Browse Source

ipn/{ipnlocal,localapi},wgengine{,/magicsock}: plumb health.Tracker

Down to 25 health.Global users. After this remains controlclient &
net/dns & wgengine/router.

Updates #11874
Updates #4136

Change-Id: I6dd1856e3d9bf523bdd44b60fb3b8f7501d5dc0d
Signed-off-by: Brad Fitzpatrick <[email protected]>
Brad Fitzpatrick 1 year ago
parent
commit
6d69fc137f

+ 1 - 0
cmd/derper/depaware.txt

@@ -138,6 +138,7 @@ tailscale.com/cmd/derper dependencies: (generated by github.com/tailscale/depawa
         tailscale.com/types/structs                                  from tailscale.com/ipn+
         tailscale.com/types/tkatype                                  from tailscale.com/client/tailscale+
         tailscale.com/types/views                                    from tailscale.com/ipn+
+        tailscale.com/util/cibuild                                   from tailscale.com/health
         tailscale.com/util/clientmetric                              from tailscale.com/net/netmon+
         tailscale.com/util/cloudenv                                  from tailscale.com/hostinfo+
    W    tailscale.com/util/cmpver                                    from tailscale.com/net/tshttpproxy

+ 1 - 0
cmd/tailscale/depaware.txt

@@ -142,6 +142,7 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
         tailscale.com/types/structs                                  from tailscale.com/ipn+
         tailscale.com/types/tkatype                                  from tailscale.com/types/key+
         tailscale.com/types/views                                    from tailscale.com/tailcfg+
+        tailscale.com/util/cibuild                                   from tailscale.com/health
         tailscale.com/util/clientmetric                              from tailscale.com/net/netcheck+
         tailscale.com/util/cloudenv                                  from tailscale.com/net/dnscache+
         tailscale.com/util/cmpver                                    from tailscale.com/net/tshttpproxy+

+ 1 - 0
cmd/tailscaled/depaware.txt

@@ -358,6 +358,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
         tailscale.com/types/structs                                  from tailscale.com/control/controlclient+
         tailscale.com/types/tkatype                                  from tailscale.com/tka+
         tailscale.com/types/views                                    from tailscale.com/ipn/ipnlocal+
+        tailscale.com/util/cibuild                                   from tailscale.com/health
         tailscale.com/util/clientmetric                              from tailscale.com/control/controlclient+
         tailscale.com/util/cloudenv                                  from tailscale.com/net/dns/resolver+
         tailscale.com/util/cmpver                                    from tailscale.com/net/dns+

+ 1 - 0
cmd/tailscaled/tailscaled.go

@@ -651,6 +651,7 @@ func tryEngine(logf logger.Logf, sys *tsd.System, name string) (onlyNetstack boo
 	conf := wgengine.Config{
 		ListenPort:    args.port,
 		NetMon:        sys.NetMon.Get(),
+		HealthTracker: sys.HealthTracker(),
 		Dialer:        sys.Dialer.Get(),
 		SetSubsystem:  sys.Set,
 		ControlKnobs:  sys.ControlKnobs(),

+ 7 - 0
health/health.go

@@ -9,6 +9,7 @@ import (
 	"errors"
 	"fmt"
 	"net/http"
+	"os"
 	"runtime"
 	"sort"
 	"sync"
@@ -18,6 +19,7 @@ import (
 	"tailscale.com/envknob"
 	"tailscale.com/tailcfg"
 	"tailscale.com/types/opt"
+	"tailscale.com/util/cibuild"
 	"tailscale.com/util/mak"
 	"tailscale.com/util/multierr"
 	"tailscale.com/util/set"
@@ -152,6 +154,11 @@ func (t *Tracker) nil() bool {
 	if t != nil {
 		return false
 	}
+	if cibuild.On() {
+		stack := make([]byte, 1<<10)
+		stack = stack[:runtime.Stack(stack, false)]
+		fmt.Fprintf(os.Stderr, "## WARNING: (non-fatal) nil health.Tracker (being strict in CI):\n%s\n", stack)
+	}
 	// TODO(bradfitz): open source our "unexpected" package
 	// and use it here to capture samples of stacks where
 	// t is nil.

+ 10 - 0
ipn/ipnlocal/local.go

@@ -327,6 +327,16 @@ type LocalBackend struct {
 	outgoingFiles map[string]*ipn.OutgoingFile
 }
 
+// HealthTracker returns the health tracker for the backend.
+func (b *LocalBackend) HealthTracker() *health.Tracker {
+	return b.health
+}
+
+// NetMon returns the network monitor for the backend.
+func (b *LocalBackend) NetMon() *netmon.Monitor {
+	return b.sys.NetMon.Get()
+}
+
 type updateStatus struct {
 	started bool
 }

+ 5 - 6
ipn/ipnlocal/network-lock.go

@@ -20,7 +20,6 @@ import (
 	"path/filepath"
 	"time"
 
-	"tailscale.com/health"
 	"tailscale.com/health/healthmsg"
 	"tailscale.com/ipn"
 	"tailscale.com/ipn/ipnstate"
@@ -59,11 +58,11 @@ type tkaState struct {
 // b.mu must be held.
 func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
 	if b.tka == nil && !b.capTailnetLock {
-		health.Global.SetTKAHealth(nil)
+		b.health.SetTKAHealth(nil)
 		return
 	}
 	if b.tka == nil {
-		health.Global.SetTKAHealth(nil)
+		b.health.SetTKAHealth(nil)
 		return // TKA not enabled.
 	}
 
@@ -117,9 +116,9 @@ func (b *LocalBackend) tkaFilterNetmapLocked(nm *netmap.NetworkMap) {
 
 	// Check that we ourselves are not locked out, report a health issue if so.
 	if nm.SelfNode.Valid() && b.tka.authority.NodeKeyAuthorized(nm.SelfNode.Key(), nm.SelfNode.KeySignature().AsSlice()) != nil {
-		health.Global.SetTKAHealth(errors.New(healthmsg.LockedOut))
+		b.health.SetTKAHealth(errors.New(healthmsg.LockedOut))
 	} else {
-		health.Global.SetTKAHealth(nil)
+		b.health.SetTKAHealth(nil)
 	}
 }
 
@@ -188,7 +187,7 @@ func (b *LocalBackend) tkaSyncIfNeeded(nm *netmap.NetworkMap, prefs ipn.PrefsVie
 				b.logf("Disablement failed, leaving TKA enabled. Error: %v", err)
 			} else {
 				isEnabled = false
-				health.Global.SetTKAHealth(nil)
+				b.health.SetTKAHealth(nil)
 			}
 		} else {
 			return fmt.Errorf("[bug] unreachable invariant of wantEnabled w/ isEnabled")

+ 1 - 1
ipn/ipnserver/server.go

@@ -199,7 +199,7 @@ func (s *Server) serveHTTP(w http.ResponseWriter, r *http.Request) {
 	defer onDone()
 
 	if strings.HasPrefix(r.URL.Path, "/localapi/") {
-		lah := localapi.NewHandler(lb, s.logf, s.netMon, s.backendLogID)
+		lah := localapi.NewHandler(lb, s.logf, s.backendLogID)
 		lah.PermitRead, lah.PermitWrite = s.localAPIPermissions(ci)
 		lah.PermitCert = s.connCanFetchCerts(ci)
 		lah.ConnIdentity = ci

+ 2 - 2
ipn/localapi/debugderp.go

@@ -140,7 +140,7 @@ func (h *Handler) serveDebugDERPRegion(w http.ResponseWriter, r *http.Request) {
 	}
 
 	checkSTUN4 := func(derpNode *tailcfg.DERPNode) {
-		u4, err := nettype.MakePacketListenerWithNetIP(netns.Listener(h.logf, h.netMon)).ListenPacket(ctx, "udp4", ":0")
+		u4, err := nettype.MakePacketListenerWithNetIP(netns.Listener(h.logf, h.b.NetMon())).ListenPacket(ctx, "udp4", ":0")
 		if err != nil {
 			st.Errors = append(st.Errors, fmt.Sprintf("Error creating IPv4 STUN listener: %v", err))
 			return
@@ -249,7 +249,7 @@ func (h *Handler) serveDebugDERPRegion(w http.ResponseWriter, r *http.Request) {
 		serverPubKeys := make(map[key.NodePublic]bool)
 		for i := range 5 {
 			func() {
-				rc := derphttp.NewRegionClient(fakePrivKey, h.logf, h.netMon, func() *tailcfg.DERPRegion {
+				rc := derphttp.NewRegionClient(fakePrivKey, h.logf, h.b.NetMon(), func() *tailcfg.DERPRegion {
 					return &tailcfg.DERPRegion{
 						RegionID:   reg.RegionID,
 						RegionCode: reg.RegionCode,

+ 4 - 6
ipn/localapi/localapi.go

@@ -36,7 +36,6 @@ import (
 	"tailscale.com/clientupdate"
 	"tailscale.com/drive"
 	"tailscale.com/envknob"
-	"tailscale.com/health"
 	"tailscale.com/hostinfo"
 	"tailscale.com/ipn"
 	"tailscale.com/ipn/ipnauth"
@@ -156,8 +155,8 @@ var (
 
 // NewHandler creates a new LocalAPI HTTP handler. All parameters except netMon
 // are required (if non-nil it's used to do faster interface lookups).
-func NewHandler(b *ipnlocal.LocalBackend, logf logger.Logf, netMon *netmon.Monitor, logID logid.PublicID) *Handler {
-	return &Handler{b: b, logf: logf, netMon: netMon, backendLogID: logID, clock: tstime.StdClock{}}
+func NewHandler(b *ipnlocal.LocalBackend, logf logger.Logf, logID logid.PublicID) *Handler {
+	return &Handler{b: b, logf: logf, backendLogID: logID, clock: tstime.StdClock{}}
 }
 
 type Handler struct {
@@ -188,7 +187,6 @@ type Handler struct {
 
 	b            *ipnlocal.LocalBackend
 	logf         logger.Logf
-	netMon       *netmon.Monitor // optional; nil means interfaces will be looked up on-demand
 	backendLogID logid.PublicID
 	clock        tstime.Clock
 }
@@ -358,7 +356,7 @@ func (h *Handler) serveBugReport(w http.ResponseWriter, r *http.Request) {
 	}
 	hi, _ := json.Marshal(hostinfo.New())
 	h.logf("user bugreport hostinfo: %s", hi)
-	if err := health.Global.OverallError(); err != nil {
+	if err := h.b.HealthTracker().OverallError(); err != nil {
 		h.logf("user bugreport health: %s", err.Error())
 	} else {
 		h.logf("user bugreport health: ok")
@@ -748,7 +746,7 @@ func (h *Handler) serveDebugPortmap(w http.ResponseWriter, r *http.Request) {
 	done := make(chan bool, 1)
 
 	var c *portmapper.Client
-	c = portmapper.NewClient(logger.WithPrefix(logf, "portmapper: "), h.netMon, debugKnobs, h.b.ControlKnobs(), func() {
+	c = portmapper.NewClient(logger.WithPrefix(logf, "portmapper: "), h.b.NetMon(), debugKnobs, h.b.ControlKnobs(), func() {
 		logf("portmapping changed.")
 		logf("have mapping: %v", c.HaveMapping())
 

+ 8 - 7
tsnet/tsnet.go

@@ -233,7 +233,7 @@ func (s *Server) Loopback() (addr string, proxyCred, localAPICred string, err er
 		// out the CONNECT code from tailscaled/proxy.go that uses
 		// httputil.ReverseProxy and adding auth support.
 		go func() {
-			lah := localapi.NewHandler(s.lb, s.logf, s.netMon, s.logid)
+			lah := localapi.NewHandler(s.lb, s.logf, s.logid)
 			lah.PermitWrite = true
 			lah.PermitRead = true
 			lah.RequiredPassword = s.localAPICred
@@ -517,11 +517,12 @@ func (s *Server) start() (reterr error) {
 	sys := new(tsd.System)
 	s.dialer = &tsdial.Dialer{Logf: logf} // mutated below (before used)
 	eng, err := wgengine.NewUserspaceEngine(logf, wgengine.Config{
-		ListenPort:   s.Port,
-		NetMon:       s.netMon,
-		Dialer:       s.dialer,
-		SetSubsystem: sys.Set,
-		ControlKnobs: sys.ControlKnobs(),
+		ListenPort:    s.Port,
+		NetMon:        s.netMon,
+		Dialer:        s.dialer,
+		SetSubsystem:  sys.Set,
+		ControlKnobs:  sys.ControlKnobs(),
+		HealthTracker: sys.HealthTracker(),
 	})
 	if err != nil {
 		return err
@@ -606,7 +607,7 @@ func (s *Server) start() (reterr error) {
 	go s.printAuthURLLoop()
 
 	// Run the localapi handler, to allow fetching LetsEncrypt certs.
-	lah := localapi.NewHandler(lb, logf, s.netMon, s.logid)
+	lah := localapi.NewHandler(lb, logf, s.logid)
 	lah.PermitWrite = true
 	lah.PermitRead = true
 

+ 12 - 12
wgengine/magicsock/derp.go

@@ -165,7 +165,7 @@ func (c *Conn) maybeSetNearestDERP(report *netcheck.Report) (preferredDERP int)
 	if testenv.InTest() && !checkControlHealthDuringNearestDERPInTests {
 		connectedToControl = true
 	} else {
-		connectedToControl = health.Global.GetInPollNetMap()
+		connectedToControl = c.health.GetInPollNetMap()
 	}
 	if !connectedToControl {
 		c.mu.Lock()
@@ -201,12 +201,12 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) {
 	defer c.mu.Unlock()
 	if !c.wantDerpLocked() {
 		c.myDerp = 0
-		health.Global.SetMagicSockDERPHome(0, c.homeless)
+		c.health.SetMagicSockDERPHome(0, c.homeless)
 		return false
 	}
 	if c.homeless {
 		c.myDerp = 0
-		health.Global.SetMagicSockDERPHome(0, c.homeless)
+		c.health.SetMagicSockDERPHome(0, c.homeless)
 		return false
 	}
 	if derpNum == c.myDerp {
@@ -217,7 +217,7 @@ func (c *Conn) setNearestDERP(derpNum int) (wantDERP bool) {
 		metricDERPHomeChange.Add(1)
 	}
 	c.myDerp = derpNum
-	health.Global.SetMagicSockDERPHome(derpNum, c.homeless)
+	c.health.SetMagicSockDERPHome(derpNum, c.homeless)
 
 	if c.privateKey.IsZero() {
 		// No private key yet, so DERP connections won't come up anyway.
@@ -400,7 +400,7 @@ func (c *Conn) derpWriteChanOfAddr(addr netip.AddrPort, peer key.NodePublic) cha
 		}
 		return derpMap.Regions[regionID]
 	})
-	dc.HealthTracker = health.Global
+	dc.HealthTracker = c.health
 
 	dc.SetCanAckPings(true)
 	dc.NotePreferred(c.myDerp == regionID)
@@ -526,8 +526,8 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
 		return n
 	}
 
-	defer health.Global.SetDERPRegionConnectedState(regionID, false)
-	defer health.Global.SetDERPRegionHealth(regionID, "")
+	defer c.health.SetDERPRegionConnectedState(regionID, false)
+	defer c.health.SetDERPRegionHealth(regionID, "")
 
 	// peerPresent is the set of senders we know are present on this
 	// connection, based on messages we've received from the server.
@@ -539,7 +539,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
 	for {
 		msg, connGen, err := dc.RecvDetail()
 		if err != nil {
-			health.Global.SetDERPRegionConnectedState(regionID, false)
+			c.health.SetDERPRegionConnectedState(regionID, false)
 			// Forget that all these peers have routes.
 			for peer := range peerPresent {
 				delete(peerPresent, peer)
@@ -577,14 +577,14 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
 
 		now := time.Now()
 		if lastPacketTime.IsZero() || now.Sub(lastPacketTime) > frameReceiveRecordRate {
-			health.Global.NoteDERPRegionReceivedFrame(regionID)
+			c.health.NoteDERPRegionReceivedFrame(regionID)
 			lastPacketTime = now
 		}
 
 		switch m := msg.(type) {
 		case derp.ServerInfoMessage:
-			health.Global.SetDERPRegionConnectedState(regionID, true)
-			health.Global.SetDERPRegionHealth(regionID, "") // until declared otherwise
+			c.health.SetDERPRegionConnectedState(regionID, true)
+			c.health.SetDERPRegionHealth(regionID, "") // until declared otherwise
 			c.logf("magicsock: derp-%d connected; connGen=%v", regionID, connGen)
 			continue
 		case derp.ReceivedPacket:
@@ -624,7 +624,7 @@ func (c *Conn) runDerpReader(ctx context.Context, derpFakeAddr netip.AddrPort, d
 			}()
 			continue
 		case derp.HealthMessage:
-			health.Global.SetDERPRegionHealth(regionID, m.Problem)
+			c.health.SetDERPRegionHealth(regionID, m.Problem)
 			continue
 		case derp.PeerGoneMessage:
 			switch m.Reason {

+ 10 - 4
wgengine/magicsock/magicsock.go

@@ -91,6 +91,7 @@ type Conn struct {
 	testOnlyPacketListener nettype.PacketListener
 	noteRecvActivity       func(key.NodePublic) // or nil, see Options.NoteRecvActivity
 	netMon                 *netmon.Monitor      // or nil
+	health                 *health.Tracker      // or nil
 	controlKnobs           *controlknobs.Knobs  // or nil
 
 	// ================================================================
@@ -369,9 +370,13 @@ type Options struct {
 	NoteRecvActivity func(key.NodePublic)
 
 	// NetMon is the network monitor to use.
-	// With one, the portmapper won't be used.
+	// If nil, the portmapper won't be used.
 	NetMon *netmon.Monitor
 
+	// HealthTracker optionally specifies the health tracker to
+	// report errors and warnings to.
+	HealthTracker *health.Tracker
+
 	// ControlKnobs are the set of control knobs to use.
 	// If nil, they're ignored and not updated.
 	ControlKnobs *controlknobs.Knobs
@@ -463,6 +468,7 @@ func NewConn(opts Options) (*Conn, error) {
 		c.portMapper.SetGatewayLookupFunc(opts.NetMon.GatewayAndSelfIP)
 	}
 	c.netMon = opts.NetMon
+	c.health = opts.HealthTracker
 	c.onPortUpdate = opts.OnPortUpdate
 	c.getPeerByKey = opts.PeerByKeyFunc
 
@@ -666,7 +672,7 @@ func (c *Conn) updateNetInfo(ctx context.Context) (*netcheck.Report, error) {
 		// NOTE(andrew-d): I don't love that we're depending on the
 		// health package here, but I'd rather do that and not store
 		// the exact same state in two different places.
-		GetLastDERPActivity: health.Global.GetDERPRegionReceivedTime,
+		GetLastDERPActivity: c.health.GetDERPRegionReceivedTime,
 	})
 	if err != nil {
 		return nil, err
@@ -2471,7 +2477,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
 		}
 		ruc.setConnLocked(pconn, network, c.bind.BatchSize())
 		if network == "udp4" {
-			health.Global.SetUDP4Unbound(false)
+			c.health.SetUDP4Unbound(false)
 		}
 		return nil
 	}
@@ -2482,7 +2488,7 @@ func (c *Conn) bindSocket(ruc *RebindingUDPConn, network string, curPortFate cur
 	// we get a link change and we can try binding again.
 	ruc.setConnLocked(newBlockForeverConn(), "", c.bind.BatchSize())
 	if network == "udp4" {
-		health.Global.SetUDP4Unbound(true)
+		c.health.SetUDP4Unbound(true)
 	}
 	return fmt.Errorf("failed to bind any ports (tried %v)", ports)
 }

+ 7 - 5
wgengine/magicsock/magicsock_test.go

@@ -3113,21 +3113,23 @@ func TestMaybeSetNearestDERP(t *testing.T) {
 	}
 	for _, tt := range testCases {
 		t.Run(tt.name, func(t *testing.T) {
+			ht := new(health.Tracker)
 			c := newConn()
 			c.logf = t.Logf
 			c.myDerp = tt.old
 			c.derpMap = derpMap
+			c.health = ht
 
 			report := &netcheck.Report{PreferredDERP: tt.reportDERP}
 
-			oldConnected := health.Global.GetInPollNetMap()
+			oldConnected := ht.GetInPollNetMap()
 			if tt.connectedToControl != oldConnected {
 				if tt.connectedToControl {
-					health.Global.GotStreamedMapResponse()
-					t.Cleanup(health.Global.SetOutOfPollNetMap)
+					ht.GotStreamedMapResponse()
+					t.Cleanup(ht.SetOutOfPollNetMap)
 				} else {
-					health.Global.SetOutOfPollNetMap()
-					t.Cleanup(health.Global.GotStreamedMapResponse)
+					ht.SetOutOfPollNetMap()
+					t.Cleanup(ht.GotStreamedMapResponse)
 				}
 			}
 

+ 9 - 3
wgengine/userspace.go

@@ -98,6 +98,7 @@ type userspaceEngine struct {
 	dns              *dns.Manager
 	magicConn        *magicsock.Conn
 	netMon           *netmon.Monitor
+	health           *health.Tracker
 	netMonOwned      bool                // whether we created netMon (and thus need to close it)
 	netMonUnregister func()              // unsubscribes from changes; used regardless of netMonOwned
 	birdClient       BIRDClient          // or nil
@@ -188,6 +189,9 @@ type Config struct {
 	// If nil, a new network monitor is created.
 	NetMon *netmon.Monitor
 
+	// HealthTracker, if non-nil, is the health tracker to use.
+	HealthTracker *health.Tracker
+
 	// Dialer is the dialer to use for outbound connections.
 	// If nil, a new Dialer is created
 	Dialer *tsdial.Dialer
@@ -310,6 +314,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
 		birdClient:     conf.BIRDClient,
 		controlKnobs:   conf.ControlKnobs,
 		reconfigureVPN: conf.ReconfigureVPN,
+		health:         conf.HealthTracker,
 	}
 
 	if e.birdClient != nil {
@@ -372,6 +377,7 @@ func NewUserspaceEngine(logf logger.Logf, conf Config) (_ Engine, reterr error)
 		IdleFunc:         e.tundev.IdleDuration,
 		NoteRecvActivity: e.noteRecvActivity,
 		NetMon:           e.netMon,
+		HealthTracker:    e.health,
 		ControlKnobs:     conf.ControlKnobs,
 		OnPortUpdate:     onPortUpdate,
 		PeerByKeyFunc:    e.PeerByKey,
@@ -970,7 +976,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
 		e.logf("wgengine: Reconfig: configuring router")
 		e.networkLogger.ReconfigRoutes(routerCfg)
 		err := e.router.Set(routerCfg)
-		health.Global.SetRouterHealth(err)
+		e.health.SetRouterHealth(err)
 		if err != nil {
 			return err
 		}
@@ -979,7 +985,7 @@ func (e *userspaceEngine) Reconfig(cfg *wgcfg.Config, routerCfg *router.Config,
 		// assigned address.
 		e.logf("wgengine: Reconfig: configuring DNS")
 		err = e.dns.Set(*dnsCfg)
-		health.Global.SetDNSHealth(err)
+		e.health.SetDNSHealth(err)
 		if err != nil {
 			return err
 		}
@@ -1183,7 +1189,7 @@ func (e *userspaceEngine) linkChange(delta *netmon.ChangeDelta) {
 		e.logf("[v1] LinkChange: minor")
 	}
 
-	health.Global.SetAnyInterfaceUp(up)
+	e.health.SetAnyInterfaceUp(up)
 	e.magicConn.SetNetworkUp(up)
 	if !up || changed {
 		if err := e.dns.FlushCaches(); err != nil {