Jelajahi Sumber

net/dns, health: raise health warning for failing forwarded DNS queries (#12888)

updates tailscale/corp#21823

Misconfigured, broken, or blocked DNS will often present as
"internet is broken'" to the end user.  This  plumbs the health tracker
into the dns manager and forwarder and adds a health warning
with a 5 second delay that is raised on failures in the forwarder and
lowered on successes.

Signed-off-by: Jonathan Nobels <[email protected]>
Jonathan Nobels 1 tahun lalu
induk
melakukan
19b0c8a024

+ 4 - 0
health/args.go

@@ -32,4 +32,8 @@ const (
 
 	// ArgServerName provides a Warnable with the hostname of a server involved in the unhealthy state.
 	ArgServerName Arg = "server-name"
+
+	// ArgServerName provides a Warnable with comma delimited list of the hostname of the servers involved in the unhealthy state.
+	// If no nameservers were available to query, this will be an empty string.
+	ArgDNSServers Arg = "dns-servers"
 )

+ 1 - 1
net/dns/manager.go

@@ -82,7 +82,7 @@ func NewManager(logf logger.Logf, oscfg OSConfigurator, health *health.Tracker,
 
 	m := &Manager{
 		logf:     logf,
-		resolver: resolver.New(logf, linkSel, dialer, knobs),
+		resolver: resolver.New(logf, linkSel, dialer, health, knobs),
 		os:       oscfg,
 		health:   health,
 		knobs:    knobs,

+ 3 - 2
net/dns/manager_tcp_test.go

@@ -15,6 +15,7 @@ import (
 
 	"github.com/google/go-cmp/cmp"
 	dns "golang.org/x/net/dns/dnsmessage"
+	"tailscale.com/health"
 	"tailscale.com/net/netmon"
 	"tailscale.com/net/tsdial"
 	"tailscale.com/tstest"
@@ -88,7 +89,7 @@ func TestDNSOverTCP(t *testing.T) {
 			SearchDomains: fqdns("coffee.shop"),
 		},
 	}
-	m := NewManager(t.Logf, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
+	m := NewManager(t.Logf, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
 	m.resolver.TestOnlySetHook(f.SetResolver)
 	m.Set(Config{
 		Hosts: hosts(
@@ -173,7 +174,7 @@ func TestDNSOverTCP_TooLarge(t *testing.T) {
 			SearchDomains: fqdns("coffee.shop"),
 		},
 	}
-	m := NewManager(log, &f, nil, tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
+	m := NewManager(log, &f, new(health.Tracker), tsdial.NewDialer(netmon.NewStatic()), nil, nil, "")
 	m.resolver.TestOnlySetHook(f.SetResolver)
 	m.Set(Config{
 		Hosts:         hosts("andrew.ts.com.", "1.2.3.4"),

+ 31 - 1
net/dns/resolver/forwarder.go

@@ -23,6 +23,7 @@ import (
 	dns "golang.org/x/net/dns/dnsmessage"
 	"tailscale.com/control/controlknobs"
 	"tailscale.com/envknob"
+	"tailscale.com/health"
 	"tailscale.com/net/dns/publicdns"
 	"tailscale.com/net/dnscache"
 	"tailscale.com/net/neterror"
@@ -164,6 +165,23 @@ func clampEDNSSize(packet []byte, maxSize uint16) {
 	binary.BigEndian.PutUint16(opt[3:5], maxSize)
 }
 
+// dnsForwarderFailing should be raised when the forwarder is unable to reach the
+// upstream resolvers. This is a high severity warning as it results in "no internet".
+// This warning must be cleared when the forwarder is working again.
+//
+// We allow for 5 second grace period to ensure this is not raised for spurious errors
+// under the assumption that DNS queries are relatively frequent and a subsequent
+// successful query will clear any one-off errors.
+var dnsForwarderFailing = health.Register(&health.Warnable{
+	Code:                "dns-forward-failing",
+	Title:               "DNS unavailable",
+	Severity:            health.SeverityHigh,
+	DependsOn:           []*health.Warnable{health.NetworkStatusWarnable},
+	Text:                health.StaticMessage("Tailscale can't reach the configured DNS servers. Internet connectivity may be affected."),
+	ImpactsConnectivity: true,
+	TimeToVisible:       5 * time.Second,
+})
+
 type route struct {
 	Suffix    dnsname.FQDN
 	Resolvers []resolverAndDelay
@@ -188,6 +206,7 @@ type forwarder struct {
 	netMon  *netmon.Monitor     // always non-nil
 	linkSel ForwardLinkSelector // TODO(bradfitz): remove this when tsdial.Dialer absorbs it
 	dialer  *tsdial.Dialer
+	health  *health.Tracker // always non-nil
 
 	controlKnobs *controlknobs.Knobs // or nil
 
@@ -219,7 +238,7 @@ type forwarder struct {
 	missingUpstreamRecovery func()
 }
 
-func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *forwarder {
+func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *forwarder {
 	if netMon == nil {
 		panic("nil netMon")
 	}
@@ -228,6 +247,7 @@ func newForwarder(logf logger.Logf, netMon *netmon.Monitor, linkSel ForwardLinkS
 		netMon:                  netMon,
 		linkSel:                 linkSel,
 		dialer:                  dialer,
+		health:                  health,
 		controlKnobs:            knobs,
 		missingUpstreamRecovery: func() {},
 	}
@@ -887,6 +907,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
 		resolvers = f.resolvers(domain)
 		if len(resolvers) == 0 {
 			metricDNSFwdErrorNoUpstream.Add(1)
+			f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: ""})
 			f.logf("no upstream resolvers set, returning SERVFAIL")
 
 			// Attempt to recompile the DNS configuration
@@ -909,6 +930,8 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
 			case responseChan <- res:
 				return nil
 			}
+		} else {
+			f.health.SetHealthy(dnsForwarderFailing)
 		}
 	}
 
@@ -960,6 +983,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
 				return fmt.Errorf("waiting to send response: %w", ctx.Err())
 			case responseChan <- packet{v, query.family, query.addr}:
 				metricDNSFwdSuccess.Add(1)
+				f.health.SetHealthy(dnsForwarderFailing)
 				return nil
 			}
 		case err := <-errc:
@@ -979,6 +1003,11 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
 					case <-ctx.Done():
 						metricDNSFwdErrorContext.Add(1)
 						metricDNSFwdErrorContextGotError.Add(1)
+						var resolverAddrs []string
+						for _, rr := range resolvers {
+							resolverAddrs = append(resolverAddrs, rr.name.Addr)
+						}
+						f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
 					case responseChan <- res:
 					}
 				}
@@ -999,6 +1028,7 @@ func (f *forwarder) forwardWithDestChan(ctx context.Context, query packet, respo
 			for _, rr := range resolvers {
 				resolverAddrs = append(resolverAddrs, rr.name.Addr)
 			}
+			f.health.SetUnhealthy(dnsForwarderFailing, health.Args{health.ArgDNSServers: strings.Join(resolverAddrs, ",")})
 			return fmt.Errorf("waiting for response or error from %v: %w", resolverAddrs, ctx.Err())
 		}
 	}

+ 2 - 1
net/dns/resolver/forwarder_test.go

@@ -24,6 +24,7 @@ import (
 	dns "golang.org/x/net/dns/dnsmessage"
 	"tailscale.com/control/controlknobs"
 	"tailscale.com/envknob"
+	"tailscale.com/health"
 	"tailscale.com/net/netmon"
 	"tailscale.com/net/tsdial"
 	"tailscale.com/types/dnstype"
@@ -457,7 +458,7 @@ func runTestQuery(tb testing.TB, port uint16, request []byte, modify func(*forwa
 	var dialer tsdial.Dialer
 	dialer.SetNetMon(netMon)
 
-	fwd := newForwarder(tb.Logf, netMon, nil, &dialer, nil)
+	fwd := newForwarder(tb.Logf, netMon, nil, &dialer, new(health.Tracker), nil)
 	if modify != nil {
 		modify(fwd)
 	}

+ 9 - 2
net/dns/resolver/tsdns.go

@@ -25,6 +25,7 @@ import (
 	dns "golang.org/x/net/dns/dnsmessage"
 	"tailscale.com/control/controlknobs"
 	"tailscale.com/envknob"
+	"tailscale.com/health"
 	"tailscale.com/net/dns/resolvconffile"
 	"tailscale.com/net/netaddr"
 	"tailscale.com/net/netmon"
@@ -202,6 +203,7 @@ type Resolver struct {
 	logf               logger.Logf
 	netMon             *netmon.Monitor  // non-nil
 	dialer             *tsdial.Dialer   // non-nil
+	health             *health.Tracker  // non-nil
 	saveConfigForTests func(cfg Config) // used in tests to capture resolver config
 	// forwarder forwards requests to upstream nameservers.
 	forwarder *forwarder
@@ -224,10 +226,14 @@ type ForwardLinkSelector interface {
 }
 
 // New returns a new resolver.
-func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, knobs *controlknobs.Knobs) *Resolver {
+// dialer and health must be non-nil.
+func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, health *health.Tracker, knobs *controlknobs.Knobs) *Resolver {
 	if dialer == nil {
 		panic("nil Dialer")
 	}
+	if health == nil {
+		panic("nil health")
+	}
 	netMon := dialer.NetMon()
 	if netMon == nil {
 		logf("nil netMon")
@@ -239,8 +245,9 @@ func New(logf logger.Logf, linkSel ForwardLinkSelector, dialer *tsdial.Dialer, k
 		hostToIP: map[dnsname.FQDN][]netip.Addr{},
 		ipToHost: map[netip.Addr]dnsname.FQDN{},
 		dialer:   dialer,
+		health:   health,
 	}
-	r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, knobs)
+	r.forwarder = newForwarder(r.logf, netMon, linkSel, dialer, health, knobs)
 	return r
 }
 

+ 3 - 1
net/dns/resolver/tsdns_test.go

@@ -23,6 +23,7 @@ import (
 
 	miekdns "github.com/miekg/dns"
 	dns "golang.org/x/net/dns/dnsmessage"
+	"tailscale.com/health"
 	"tailscale.com/net/netaddr"
 	"tailscale.com/net/netmon"
 	"tailscale.com/net/tsdial"
@@ -354,6 +355,7 @@ func newResolver(t testing.TB) *Resolver {
 	return New(t.Logf,
 		nil, // no link selector
 		tsdial.NewDialer(netmon.NewStatic()),
+		new(health.Tracker),
 		nil, // no control knobs
 	)
 }
@@ -1068,7 +1070,7 @@ func TestForwardLinkSelection(t *testing.T) {
 			return "special"
 		}
 		return ""
-	}), new(tsdial.Dialer), nil /* no control knobs */)
+	}), new(tsdial.Dialer), new(health.Tracker), nil /* no control knobs */)
 
 	// Test non-special IP.
 	if got, err := fwd.packetListener(netip.Addr{}); err != nil {