Selaa lähdekoodia

health: introduce captive-portal-detected Warnable (#12707)

Updates tailscale/tailscale#1634

This PR introduces a new `captive-portal-detected` Warnable which is set to an unhealthy state whenever a captive portal is detected on the local network, preventing Tailscale from connecting.



ipn/ipnlocal: fix captive portal loop shutdown


Change-Id: I7cafdbce68463a16260091bcec1741501a070c95

net/captivedetection: fix mutex misuse

ipn/ipnlocal: ensure that we don't fail to start the timer


Change-Id: I3e43fb19264d793e8707c5031c0898e48e3e7465

Signed-off-by: Andrew Dunham <[email protected]>
Signed-off-by: Andrea Gottardo <[email protected]>
Andrea Gottardo 1 vuosi sitten
vanhempi
sitoutus
90be06bd5b

+ 1 - 0
cmd/k8s-operator/depaware.txt

@@ -701,6 +701,7 @@ tailscale.com/cmd/k8s-operator dependencies: (generated by github.com/tailscale/
         tailscale.com/logtail/backoff                                from tailscale.com/control/controlclient+
         tailscale.com/logtail/filch                                  from tailscale.com/log/sockstatlog+
         tailscale.com/metrics                                        from tailscale.com/derp+
+        tailscale.com/net/captivedetection                           from tailscale.com/ipn/ipnlocal+
         tailscale.com/net/connstats                                  from tailscale.com/net/tstun+
         tailscale.com/net/dns                                        from tailscale.com/ipn/ipnlocal+
         tailscale.com/net/dns/publicdns                              from tailscale.com/net/dns+

+ 2 - 1
cmd/tailscale/depaware.txt

@@ -100,9 +100,10 @@ tailscale.com/cmd/tailscale dependencies: (generated by github.com/tailscale/dep
         tailscale.com/ipn/ipnstate                                   from tailscale.com/client/tailscale+
         tailscale.com/licenses                                       from tailscale.com/client/web+
         tailscale.com/metrics                                        from tailscale.com/derp
+        tailscale.com/net/captivedetection                           from tailscale.com/net/netcheck
         tailscale.com/net/dns/recursive                              from tailscale.com/net/dnsfallback
         tailscale.com/net/dnscache                                   from tailscale.com/control/controlhttp+
-        tailscale.com/net/dnsfallback                                from tailscale.com/control/controlhttp
+        tailscale.com/net/dnsfallback                                from tailscale.com/control/controlhttp+
         tailscale.com/net/flowtrack                                  from tailscale.com/net/packet
         tailscale.com/net/netaddr                                    from tailscale.com/ipn+
         tailscale.com/net/netcheck                                   from tailscale.com/cmd/tailscale/cli

+ 1 - 0
cmd/tailscaled/depaware.txt

@@ -288,6 +288,7 @@ tailscale.com/cmd/tailscaled dependencies: (generated by github.com/tailscale/de
         tailscale.com/logtail/backoff                                from tailscale.com/cmd/tailscaled+
         tailscale.com/logtail/filch                                  from tailscale.com/log/sockstatlog+
         tailscale.com/metrics                                        from tailscale.com/derp+
+        tailscale.com/net/captivedetection                           from tailscale.com/ipn/ipnlocal+
         tailscale.com/net/connstats                                  from tailscale.com/net/tstun+
         tailscale.com/net/dns                                        from tailscale.com/cmd/tailscaled+
         tailscale.com/net/dns/publicdns                              from tailscale.com/net/dns+

+ 7 - 0
control/controlknobs/controlknobs.go

@@ -99,6 +99,10 @@ type Knobs struct {
 	// DisableCryptorouting indicates that the node should not use the
 	// magicsock crypto routing feature.
 	DisableCryptorouting atomic.Bool
+
+	// DisableCaptivePortalDetection is whether the node should not perform captive portal detection
+	// automatically when the network state changes.
+	DisableCaptivePortalDetection atomic.Bool
 }
 
 // UpdateFromNodeAttributes updates k (if non-nil) based on the provided self
@@ -127,6 +131,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
 		disableSplitDNSWhenNoCustomResolvers = has(tailcfg.NodeAttrDisableSplitDNSWhenNoCustomResolvers)
 		disableLocalDNSOverrideViaNRPT       = has(tailcfg.NodeAttrDisableLocalDNSOverrideViaNRPT)
 		disableCryptorouting                 = has(tailcfg.NodeAttrDisableMagicSockCryptoRouting)
+		disableCaptivePortalDetection        = has(tailcfg.NodeAttrDisableCaptivePortalDetection)
 	)
 
 	if has(tailcfg.NodeAttrOneCGNATEnable) {
@@ -153,6 +158,7 @@ func (k *Knobs) UpdateFromNodeAttributes(capMap tailcfg.NodeCapMap) {
 	k.DisableSplitDNSWhenNoCustomResolvers.Store(disableSplitDNSWhenNoCustomResolvers)
 	k.DisableLocalDNSOverrideViaNRPT.Store(disableLocalDNSOverrideViaNRPT)
 	k.DisableCryptorouting.Store(disableCryptorouting)
+	k.DisableCaptivePortalDetection.Store(disableCaptivePortalDetection)
 }
 
 // AsDebugJSON returns k as something that can be marshalled with json.Marshal
@@ -180,5 +186,6 @@ func (k *Knobs) AsDebugJSON() map[string]any {
 		"DisableSplitDNSWhenNoCustomResolvers": k.DisableSplitDNSWhenNoCustomResolvers.Load(),
 		"DisableLocalDNSOverrideViaNRPT":       k.DisableLocalDNSOverrideViaNRPT.Load(),
 		"DisableCryptorouting":                 k.DisableCryptorouting.Load(),
+		"DisableCaptivePortalDetection":        k.DisableCaptivePortalDetection.Load(),
 	}
 }

+ 225 - 20
ipn/ipnlocal/local.go

@@ -60,6 +60,7 @@ import (
 	"tailscale.com/ipn/policy"
 	"tailscale.com/log/sockstatlog"
 	"tailscale.com/logpolicy"
+	"tailscale.com/net/captivedetection"
 	"tailscale.com/net/dns"
 	"tailscale.com/net/dnscache"
 	"tailscale.com/net/dnsfallback"
@@ -344,6 +345,21 @@ type LocalBackend struct {
 
 	// refreshAutoExitNode indicates if the exit node should be recomputed when the next netcheck report is available.
 	refreshAutoExitNode bool
+
+	// captiveCtx and captiveCancel are used to control captive portal
+	// detection. They are protected by 'mu' and can be changed during the
+	// lifetime of a LocalBackend.
+	//
+	// captiveCtx will always be non-nil, though it might be a canceled
+	// context. captiveCancel is non-nil if checkCaptivePortalLoop is
+	// running, and is set to nil after being canceled.
+	captiveCtx    context.Context
+	captiveCancel context.CancelFunc
+	// needsCaptiveDetection is a channel that is used to signal either
+	// that captive portal detection is required (sending true) or that the
+	// backend is healthy and captive portal detection is not required
+	// (sending false).
+	needsCaptiveDetection chan bool
 }
 
 // HealthTracker returns the health tracker for the backend.
@@ -398,27 +414,35 @@ func NewLocalBackend(logf logger.Logf, logID logid.PublicID, sys *tsd.System, lo
 	ctx, cancel := context.WithCancel(context.Background())
 	clock := tstime.StdClock{}
 
+	// Until we transition to a Running state, use a canceled context for
+	// our captive portal detection.
+	captiveCtx, captiveCancel := context.WithCancel(ctx)
+	captiveCancel()
+
 	b := &LocalBackend{
-		ctx:                 ctx,
-		ctxCancel:           cancel,
-		logf:                logf,
-		keyLogf:             logger.LogOnChange(logf, 5*time.Minute, clock.Now),
-		statsLogf:           logger.LogOnChange(logf, 5*time.Minute, clock.Now),
-		sys:                 sys,
-		health:              sys.HealthTracker(),
-		e:                   e,
-		dialer:              dialer,
-		store:               store,
-		pm:                  pm,
-		backendLogID:        logID,
-		state:               ipn.NoState,
-		portpoll:            new(portlist.Poller),
-		em:                  newExpiryManager(logf),
-		gotPortPollRes:      make(chan struct{}),
-		loginFlags:          loginFlags,
-		clock:               clock,
-		selfUpdateProgress:  make([]ipnstate.UpdateProgress, 0),
-		lastSelfUpdateState: ipnstate.UpdateFinished,
+		ctx:                   ctx,
+		ctxCancel:             cancel,
+		logf:                  logf,
+		keyLogf:               logger.LogOnChange(logf, 5*time.Minute, clock.Now),
+		statsLogf:             logger.LogOnChange(logf, 5*time.Minute, clock.Now),
+		sys:                   sys,
+		health:                sys.HealthTracker(),
+		e:                     e,
+		dialer:                dialer,
+		store:                 store,
+		pm:                    pm,
+		backendLogID:          logID,
+		state:                 ipn.NoState,
+		portpoll:              new(portlist.Poller),
+		em:                    newExpiryManager(logf),
+		gotPortPollRes:        make(chan struct{}),
+		loginFlags:            loginFlags,
+		clock:                 clock,
+		selfUpdateProgress:    make([]ipnstate.UpdateProgress, 0),
+		lastSelfUpdateState:   ipnstate.UpdateFinished,
+		captiveCtx:            captiveCtx,
+		captiveCancel:         nil, // so that we start checkCaptivePortalLoop when Running
+		needsCaptiveDetection: make(chan bool),
 	}
 	mConn.SetNetInfoCallback(b.setNetInfo)
 
@@ -669,6 +693,10 @@ func (b *LocalBackend) pauseOrResumeControlClientLocked() {
 	b.cc.SetPaused((b.state == ipn.Stopped && b.netMap != nil) || (!networkUp && !testenv.InTest() && !assumeNetworkUpdateForTest()))
 }
 
+// captivePortalDetectionInterval is the duration to wait in an unhealthy state with connectivity broken
+// before running captive portal detection.
+const captivePortalDetectionInterval = 2 * time.Second
+
 // linkChange is our network monitor callback, called whenever the network changes.
 func (b *LocalBackend) linkChange(delta *netmon.ChangeDelta) {
 	b.mu.Lock()
@@ -719,6 +747,44 @@ func (b *LocalBackend) onHealthChange(w *health.Warnable, us *health.UnhealthySt
 	b.send(ipn.Notify{
 		Health: state,
 	})
+
+	isConnectivityImpacted := false
+	for _, w := range state.Warnings {
+		// Ignore the captive portal warnable itself.
+		if w.ImpactsConnectivity && w.WarnableCode != captivePortalWarnable.Code {
+			isConnectivityImpacted = true
+			break
+		}
+	}
+
+	// captiveCtx can be changed, and is protected with 'mu'; grab that
+	// before we start our select, below.
+	//
+	// It is guaranteed to be non-nil.
+	b.mu.Lock()
+	ctx := b.captiveCtx
+	b.mu.Unlock()
+
+	// If the context is canceled, we don't need to do anything.
+	if ctx.Err() != nil {
+		return
+	}
+
+	if isConnectivityImpacted {
+		b.logf("health: connectivity impacted; triggering captive portal detection")
+
+		// Ensure that we select on captiveCtx so that we can time out
+		// triggering captive portal detection if the backend is shutdown.
+		select {
+		case b.needsCaptiveDetection <- true:
+		case <-ctx.Done():
+		}
+	} else {
+		select {
+		case b.needsCaptiveDetection <- false:
+		case <-ctx.Done():
+		}
+	}
 }
 
 // Shutdown halts the backend and all its sub-components. The backend
@@ -731,6 +797,11 @@ func (b *LocalBackend) Shutdown() {
 	}
 	b.shutdownCalled = true
 
+	if b.captiveCancel != nil {
+		b.logf("canceling captive portal context")
+		b.captiveCancel()
+	}
+
 	if b.loginFlags&controlclient.LoginEphemeral != 0 {
 		b.mu.Unlock()
 		ctx, cancel := context.WithTimeout(b.ctx, 5*time.Second)
@@ -2097,6 +2168,122 @@ func (b *LocalBackend) updateFilterLocked(netMap *netmap.NetworkMap, prefs ipn.P
 	}
 }
 
+// captivePortalWarnable is a Warnable which is set to an unhealthy state when a captive portal is detected.
+var captivePortalWarnable = health.Register(&health.Warnable{
+	Code:  "captive-portal-detected",
+	Title: "Captive portal detected",
+	// High severity, because captive portals block all traffic and require user intervention.
+	Severity:            health.SeverityHigh,
+	Text:                health.StaticMessage("This network requires you to log in using your web browser."),
+	ImpactsConnectivity: true,
+})
+
+func (b *LocalBackend) checkCaptivePortalLoop(ctx context.Context) {
+	var tmr *time.Timer
+
+	maybeStartTimer := func() {
+		// If there's an existing timer, nothing to do; just continue
+		// waiting for it to expire. Otherwise, create a new timer.
+		if tmr == nil {
+			tmr = time.NewTimer(captivePortalDetectionInterval)
+		}
+	}
+	maybeStopTimer := func() {
+		if tmr == nil {
+			return
+		}
+		if !tmr.Stop() {
+			<-tmr.C
+		}
+		tmr = nil
+	}
+
+	for {
+		if ctx.Err() != nil {
+			maybeStopTimer()
+			return
+		}
+
+		// First, see if we have a signal on our "healthy" channel, which
+		// takes priority over an existing timer. Because a select is
+		// nondeterministic, we explicitly check this channel before
+		// entering the main select below, so that we're guaranteed to
+		// stop the timer before starting captive portal detection.
+		select {
+		case needsCaptiveDetection := <-b.needsCaptiveDetection:
+			if needsCaptiveDetection {
+				maybeStartTimer()
+			} else {
+				maybeStopTimer()
+			}
+		default:
+		}
+
+		var timerChan <-chan time.Time
+		if tmr != nil {
+			timerChan = tmr.C
+		}
+		select {
+		case <-ctx.Done():
+			// All done; stop the timer and then exit.
+			maybeStopTimer()
+			return
+		case <-timerChan:
+			// Kick off captive portal check
+			b.performCaptiveDetection()
+			// nil out timer to force recreation
+			tmr = nil
+		case needsCaptiveDetection := <-b.needsCaptiveDetection:
+			if needsCaptiveDetection {
+				maybeStartTimer()
+			} else {
+				// Healthy; cancel any existing timer
+				maybeStopTimer()
+			}
+		}
+	}
+}
+
+// performCaptiveDetection checks if captive portal detection is enabled via controlknob. If so, it runs
+// the detection and updates the Warnable accordingly.
+func (b *LocalBackend) performCaptiveDetection() {
+	if !b.shouldRunCaptivePortalDetection() {
+		return
+	}
+
+	d := captivedetection.NewDetector(b.logf)
+	var dm *tailcfg.DERPMap
+	b.mu.Lock()
+	if b.netMap != nil {
+		dm = b.netMap.DERPMap
+	}
+	preferredDERP := 0
+	if b.hostinfo != nil {
+		if b.hostinfo.NetInfo != nil {
+			preferredDERP = b.hostinfo.NetInfo.PreferredDERP
+		}
+	}
+	ctx := b.ctx
+	netMon := b.NetMon()
+	b.mu.Unlock()
+	found := d.Detect(ctx, netMon, dm, preferredDERP)
+	if found {
+		b.health.SetUnhealthy(captivePortalWarnable, health.Args{})
+	} else {
+		b.health.SetHealthy(captivePortalWarnable)
+	}
+}
+
+// shouldRunCaptivePortalDetection reports whether captive portal detection
+// should be run. It is enabled by default, but can be disabled via a control
+// knob. It is also only run when the user explicitly wants the backend to be
+// running.
+func (b *LocalBackend) shouldRunCaptivePortalDetection() bool {
+	b.mu.Lock()
+	defer b.mu.Unlock()
+	return !b.ControlKnobs().DisableCaptivePortalDetection.Load() && b.pm.prefs.WantRunning()
+}
+
 // packetFilterPermitsUnlockedNodes reports any peer in peers with the
 // UnsignedPeerAPIOnly bool set true has any of its allowed IPs in the packet
 // filter.
@@ -4490,9 +4677,27 @@ func (b *LocalBackend) enterStateLockedOnEntry(newState ipn.State, unlock unlock
 	if newState == ipn.Running {
 		b.authURL = ""
 		b.authURLTime = time.Time{}
+
+		// Start a captive portal detection loop if none has been
+		// started. Create a new context if none is present, since it
+		// can be shut down if we transition away from Running.
+		if b.captiveCancel == nil {
+			b.captiveCtx, b.captiveCancel = context.WithCancel(b.ctx)
+			go b.checkCaptivePortalLoop(b.captiveCtx)
+		}
 	} else if oldState == ipn.Running {
 		// Transitioning away from running.
 		b.closePeerAPIListenersLocked()
+
+		// Stop any existing captive portal detection loop.
+		if b.captiveCancel != nil {
+			b.captiveCancel()
+			b.captiveCancel = nil
+
+			// NOTE: don't set captiveCtx to nil here, to ensure
+			// that we always have a (canceled) context to wait on
+			// in onHealthChange.
+		}
 	}
 	b.pauseOrResumeControlClientLocked()
 

+ 217 - 0
net/captivedetection/captivedetection.go

@@ -0,0 +1,217 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+// Package captivedetection provides a way to detect if the system is connected to a network that has
+// a captive portal. It does this by making HTTP requests to known captive portal detection endpoints
+// and checking if the HTTP responses indicate that a captive portal might be present.
+package captivedetection
+
+import (
+	"context"
+	"net"
+	"net/http"
+	"runtime"
+	"strings"
+	"sync"
+	"syscall"
+	"time"
+
+	"tailscale.com/net/netmon"
+	"tailscale.com/tailcfg"
+	"tailscale.com/types/logger"
+)
+
+// Detector checks whether the system is behind a captive portal.
+type Detector struct {
+
+	// httpClient is the HTTP client that is used for captive portal detection. It is configured
+	// to not follow redirects, have a short timeout and no keep-alive.
+	httpClient *http.Client
+	// currIfIndex is the index of the interface that is currently being used by the httpClient.
+	currIfIndex int
+	// mu guards currIfIndex.
+	mu sync.Mutex
+	// logf is the logger used for logging messages. If it is nil, log.Printf is used.
+	logf logger.Logf
+}
+
+// NewDetector creates a new Detector instance for captive portal detection.
+func NewDetector(logf logger.Logf) *Detector {
+	d := &Detector{logf: logf}
+	d.httpClient = &http.Client{
+		// No redirects allowed
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			return http.ErrUseLastResponse
+		},
+		Transport: &http.Transport{
+			DialContext:       d.dialContext,
+			DisableKeepAlives: true,
+		},
+		Timeout: Timeout,
+	}
+	return d
+}
+
+// Timeout is the timeout for captive portal detection requests. Because the captive portal intercepting our requests
+// is usually located on the LAN, this is a relatively short timeout.
+const Timeout = 3 * time.Second
+
+// Detect is the entry point to the API. It attempts to detect if the system is behind a captive portal
+// by making HTTP requests to known captive portal detection Endpoints. If any of the requests return a response code
+// or body that looks like a captive portal, Detect returns true. It returns false in all other cases, including when any
+// error occurs during a detection attempt.
+//
+// This function might take a while to return, as it will attempt to detect a captive portal on all available interfaces
+// by performing multiple HTTP requests. It should be called in a separate goroutine if you want to avoid blocking.
+func (d *Detector) Detect(ctx context.Context, netMon *netmon.Monitor, derpMap *tailcfg.DERPMap, preferredDERPRegionID int) (found bool) {
+	return d.detectCaptivePortalWithGOOS(ctx, netMon, derpMap, preferredDERPRegionID, runtime.GOOS)
+}
+
+func (d *Detector) detectCaptivePortalWithGOOS(ctx context.Context, netMon *netmon.Monitor, derpMap *tailcfg.DERPMap, preferredDERPRegionID int, goos string) (found bool) {
+	ifState := netMon.InterfaceState()
+	if !ifState.AnyInterfaceUp() {
+		d.logf("[v2] DetectCaptivePortal: no interfaces up, returning false")
+		return false
+	}
+
+	endpoints := availableEndpoints(derpMap, preferredDERPRegionID, d.logf, goos)
+
+	// Here we try detecting a captive portal using *all* available interfaces on the system
+	// that have a IPv4 address. We consider to have found a captive portal when any interface
+	// reports one may exists. This is necessary because most systems have multiple interfaces,
+	// and most importantly on macOS no default route interface is set until the user has accepted
+	// the captive portal alert thrown by the system. If no default route interface is known,
+	// we need to try with anything that might remotely resemble a Wi-Fi interface.
+	for ifName, i := range ifState.Interface {
+		if !i.IsUp() || i.IsLoopback() || interfaceNameDoesNotNeedCaptiveDetection(ifName, goos) {
+			continue
+		}
+		addrs, err := i.Addrs()
+		if err != nil {
+			d.logf("[v1] DetectCaptivePortal: failed to get addresses for interface %s: %v", ifName, err)
+			continue
+		}
+		if len(addrs) == 0 {
+			continue
+		}
+		d.logf("[v2] attempting to do captive portal detection on interface %s", ifName)
+		res := d.detectOnInterface(ctx, i.Index, endpoints)
+		if res {
+			d.logf("DetectCaptivePortal(found=true,ifName=%s)", found, ifName)
+			return true
+		}
+	}
+
+	d.logf("DetectCaptivePortal(found=false)")
+	return false
+}
+
+func interfaceNameDoesNotNeedCaptiveDetection(ifName string, goos string) bool {
+	ifName = strings.ToLower(ifName)
+	excludedPrefixes := []string{"tailscale", "tun", "tap", "docker", "kube", "wg"}
+	if goos == "windows" {
+		excludedPrefixes = append(excludedPrefixes, "loopback", "tunnel", "ppp", "isatap", "teredo", "6to4")
+	} else if goos == "darwin" || goos == "ios" {
+		excludedPrefixes = append(excludedPrefixes, "awdl", "bridge", "ap", "utun", "tap", "llw", "anpi", "lo", "stf", "gif", "xhc")
+	}
+	for _, prefix := range excludedPrefixes {
+		if strings.HasPrefix(ifName, prefix) {
+			return true
+		}
+	}
+	return false
+}
+
+// detectOnInterface reports whether or not we think the system is behind a
+// captive portal, detected by making a request to a URL that we know should
+// return a "204 No Content" response and checking if that's what we get.
+//
+// The boolean return is whether we think we have a captive portal.
+func (d *Detector) detectOnInterface(ctx context.Context, ifIndex int, endpoints []Endpoint) bool {
+	defer d.httpClient.CloseIdleConnections()
+
+	d.logf("[v2] %d available captive portal detection endpoints: %v", len(endpoints), endpoints)
+
+	// We try to detect the captive portal more quickly by making requests to multiple endpoints concurrently.
+	var wg sync.WaitGroup
+	resultCh := make(chan bool, len(endpoints))
+
+	for i, e := range endpoints {
+		if i >= 5 {
+			// Try a maximum of 5 endpoints, break out (returning false) if we run of attempts.
+			break
+		}
+		wg.Add(1)
+		go func(endpoint Endpoint) {
+			defer wg.Done()
+			found, err := d.verifyCaptivePortalEndpoint(ctx, endpoint, ifIndex)
+			if err != nil {
+				d.logf("[v1] checkCaptivePortalEndpoint failed with endpoint %v: %v", endpoint, err)
+				return
+			}
+			if found {
+				resultCh <- true
+			}
+		}(e)
+	}
+
+	go func() {
+		wg.Wait()
+		close(resultCh)
+	}()
+
+	for result := range resultCh {
+		if result {
+			// If any of the endpoints seems to be a captive portal, we consider the system to be behind one.
+			return true
+		}
+	}
+
+	return false
+}
+
+// verifyCaptivePortalEndpoint checks if the given Endpoint is a captive portal by making an HTTP request to the
+// given Endpoint URL using the interface with index ifIndex, and checking if the response looks like a captive portal.
+func (d *Detector) verifyCaptivePortalEndpoint(ctx context.Context, e Endpoint, ifIndex int) (found bool, err error) {
+	req, err := http.NewRequestWithContext(ctx, "GET", e.URL.String(), nil)
+	if err != nil {
+		return false, err
+	}
+
+	// Attach the Tailscale challenge header if the endpoint supports it. Not all captive portal detection endpoints
+	// support this, so we only attach it if the endpoint does.
+	if e.SupportsTailscaleChallenge {
+		// Note: the set of valid characters in a challenge and the total
+		// length is limited; see isChallengeChar in cmd/derper for more
+		// details.
+		chal := "ts_" + e.URL.Host
+		req.Header.Set("X-Tailscale-Challenge", chal)
+	}
+
+	d.mu.Lock()
+	d.currIfIndex = ifIndex
+	d.mu.Unlock()
+
+	// Make the actual request, and check if the response looks like a captive portal or not.
+	r, err := d.httpClient.Do(req)
+	if err != nil {
+		return false, err
+	}
+
+	return e.responseLooksLikeCaptive(r, d.logf), nil
+}
+
+func (d *Detector) dialContext(ctx context.Context, network, addr string) (net.Conn, error) {
+	d.mu.Lock()
+	defer d.mu.Unlock()
+
+	ifIndex := d.currIfIndex
+
+	dl := net.Dialer{
+		Control: func(network, address string, c syscall.RawConn) error {
+			return setSocketInterfaceIndex(c, ifIndex, d.logf)
+		},
+	}
+
+	return dl.DialContext(ctx, network, addr)
+}

+ 58 - 0
net/captivedetection/captivedetection_test.go

@@ -0,0 +1,58 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+package captivedetection
+
+import (
+	"context"
+	"runtime"
+	"sync"
+	"testing"
+
+	"tailscale.com/net/netmon"
+)
+
+func TestAvailableEndpointsAlwaysAtLeastTwo(t *testing.T) {
+	endpoints := availableEndpoints(nil, 0, t.Logf, runtime.GOOS)
+	if len(endpoints) == 0 {
+		t.Errorf("Expected non-empty AvailableEndpoints, got an empty slice instead")
+	}
+	if len(endpoints) == 1 {
+		t.Errorf("Expected at least two AvailableEndpoints for redundancy, got only one instead")
+	}
+	for _, e := range endpoints {
+		if e.URL.Scheme != "http" {
+			t.Errorf("Expected HTTP URL in Endpoint, got HTTPS")
+		}
+	}
+}
+
+func TestDetectCaptivePortalReturnsFalse(t *testing.T) {
+	d := NewDetector(t.Logf)
+	found := d.Detect(context.Background(), netmon.NewStatic(), nil, 0)
+	if found {
+		t.Errorf("DetectCaptivePortal returned true, expected false.")
+	}
+}
+
+func TestAllEndpointsAreUpAndReturnExpectedResponse(t *testing.T) {
+	d := NewDetector(t.Logf)
+	endpoints := availableEndpoints(nil, 0, t.Logf, runtime.GOOS)
+
+	var wg sync.WaitGroup
+	for _, e := range endpoints {
+		wg.Add(1)
+		go func(endpoint Endpoint) {
+			defer wg.Done()
+			found, err := d.verifyCaptivePortalEndpoint(context.Background(), endpoint, 0)
+			if err != nil {
+				t.Errorf("verifyCaptivePortalEndpoint failed with endpoint %v: %v", endpoint, err)
+			}
+			if found {
+				t.Errorf("verifyCaptivePortalEndpoint with endpoint %v says we're behind a captive portal, but we aren't", endpoint)
+			}
+		}(e)
+	}
+
+	wg.Wait()
+}

+ 178 - 0
net/captivedetection/endpoints.go

@@ -0,0 +1,178 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+package captivedetection
+
+import (
+	"cmp"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"slices"
+
+	"go4.org/mem"
+	"tailscale.com/net/dnsfallback"
+	"tailscale.com/tailcfg"
+	"tailscale.com/types/logger"
+)
+
+// EndpointProvider is an enum that represents the source of an Endpoint.
+type EndpointProvider int
+
+const (
+	// DERPMapPreferred is used for an endpoint that is a DERP node contained in the current preferred DERP region,
+	// as provided by the DERPMap.
+	DERPMapPreferred EndpointProvider = iota
+	// DERPMapOther is used for an endpoint that is a DERP node, but not contained in the current preferred DERP region.
+	DERPMapOther
+	// Tailscale is used for endpoints that are the Tailscale coordination server or admin console.
+	Tailscale
+)
+
+func (p EndpointProvider) String() string {
+	switch p {
+	case DERPMapPreferred:
+		return "DERPMapPreferred"
+	case Tailscale:
+		return "Tailscale"
+	case DERPMapOther:
+		return "DERPMapOther"
+	default:
+		return fmt.Sprintf("EndpointProvider(%d)", p)
+	}
+}
+
+// Endpoint represents a URL that can be used to detect a captive portal, along with the expected
+// result of the HTTP request.
+type Endpoint struct {
+	// URL is the URL that we make an HTTP request to as part of the captive portal detection process.
+	URL *url.URL
+	// StatusCode is the expected HTTP status code that we expect to see in the response.
+	StatusCode int
+	// ExpectedContent is a string that we expect to see contained in the response body. If this is non-empty,
+	// we will check that the response body contains this string. If it is empty, we will not check the response body
+	// and only check the status code.
+	ExpectedContent string
+	// SupportsTailscaleChallenge is true if the endpoint will return the sent value of the X-Tailscale-Challenge
+	// HTTP header in its HTTP response.
+	SupportsTailscaleChallenge bool
+	// Provider is the source of the endpoint. This is used to prioritize certain endpoints over others
+	// (for example, a DERP node in the preferred region should always be used first).
+	Provider EndpointProvider
+}
+
+func (e Endpoint) String() string {
+	return fmt.Sprintf("Endpoint{URL=%q, StatusCode=%d, ExpectedContent=%q, SupportsTailscaleChallenge=%v, Provider=%s}", e.URL, e.StatusCode, e.ExpectedContent, e.SupportsTailscaleChallenge, e.Provider.String())
+}
+
+func (e Endpoint) Equal(other Endpoint) bool {
+	return e.URL.String() == other.URL.String() &&
+		e.StatusCode == other.StatusCode &&
+		e.ExpectedContent == other.ExpectedContent &&
+		e.SupportsTailscaleChallenge == other.SupportsTailscaleChallenge &&
+		e.Provider == other.Provider
+}
+
+// availableEndpoints returns a set of Endpoints which can be used for captive portal detection by performing
+// one or more HTTP requests and looking at the response. The returned Endpoints are ordered by preference,
+// with the most preferred Endpoint being the first in the slice.
+func availableEndpoints(derpMap *tailcfg.DERPMap, preferredDERPRegionID int, logf logger.Logf, goos string) []Endpoint {
+	endpoints := []Endpoint{}
+
+	if derpMap == nil || len(derpMap.Regions) == 0 {
+		// When the client first starts, we don't have a DERPMap in LocalBackend yet. In this case,
+		// we use the static DERPMap from dnsfallback.
+		logf("captivedetection: current DERPMap is empty, using map from dnsfallback")
+		derpMap = dnsfallback.GetDERPMap()
+	}
+	// Use the DERP IPs as captive portal detection endpoints. Using IPs is better than hostnames
+	// because they do not depend on DNS resolution.
+	for _, region := range derpMap.Regions {
+		if region.Avoid {
+			continue
+		}
+		for _, node := range region.Nodes {
+			if node.IPv4 == "" || !node.CanPort80 {
+				continue
+			}
+			str := "http://" + node.IPv4 + "/generate_204"
+			u, err := url.Parse(str)
+			if err != nil {
+				logf("captivedetection: failed to parse DERP node URL %q: %v", str, err)
+				continue
+			}
+			p := DERPMapOther
+			if region.RegionID == preferredDERPRegionID {
+				p = DERPMapPreferred
+			}
+			e := Endpoint{u, http.StatusNoContent, "", true, p}
+			endpoints = append(endpoints, e)
+		}
+	}
+
+	// Let's also try the default Tailscale coordination server and admin console.
+	// These are likely to be blocked on some networks.
+	appendTailscaleEndpoint := func(urlString string) {
+		u, err := url.Parse(urlString)
+		if err != nil {
+			logf("captivedetection: failed to parse Tailscale URL %q: %v", urlString, err)
+			return
+		}
+		endpoints = append(endpoints, Endpoint{u, http.StatusNoContent, "", false, Tailscale})
+	}
+	appendTailscaleEndpoint("http://controlplane.tailscale.com/generate_204")
+	appendTailscaleEndpoint("http://login.tailscale.com/generate_204")
+
+	// Sort the endpoints by provider so that we can prioritize DERP nodes in the preferred region, followed by
+	// any other DERP server elsewhere, then followed by Tailscale endpoints.
+	slices.SortFunc(endpoints, func(x, y Endpoint) int {
+		return cmp.Compare(x.Provider, y.Provider)
+	})
+
+	return endpoints
+}
+
+// responseLooksLikeCaptive checks if the given HTTP response matches the expected response for the Endpoint.
+func (e Endpoint) responseLooksLikeCaptive(r *http.Response, logf logger.Logf) bool {
+	defer r.Body.Close()
+
+	// Check the status code first.
+	if r.StatusCode != e.StatusCode {
+		logf("[v1] unexpected status code in captive portal response: want=%d, got=%d", e.StatusCode, r.StatusCode)
+		return true
+	}
+
+	// If the endpoint supports the Tailscale challenge header, check that the response contains the expected header.
+	if e.SupportsTailscaleChallenge {
+		expectedResponse := "response ts_" + e.URL.Host
+		hasResponse := r.Header.Get("X-Tailscale-Response") == expectedResponse
+		if !hasResponse {
+			// The response did not contain the expected X-Tailscale-Response header, which means we are most likely
+			// behind a captive portal (somebody is tampering with the response headers).
+			logf("captive portal check response did not contain expected X-Tailscale-Response header: want=%q, got=%q", expectedResponse, r.Header.Get("X-Tailscale-Response"))
+			return true
+		}
+	}
+
+	// If we don't have an expected content string, we don't need to check the response body.
+	if e.ExpectedContent == "" {
+		return false
+	}
+
+	// Read the response body and check if it contains the expected content.
+	b, err := io.ReadAll(io.LimitReader(r.Body, 4096))
+	if err != nil {
+		logf("reading captive portal check response body failed: %v", err)
+		return false
+	}
+	hasExpectedContent := mem.Contains(mem.B(b), mem.S(e.ExpectedContent))
+	if !hasExpectedContent {
+		// The response body did not contain the expected content, that means we are most likely behind a captive portal.
+		logf("[v1] captive portal check response body did not contain expected content: want=%q", e.ExpectedContent)
+		return true
+	}
+
+	// If we got here, the response looks good.
+	return false
+}

+ 19 - 0
net/captivedetection/rawconn.go

@@ -0,0 +1,19 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build !(ios || darwin)
+
+package captivedetection
+
+import (
+	"syscall"
+
+	"tailscale.com/types/logger"
+)
+
+// setSocketInterfaceIndex sets the IP_BOUND_IF socket option on the given RawConn.
+// This forces the socket to use the given interface.
+func setSocketInterfaceIndex(c syscall.RawConn, ifIndex int, logf logger.Logf) error {
+	// No-op on non-Darwin platforms.
+	return nil
+}

+ 24 - 0
net/captivedetection/rawconn_apple.go

@@ -0,0 +1,24 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+//go:build ios || darwin
+
+package captivedetection
+
+import (
+	"syscall"
+
+	"golang.org/x/sys/unix"
+	"tailscale.com/types/logger"
+)
+
+// setSocketInterfaceIndex sets the IP_BOUND_IF socket option on the given RawConn.
+// This forces the socket to use the given interface.
+func setSocketInterfaceIndex(c syscall.RawConn, ifIndex int, logf logger.Logf) error {
+	return c.Control((func(fd uintptr) {
+		err := unix.SetsockoptInt(int(fd), unix.IPPROTO_IP, unix.IP_BOUND_IF, ifIndex)
+		if err != nil {
+			logf("captivedetection: failed to set IP_BOUND_IF (ifIndex=%d): %v", ifIndex, err)
+		}
+	}))
+}

+ 7 - 4
net/dnsfallback/dnsfallback.go

@@ -219,7 +219,7 @@ func lookup(ctx context.Context, host string, logf logger.Logf, ht *health.Track
 		ip      netip.Addr
 	}
 
-	dm := getDERPMap()
+	dm := GetDERPMap()
 
 	var cands4, cands6 []nameIP
 	for _, dr := range dm.Regions {
@@ -310,9 +310,12 @@ func bootstrapDNSMap(ctx context.Context, serverName string, serverIP netip.Addr
 // https://derp10.tailscale.com/bootstrap-dns
 type dnsMap map[string][]netip.Addr
 
-// getDERPMap returns some DERP map. The DERP servers also run a fallback
-// DNS server.
-func getDERPMap() *tailcfg.DERPMap {
+// GetDERPMap returns a fallback DERP map that is always available, useful for basic
+// bootstrapping purposes. The dynamically updated DERP map in LocalBackend should
+// always be preferred over this. Use this DERP map only when the control plane is
+// unreachable or hasn't been reached yet. The DERP servers in the returned map also
+// run a fallback DNS server.
+func GetDERPMap() *tailcfg.DERPMap {
 	dm := getStaticDERPMap()
 
 	// Merge in any DERP servers from the cached map that aren't in the

+ 2 - 2
net/dnsfallback/dnsfallback_test.go

@@ -18,7 +18,7 @@ import (
 )
 
 func TestGetDERPMap(t *testing.T) {
-	dm := getDERPMap()
+	dm := GetDERPMap()
 	if dm == nil {
 		t.Fatal("nil")
 	}
@@ -78,7 +78,7 @@ func TestCache(t *testing.T) {
 	}
 
 	// Verify that our DERP map is merged with the cache.
-	dm := getDERPMap()
+	dm := GetDERPMap()
 	region, ok := dm.Regions[99]
 	if !ok {
 		t.Fatal("expected region 99")

+ 3 - 76
net/netcheck/netcheck.go

@@ -14,13 +14,11 @@ import (
 	"io"
 	"log"
 	"maps"
-	"math/rand/v2"
 	"net"
 	"net/http"
 	"net/netip"
 	"runtime"
 	"sort"
-	"strings"
 	"sync"
 	"syscall"
 	"time"
@@ -28,6 +26,7 @@ import (
 	"github.com/tcnksm/go-httpstat"
 	"tailscale.com/derp/derphttp"
 	"tailscale.com/envknob"
+	"tailscale.com/net/captivedetection"
 	"tailscale.com/net/dnscache"
 	"tailscale.com/net/neterror"
 	"tailscale.com/net/netmon"
@@ -847,11 +846,8 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
 
 		tmr := time.AfterFunc(c.captivePortalDelay(), func() {
 			defer close(ch)
-			found, err := c.checkCaptivePortal(ctx, dm, preferredDERP)
-			if err != nil {
-				c.logf("[v1] checkCaptivePortal: %v", err)
-				return
-			}
+			d := captivedetection.NewDetector(c.logf)
+			found := d.Detect(ctx, c.NetMon, dm, preferredDERP)
 			rs.report.CaptivePortal.Set(found)
 		})
 
@@ -988,75 +984,6 @@ func (c *Client) finishAndStoreReport(rs *reportState, dm *tailcfg.DERPMap) *Rep
 	return report
 }
 
-var noRedirectClient = &http.Client{
-	// No redirects allowed
-	CheckRedirect: func(req *http.Request, via []*http.Request) error {
-		return http.ErrUseLastResponse
-	},
-
-	// Remaining fields are the same as the default client.
-	Transport: http.DefaultClient.Transport,
-	Jar:       http.DefaultClient.Jar,
-	Timeout:   http.DefaultClient.Timeout,
-}
-
-// checkCaptivePortal reports whether or not we think the system is behind a
-// captive portal, detected by making a request to a URL that we know should
-// return a "204 No Content" response and checking if that's what we get.
-//
-// The boolean return is whether we think we have a captive portal.
-func (c *Client) checkCaptivePortal(ctx context.Context, dm *tailcfg.DERPMap, preferredDERP int) (bool, error) {
-	defer noRedirectClient.CloseIdleConnections()
-
-	// If we have a preferred DERP region with more than one node, try
-	// that; otherwise, pick a random one not marked as "Avoid".
-	if preferredDERP == 0 || dm.Regions[preferredDERP] == nil ||
-		(preferredDERP != 0 && len(dm.Regions[preferredDERP].Nodes) == 0) {
-		rids := make([]int, 0, len(dm.Regions))
-		for id, reg := range dm.Regions {
-			if reg == nil || reg.Avoid || len(reg.Nodes) == 0 {
-				continue
-			}
-			rids = append(rids, id)
-		}
-		if len(rids) == 0 {
-			return false, nil
-		}
-		preferredDERP = rids[rand.IntN(len(rids))]
-	}
-
-	node := dm.Regions[preferredDERP].Nodes[0]
-
-	if strings.HasSuffix(node.HostName, tailcfg.DotInvalid) {
-		// Don't try to connect to invalid hostnames. This occurred in tests:
-		// https://github.com/tailscale/tailscale/issues/6207
-		// TODO(bradfitz,andrew-d): how to actually handle this nicely?
-		return false, nil
-	}
-
-	req, err := http.NewRequestWithContext(ctx, "GET", "http://"+node.HostName+"/generate_204", nil)
-	if err != nil {
-		return false, err
-	}
-
-	// Note: the set of valid characters in a challenge and the total
-	// length is limited; see isChallengeChar in cmd/derper for more
-	// details.
-	chal := "ts_" + node.HostName
-	req.Header.Set("X-Tailscale-Challenge", chal)
-	r, err := noRedirectClient.Do(req)
-	if err != nil {
-		return false, err
-	}
-	defer r.Body.Close()
-
-	expectedResponse := "response " + chal
-	validResponse := r.Header.Get("X-Tailscale-Response") == expectedResponse
-
-	c.logf("[v2] checkCaptivePortal url=%q status_code=%d valid_response=%v", req.URL.String(), r.StatusCode, validResponse)
-	return r.StatusCode != 204 || !validResponse, nil
-}
-
 // runHTTPOnlyChecks is the netcheck done by environments that can
 // only do HTTP requests, such as ws/wasm.
 func (c *Client) runHTTPOnlyChecks(ctx context.Context, last *Report, rs *reportState, dm *tailcfg.DERPMap) error {

+ 0 - 50
net/netcheck/netcheck_test.go

@@ -15,14 +15,12 @@ import (
 	"sort"
 	"strconv"
 	"strings"
-	"sync/atomic"
 	"testing"
 	"time"
 
 	"tailscale.com/net/netmon"
 	"tailscale.com/net/stun/stuntest"
 	"tailscale.com/tailcfg"
-	"tailscale.com/tstest"
 	"tailscale.com/tstest/nettest"
 )
 
@@ -778,54 +776,6 @@ func TestSortRegions(t *testing.T) {
 	}
 }
 
-func TestNoCaptivePortalWhenUDP(t *testing.T) {
-	nettest.SkipIfNoNetwork(t) // empirically. not sure why.
-
-	// Override noRedirectClient to handle the /generate_204 endpoint
-	var generate204Called atomic.Bool
-	tr := RoundTripFunc(func(req *http.Request) *http.Response {
-		if !strings.HasSuffix(req.URL.String(), "/generate_204") {
-			panic("bad URL: " + req.URL.String())
-		}
-		generate204Called.Store(true)
-		return &http.Response{
-			StatusCode: http.StatusNoContent,
-			Header:     make(http.Header),
-		}
-	})
-
-	tstest.Replace(t, &noRedirectClient.Transport, http.RoundTripper(tr))
-
-	stunAddr, cleanup := stuntest.Serve(t)
-	defer cleanup()
-
-	c := newTestClient(t)
-	c.testEnoughRegions = 1
-	// Set the delay long enough that we have time to cancel it
-	// when our STUN probe succeeds.
-	c.testCaptivePortalDelay = 10 * time.Second
-
-	ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
-	defer cancel()
-
-	if err := c.Standalone(ctx, "127.0.0.1:0"); err != nil {
-		t.Fatal(err)
-	}
-
-	r, err := c.GetReport(ctx, stuntest.DERPMapOf(stunAddr.String()), nil)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	// Should not have called our captive portal function.
-	if generate204Called.Load() {
-		t.Errorf("captive portal check called; expected no call")
-	}
-	if r.CaptivePortal != "" {
-		t.Errorf("got CaptivePortal=%q, want empty", r.CaptivePortal)
-	}
-}
-
 type RoundTripFunc func(req *http.Request) *http.Response
 
 func (f RoundTripFunc) RoundTrip(req *http.Request) (*http.Response, error) {

+ 6 - 1
tailcfg/tailcfg.go

@@ -145,7 +145,8 @@ type CapabilityVersion int
 //   - 100: 2024-06-18: Client supports filtertype.Match.SrcCaps (issue #12542)
 //   - 101: 2024-07-01: Client supports SSH agent forwarding when handling connections with /bin/su
 //   - 102: 2024-07-12: NodeAttrDisableMagicSockCryptoRouting support
-const CurrentCapabilityVersion CapabilityVersion = 102
+//   - 103: 2024-07-24: Client supports NodeAttrDisableCaptivePortalDetection
+const CurrentCapabilityVersion CapabilityVersion = 103
 
 type StableID string
 
@@ -2327,6 +2328,10 @@ const (
 	// NodeAttrDisableMagicSockCryptoRouting disables the use of the
 	// magicsock cryptorouting hook. See tailscale/corp#20732.
 	NodeAttrDisableMagicSockCryptoRouting NodeCapability = "disable-magicsock-crypto-routing"
+
+	// NodeAttrDisableCaptivePortalDetection instructs the client to not perform captive portal detection
+	// automatically when the network state changes.
+	NodeAttrDisableCaptivePortalDetection NodeCapability = "disable-captive-portal-detection"
 )
 
 // SetDNSRequest is a request to add a DNS record.