Browse Source

net/netcheck: ensure prior preferred DERP is always in netchecks

In an environment with unstable latency, such as upstream bufferbloat,
there are cases where a full netcheck could drop the prior preferred
DERP (likely home DERP) from future netcheck probe plans. This will then
likely result in a home DERP having a missing sample on the next
incremental netcheck, ultimately resulting in a home DERP move.

This change does not fix our overall response to highly unstable
latency, but it is an incremental improvement to prevent single spurious
samples during a full netcheck from alone triggering a flapping
condition, as now the prior changes to include historical latency will
still provide the desired resistance, and the home DERP should not move
unless latency is consistently worse over a 5 minute period.

Note that there is a nomenclature and semantics issue remaining in the
difference between a report preferred DERP and a home DERP. A report
preferred DERP is aspirational, it is what will be picked as a home DERP
if a home DERP connection needs to be established. A nodes home DERP may
be different than a recent preferred DERP, in which case a lot of
netcheck logic is fallible. In future enhancements much of the DERP move
logic should move to consider the home DERP, rather than recent report
preferred DERP.

Updates #8603
Updates #13969

Signed-off-by: James Tucker <[email protected]>
James Tucker 1 year ago
parent
commit
e1e22785b4
2 changed files with 93 additions and 17 deletions
  1. 53 15
      net/netcheck/netcheck.go
  2. 40 2
      net/netcheck/netcheck_test.go

+ 53 - 15
net/netcheck/netcheck.go

@@ -392,10 +392,11 @@ type probePlan map[string][]probe
 // sortRegions returns the regions of dm first sorted
 // from fastest to slowest (based on the 'last' report),
 // end in regions that have no data.
-func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion) {
+func sortRegions(dm *tailcfg.DERPMap, last *Report, preferredDERP int) (prev []*tailcfg.DERPRegion) {
 	prev = make([]*tailcfg.DERPRegion, 0, len(dm.Regions))
 	for _, reg := range dm.Regions {
-		if reg.Avoid {
+		// include an otherwise avoid region if it is the current preferred region
+		if reg.Avoid && reg.RegionID != preferredDERP {
 			continue
 		}
 		prev = append(prev, reg)
@@ -420,9 +421,19 @@ func sortRegions(dm *tailcfg.DERPMap, last *Report) (prev []*tailcfg.DERPRegion)
 // a full report, all regions are scanned.)
 const numIncrementalRegions = 3
 
-// makeProbePlan generates the probe plan for a DERPMap, given the most
-// recent report and whether IPv6 is configured on an interface.
-func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (plan probePlan) {
+// makeProbePlan generates the probe plan for a DERPMap, given the most recent
+// report and the current home DERP. preferredDERP is passed independently of
+// last (report) because last is currently nil'd to indicate a desire for a full
+// netcheck.
+//
+// TODO(raggi,jwhited): refactor the callers and this function to be more clear
+// about full vs. incremental netchecks, and remove the need for the history
+// hiding. This was avoided in an incremental change due to exactly this kind of
+// distant coupling.
+// TODO(raggi): change from "preferred DERP" from a historical report to "home
+// DERP" as in what DERP is the current home connection, this would further
+// reduce flap events.
+func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report, preferredDERP int) (plan probePlan) {
 	if last == nil || len(last.RegionLatency) == 0 {
 		return makeProbePlanInitial(dm, ifState)
 	}
@@ -433,9 +444,34 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
 	had4 := len(last.RegionV4Latency) > 0
 	had6 := len(last.RegionV6Latency) > 0
 	hadBoth := have6if && had4 && had6
-	for ri, reg := range sortRegions(dm, last) {
-		if ri == numIncrementalRegions {
-			break
+	// #13969 ensure that the home region is always probed.
+	// If a netcheck has unstable latency, such as a user with large amounts of
+	// bufferbloat or a highly congested connection, there are cases where a full
+	// netcheck may observe a one-off high latency to the current home DERP. Prior
+	// to the forced inclusion of the home DERP, this would result in an
+	// incremental netcheck following such an event to cause a home DERP move, with
+	// restoration back to the home DERP on the next full netcheck ~5 minutes later
+	// - which is highly disruptive when it causes shifts in geo routed subnet
+	// routers. By always including the home DERP in the incremental netcheck, we
+	// ensure that the home DERP is always probed, even if it observed a recenet
+	// poor latency sample. This inclusion enables the latency history checks in
+	// home DERP selection to still take effect.
+	// planContainsHome indicates whether the home DERP has been added to the probePlan,
+	// if there is no prior home, then there's no home to additionally include.
+	planContainsHome := preferredDERP == 0
+	for ri, reg := range sortRegions(dm, last, preferredDERP) {
+		regIsHome := reg.RegionID == preferredDERP
+		if ri >= numIncrementalRegions {
+			// planned at least numIncrementalRegions regions and that includes the
+			// last home region (or there was none), plan complete.
+			if planContainsHome {
+				break
+			}
+			// planned at least numIncrementalRegions regions, but not the home region,
+			// check if this is the home region, if not, skip it.
+			if !regIsHome {
+				continue
+			}
 		}
 		var p4, p6 []probe
 		do4 := have4if
@@ -446,7 +482,7 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
 		tries := 1
 		isFastestTwo := ri < 2
 
-		if isFastestTwo {
+		if isFastestTwo || regIsHome {
 			tries = 2
 		} else if hadBoth {
 			// For dual stack machines, make the 3rd & slower nodes alternate
@@ -457,14 +493,15 @@ func makeProbePlan(dm *tailcfg.DERPMap, ifState *netmon.State, last *Report) (pl
 				do4, do6 = false, true
 			}
 		}
-		if !isFastestTwo && !had6 {
+		if !regIsHome && !isFastestTwo && !had6 {
 			do6 = false
 		}
 
-		if reg.RegionID == last.PreferredDERP {
+		if regIsHome {
 			// But if we already had a DERP home, try extra hard to
 			// make sure it's there so we don't flip flop around.
 			tries = 4
+			planContainsHome = true
 		}
 
 		for try := 0; try < tries; try++ {
@@ -789,9 +826,10 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
 	c.curState = rs
 	last := c.last
 
-	// Even if we're doing a non-incremental update, we may want to try our
-	// preferred DERP region for captive portal detection. Save that, if we
-	// have it.
+	// Extract preferredDERP from the last report, if available. This will be used
+	// in captive portal detection and DERP flapping suppression. Ideally this would
+	// be the current active home DERP rather than the last report preferred DERP,
+	// but only the latter is presently available.
 	var preferredDERP int
 	if last != nil {
 		preferredDERP = last.PreferredDERP
@@ -848,7 +886,7 @@ func (c *Client) GetReport(ctx context.Context, dm *tailcfg.DERPMap, opts *GetRe
 
 	var plan probePlan
 	if opts == nil || !opts.OnlyTCP443 {
-		plan = makeProbePlan(dm, ifState, last)
+		plan = makeProbePlan(dm, ifState, last, preferredDERP)
 	}
 
 	// If we're doing a full probe, also check for a captive portal. We

+ 40 - 2
net/netcheck/netcheck_test.go

@@ -590,6 +590,40 @@ func TestMakeProbePlan(t *testing.T) {
 				"region-3-v4": []probe{p("3a", 4)},
 			},
 		},
+		{
+			// #13969: ensure that the prior/current home region is always included in
+			// probe plans, so that we don't flap between regions due to a single major
+			// netcheck having excluded the home region due to a spuriously high sample.
+			name:    "ensure_home_region_inclusion",
+			dm:      basicMap,
+			have6if: true,
+			last: &Report{
+				RegionLatency: map[int]time.Duration{
+					1: 50 * time.Millisecond,
+					2: 20 * time.Millisecond,
+					3: 30 * time.Millisecond,
+					4: 40 * time.Millisecond,
+				},
+				RegionV4Latency: map[int]time.Duration{
+					1: 50 * time.Millisecond,
+					2: 20 * time.Millisecond,
+				},
+				RegionV6Latency: map[int]time.Duration{
+					3: 30 * time.Millisecond,
+					4: 40 * time.Millisecond,
+				},
+				PreferredDERP: 1,
+			},
+			want: probePlan{
+				"region-1-v4": []probe{p("1a", 4), p("1a", 4, 60*ms), p("1a", 4, 220*ms), p("1a", 4, 330*ms)},
+				"region-1-v6": []probe{p("1a", 6), p("1a", 6, 60*ms), p("1a", 6, 220*ms), p("1a", 6, 330*ms)},
+				"region-2-v4": []probe{p("2a", 4), p("2b", 4, 24*ms)},
+				"region-2-v6": []probe{p("2a", 6), p("2b", 6, 24*ms)},
+				"region-3-v4": []probe{p("3a", 4), p("3b", 4, 36*ms)},
+				"region-3-v6": []probe{p("3a", 6), p("3b", 6, 36*ms)},
+				"region-4-v4": []probe{p("4a", 4)},
+			},
+		},
 	}
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
@@ -597,7 +631,11 @@ func TestMakeProbePlan(t *testing.T) {
 				HaveV6: tt.have6if,
 				HaveV4: !tt.no4,
 			}
-			got := makeProbePlan(tt.dm, ifState, tt.last)
+			preferredDERP := 0
+			if tt.last != nil {
+				preferredDERP = tt.last.PreferredDERP
+			}
+			got := makeProbePlan(tt.dm, ifState, tt.last, preferredDERP)
 			if !reflect.DeepEqual(got, tt.want) {
 				t.Errorf("unexpected plan; got:\n%v\nwant:\n%v\n", got, tt.want)
 			}
@@ -770,7 +808,7 @@ func TestSortRegions(t *testing.T) {
 	report.RegionLatency[3] = time.Second * time.Duration(6)
 	report.RegionLatency[4] = time.Second * time.Duration(0)
 	report.RegionLatency[5] = time.Second * time.Duration(2)
-	sortedMap := sortRegions(unsortedMap, report)
+	sortedMap := sortRegions(unsortedMap, report, 0)
 
 	// Sorting by latency this should result in rid: 5, 2, 1, 3
 	// rid 4 with latency 0 should be at the end