1 年間前 · 00a4504cf1
--- a/cmd/derpprobe/derpprobe.go
+++ b/cmd/derpprobe/derpprobe.go
@@ -18,19 +18,21 @@ import (
 
				 )
			
 
				 
			
 
				 var (
			
 
				-	derpMapURL       = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map")
			
 
				-	versionFlag      = flag.Bool("version", false, "print version and exit")
			
 
				-	listen           = flag.String("listen", ":8030", "HTTP listen address")
			
 
				-	probeOnce        = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag")
			
 
				-	spread           = flag.Bool("spread", true, "whether to spread probing over time")
			
 
				-	interval         = flag.Duration("interval", 15*time.Second, "probe interval")
			
 
				-	meshInterval     = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval")
			
 
				-	stunInterval     = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval")
			
 
				-	tlsInterval      = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
			
 
				-	bwInterval       = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
			
 
				-	bwSize           = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
			
 
				-	bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP. We will use a /30 subnet including this IP address.")
			
 
				-	regionCode       = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
			
 
				+	derpMapURL         = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map")
			
 
				+	versionFlag        = flag.Bool("version", false, "print version and exit")
			
 
				+	listen             = flag.String("listen", ":8030", "HTTP listen address")
			
 
				+	probeOnce          = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag")
			
 
				+	spread             = flag.Bool("spread", true, "whether to spread probing over time")
			
 
				+	interval           = flag.Duration("interval", 15*time.Second, "probe interval")
			
 
				+	meshInterval       = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval")
			
 
				+	stunInterval       = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval")
			
 
				+	tlsInterval        = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
			
 
				+	bwInterval         = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
			
 
				+	bwSize             = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
			
 
				+	bwTUNIPv4Address   = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP; we will use a /30 subnet including this IP address")
			
 
				+	qdPacketsPerSecond = flag.Int("qd-packets-per-second", 0, "if greater than 0, queuing delay will be measured continuously using 260 byte packets (approximate size of a CallMeMaybe packet) sent at this rate per second")
			
 
				+	qdPacketTimeout    = flag.Duration("qd-packet-timeout", 5*time.Second, "queuing delay packets arriving after this period of time from being sent are treated like dropped packets and don't count toward queuing delay timings")
			
 
				+	regionCode         = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
			
 
				 )
			
 
				 
			
 
				 func main() {
			
@@ -45,6 +47,7 @@ func main() {
 
				 		prober.WithMeshProbing(*meshInterval),
			
 
				 		prober.WithSTUNProbing(*stunInterval),
			
 
				 		prober.WithTLSProbing(*tlsInterval),
			
 
				+		prober.WithQueuingDelayProbing(*qdPacketsPerSecond, *qdPacketTimeout),
			
 
				 	}
			
 
				 	if *bwInterval > 0 {
			
 
				 		opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address))
			
@@ -107,7 +110,7 @@ func getOverallStatus(p *prober.Prober) (o overallStatus) {
 
				 			// Do not show probes that have not finished yet.
			
 
				 			continue
			
 
				 		}
			
 
				-		if i.Result {
			
 
				+		if i.Status == prober.ProbeStatusSucceeded {
			
 
				 			o.addGoodf("%s: %s", p, i.Latency)
			
 
				 		} else {
			
 
				 			o.addBadf("%s: %s", p, i.Error)
			
--- a/prober/derp.go
+++ b/prober/derp.go
@@ -8,6 +8,7 @@ import (
 
				 	"cmp"
			
 
				 	"context"
			
 
				 	crand "crypto/rand"
			
 
				+	"encoding/binary"
			
 
				 	"encoding/json"
			
 
				 	"errors"
			
 
				 	"expvar"
			
@@ -17,6 +18,7 @@ import (
 
				 	"net"
			
 
				 	"net/http"
			
 
				 	"net/netip"
			
 
				+	"slices"
			
 
				 	"strconv"
			
 
				 	"strings"
			
 
				 	"sync"
			
@@ -53,6 +55,10 @@ type derpProber struct {
 
				 	bwProbeSize     int64
			
 
				 	bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN
			
 
				 
			
 
				+	// Optional queuing delay probing.
			
 
				+	qdPacketsPerSecond int // in packets per second
			
 
				+	qdPacketTimeout    time.Duration
			
 
				+
			
 
				 	// Optionally restrict probes to a single regionCode.
			
 
				 	regionCode string
			
 
				 
			
@@ -64,6 +70,7 @@ type derpProber struct {
 
				 	udpProbeFn  func(string, int) ProbeClass
			
 
				 	meshProbeFn func(string, string) ProbeClass
			
 
				 	bwProbeFn   func(string, string, int64) ProbeClass
			
 
				+	qdProbeFn   func(string, string, int, time.Duration) ProbeClass
			
 
				 
			
 
				 	sync.Mutex
			
 
				 	lastDERPMap   *tailcfg.DERPMap
			
@@ -93,6 +100,16 @@ func WithBandwidthProbing(interval time.Duration, size int64, tunAddress string)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+// WithQueuingDelayProbing enables/disables queuing delay probing. qdSendRate
			
 
				+// is the number of packets sent per second. qdTimeout is the amount of time
			
 
				+// after which a sent packet is considered to have timed out.
			
 
				+func WithQueuingDelayProbing(qdPacketsPerSecond int, qdPacketTimeout time.Duration) DERPOpt {
			
 
				+	return func(d *derpProber) {
			
 
				+		d.qdPacketsPerSecond = qdPacketsPerSecond
			
 
				+		d.qdPacketTimeout = qdPacketTimeout
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 // WithMeshProbing enables mesh probing. When enabled, a small message will be
			
 
				 // transferred through each DERP server and each pair of DERP servers.
			
 
				 func WithMeshProbing(interval time.Duration) DERPOpt {
			
@@ -147,6 +164,7 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
 
				 	d.udpProbeFn = d.ProbeUDP
			
 
				 	d.meshProbeFn = d.probeMesh
			
 
				 	d.bwProbeFn = d.probeBandwidth
			
 
				+	d.qdProbeFn = d.probeQueuingDelay
			
 
				 	return d, nil
			
 
				 }
			
 
				 
			
@@ -213,7 +231,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 
				 					}
			
 
				 				}
			
 
				 
			
 
				-				if d.bwInterval > 0 && d.bwProbeSize > 0 {
			
 
				+				if d.bwInterval != 0 && d.bwProbeSize > 0 {
			
 
				 					n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name)
			
 
				 					wantProbes[n] = true
			
 
				 					if d.probes[n] == nil {
			
@@ -225,6 +243,15 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 
				 						d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
			
 
				 					}
			
 
				 				}
			
 
				+
			
 
				+				if d.qdPacketsPerSecond > 0 {
			
 
				+					n := fmt.Sprintf("derp/%s/%s/%s/qd", region.RegionCode, server.Name, to.Name)
			
 
				+					wantProbes[n] = true
			
 
				+					if d.probes[n] == nil {
			
 
				+						log.Printf("adding DERP queuing delay probe for %s->%s (%s)", server.Name, to.Name, region.RegionName)
			
 
				+						d.probes[n] = d.p.Run(n, -10*time.Second, labels, d.qdProbeFn(server.Name, to.Name, d.qdPacketsPerSecond, d.qdPacketTimeout))
			
 
				+					}
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
@@ -240,7 +267,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-// probeMesh returs a probe class that sends a test packet through a pair of DERP
			
 
				+// probeMesh returns a probe class that sends a test packet through a pair of DERP
			
 
				 // servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to'
			
 
				 // are expected to be names (DERPNode.Name) of two DERP servers in the same region.
			
 
				 func (d *derpProber) probeMesh(from, to string) ProbeClass {
			
@@ -263,7 +290,7 @@ func (d *derpProber) probeMesh(from, to string) ProbeClass {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-// probeBandwidth returs a probe class that sends a payload of a given size
			
 
				+// probeBandwidth returns a probe class that sends a payload of a given size
			
 
				 // through a pair of DERP servers (or just one server, if 'from' and 'to' are
			
 
				 // the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two
			
 
				 // DERP servers in the same region.
			
@@ -295,6 +322,193 @@ func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass {
 
				 	}
			
 
				 }
			
 
				 
			
 
				+// probeQueuingDelay returns a probe class that continuously sends packets
			
 
				+// through a pair of DERP servers (or just one server, if 'from' and 'to' are
			
 
				+// the same) at a rate of `packetsPerSecond` packets per second in order to
			
 
				+// measure queuing delays. Packets arriving after `packetTimeout` don't contribute
			
 
				+// to the queuing delay measurement and are recorded as dropped. 'from' and 'to' are
			
 
				+// expected to be names (DERPNode.Name) of two DERP servers in the same region,
			
 
				+// and may refer to the same server.
			
 
				+func (d *derpProber) probeQueuingDelay(from, to string, packetsPerSecond int, packetTimeout time.Duration) ProbeClass {
			
 
				+	derpPath := "mesh"
			
 
				+	if from == to {
			
 
				+		derpPath = "single"
			
 
				+	}
			
 
				+	var packetsDropped expvar.Float
			
 
				+	qdh := newHistogram([]float64{.005, .01, .025, .05, .1, .25, .5, 1})
			
 
				+	return ProbeClass{
			
 
				+		Probe: func(ctx context.Context) error {
			
 
				+			fromN, toN, err := d.getNodePair(from, to)
			
 
				+			if err != nil {
			
 
				+				return err
			
 
				+			}
			
 
				+			return derpProbeQueuingDelay(ctx, d.lastDERPMap, fromN, toN, packetsPerSecond, packetTimeout, &packetsDropped, qdh)
			
 
				+		},
			
 
				+		Class:  "derp_qd",
			
 
				+		Labels: Labels{"derp_path": derpPath},
			
 
				+		Metrics: func(l prometheus.Labels) []prometheus.Metric {
			
 
				+			qdh.mx.Lock()
			
 
				+			result := []prometheus.Metric{
			
 
				+				prometheus.MustNewConstMetric(prometheus.NewDesc("derp_qd_probe_dropped_packets", "Total packets dropped", nil, l), prometheus.CounterValue, float64(packetsDropped.Value())),
			
 
				+				prometheus.MustNewConstHistogram(prometheus.NewDesc("derp_qd_probe_delays_seconds", "Distribution of queuing delays", nil, l), qdh.count, qdh.sum, qdh.bucketedCounts),
			
 
				+			}
			
 
				+			qdh.mx.Unlock()
			
 
				+			return result
			
 
				+		},
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// derpProbeQueuingDelay continuously sends data between two local DERP clients
			
 
				+// connected to two DERP servers in order to measure queuing delays. From and to
			
 
				+// can be the same server.
			
 
				+func derpProbeQueuingDelay(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) (err error) {
			
 
				+	// This probe uses clients with isProber=false to avoid spamming the derper
			
 
				+	// logs with every packet sent by the queuing delay probe.
			
 
				+	fromc, err := newConn(ctx, dm, from, false)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	defer fromc.Close()
			
 
				+	toc, err := newConn(ctx, dm, to, false)
			
 
				+	if err != nil {
			
 
				+		return err
			
 
				+	}
			
 
				+	defer toc.Close()
			
 
				+
			
 
				+	// Wait a bit for from's node to hear about to existing on the
			
 
				+	// other node in the region, in the case where the two nodes
			
 
				+	// are different.
			
 
				+	if from.Name != to.Name {
			
 
				+		time.Sleep(100 * time.Millisecond) // pretty arbitrary
			
 
				+	}
			
 
				+
			
 
				+	if err := runDerpProbeQueuingDelayContinously(ctx, from, to, fromc, toc, packetsPerSecond, packetTimeout, packetsDropped, qdh); err != nil {
			
 
				+		// Record pubkeys on failed probes to aid investigation.
			
 
				+		return fmt.Errorf("%s -> %s: %w",
			
 
				+			fromc.SelfPublicKey().ShortString(),
			
 
				+			toc.SelfPublicKey().ShortString(), err)
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				+func runDerpProbeQueuingDelayContinously(ctx context.Context, from, to *tailcfg.DERPNode, fromc, toc *derphttp.Client, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) error {
			
 
				+	// Make sure all goroutines have finished.
			
 
				+	var wg sync.WaitGroup
			
 
				+	defer wg.Wait()
			
 
				+
			
 
				+	// Close the clients to make sure goroutines that are reading/writing from them terminate.
			
 
				+	defer fromc.Close()
			
 
				+	defer toc.Close()
			
 
				+
			
 
				+	type txRecord struct {
			
 
				+		at  time.Time
			
 
				+		seq uint64
			
 
				+	}
			
 
				+	// txRecords is sized to hold enough transmission records to keep timings
			
 
				+	// for packets up to their timeout. As records age out of the front of this
			
 
				+	// list, if the associated packet arrives, we won't have a txRecord for it
			
 
				+	// and will consider it to have timed out.
			
 
				+	txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds()))
			
 
				+	var txRecordsMu sync.Mutex
			
 
				+
			
 
				+	// Send the packets.
			
 
				+	sendErrC := make(chan error, 1)
			
 
				+	// TODO: construct a disco CallMeMaybe in the same fashion as magicsock, e.g. magic bytes, src pub, seal payload.
			
 
				+	// DERP server handling of disco may vary from non-disco, and we may want to measure queue delay of both.
			
 
				+	pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client.
			
 
				+	crand.Read(pkt)
			
 
				+
			
 
				+	wg.Add(1)
			
 
				+	go func() {
			
 
				+		defer wg.Done()
			
 
				+		t := time.NewTicker(time.Second / time.Duration(packetsPerSecond))
			
 
				+		defer t.Stop()
			
 
				+
			
 
				+		seq := uint64(0)
			
 
				+		for {
			
 
				+			select {
			
 
				+			case <-ctx.Done():
			
 
				+				return
			
 
				+			case <-t.C:
			
 
				+				txRecordsMu.Lock()
			
 
				+				if len(txRecords) == cap(txRecords) {
			
 
				+					txRecords = slices.Delete(txRecords, 0, 1)
			
 
				+					packetsDropped.Add(1)
			
 
				+				}
			
 
				+				txRecords = append(txRecords, txRecord{time.Now(), seq})
			
 
				+				txRecordsMu.Unlock()
			
 
				+				binary.BigEndian.PutUint64(pkt, seq)
			
 
				+				seq++
			
 
				+				if err := fromc.Send(toc.SelfPublicKey(), pkt); err != nil {
			
 
				+					sendErrC <- fmt.Errorf("sending packet %w", err)
			
 
				+					return
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}()
			
 
				+
			
 
				+	// Receive the packets.
			
 
				+	recvFinishedC := make(chan error, 1)
			
 
				+	wg.Add(1)
			
 
				+	go func() {
			
 
				+		defer wg.Done()
			
 
				+		defer close(recvFinishedC) // to break out of 'select' below.
			
 
				+		for {
			
 
				+			m, err := toc.Recv()
			
 
				+			if err != nil {
			
 
				+				recvFinishedC <- err
			
 
				+				return
			
 
				+			}
			
 
				+			switch v := m.(type) {
			
 
				+			case derp.ReceivedPacket:
			
 
				+				now := time.Now()
			
 
				+				if v.Source != fromc.SelfPublicKey() {
			
 
				+					recvFinishedC <- fmt.Errorf("got data packet from unexpected source, %v", v.Source)
			
 
				+					return
			
 
				+				}
			
 
				+				seq := binary.BigEndian.Uint64(v.Data)
			
 
				+				txRecordsMu.Lock()
			
 
				+			findTxRecord:
			
 
				+				for i, record := range txRecords {
			
 
				+					switch {
			
 
				+					case record.seq == seq:
			
 
				+						rtt := now.Sub(record.at)
			
 
				+						qdh.add(rtt.Seconds())
			
 
				+						txRecords = slices.Delete(txRecords, i, i+1)
			
 
				+						break findTxRecord
			
 
				+					case record.seq > seq:
			
 
				+						// No sent time found, probably a late arrival already
			
 
				+						// recorded as drop by sender when deleted.
			
 
				+						break findTxRecord
			
 
				+					case record.seq < seq:
			
 
				+						continue
			
 
				+					}
			
 
				+				}
			
 
				+				txRecordsMu.Unlock()
			
 
				+
			
 
				+			case derp.KeepAliveMessage:
			
 
				+				// Silently ignore.
			
 
				+
			
 
				+			default:
			
 
				+				log.Printf("%v: ignoring Recv frame type %T", to.Name, v)
			
 
				+				// Loop.
			
 
				+			}
			
 
				+		}
			
 
				+	}()
			
 
				+
			
 
				+	select {
			
 
				+	case <-ctx.Done():
			
 
				+		return fmt.Errorf("timeout: %w", ctx.Err())
			
 
				+	case err := <-sendErrC:
			
 
				+		return fmt.Errorf("error sending via %q: %w", from.Name, err)
			
 
				+	case err := <-recvFinishedC:
			
 
				+		if err != nil {
			
 
				+			return fmt.Errorf("error receiving from %q: %w", to.Name, err)
			
 
				+		}
			
 
				+	}
			
 
				+	return nil
			
 
				+}
			
 
				+
			
 
				 // getNodePair returns DERPNode objects for two DERP servers based on their
			
 
				 // short names.
			
 
				 func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) {
			
@@ -573,6 +787,8 @@ func runDerpProbeNodePair(ctx context.Context, from, to *tailcfg.DERPNode, fromc
 
				 					recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source)
			
 
				 					return
			
 
				 				}
			
 
				+				// This assumes that the packets are received reliably and in order.
			
 
				+				// The DERP protocol does not guarantee this, but this probe assumes it.
			
 
				 				if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) {
			
 
				 					recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts))
			
 
				 					return
			
--- a/prober/histogram.go
+++ b/prober/histogram.go
@@ -0,0 +1,50 @@
 
				+// Copyright (c) Tailscale Inc & AUTHORS
			
 
				+// SPDX-License-Identifier: BSD-3-Clause
			
 
				+
			
 
				+package prober
			
 
				+
			
 
				+import (
			
 
				+	"slices"
			
 
				+	"sync"
			
 
				+)
			
 
				+
			
 
				+// histogram serves as an adapter to the Prometheus histogram datatype.
			
 
				+// The prober framework passes labels at custom metric collection time that
			
 
				+// it expects to be coupled with the returned metrics. See ProbeClass.Metrics
			
 
				+// and its call sites. Native prometheus histograms cannot be collected while
			
 
				+// injecting more labels. Instead we use this type and pass observations +
			
 
				+// collection labels to prometheus.MustNewConstHistogram() at prometheus
			
 
				+// metric collection time.
			
 
				+type histogram struct {
			
 
				+	count          uint64
			
 
				+	sum            float64
			
 
				+	buckets        []float64
			
 
				+	bucketedCounts map[float64]uint64
			
 
				+	mx             sync.Mutex
			
 
				+}
			
 
				+
			
 
				+// newHistogram constructs a histogram that buckets data based on the given
			
 
				+// slice of upper bounds.
			
 
				+func newHistogram(buckets []float64) *histogram {
			
 
				+	slices.Sort(buckets)
			
 
				+	return &histogram{
			
 
				+		buckets:        buckets,
			
 
				+		bucketedCounts: make(map[float64]uint64, len(buckets)),
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func (h *histogram) add(v float64) {
			
 
				+	h.mx.Lock()
			
 
				+	defer h.mx.Unlock()
			
 
				+
			
 
				+	h.count++
			
 
				+	h.sum += v
			
 
				+
			
 
				+	for _, b := range h.buckets {
			
 
				+		if v > b {
			
 
				+			continue
			
 
				+		}
			
 
				+		h.bucketedCounts[b] += 1
			
 
				+		break
			
 
				+	}
			
 
				+}
			
--- a/prober/histogram_test.go
+++ b/prober/histogram_test.go
@@ -0,0 +1,29 @@
 
				+// Copyright (c) Tailscale Inc & AUTHORS
			
 
				+// SPDX-License-Identifier: BSD-3-Clause
			
 
				+
			
 
				+package prober
			
 
				+
			
 
				+import (
			
 
				+	"testing"
			
 
				+
			
 
				+	"github.com/google/go-cmp/cmp"
			
 
				+)
			
 
				+
			
 
				+func TestHistogram(t *testing.T) {
			
 
				+	h := newHistogram([]float64{1, 2})
			
 
				+	h.add(0.5)
			
 
				+	h.add(1)
			
 
				+	h.add(1.5)
			
 
				+	h.add(2)
			
 
				+	h.add(2.5)
			
 
				+
			
 
				+	if diff := cmp.Diff(h.count, uint64(5)); diff != "" {
			
 
				+		t.Errorf("wrong count; (-got+want):%v", diff)
			
 
				+	}
			
 
				+	if diff := cmp.Diff(h.sum, 7.5); diff != "" {
			
 
				+		t.Errorf("wrong sum; (-got+want):%v", diff)
			
 
				+	}
			
 
				+	if diff := cmp.Diff(h.bucketedCounts, map[float64]uint64{1: 2, 2: 2}); diff != "" {
			
 
				+		t.Errorf("wrong bucketedCounts; (-got+want):%v", diff)
			
 
				+	}
			
 
				+}
			
--- a/prober/prober.go
+++ b/prober/prober.go
@@ -94,6 +94,9 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro
 
				 
			
 
				 // Run executes probe class function every interval, and exports probe results under probeName.
			
 
				 //
			
 
				+// If interval is negative, the probe will run continuously. If it encounters a failure while
			
 
				+// running continuously, it will pause for -1*interval and then retry.
			
 
				+//
			
 
				 // Registering a probe under an already-registered name panics.
			
 
				 func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
			
 
				 	p.mu.Lock()
			
@@ -256,6 +259,11 @@ type Probe struct {
 
				 	latencyHist *ring.Ring
			
 
				 }
			
 
				 
			
 
				+// IsContinuous indicates that this is a continuous probe.
			
 
				+func (p *Probe) IsContinuous() bool {
			
 
				+	return p.interval < 0
			
 
				+}
			
 
				+
			
 
				 // Close shuts down the Probe and unregisters it from its Prober.
			
 
				 // It is safe to Run a new probe of the same name after Close returns.
			
 
				 func (p *Probe) Close() error {
			
@@ -288,6 +296,22 @@ func (p *Probe) loop() {
 
				 		return
			
 
				 	}
			
 
				 
			
 
				+	if p.IsContinuous() {
			
 
				+		// Probe function is going to run continuously.
			
 
				+		for {
			
 
				+			p.run()
			
 
				+			// Wait and then retry if probe fails. We use the inverse of the
			
 
				+			// configured negative interval as our sleep period.
			
 
				+			// TODO(percy):implement exponential backoff, possibly using logtail/backoff.
			
 
				+			select {
			
 
				+			case <-time.After(-1 * p.interval):
			
 
				+				p.run()
			
 
				+			case <-p.ctx.Done():
			
 
				+				return
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				 	p.tick = p.prober.newTicker(p.interval)
			
 
				 	defer p.tick.Stop()
			
 
				 	for {
			
@@ -323,9 +347,13 @@ func (p *Probe) run() (pi ProbeInfo, err error) {
 
				 			p.recordEnd(err)
			
 
				 		}
			
 
				 	}()
			
 
				-	timeout := time.Duration(float64(p.interval) * 0.8)
			
 
				-	ctx, cancel := context.WithTimeout(p.ctx, timeout)
			
 
				-	defer cancel()
			
 
				+	ctx := p.ctx
			
 
				+	if !p.IsContinuous() {
			
 
				+		timeout := time.Duration(float64(p.interval) * 0.8)
			
 
				+		var cancel func()
			
 
				+		ctx, cancel = context.WithTimeout(ctx, timeout)
			
 
				+		defer cancel()
			
 
				+	}
			
 
				 
			
 
				 	err = p.probeClass.Probe(ctx)
			
 
				 	p.recordEnd(err)
			
@@ -365,6 +393,16 @@ func (p *Probe) recordEnd(err error) {
 
				 	p.successHist = p.successHist.Next()
			
 
				 }
			
 
				 
			
 
				+// ProbeStatus indicates the status of a probe.
			
 
				+type ProbeStatus string
			
 
				+
			
 
				+const (
			
 
				+	ProbeStatusUnknown   = "unknown"
			
 
				+	ProbeStatusRunning   = "running"
			
 
				+	ProbeStatusFailed    = "failed"
			
 
				+	ProbeStatusSucceeded = "succeeded"
			
 
				+)
			
 
				+
			
 
				 // ProbeInfo is a snapshot of the configuration and state of a Probe.
			
 
				 type ProbeInfo struct {
			
 
				 	Name            string
			
@@ -374,7 +412,7 @@ type ProbeInfo struct {
 
				 	Start           time.Time
			
 
				 	End             time.Time
			
 
				 	Latency         time.Duration
			
 
				-	Result          bool
			
 
				+	Status          ProbeStatus
			
 
				 	Error           string
			
 
				 	RecentResults   []bool
			
 
				 	RecentLatencies []time.Duration
			
@@ -402,6 +440,10 @@ func (pb ProbeInfo) RecentMedianLatency() time.Duration {
 
				 	return pb.RecentLatencies[len(pb.RecentLatencies)/2]
			
 
				 }
			
 
				 
			
 
				+func (pb ProbeInfo) Continuous() bool {
			
 
				+	return pb.Interval < 0
			
 
				+}
			
 
				+
			
 
				 // ProbeInfo returns the state of all probes.
			
 
				 func (p *Prober) ProbeInfo() map[string]ProbeInfo {
			
 
				 	out := map[string]ProbeInfo{}
			
@@ -429,9 +471,14 @@ func (probe *Probe) probeInfoLocked() ProbeInfo {
 
				 		Labels:   probe.metricLabels,
			
 
				 		Start:    probe.start,
			
 
				 		End:      probe.end,
			
 
				-		Result:   probe.succeeded,
			
 
				 	}
			
 
				-	if probe.lastErr != nil {
			
 
				+	inf.Status = ProbeStatusUnknown
			
 
				+	if probe.end.Before(probe.start) {
			
 
				+		inf.Status = ProbeStatusRunning
			
 
				+	} else if probe.succeeded {
			
 
				+		inf.Status = ProbeStatusSucceeded
			
 
				+	} else if probe.lastErr != nil {
			
 
				+		inf.Status = ProbeStatusFailed
			
 
				 		inf.Error = probe.lastErr.Error()
			
 
				 	}
			
 
				 	if probe.latency > 0 {
			
@@ -467,7 +514,7 @@ func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
 
				 	p.mu.Lock()
			
 
				 	probe, ok := p.probes[name]
			
 
				 	p.mu.Unlock()
			
 
				-	if !ok {
			
 
				+	if !ok || probe.IsContinuous() {
			
 
				 		return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
			
 
				 	}
			
 
				 
			
@@ -531,7 +578,8 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) {
 
				 	if !p.start.IsZero() {
			
 
				 		ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix()))
			
 
				 	}
			
 
				-	if p.end.IsZero() {
			
 
				+	// For periodic probes that haven't ended, don't collect probe metrics yet.
			
 
				+	if p.end.IsZero() && !p.IsContinuous() {
			
 
				 		return
			
 
				 	}
			
 
				 	ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix()))
			
--- a/prober/prober_test.go
+++ b/prober/prober_test.go
@@ -316,7 +316,7 @@ func TestProberProbeInfo(t *testing.T) {
 
				 			Interval:        probeInterval,
			
 
				 			Labels:          map[string]string{"class": "", "name": "probe1"},
			
 
				 			Latency:         500 * time.Millisecond,
			
 
				-			Result:          true,
			
 
				+			Status:          ProbeStatusSucceeded,
			
 
				 			RecentResults:   []bool{true},
			
 
				 			RecentLatencies: []time.Duration{500 * time.Millisecond},
			
 
				 		},
			
@@ -324,6 +324,7 @@ func TestProberProbeInfo(t *testing.T) {
 
				 			Name:            "probe2",
			
 
				 			Interval:        probeInterval,
			
 
				 			Labels:          map[string]string{"class": "", "name": "probe2"},
			
 
				+			Status:          ProbeStatusFailed,
			
 
				 			Error:           "error2",
			
 
				 			RecentResults:   []bool{false},
			
 
				 			RecentLatencies: nil, // no latency for failed probes
			
@@ -349,7 +350,7 @@ func TestProbeInfoRecent(t *testing.T) {
 
				 	}{
			
 
				 		{
			
 
				 			name:                    "no_runs",
			
 
				-			wantProbeInfo:           ProbeInfo{},
			
 
				+			wantProbeInfo:           ProbeInfo{Status: ProbeStatusUnknown},
			
 
				 			wantRecentSuccessRatio:  0,
			
 
				 			wantRecentMedianLatency: 0,
			
 
				 		},
			
@@ -358,7 +359,7 @@ func TestProbeInfoRecent(t *testing.T) {
 
				 			results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
			
 
				 			wantProbeInfo: ProbeInfo{
			
 
				 				Latency:         100 * time.Millisecond,
			
 
				-				Result:          true,
			
 
				+				Status:          ProbeStatusSucceeded,
			
 
				 				RecentResults:   []bool{true},
			
 
				 				RecentLatencies: []time.Duration{100 * time.Millisecond},
			
 
				 			},
			
@@ -369,7 +370,7 @@ func TestProbeInfoRecent(t *testing.T) {
 
				 			name:    "single_failure",
			
 
				 			results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
			
 
				 			wantProbeInfo: ProbeInfo{
			
 
				-				Result:          false,
			
 
				+				Status:          ProbeStatusFailed,
			
 
				 				RecentResults:   []bool{false},
			
 
				 				RecentLatencies: nil,
			
 
				 				Error:           "error123",
			
@@ -390,7 +391,7 @@ func TestProbeInfoRecent(t *testing.T) {
 
				 				{latency: 80 * time.Millisecond, err: nil},
			
 
				 			},
			
 
				 			wantProbeInfo: ProbeInfo{
			
 
				-				Result:        true,
			
 
				+				Status:        ProbeStatusSucceeded,
			
 
				 				Latency:       80 * time.Millisecond,
			
 
				 				RecentResults: []bool{false, true, true, false, true, true, false, true},
			
 
				 				RecentLatencies: []time.Duration{
			
@@ -420,7 +421,7 @@ func TestProbeInfoRecent(t *testing.T) {
 
				 				{latency: 110 * time.Millisecond, err: nil},
			
 
				 			},
			
 
				 			wantProbeInfo: ProbeInfo{
			
 
				-				Result:        true,
			
 
				+				Status:        ProbeStatusSucceeded,
			
 
				 				Latency:       110 * time.Millisecond,
			
 
				 				RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
			
 
				 				RecentLatencies: []time.Duration{
			
@@ -483,7 +484,7 @@ func TestProberRunHandler(t *testing.T) {
 
				 				ProbeInfo: ProbeInfo{
			
 
				 					Name:          "success",
			
 
				 					Interval:      probeInterval,
			
 
				-					Result:        true,
			
 
				+					Status:        ProbeStatusSucceeded,
			
 
				 					RecentResults: []bool{true, true},
			
 
				 				},
			
 
				 				PreviousSuccessRatio: 1,
			
@@ -498,7 +499,7 @@ func TestProberRunHandler(t *testing.T) {
 
				 				ProbeInfo: ProbeInfo{
			
 
				 					Name:          "failure",
			
 
				 					Interval:      probeInterval,
			
 
				-					Result:        false,
			
 
				+					Status:        ProbeStatusFailed,
			
 
				 					Error:         "error123",
			
 
				 					RecentResults: []bool{false, false},
			
 
				 				},
			
--- a/prober/status.go
+++ b/prober/status.go
@@ -62,8 +62,9 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
 
				 	return func(w http.ResponseWriter, r *http.Request) error {
			
 
				 		type probeStatus struct {
			
 
				 			ProbeInfo
			
 
				-			TimeSinceLast time.Duration
			
 
				-			Links         map[string]template.URL
			
 
				+			TimeSinceLastStart time.Duration
			
 
				+			TimeSinceLastEnd   time.Duration
			
 
				+			Links              map[string]template.URL
			
 
				 		}
			
 
				 		vars := struct {
			
 
				 			Title           string
			
@@ -81,12 +82,15 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
 
				 
			
 
				 		for name, info := range p.ProbeInfo() {
			
 
				 			vars.TotalProbes++
			
 
				-			if !info.Result {
			
 
				+			if info.Error != "" {
			
 
				 				vars.UnhealthyProbes++
			
 
				 			}
			
 
				 			s := probeStatus{ProbeInfo: info}
			
 
				+			if !info.Start.IsZero() {
			
 
				+				s.TimeSinceLastStart = time.Since(info.Start).Truncate(time.Second)
			
 
				+			}
			
 
				 			if !info.End.IsZero() {
			
 
				-				s.TimeSinceLast = time.Since(info.End).Truncate(time.Second)
			
 
				+				s.TimeSinceLastEnd = time.Since(info.End).Truncate(time.Second)
			
 
				 			}
			
 
				 			for textTpl, urlTpl := range params.probeLinks {
			
 
				 				text, err := renderTemplate(textTpl, info)
			
--- a/prober/status.html
+++ b/prober/status.html
@@ -73,8 +73,9 @@
 
				             <th>Name</th>
			
 
				             <th>Probe Class & Labels</th>
			
 
				             <th>Interval</th>
			
 
				-            <th>Last Attempt</th>
			
 
				-            <th>Success</th>
			
 
				+            <th>Last Finished</th>
			
 
				+            <th>Last Started</th>
			
 
				+            <th>Status</th>
			
 
				             <th>Latency</th>
			
 
				             <th>Last Error</th>
			
 
				         </tr></thead>
			
@@ -85,9 +86,11 @@
 
				                 {{$name}}
			
 
				                 {{range $text, $url := $probeInfo.Links}}
			
 
				                 <br/>
			
 
				-                <button onclick="location.href='{{$url}}';" type="button">
			
 
				-                    {{$text}}
			
 
				-                </button>
			
 
				+                {{if not $probeInfo.Continuous}}
			
 
				+                    <button onclick="location.href='{{$url}}';" type="button">
			
 
				+                        {{$text}}
			
 
				+                    </button>
			
 
				+                {{end}}
			
 
				                 {{end}}
			
 
				             </td>
			
 
				             <td>{{$probeInfo.Class}}<br/>
			
@@ -97,28 +100,48 @@
 
				                 {{end}}
			
 
				                 </div>
			
 
				             </td>
			
 
				-            <td>{{$probeInfo.Interval}}</td>
			
 
				-            <td data-sort="{{$probeInfo.TimeSinceLast.Milliseconds}}">
			
 
				-                {{if $probeInfo.TimeSinceLast}}
			
 
				-                    {{$probeInfo.TimeSinceLast.String}} ago<br/>
			
 
				+            <td>
			
 
				+                {{if $probeInfo.Continuous}}
			
 
				+                    Continuous
			
 
				+                {{else}}
			
 
				+                    {{$probeInfo.Interval}}
			
 
				+                {{end}}
			
 
				+            </td>
			
 
				+            <td data-sort="{{$probeInfo.TimeSinceLastEnd.Milliseconds}}">
			
 
				+                {{if $probeInfo.TimeSinceLastEnd}}
			
 
				+                    {{$probeInfo.TimeSinceLastEnd.String}} ago<br/>
			
 
				                     <span class="small">{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}}</span>
			
 
				                 {{else}}
			
 
				                     Never
			
 
				                 {{end}}
			
 
				             </td>
			
 
				+            <td data-sort="{{$probeInfo.TimeSinceLastStart.Milliseconds}}">
			
 
				+                {{if $probeInfo.TimeSinceLastStart}}
			
 
				+                    {{$probeInfo.TimeSinceLastStart.String}} ago<br/>
			
 
				+                    <span class="small">{{$probeInfo.Start.Format "2006-01-02T15:04:05Z07:00"}}</span>
			
 
				+                {{else}}
			
 
				+                    Never
			
 
				+                {{end}}
			
 
				+            </td>
			
 
				             <td>
			
 
				-                {{if $probeInfo.Result}}
			
 
				-                    {{$probeInfo.Result}}
			
 
				+                {{if $probeInfo.Error}}
			
 
				+                    <span class="error">{{$probeInfo.Status}}</span>
			
 
				                 {{else}}
			
 
				-                    <span class="error">{{$probeInfo.Result}}</span>
			
 
				+                    {{$probeInfo.Status}}
			
 
				                 {{end}}<br/>
			
 
				-                <div class="small">Recent: {{$probeInfo.RecentResults}}</div>
			
 
				-                <div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
			
 
				+                {{if not $probeInfo.Continuous}}
			
 
				+                    <div class="small">Recent: {{$probeInfo.RecentResults}}</div>
			
 
				+                    <div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
			
 
				+                {{end}}
			
 
				             </td>
			
 
				             <td data-sort="{{$probeInfo.Latency.Milliseconds}}">
			
 
				-                {{$probeInfo.Latency.String}}
			
 
				-                <div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
			
 
				-                <div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
			
 
				+                {{if $probeInfo.Continuous}}
			
 
				+                    n/a
			
 
				+                {{else}}
			
 
				+                    {{$probeInfo.Latency.String}}
			
 
				+                    <div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
			
 
				+                    <div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
			
 
				+                {{end}}
			
 
				             </td>
			
 
				             <td class="small">{{$probeInfo.Error}}</td>
			
 
				         </tr>