ソースを参照

cmd/derpprobe,prober: add ability to perform continuous queuing delay measurements against DERP servers

This new type of probe sends DERP packets sized similarly to CallMeMaybe packets
at a rate of 10 packets per second. It records the round-trip times in a Prometheus
histogram. It also keeps track of how many packets are dropped. Packets that fail to
arrive within 5 seconds are considered dropped.

Updates tailscale/corp#24522

Signed-off-by: Percy Wegmann <[email protected]>
Percy Wegmann 1 年間 前
コミット
00a4504cf1
8 ファイル変更428 行追加54 行削除
  1. 17 14
      cmd/derpprobe/derpprobe.go
  2. 219 3
      prober/derp.go
  3. 50 0
      prober/histogram.go
  4. 29 0
      prober/histogram_test.go
  5. 56 8
      prober/prober.go
  6. 9 8
      prober/prober_test.go
  7. 8 4
      prober/status.go
  8. 40 17
      prober/status.html

+ 17 - 14
cmd/derpprobe/derpprobe.go

@@ -18,19 +18,21 @@ import (
 )
 
 var (
-	derpMapURL       = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map")
-	versionFlag      = flag.Bool("version", false, "print version and exit")
-	listen           = flag.String("listen", ":8030", "HTTP listen address")
-	probeOnce        = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag")
-	spread           = flag.Bool("spread", true, "whether to spread probing over time")
-	interval         = flag.Duration("interval", 15*time.Second, "probe interval")
-	meshInterval     = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval")
-	stunInterval     = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval")
-	tlsInterval      = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
-	bwInterval       = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
-	bwSize           = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
-	bwTUNIPv4Address = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP. We will use a /30 subnet including this IP address.")
-	regionCode       = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
+	derpMapURL         = flag.String("derp-map", "https://login.tailscale.com/derpmap/default", "URL to DERP map (https:// or file://) or 'local' to use the local tailscaled's DERP map")
+	versionFlag        = flag.Bool("version", false, "print version and exit")
+	listen             = flag.String("listen", ":8030", "HTTP listen address")
+	probeOnce          = flag.Bool("once", false, "probe once and print results, then exit; ignores the listen flag")
+	spread             = flag.Bool("spread", true, "whether to spread probing over time")
+	interval           = flag.Duration("interval", 15*time.Second, "probe interval")
+	meshInterval       = flag.Duration("mesh-interval", 15*time.Second, "mesh probe interval")
+	stunInterval       = flag.Duration("stun-interval", 15*time.Second, "STUN probe interval")
+	tlsInterval        = flag.Duration("tls-interval", 15*time.Second, "TLS probe interval")
+	bwInterval         = flag.Duration("bw-interval", 0, "bandwidth probe interval (0 = no bandwidth probing)")
+	bwSize             = flag.Int64("bw-probe-size-bytes", 1_000_000, "bandwidth probe size")
+	bwTUNIPv4Address   = flag.String("bw-tun-ipv4-addr", "", "if specified, bandwidth probes will be performed over a TUN device at this address in order to exercise TCP-in-TCP in similar fashion to TCP over Tailscale via DERP; we will use a /30 subnet including this IP address")
+	qdPacketsPerSecond = flag.Int("qd-packets-per-second", 0, "if greater than 0, queuing delay will be measured continuously using 260 byte packets (approximate size of a CallMeMaybe packet) sent at this rate per second")
+	qdPacketTimeout    = flag.Duration("qd-packet-timeout", 5*time.Second, "queuing delay packets arriving after this period of time from being sent are treated like dropped packets and don't count toward queuing delay timings")
+	regionCode         = flag.String("region-code", "", "probe only this region (e.g. 'lax'); if left blank, all regions will be probed")
 )
 
 func main() {
@@ -45,6 +47,7 @@ func main() {
 		prober.WithMeshProbing(*meshInterval),
 		prober.WithSTUNProbing(*stunInterval),
 		prober.WithTLSProbing(*tlsInterval),
+		prober.WithQueuingDelayProbing(*qdPacketsPerSecond, *qdPacketTimeout),
 	}
 	if *bwInterval > 0 {
 		opts = append(opts, prober.WithBandwidthProbing(*bwInterval, *bwSize, *bwTUNIPv4Address))
@@ -107,7 +110,7 @@ func getOverallStatus(p *prober.Prober) (o overallStatus) {
 			// Do not show probes that have not finished yet.
 			continue
 		}
-		if i.Result {
+		if i.Status == prober.ProbeStatusSucceeded {
 			o.addGoodf("%s: %s", p, i.Latency)
 		} else {
 			o.addBadf("%s: %s", p, i.Error)

+ 219 - 3
prober/derp.go

@@ -8,6 +8,7 @@ import (
 	"cmp"
 	"context"
 	crand "crypto/rand"
+	"encoding/binary"
 	"encoding/json"
 	"errors"
 	"expvar"
@@ -17,6 +18,7 @@ import (
 	"net"
 	"net/http"
 	"net/netip"
+	"slices"
 	"strconv"
 	"strings"
 	"sync"
@@ -53,6 +55,10 @@ type derpProber struct {
 	bwProbeSize     int64
 	bwTUNIPv4Prefix *netip.Prefix // or nil to not use TUN
 
+	// Optional queuing delay probing.
+	qdPacketsPerSecond int // in packets per second
+	qdPacketTimeout    time.Duration
+
 	// Optionally restrict probes to a single regionCode.
 	regionCode string
 
@@ -64,6 +70,7 @@ type derpProber struct {
 	udpProbeFn  func(string, int) ProbeClass
 	meshProbeFn func(string, string) ProbeClass
 	bwProbeFn   func(string, string, int64) ProbeClass
+	qdProbeFn   func(string, string, int, time.Duration) ProbeClass
 
 	sync.Mutex
 	lastDERPMap   *tailcfg.DERPMap
@@ -93,6 +100,16 @@ func WithBandwidthProbing(interval time.Duration, size int64, tunAddress string)
 	}
 }
 
+// WithQueuingDelayProbing enables/disables queuing delay probing. qdSendRate
+// is the number of packets sent per second. qdTimeout is the amount of time
+// after which a sent packet is considered to have timed out.
+func WithQueuingDelayProbing(qdPacketsPerSecond int, qdPacketTimeout time.Duration) DERPOpt {
+	return func(d *derpProber) {
+		d.qdPacketsPerSecond = qdPacketsPerSecond
+		d.qdPacketTimeout = qdPacketTimeout
+	}
+}
+
 // WithMeshProbing enables mesh probing. When enabled, a small message will be
 // transferred through each DERP server and each pair of DERP servers.
 func WithMeshProbing(interval time.Duration) DERPOpt {
@@ -147,6 +164,7 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
 	d.udpProbeFn = d.ProbeUDP
 	d.meshProbeFn = d.probeMesh
 	d.bwProbeFn = d.probeBandwidth
+	d.qdProbeFn = d.probeQueuingDelay
 	return d, nil
 }
 
@@ -213,7 +231,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 					}
 				}
 
-				if d.bwInterval > 0 && d.bwProbeSize > 0 {
+				if d.bwInterval != 0 && d.bwProbeSize > 0 {
 					n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name)
 					wantProbes[n] = true
 					if d.probes[n] == nil {
@@ -225,6 +243,15 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 						d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
 					}
 				}
+
+				if d.qdPacketsPerSecond > 0 {
+					n := fmt.Sprintf("derp/%s/%s/%s/qd", region.RegionCode, server.Name, to.Name)
+					wantProbes[n] = true
+					if d.probes[n] == nil {
+						log.Printf("adding DERP queuing delay probe for %s->%s (%s)", server.Name, to.Name, region.RegionName)
+						d.probes[n] = d.p.Run(n, -10*time.Second, labels, d.qdProbeFn(server.Name, to.Name, d.qdPacketsPerSecond, d.qdPacketTimeout))
+					}
+				}
 			}
 		}
 	}
@@ -240,7 +267,7 @@ func (d *derpProber) probeMapFn(ctx context.Context) error {
 	return nil
 }
 
-// probeMesh returs a probe class that sends a test packet through a pair of DERP
+// probeMesh returns a probe class that sends a test packet through a pair of DERP
 // servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to'
 // are expected to be names (DERPNode.Name) of two DERP servers in the same region.
 func (d *derpProber) probeMesh(from, to string) ProbeClass {
@@ -263,7 +290,7 @@ func (d *derpProber) probeMesh(from, to string) ProbeClass {
 	}
 }
 
-// probeBandwidth returs a probe class that sends a payload of a given size
+// probeBandwidth returns a probe class that sends a payload of a given size
 // through a pair of DERP servers (or just one server, if 'from' and 'to' are
 // the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two
 // DERP servers in the same region.
@@ -295,6 +322,193 @@ func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass {
 	}
 }
 
+// probeQueuingDelay returns a probe class that continuously sends packets
+// through a pair of DERP servers (or just one server, if 'from' and 'to' are
+// the same) at a rate of `packetsPerSecond` packets per second in order to
+// measure queuing delays. Packets arriving after `packetTimeout` don't contribute
+// to the queuing delay measurement and are recorded as dropped. 'from' and 'to' are
+// expected to be names (DERPNode.Name) of two DERP servers in the same region,
+// and may refer to the same server.
+func (d *derpProber) probeQueuingDelay(from, to string, packetsPerSecond int, packetTimeout time.Duration) ProbeClass {
+	derpPath := "mesh"
+	if from == to {
+		derpPath = "single"
+	}
+	var packetsDropped expvar.Float
+	qdh := newHistogram([]float64{.005, .01, .025, .05, .1, .25, .5, 1})
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			fromN, toN, err := d.getNodePair(from, to)
+			if err != nil {
+				return err
+			}
+			return derpProbeQueuingDelay(ctx, d.lastDERPMap, fromN, toN, packetsPerSecond, packetTimeout, &packetsDropped, qdh)
+		},
+		Class:  "derp_qd",
+		Labels: Labels{"derp_path": derpPath},
+		Metrics: func(l prometheus.Labels) []prometheus.Metric {
+			qdh.mx.Lock()
+			result := []prometheus.Metric{
+				prometheus.MustNewConstMetric(prometheus.NewDesc("derp_qd_probe_dropped_packets", "Total packets dropped", nil, l), prometheus.CounterValue, float64(packetsDropped.Value())),
+				prometheus.MustNewConstHistogram(prometheus.NewDesc("derp_qd_probe_delays_seconds", "Distribution of queuing delays", nil, l), qdh.count, qdh.sum, qdh.bucketedCounts),
+			}
+			qdh.mx.Unlock()
+			return result
+		},
+	}
+}
+
+// derpProbeQueuingDelay continuously sends data between two local DERP clients
+// connected to two DERP servers in order to measure queuing delays. From and to
+// can be the same server.
+func derpProbeQueuingDelay(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) (err error) {
+	// This probe uses clients with isProber=false to avoid spamming the derper
+	// logs with every packet sent by the queuing delay probe.
+	fromc, err := newConn(ctx, dm, from, false)
+	if err != nil {
+		return err
+	}
+	defer fromc.Close()
+	toc, err := newConn(ctx, dm, to, false)
+	if err != nil {
+		return err
+	}
+	defer toc.Close()
+
+	// Wait a bit for from's node to hear about to existing on the
+	// other node in the region, in the case where the two nodes
+	// are different.
+	if from.Name != to.Name {
+		time.Sleep(100 * time.Millisecond) // pretty arbitrary
+	}
+
+	if err := runDerpProbeQueuingDelayContinously(ctx, from, to, fromc, toc, packetsPerSecond, packetTimeout, packetsDropped, qdh); err != nil {
+		// Record pubkeys on failed probes to aid investigation.
+		return fmt.Errorf("%s -> %s: %w",
+			fromc.SelfPublicKey().ShortString(),
+			toc.SelfPublicKey().ShortString(), err)
+	}
+	return nil
+}
+
+func runDerpProbeQueuingDelayContinously(ctx context.Context, from, to *tailcfg.DERPNode, fromc, toc *derphttp.Client, packetsPerSecond int, packetTimeout time.Duration, packetsDropped *expvar.Float, qdh *histogram) error {
+	// Make sure all goroutines have finished.
+	var wg sync.WaitGroup
+	defer wg.Wait()
+
+	// Close the clients to make sure goroutines that are reading/writing from them terminate.
+	defer fromc.Close()
+	defer toc.Close()
+
+	type txRecord struct {
+		at  time.Time
+		seq uint64
+	}
+	// txRecords is sized to hold enough transmission records to keep timings
+	// for packets up to their timeout. As records age out of the front of this
+	// list, if the associated packet arrives, we won't have a txRecord for it
+	// and will consider it to have timed out.
+	txRecords := make([]txRecord, 0, packetsPerSecond*int(packetTimeout.Seconds()))
+	var txRecordsMu sync.Mutex
+
+	// Send the packets.
+	sendErrC := make(chan error, 1)
+	// TODO: construct a disco CallMeMaybe in the same fashion as magicsock, e.g. magic bytes, src pub, seal payload.
+	// DERP server handling of disco may vary from non-disco, and we may want to measure queue delay of both.
+	pkt := make([]byte, 260) // the same size as a CallMeMaybe packet observed on a Tailscale client.
+	crand.Read(pkt)
+
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		t := time.NewTicker(time.Second / time.Duration(packetsPerSecond))
+		defer t.Stop()
+
+		seq := uint64(0)
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-t.C:
+				txRecordsMu.Lock()
+				if len(txRecords) == cap(txRecords) {
+					txRecords = slices.Delete(txRecords, 0, 1)
+					packetsDropped.Add(1)
+				}
+				txRecords = append(txRecords, txRecord{time.Now(), seq})
+				txRecordsMu.Unlock()
+				binary.BigEndian.PutUint64(pkt, seq)
+				seq++
+				if err := fromc.Send(toc.SelfPublicKey(), pkt); err != nil {
+					sendErrC <- fmt.Errorf("sending packet %w", err)
+					return
+				}
+			}
+		}
+	}()
+
+	// Receive the packets.
+	recvFinishedC := make(chan error, 1)
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		defer close(recvFinishedC) // to break out of 'select' below.
+		for {
+			m, err := toc.Recv()
+			if err != nil {
+				recvFinishedC <- err
+				return
+			}
+			switch v := m.(type) {
+			case derp.ReceivedPacket:
+				now := time.Now()
+				if v.Source != fromc.SelfPublicKey() {
+					recvFinishedC <- fmt.Errorf("got data packet from unexpected source, %v", v.Source)
+					return
+				}
+				seq := binary.BigEndian.Uint64(v.Data)
+				txRecordsMu.Lock()
+			findTxRecord:
+				for i, record := range txRecords {
+					switch {
+					case record.seq == seq:
+						rtt := now.Sub(record.at)
+						qdh.add(rtt.Seconds())
+						txRecords = slices.Delete(txRecords, i, i+1)
+						break findTxRecord
+					case record.seq > seq:
+						// No sent time found, probably a late arrival already
+						// recorded as drop by sender when deleted.
+						break findTxRecord
+					case record.seq < seq:
+						continue
+					}
+				}
+				txRecordsMu.Unlock()
+
+			case derp.KeepAliveMessage:
+				// Silently ignore.
+
+			default:
+				log.Printf("%v: ignoring Recv frame type %T", to.Name, v)
+				// Loop.
+			}
+		}
+	}()
+
+	select {
+	case <-ctx.Done():
+		return fmt.Errorf("timeout: %w", ctx.Err())
+	case err := <-sendErrC:
+		return fmt.Errorf("error sending via %q: %w", from.Name, err)
+	case err := <-recvFinishedC:
+		if err != nil {
+			return fmt.Errorf("error receiving from %q: %w", to.Name, err)
+		}
+	}
+	return nil
+}
+
 // getNodePair returns DERPNode objects for two DERP servers based on their
 // short names.
 func (d *derpProber) getNodePair(n1, n2 string) (ret1, ret2 *tailcfg.DERPNode, _ error) {
@@ -573,6 +787,8 @@ func runDerpProbeNodePair(ctx context.Context, from, to *tailcfg.DERPNode, fromc
 					recvc <- fmt.Errorf("got data packet %d from unexpected source, %v", idx, v.Source)
 					return
 				}
+				// This assumes that the packets are received reliably and in order.
+				// The DERP protocol does not guarantee this, but this probe assumes it.
 				if got, want := v.Data, pkts[idx]; !bytes.Equal(got, want) {
 					recvc <- fmt.Errorf("unexpected data packet %d (out of %d)", idx, len(pkts))
 					return

+ 50 - 0
prober/histogram.go

@@ -0,0 +1,50 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+package prober
+
+import (
+	"slices"
+	"sync"
+)
+
+// histogram serves as an adapter to the Prometheus histogram datatype.
+// The prober framework passes labels at custom metric collection time that
+// it expects to be coupled with the returned metrics. See ProbeClass.Metrics
+// and its call sites. Native prometheus histograms cannot be collected while
+// injecting more labels. Instead we use this type and pass observations +
+// collection labels to prometheus.MustNewConstHistogram() at prometheus
+// metric collection time.
+type histogram struct {
+	count          uint64
+	sum            float64
+	buckets        []float64
+	bucketedCounts map[float64]uint64
+	mx             sync.Mutex
+}
+
+// newHistogram constructs a histogram that buckets data based on the given
+// slice of upper bounds.
+func newHistogram(buckets []float64) *histogram {
+	slices.Sort(buckets)
+	return &histogram{
+		buckets:        buckets,
+		bucketedCounts: make(map[float64]uint64, len(buckets)),
+	}
+}
+
+func (h *histogram) add(v float64) {
+	h.mx.Lock()
+	defer h.mx.Unlock()
+
+	h.count++
+	h.sum += v
+
+	for _, b := range h.buckets {
+		if v > b {
+			continue
+		}
+		h.bucketedCounts[b] += 1
+		break
+	}
+}

+ 29 - 0
prober/histogram_test.go

@@ -0,0 +1,29 @@
+// Copyright (c) Tailscale Inc & AUTHORS
+// SPDX-License-Identifier: BSD-3-Clause
+
+package prober
+
+import (
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+)
+
+func TestHistogram(t *testing.T) {
+	h := newHistogram([]float64{1, 2})
+	h.add(0.5)
+	h.add(1)
+	h.add(1.5)
+	h.add(2)
+	h.add(2.5)
+
+	if diff := cmp.Diff(h.count, uint64(5)); diff != "" {
+		t.Errorf("wrong count; (-got+want):%v", diff)
+	}
+	if diff := cmp.Diff(h.sum, 7.5); diff != "" {
+		t.Errorf("wrong sum; (-got+want):%v", diff)
+	}
+	if diff := cmp.Diff(h.bucketedCounts, map[float64]uint64{1: 2, 2: 2}); diff != "" {
+		t.Errorf("wrong bucketedCounts; (-got+want):%v", diff)
+	}
+}

+ 56 - 8
prober/prober.go

@@ -94,6 +94,9 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro
 
 // Run executes probe class function every interval, and exports probe results under probeName.
 //
+// If interval is negative, the probe will run continuously. If it encounters a failure while
+// running continuously, it will pause for -1*interval and then retry.
+//
 // Registering a probe under an already-registered name panics.
 func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
 	p.mu.Lock()
@@ -256,6 +259,11 @@ type Probe struct {
 	latencyHist *ring.Ring
 }
 
+// IsContinuous indicates that this is a continuous probe.
+func (p *Probe) IsContinuous() bool {
+	return p.interval < 0
+}
+
 // Close shuts down the Probe and unregisters it from its Prober.
 // It is safe to Run a new probe of the same name after Close returns.
 func (p *Probe) Close() error {
@@ -288,6 +296,22 @@ func (p *Probe) loop() {
 		return
 	}
 
+	if p.IsContinuous() {
+		// Probe function is going to run continuously.
+		for {
+			p.run()
+			// Wait and then retry if probe fails. We use the inverse of the
+			// configured negative interval as our sleep period.
+			// TODO(percy):implement exponential backoff, possibly using logtail/backoff.
+			select {
+			case <-time.After(-1 * p.interval):
+				p.run()
+			case <-p.ctx.Done():
+				return
+			}
+		}
+	}
+
 	p.tick = p.prober.newTicker(p.interval)
 	defer p.tick.Stop()
 	for {
@@ -323,9 +347,13 @@ func (p *Probe) run() (pi ProbeInfo, err error) {
 			p.recordEnd(err)
 		}
 	}()
-	timeout := time.Duration(float64(p.interval) * 0.8)
-	ctx, cancel := context.WithTimeout(p.ctx, timeout)
-	defer cancel()
+	ctx := p.ctx
+	if !p.IsContinuous() {
+		timeout := time.Duration(float64(p.interval) * 0.8)
+		var cancel func()
+		ctx, cancel = context.WithTimeout(ctx, timeout)
+		defer cancel()
+	}
 
 	err = p.probeClass.Probe(ctx)
 	p.recordEnd(err)
@@ -365,6 +393,16 @@ func (p *Probe) recordEnd(err error) {
 	p.successHist = p.successHist.Next()
 }
 
+// ProbeStatus indicates the status of a probe.
+type ProbeStatus string
+
+const (
+	ProbeStatusUnknown   = "unknown"
+	ProbeStatusRunning   = "running"
+	ProbeStatusFailed    = "failed"
+	ProbeStatusSucceeded = "succeeded"
+)
+
 // ProbeInfo is a snapshot of the configuration and state of a Probe.
 type ProbeInfo struct {
 	Name            string
@@ -374,7 +412,7 @@ type ProbeInfo struct {
 	Start           time.Time
 	End             time.Time
 	Latency         time.Duration
-	Result          bool
+	Status          ProbeStatus
 	Error           string
 	RecentResults   []bool
 	RecentLatencies []time.Duration
@@ -402,6 +440,10 @@ func (pb ProbeInfo) RecentMedianLatency() time.Duration {
 	return pb.RecentLatencies[len(pb.RecentLatencies)/2]
 }
 
+func (pb ProbeInfo) Continuous() bool {
+	return pb.Interval < 0
+}
+
 // ProbeInfo returns the state of all probes.
 func (p *Prober) ProbeInfo() map[string]ProbeInfo {
 	out := map[string]ProbeInfo{}
@@ -429,9 +471,14 @@ func (probe *Probe) probeInfoLocked() ProbeInfo {
 		Labels:   probe.metricLabels,
 		Start:    probe.start,
 		End:      probe.end,
-		Result:   probe.succeeded,
 	}
-	if probe.lastErr != nil {
+	inf.Status = ProbeStatusUnknown
+	if probe.end.Before(probe.start) {
+		inf.Status = ProbeStatusRunning
+	} else if probe.succeeded {
+		inf.Status = ProbeStatusSucceeded
+	} else if probe.lastErr != nil {
+		inf.Status = ProbeStatusFailed
 		inf.Error = probe.lastErr.Error()
 	}
 	if probe.latency > 0 {
@@ -467,7 +514,7 @@ func (p *Prober) RunHandler(w http.ResponseWriter, r *http.Request) error {
 	p.mu.Lock()
 	probe, ok := p.probes[name]
 	p.mu.Unlock()
-	if !ok {
+	if !ok || probe.IsContinuous() {
 		return tsweb.Error(http.StatusNotFound, fmt.Sprintf("unknown probe %q", name), nil)
 	}
 
@@ -531,7 +578,8 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) {
 	if !p.start.IsZero() {
 		ch <- prometheus.MustNewConstMetric(p.mStartTime, prometheus.GaugeValue, float64(p.start.Unix()))
 	}
-	if p.end.IsZero() {
+	// For periodic probes that haven't ended, don't collect probe metrics yet.
+	if p.end.IsZero() && !p.IsContinuous() {
 		return
 	}
 	ch <- prometheus.MustNewConstMetric(p.mEndTime, prometheus.GaugeValue, float64(p.end.Unix()))

+ 9 - 8
prober/prober_test.go

@@ -316,7 +316,7 @@ func TestProberProbeInfo(t *testing.T) {
 			Interval:        probeInterval,
 			Labels:          map[string]string{"class": "", "name": "probe1"},
 			Latency:         500 * time.Millisecond,
-			Result:          true,
+			Status:          ProbeStatusSucceeded,
 			RecentResults:   []bool{true},
 			RecentLatencies: []time.Duration{500 * time.Millisecond},
 		},
@@ -324,6 +324,7 @@ func TestProberProbeInfo(t *testing.T) {
 			Name:            "probe2",
 			Interval:        probeInterval,
 			Labels:          map[string]string{"class": "", "name": "probe2"},
+			Status:          ProbeStatusFailed,
 			Error:           "error2",
 			RecentResults:   []bool{false},
 			RecentLatencies: nil, // no latency for failed probes
@@ -349,7 +350,7 @@ func TestProbeInfoRecent(t *testing.T) {
 	}{
 		{
 			name:                    "no_runs",
-			wantProbeInfo:           ProbeInfo{},
+			wantProbeInfo:           ProbeInfo{Status: ProbeStatusUnknown},
 			wantRecentSuccessRatio:  0,
 			wantRecentMedianLatency: 0,
 		},
@@ -358,7 +359,7 @@ func TestProbeInfoRecent(t *testing.T) {
 			results: []probeResult{{latency: 100 * time.Millisecond, err: nil}},
 			wantProbeInfo: ProbeInfo{
 				Latency:         100 * time.Millisecond,
-				Result:          true,
+				Status:          ProbeStatusSucceeded,
 				RecentResults:   []bool{true},
 				RecentLatencies: []time.Duration{100 * time.Millisecond},
 			},
@@ -369,7 +370,7 @@ func TestProbeInfoRecent(t *testing.T) {
 			name:    "single_failure",
 			results: []probeResult{{latency: 100 * time.Millisecond, err: errors.New("error123")}},
 			wantProbeInfo: ProbeInfo{
-				Result:          false,
+				Status:          ProbeStatusFailed,
 				RecentResults:   []bool{false},
 				RecentLatencies: nil,
 				Error:           "error123",
@@ -390,7 +391,7 @@ func TestProbeInfoRecent(t *testing.T) {
 				{latency: 80 * time.Millisecond, err: nil},
 			},
 			wantProbeInfo: ProbeInfo{
-				Result:        true,
+				Status:        ProbeStatusSucceeded,
 				Latency:       80 * time.Millisecond,
 				RecentResults: []bool{false, true, true, false, true, true, false, true},
 				RecentLatencies: []time.Duration{
@@ -420,7 +421,7 @@ func TestProbeInfoRecent(t *testing.T) {
 				{latency: 110 * time.Millisecond, err: nil},
 			},
 			wantProbeInfo: ProbeInfo{
-				Result:        true,
+				Status:        ProbeStatusSucceeded,
 				Latency:       110 * time.Millisecond,
 				RecentResults: []bool{true, true, true, true, true, true, true, true, true, true},
 				RecentLatencies: []time.Duration{
@@ -483,7 +484,7 @@ func TestProberRunHandler(t *testing.T) {
 				ProbeInfo: ProbeInfo{
 					Name:          "success",
 					Interval:      probeInterval,
-					Result:        true,
+					Status:        ProbeStatusSucceeded,
 					RecentResults: []bool{true, true},
 				},
 				PreviousSuccessRatio: 1,
@@ -498,7 +499,7 @@ func TestProberRunHandler(t *testing.T) {
 				ProbeInfo: ProbeInfo{
 					Name:          "failure",
 					Interval:      probeInterval,
-					Result:        false,
+					Status:        ProbeStatusFailed,
 					Error:         "error123",
 					RecentResults: []bool{false, false},
 				},

+ 8 - 4
prober/status.go

@@ -62,8 +62,9 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
 	return func(w http.ResponseWriter, r *http.Request) error {
 		type probeStatus struct {
 			ProbeInfo
-			TimeSinceLast time.Duration
-			Links         map[string]template.URL
+			TimeSinceLastStart time.Duration
+			TimeSinceLastEnd   time.Duration
+			Links              map[string]template.URL
 		}
 		vars := struct {
 			Title           string
@@ -81,12 +82,15 @@ func (p *Prober) StatusHandler(opts ...statusHandlerOpt) tsweb.ReturnHandlerFunc
 
 		for name, info := range p.ProbeInfo() {
 			vars.TotalProbes++
-			if !info.Result {
+			if info.Error != "" {
 				vars.UnhealthyProbes++
 			}
 			s := probeStatus{ProbeInfo: info}
+			if !info.Start.IsZero() {
+				s.TimeSinceLastStart = time.Since(info.Start).Truncate(time.Second)
+			}
 			if !info.End.IsZero() {
-				s.TimeSinceLast = time.Since(info.End).Truncate(time.Second)
+				s.TimeSinceLastEnd = time.Since(info.End).Truncate(time.Second)
 			}
 			for textTpl, urlTpl := range params.probeLinks {
 				text, err := renderTemplate(textTpl, info)

+ 40 - 17
prober/status.html

@@ -73,8 +73,9 @@
             <th>Name</th>
             <th>Probe Class & Labels</th>
             <th>Interval</th>
-            <th>Last Attempt</th>
-            <th>Success</th>
+            <th>Last Finished</th>
+            <th>Last Started</th>
+            <th>Status</th>
             <th>Latency</th>
             <th>Last Error</th>
         </tr></thead>
@@ -85,9 +86,11 @@
                 {{$name}}
                 {{range $text, $url := $probeInfo.Links}}
                 <br/>
-                <button onclick="location.href='{{$url}}';" type="button">
-                    {{$text}}
-                </button>
+                {{if not $probeInfo.Continuous}}
+                    <button onclick="location.href='{{$url}}';" type="button">
+                        {{$text}}
+                    </button>
+                {{end}}
                 {{end}}
             </td>
             <td>{{$probeInfo.Class}}<br/>
@@ -97,28 +100,48 @@
                 {{end}}
                 </div>
             </td>
-            <td>{{$probeInfo.Interval}}</td>
-            <td data-sort="{{$probeInfo.TimeSinceLast.Milliseconds}}">
-                {{if $probeInfo.TimeSinceLast}}
-                    {{$probeInfo.TimeSinceLast.String}} ago<br/>
+            <td>
+                {{if $probeInfo.Continuous}}
+                    Continuous
+                {{else}}
+                    {{$probeInfo.Interval}}
+                {{end}}
+            </td>
+            <td data-sort="{{$probeInfo.TimeSinceLastEnd.Milliseconds}}">
+                {{if $probeInfo.TimeSinceLastEnd}}
+                    {{$probeInfo.TimeSinceLastEnd.String}} ago<br/>
                     <span class="small">{{$probeInfo.End.Format "2006-01-02T15:04:05Z07:00"}}</span>
                 {{else}}
                     Never
                 {{end}}
             </td>
+            <td data-sort="{{$probeInfo.TimeSinceLastStart.Milliseconds}}">
+                {{if $probeInfo.TimeSinceLastStart}}
+                    {{$probeInfo.TimeSinceLastStart.String}} ago<br/>
+                    <span class="small">{{$probeInfo.Start.Format "2006-01-02T15:04:05Z07:00"}}</span>
+                {{else}}
+                    Never
+                {{end}}
+            </td>
             <td>
-                {{if $probeInfo.Result}}
-                    {{$probeInfo.Result}}
+                {{if $probeInfo.Error}}
+                    <span class="error">{{$probeInfo.Status}}</span>
                 {{else}}
-                    <span class="error">{{$probeInfo.Result}}</span>
+                    {{$probeInfo.Status}}
                 {{end}}<br/>
-                <div class="small">Recent: {{$probeInfo.RecentResults}}</div>
-                <div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
+                {{if not $probeInfo.Continuous}}
+                    <div class="small">Recent: {{$probeInfo.RecentResults}}</div>
+                    <div class="small">Mean: {{$probeInfo.RecentSuccessRatio}}</div>
+                {{end}}
             </td>
             <td data-sort="{{$probeInfo.Latency.Milliseconds}}">
-                {{$probeInfo.Latency.String}}
-                <div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
-                <div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
+                {{if $probeInfo.Continuous}}
+                    n/a
+                {{else}}
+                    {{$probeInfo.Latency.String}}
+                    <div class="small">Recent: {{$probeInfo.RecentLatencies}}</div>
+                    <div class="small">Median: {{$probeInfo.RecentMedianLatency}}</div>
+                {{end}}
             </td>
             <td class="small">{{$probeInfo.Error}}</td>
         </tr>