Browse Source

prober: export probe class and metrics from bandwidth prober

- Wrap each prober function into a probe class that allows associating
  metric labels and custom metrics with a given probe;
- Make sure all existing probe classes set a `class` metric label;
- Move bandwidth probe size from being a metric label to a separate
  gauge metric; this will make it possible to use it to calculate
  average used bandwidth using a PromQL query;
- Also export transfer time for the bandwidth prober (more accurate than
  the total probe time, since it excludes connection establishment
  time).

Updates tailscale/corp#17912

Signed-off-by: Anton Tolchanov <[email protected]>
Anton Tolchanov 1 year ago
parent
commit
5336362e64
10 changed files with 215 additions and 116 deletions
  1. 68 38
      prober/derp.go
  2. 9 9
      prober/derp_test.go
  3. 6 2
      prober/dns.go
  4. 8 6
      prober/dns_example_test.go
  5. 2 2
      prober/dns_test.go
  6. 8 5
      prober/http.go
  7. 69 18
      prober/prober.go
  8. 23 23
      prober/prober_test.go
  9. 6 3
      prober/tcp.go
  10. 16 10
      prober/tls.go

+ 68 - 38
prober/derp.go

@@ -10,9 +10,9 @@ import (
 	crand "crypto/rand"
 	"encoding/json"
 	"errors"
+	"expvar"
 	"fmt"
 	"log"
-	"maps"
 	"net"
 	"net/http"
 	"strconv"
@@ -20,6 +20,7 @@ import (
 	"sync"
 	"time"
 
+	"github.com/prometheus/client_golang/prometheus"
 	"tailscale.com/derp"
 	"tailscale.com/derp/derphttp"
 	"tailscale.com/net/stun"
@@ -42,11 +43,14 @@ type derpProber struct {
 	bwInterval  time.Duration
 	bwProbeSize int64
 
-	// Probe functions that can be overridden for testing.
-	tlsProbeFn  func(string) ProbeFunc
-	udpProbeFn  func(string, int) ProbeFunc
-	meshProbeFn func(string, string) ProbeFunc
-	bwProbeFn   func(string, string, int64) ProbeFunc
+	// Probe class for fetching & updating the DERP map.
+	ProbeMap ProbeClass
+
+	// Probe classes for probing individual derpers.
+	tlsProbeFn  func(string) ProbeClass
+	udpProbeFn  func(string, int) ProbeClass
+	meshProbeFn func(string, string) ProbeClass
+	bwProbeFn   func(string, string, int64) ProbeClass
 
 	sync.Mutex
 	lastDERPMap   *tailcfg.DERPMap
@@ -100,6 +104,10 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
 		nodes:      make(map[string]*tailcfg.DERPNode),
 		probes:     make(map[string]*Probe),
 	}
+	d.ProbeMap = ProbeClass{
+		Probe: d.probeMapFn,
+		Class: "derp_map",
+	}
 	for _, o := range opts {
 		o(d)
 	}
@@ -109,10 +117,10 @@ func DERP(p *Prober, derpMapURL string, opts ...DERPOpt) (*derpProber, error) {
 	return d, nil
 }
 
-// ProbeMap fetches the DERPMap and creates/destroys probes for each
+// probeMapFn fetches the DERPMap and creates/destroys probes for each
 // DERP server as necessary. It should get regularly executed as a
 // probe function itself.
-func (d *derpProber) ProbeMap(ctx context.Context) error {
+func (d *derpProber) probeMapFn(ctx context.Context) error {
 	if err := d.updateMap(ctx); err != nil {
 		return err
 	}
@@ -123,7 +131,7 @@ func (d *derpProber) ProbeMap(ctx context.Context) error {
 
 	for _, region := range d.lastDERPMap.Regions {
 		for _, server := range region.Nodes {
-			labels := map[string]string{
+			labels := Labels{
 				"region":    region.RegionCode,
 				"region_id": strconv.Itoa(region.RegionID),
 				"hostname":  server.HostName,
@@ -169,18 +177,11 @@ func (d *derpProber) ProbeMap(ctx context.Context) error {
 				}
 
 				if d.bwInterval > 0 && d.bwProbeSize > 0 {
-					bwLabels := maps.Clone(labels)
-					bwLabels["probe_size_bytes"] = fmt.Sprintf("%d", d.bwProbeSize)
-					if server.Name == to.Name {
-						bwLabels["derp_path"] = "single"
-					} else {
-						bwLabels["derp_path"] = "mesh"
-					}
 					n := fmt.Sprintf("derp/%s/%s/%s/bw", region.RegionCode, server.Name, to.Name)
 					wantProbes[n] = true
 					if d.probes[n] == nil {
 						log.Printf("adding DERP bandwidth probe for %s->%s (%s) %v bytes every %v", server.Name, to.Name, region.RegionName, d.bwProbeSize, d.bwInterval)
-						d.probes[n] = d.p.Run(n, d.bwInterval, bwLabels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
+						d.probes[n] = d.p.Run(n, d.bwInterval, labels, d.bwProbeFn(server.Name, to.Name, d.bwProbeSize))
 					}
 				}
 			}
@@ -198,32 +199,55 @@ func (d *derpProber) ProbeMap(ctx context.Context) error {
 	return nil
 }
 
-// probeMesh returs a probe func that sends a test packet through a pair of DERP
+// probeMesh returs a probe class that sends a test packet through a pair of DERP
 // servers (or just one server, if 'from' and 'to' are the same). 'from' and 'to'
 // are expected to be names (DERPNode.Name) of two DERP servers in the same region.
-func (d *derpProber) probeMesh(from, to string) ProbeFunc {
-	return func(ctx context.Context) error {
-		fromN, toN, err := d.getNodePair(from, to)
-		if err != nil {
-			return err
-		}
+func (d *derpProber) probeMesh(from, to string) ProbeClass {
+	derpPath := "mesh"
+	if from == to {
+		derpPath = "single"
+	}
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			fromN, toN, err := d.getNodePair(from, to)
+			if err != nil {
+				return err
+			}
 
-		dm := d.lastDERPMap
-		return derpProbeNodePair(ctx, dm, fromN, toN)
+			dm := d.lastDERPMap
+			return derpProbeNodePair(ctx, dm, fromN, toN)
+		},
+		Class:  "derp_mesh",
+		Labels: Labels{"derp_path": derpPath},
 	}
 }
 
-// probeBandwidth returs a probe func that sends a payload of a given size
+// probeBandwidth returs a probe class that sends a payload of a given size
 // through a pair of DERP servers (or just one server, if 'from' and 'to' are
 // the same). 'from' and 'to' are expected to be names (DERPNode.Name) of two
 // DERP servers in the same region.
-func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeFunc {
-	return func(ctx context.Context) error {
-		fromN, toN, err := d.getNodePair(from, to)
-		if err != nil {
-			return err
-		}
-		return derpProbeBandwidth(ctx, d.lastDERPMap, fromN, toN, size)
+func (d *derpProber) probeBandwidth(from, to string, size int64) ProbeClass {
+	derpPath := "mesh"
+	if from == to {
+		derpPath = "single"
+	}
+	var transferTime expvar.Float
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			fromN, toN, err := d.getNodePair(from, to)
+			if err != nil {
+				return err
+			}
+			return derpProbeBandwidth(ctx, d.lastDERPMap, fromN, toN, size, &transferTime)
+		},
+		Class:  "derp_bw",
+		Labels: Labels{"derp_path": derpPath},
+		Metrics: func(l prometheus.Labels) []prometheus.Metric {
+			return []prometheus.Metric{
+				prometheus.MustNewConstMetric(prometheus.NewDesc("derp_bw_probe_size_bytes", "Payload size of the bandwidth prober", nil, l), prometheus.GaugeValue, float64(size)),
+				prometheus.MustNewConstMetric(prometheus.NewDesc("derp_bw_transfer_time_seconds_total", "Time it took to transfer data", nil, l), prometheus.CounterValue, transferTime.Value()),
+			}
+		},
 	}
 }
 
@@ -289,9 +313,12 @@ func (d *derpProber) updateMap(ctx context.Context) error {
 	return nil
 }
 
-func (d *derpProber) ProbeUDP(ipaddr string, port int) ProbeFunc {
-	return func(ctx context.Context) error {
-		return derpProbeUDP(ctx, ipaddr, port)
+func (d *derpProber) ProbeUDP(ipaddr string, port int) ProbeClass {
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			return derpProbeUDP(ctx, ipaddr, port)
+		},
+		Class: "derp_udp",
 	}
 }
 
@@ -347,7 +374,7 @@ func derpProbeUDP(ctx context.Context, ipStr string, port int) error {
 
 // derpProbeBandwidth sends a payload of a given size between two local
 // DERP clients connected to two DERP servers.
-func derpProbeBandwidth(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, size int64) (err error) {
+func derpProbeBandwidth(ctx context.Context, dm *tailcfg.DERPMap, from, to *tailcfg.DERPNode, size int64, transferTime *expvar.Float) (err error) {
 	// This probe uses clients with isProber=false to avoid spamming the derper logs with every packet
 	// sent by the bandwidth probe.
 	fromc, err := newConn(ctx, dm, from, false)
@@ -368,6 +395,9 @@ func derpProbeBandwidth(ctx context.Context, dm *tailcfg.DERPMap, from, to *tail
 		time.Sleep(100 * time.Millisecond) // pretty arbitrary
 	}
 
+	start := time.Now()
+	defer func() { transferTime.Add(time.Since(start).Seconds()) }()
+
 	if err := runDerpProbeNodePair(ctx, from, to, fromc, toc, size); err != nil {
 		// Record pubkeys on failed probes to aid investigation.
 		return fmt.Errorf("%s -> %s: %w",

+ 9 - 9
prober/derp_test.go

@@ -60,16 +60,16 @@ func TestDerpProber(t *testing.T) {
 		p:            p,
 		derpMapURL:   srv.URL,
 		tlsInterval:  time.Second,
-		tlsProbeFn:   func(_ string) ProbeFunc { return func(context.Context) error { return nil } },
+		tlsProbeFn:   func(_ string) ProbeClass { return FuncProbe(func(context.Context) error { return nil }) },
 		udpInterval:  time.Second,
-		udpProbeFn:   func(_ string, _ int) ProbeFunc { return func(context.Context) error { return nil } },
+		udpProbeFn:   func(_ string, _ int) ProbeClass { return FuncProbe(func(context.Context) error { return nil }) },
 		meshInterval: time.Second,
-		meshProbeFn:  func(_, _ string) ProbeFunc { return func(context.Context) error { return nil } },
+		meshProbeFn:  func(_, _ string) ProbeClass { return FuncProbe(func(context.Context) error { return nil }) },
 		nodes:        make(map[string]*tailcfg.DERPNode),
 		probes:       make(map[string]*Probe),
 	}
-	if err := dp.ProbeMap(context.Background()); err != nil {
-		t.Errorf("unexpected ProbeMap() error: %s", err)
+	if err := dp.probeMapFn(context.Background()); err != nil {
+		t.Errorf("unexpected probeMapFn() error: %s", err)
 	}
 	if len(dp.nodes) != 2 || dp.nodes["n1"] == nil || dp.nodes["n2"] == nil {
 		t.Errorf("unexpected nodes: %+v", dp.nodes)
@@ -89,8 +89,8 @@ func TestDerpProber(t *testing.T) {
 		IPv4:     "1.1.1.1",
 		IPv6:     "::1",
 	})
-	if err := dp.ProbeMap(context.Background()); err != nil {
-		t.Errorf("unexpected ProbeMap() error: %s", err)
+	if err := dp.probeMapFn(context.Background()); err != nil {
+		t.Errorf("unexpected probeMapFn() error: %s", err)
 	}
 	if len(dp.nodes) != 3 {
 		t.Errorf("unexpected nodes: %+v", dp.nodes)
@@ -102,8 +102,8 @@ func TestDerpProber(t *testing.T) {
 
 	// Remove 2 nodes and check that probes have been destroyed.
 	dm.Regions[0].Nodes = dm.Regions[0].Nodes[:1]
-	if err := dp.ProbeMap(context.Background()); err != nil {
-		t.Errorf("unexpected ProbeMap() error: %s", err)
+	if err := dp.probeMapFn(context.Background()); err != nil {
+		t.Errorf("unexpected probeMapFn() error: %s", err)
 	}
 	if len(dp.nodes) != 1 {
 		t.Errorf("unexpected nodes: %+v", dp.nodes)

+ 6 - 2
prober/dns.go

@@ -35,8 +35,12 @@ type ForEachAddrOpts struct {
 // every time a new IP is discovered. The Probes returned will be closed if an
 // IP address is no longer in the DNS record for the given hostname. This can
 // be used to healthcheck every IP address that a hostname resolves to.
-func ForEachAddr(host string, makeProbes func(netip.Addr) []*Probe, opts ForEachAddrOpts) ProbeFunc {
-	return makeForEachAddr(host, makeProbes, opts).run
+func ForEachAddr(host string, makeProbes func(netip.Addr) []*Probe, opts ForEachAddrOpts) ProbeClass {
+	feap := makeForEachAddr(host, makeProbes, opts)
+	return ProbeClass{
+		Probe: feap.run,
+		Class: "dns_each_addr",
+	}
 }
 
 func makeForEachAddr(host string, makeProbes func(netip.Addr) []*Probe, opts ForEachAddrOpts) *forEachAddrProbe {

+ 8 - 6
prober/dns_example_test.go

@@ -89,11 +89,13 @@ func ExampleForEachAddr() {
 	<-sigCh
 }
 
-func probeLogWrapper(logf logger.Logf, pf prober.ProbeFunc) prober.ProbeFunc {
-	return func(ctx context.Context) error {
-		logf("starting probe")
-		err := pf(ctx)
-		logf("probe finished with %v", err)
-		return err
+func probeLogWrapper(logf logger.Logf, pc prober.ProbeClass) prober.ProbeClass {
+	return prober.ProbeClass{
+		Probe: func(ctx context.Context) error {
+			logf("starting probe")
+			err := pc.Probe(ctx)
+			logf("probe finished with %v", err)
+			return err
+		},
 	}
 }

+ 2 - 2
prober/dns_test.go

@@ -57,9 +57,9 @@ func TestForEachAddr(t *testing.T) {
 		registered = append(registered, addr)
 
 		// Return a probe that does nothing; we don't care about what this does.
-		probe := p.Run(fmt.Sprintf("website/%s", addr), probeInterval, nil, func(_ context.Context) error {
+		probe := p.Run(fmt.Sprintf("website/%s", addr), probeInterval, nil, FuncProbe(func(_ context.Context) error {
 			return nil
-		})
+		}))
 		return []*Probe{probe}
 	}
 

+ 8 - 5
prober/http.go

@@ -13,14 +13,17 @@ import (
 
 const maxHTTPBody = 4 << 20 // MiB
 
-// HTTP returns a Probe that healthchecks an HTTP URL.
+// HTTP returns a ProbeClass that healthchecks an HTTP URL.
 //
-// The ProbeFunc sends a GET request for url, expects an HTTP 200
+// The probe function sends a GET request for url, expects an HTTP 200
 // response, and verifies that want is present in the response
 // body.
-func HTTP(url, wantText string) ProbeFunc {
-	return func(ctx context.Context) error {
-		return probeHTTP(ctx, url, []byte(wantText))
+func HTTP(url, wantText string) ProbeClass {
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			return probeHTTP(ctx, url, []byte(wantText))
+		},
+		Class: "http",
 	}
 }
 

+ 69 - 18
prober/prober.go

@@ -12,6 +12,7 @@ import (
 	"fmt"
 	"hash/fnv"
 	"log"
+	"maps"
 	"math/rand"
 	"sync"
 	"time"
@@ -19,10 +20,33 @@ import (
 	"github.com/prometheus/client_golang/prometheus"
 )
 
-// ProbeFunc is a function that probes something and reports whether
-// the probe succeeded. The provided context's deadline must be obeyed
-// for correct probe scheduling.
-type ProbeFunc func(context.Context) error
+// ProbeClass defines a probe of a specific type: a probing function that will
+// be regularly ran, and metric labels that will be added automatically to all
+// probes using this class.
+type ProbeClass struct {
+	// Probe is a function that probes something and reports whether the Probe
+	// succeeded. The provided context's deadline must be obeyed for correct
+	// Probe scheduling.
+	Probe func(context.Context) error
+
+	// Class defines a user-facing name of the probe class that will be used
+	// in the `class` metric label.
+	Class string
+
+	// Labels defines a set of metric labels that will be added to all metrics
+	// exposed by this probe class.
+	Labels Labels
+
+	// Metrics allows a probe class to export custom Metrics. Can be nil.
+	Metrics func(prometheus.Labels) []prometheus.Metric
+}
+
+// FuncProbe wraps a simple probe function in a ProbeClass.
+func FuncProbe(fn func(context.Context) error) ProbeClass {
+	return ProbeClass{
+		Probe: fn,
+	}
+}
 
 // a Prober manages a set of probes and keeps track of their results.
 type Prober struct {
@@ -61,17 +85,23 @@ func newForTest(now func() time.Time, newTicker func(time.Duration) ticker) *Pro
 	return p
 }
 
-// Run executes fun every interval, and exports probe results under probeName.
+// Run executes probe class function every interval, and exports probe results under probeName.
 //
 // Registering a probe under an already-registered name panics.
-func (p *Prober) Run(name string, interval time.Duration, labels map[string]string, fun ProbeFunc) *Probe {
+func (p *Prober) Run(name string, interval time.Duration, labels Labels, pc ProbeClass) *Probe {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if _, ok := p.probes[name]; ok {
 		panic(fmt.Sprintf("probe named %q already registered", name))
 	}
 
-	l := prometheus.Labels{"name": name}
+	l := prometheus.Labels{
+		"name":  name,
+		"class": pc.Class,
+	}
+	for k, v := range pc.Labels {
+		l[k] = v
+	}
 	for k, v := range labels {
 		l[k] = v
 	}
@@ -84,10 +114,11 @@ func (p *Prober) Run(name string, interval time.Duration, labels map[string]stri
 		stopped: make(chan struct{}),
 
 		name:         name,
-		doProbe:      fun,
+		probeClass:   pc,
 		interval:     interval,
 		initialDelay: initialDelay(name, interval),
 		metrics:      prometheus.NewRegistry(),
+		metricLabels: l,
 		mInterval:    prometheus.NewDesc("interval_secs", "Probe interval in seconds", nil, l),
 		mStartTime:   prometheus.NewDesc("start_secs", "Latest probe start time (seconds since epoch)", nil, l),
 		mEndTime:     prometheus.NewDesc("end_secs", "Latest probe end time (seconds since epoch)", nil, l),
@@ -177,7 +208,7 @@ type Probe struct {
 	stopped chan struct{}      // closed when shutdown is complete
 
 	name         string
-	doProbe      ProbeFunc
+	probeClass   ProbeClass
 	interval     time.Duration
 	initialDelay time.Duration
 	tick         ticker
@@ -185,14 +216,15 @@ type Probe struct {
 	// metrics is a Prometheus metrics registry for metrics exported by this probe.
 	// Using a separate registry allows cleanly removing metrics exported by this
 	// probe when it gets unregistered.
-	metrics    *prometheus.Registry
-	mInterval  *prometheus.Desc
-	mStartTime *prometheus.Desc
-	mEndTime   *prometheus.Desc
-	mLatency   *prometheus.Desc
-	mResult    *prometheus.Desc
-	mAttempts  *prometheus.CounterVec
-	mSeconds   *prometheus.CounterVec
+	metrics      *prometheus.Registry
+	metricLabels prometheus.Labels
+	mInterval    *prometheus.Desc
+	mStartTime   *prometheus.Desc
+	mEndTime     *prometheus.Desc
+	mLatency     *prometheus.Desc
+	mResult      *prometheus.Desc
+	mAttempts    *prometheus.CounterVec
+	mSeconds     *prometheus.CounterVec
 
 	mu        sync.Mutex
 	start     time.Time     // last time doProbe started
@@ -268,7 +300,7 @@ func (p *Probe) run() {
 	ctx, cancel := context.WithTimeout(p.ctx, timeout)
 	defer cancel()
 
-	err := p.doProbe(ctx)
+	err := p.probeClass.Probe(ctx)
 	p.recordEnd(start, err)
 	if err != nil {
 		log.Printf("probe %s: %v", p.name, err)
@@ -349,6 +381,11 @@ func (p *Probe) Describe(ch chan<- *prometheus.Desc) {
 	ch <- p.mLatency
 	p.mAttempts.Describe(ch)
 	p.mSeconds.Describe(ch)
+	if p.probeClass.Metrics != nil {
+		for _, m := range p.probeClass.Metrics(p.metricLabels) {
+			ch <- m.Desc()
+		}
+	}
 }
 
 // Collect implements prometheus.Collector.
@@ -373,6 +410,11 @@ func (p *Probe) Collect(ch chan<- prometheus.Metric) {
 	}
 	p.mAttempts.Collect(ch)
 	p.mSeconds.Collect(ch)
+	if p.probeClass.Metrics != nil {
+		for _, m := range p.probeClass.Metrics(p.metricLabels) {
+			ch <- m
+		}
+	}
 }
 
 // ticker wraps a time.Ticker in a way that can be faked for tests.
@@ -401,3 +443,12 @@ func initialDelay(seed string, interval time.Duration) time.Duration {
 	r := rand.New(rand.NewSource(int64(h.Sum64()))).Float64()
 	return time.Duration(float64(interval) * r)
 }
+
+// Labels is a set of metric labels used by a prober.
+type Labels map[string]string
+
+func (l Labels) With(k, v string) Labels {
+	new := maps.Clone(l)
+	new[k] = v
+	return new
+}

+ 23 - 23
prober/prober_test.go

@@ -51,10 +51,10 @@ func TestProberTiming(t *testing.T) {
 		}
 	}
 
-	p.Run("test-probe", probeInterval, nil, func(context.Context) error {
+	p.Run("test-probe", probeInterval, nil, FuncProbe(func(context.Context) error {
 		invoked <- struct{}{}
 		return nil
-	})
+	}))
 
 	waitActiveProbes(t, p, clk, 1)
 
@@ -93,10 +93,10 @@ func TestProberTimingSpread(t *testing.T) {
 		}
 	}
 
-	probe := p.Run("test-spread-probe", probeInterval, nil, func(context.Context) error {
+	probe := p.Run("test-spread-probe", probeInterval, nil, FuncProbe(func(context.Context) error {
 		invoked <- struct{}{}
 		return nil
-	})
+	}))
 
 	waitActiveProbes(t, p, clk, 1)
 
@@ -156,12 +156,12 @@ func TestProberRun(t *testing.T) {
 	var probes []*Probe
 
 	for i := 0; i < startingProbes; i++ {
-		probes = append(probes, p.Run(fmt.Sprintf("probe%d", i), probeInterval, nil, func(context.Context) error {
+		probes = append(probes, p.Run(fmt.Sprintf("probe%d", i), probeInterval, nil, FuncProbe(func(context.Context) error {
 			mu.Lock()
 			defer mu.Unlock()
 			cnt++
 			return nil
-		}))
+		})))
 	}
 
 	checkCnt := func(want int) {
@@ -207,13 +207,13 @@ func TestPrometheus(t *testing.T) {
 	p := newForTest(clk.Now, clk.NewTicker).WithMetricNamespace("probe")
 
 	var succeed atomic.Bool
-	p.Run("testprobe", probeInterval, map[string]string{"label": "value"}, func(context.Context) error {
+	p.Run("testprobe", probeInterval, map[string]string{"label": "value"}, FuncProbe(func(context.Context) error {
 		clk.Advance(aFewMillis)
 		if succeed.Load() {
 			return nil
 		}
 		return errors.New("failing, as instructed by test")
-	})
+	}))
 
 	waitActiveProbes(t, p, clk, 1)
 
@@ -221,16 +221,16 @@ func TestPrometheus(t *testing.T) {
 		want := fmt.Sprintf(`
 # HELP probe_interval_secs Probe interval in seconds
 # TYPE probe_interval_secs gauge
-probe_interval_secs{label="value",name="testprobe"} %f
+probe_interval_secs{class="",label="value",name="testprobe"} %f
 # HELP probe_start_secs Latest probe start time (seconds since epoch)
 # TYPE probe_start_secs gauge
-probe_start_secs{label="value",name="testprobe"} %d
+probe_start_secs{class="",label="value",name="testprobe"} %d
 # HELP probe_end_secs Latest probe end time (seconds since epoch)
 # TYPE probe_end_secs gauge
-probe_end_secs{label="value",name="testprobe"} %d
+probe_end_secs{class="",label="value",name="testprobe"} %d
 # HELP probe_result Latest probe result (1 = success, 0 = failure)
 # TYPE probe_result gauge
-probe_result{label="value",name="testprobe"} 0
+probe_result{class="",label="value",name="testprobe"} 0
 `, probeInterval.Seconds(), epoch.Unix(), epoch.Add(aFewMillis).Unix())
 		return testutil.GatherAndCompare(p.metrics, strings.NewReader(want),
 			"probe_interval_secs", "probe_start_secs", "probe_end_secs", "probe_result")
@@ -248,19 +248,19 @@ probe_result{label="value",name="testprobe"} 0
 		want := fmt.Sprintf(`
 # HELP probe_interval_secs Probe interval in seconds
 # TYPE probe_interval_secs gauge
-probe_interval_secs{label="value",name="testprobe"} %f
+probe_interval_secs{class="",label="value",name="testprobe"} %f
 # HELP probe_start_secs Latest probe start time (seconds since epoch)
 # TYPE probe_start_secs gauge
-probe_start_secs{label="value",name="testprobe"} %d
+probe_start_secs{class="",label="value",name="testprobe"} %d
 # HELP probe_end_secs Latest probe end time (seconds since epoch)
 # TYPE probe_end_secs gauge
-probe_end_secs{label="value",name="testprobe"} %d
+probe_end_secs{class="",label="value",name="testprobe"} %d
 # HELP probe_latency_millis Latest probe latency (ms)
 # TYPE probe_latency_millis gauge
-probe_latency_millis{label="value",name="testprobe"} %d
+probe_latency_millis{class="",label="value",name="testprobe"} %d
 # HELP probe_result Latest probe result (1 = success, 0 = failure)
 # TYPE probe_result gauge
-probe_result{label="value",name="testprobe"} 1
+probe_result{class="",label="value",name="testprobe"} 1
 `, probeInterval.Seconds(), start.Unix(), end.Unix(), aFewMillis.Milliseconds())
 		return testutil.GatherAndCompare(p.metrics, strings.NewReader(want),
 			"probe_interval_secs", "probe_start_secs", "probe_end_secs", "probe_latency_millis", "probe_result")
@@ -274,14 +274,14 @@ func TestOnceMode(t *testing.T) {
 	clk := newFakeTime()
 	p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
 
-	p.Run("probe1", probeInterval, nil, func(context.Context) error { return nil })
-	p.Run("probe2", probeInterval, nil, func(context.Context) error { return fmt.Errorf("error2") })
-	p.Run("probe3", probeInterval, nil, func(context.Context) error {
-		p.Run("probe4", probeInterval, nil, func(context.Context) error {
+	p.Run("probe1", probeInterval, nil, FuncProbe(func(context.Context) error { return nil }))
+	p.Run("probe2", probeInterval, nil, FuncProbe(func(context.Context) error { return fmt.Errorf("error2") }))
+	p.Run("probe3", probeInterval, nil, FuncProbe(func(context.Context) error {
+		p.Run("probe4", probeInterval, nil, FuncProbe(func(context.Context) error {
 			return fmt.Errorf("error4")
-		})
+		}))
 		return nil
-	})
+	}))
 
 	p.Wait()
 	wantCount := 4

+ 6 - 3
prober/tcp.go

@@ -12,9 +12,12 @@ import (
 // TCP returns a Probe that healthchecks a TCP endpoint.
 //
 // The ProbeFunc reports whether it can successfully connect to addr.
-func TCP(addr string) ProbeFunc {
-	return func(ctx context.Context) error {
-		return probeTCP(ctx, addr)
+func TCP(addr string) ProbeClass {
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			return probeTCP(ctx, addr)
+		},
+		Class: "tcp",
 	}
 }
 

+ 16 - 10
prober/tls.go

@@ -27,22 +27,28 @@ const expiresSoon = 7 * 24 * time.Hour // 7 days from now
 // The ProbeFunc connects to a hostPort (host:port string), does a TLS
 // handshake, verifies that the hostname matches the presented certificate,
 // checks certificate validity time and OCSP revocation status.
-func TLS(hostPort string) ProbeFunc {
-	return func(ctx context.Context) error {
-		certDomain, _, err := net.SplitHostPort(hostPort)
-		if err != nil {
-			return err
-		}
-		return probeTLS(ctx, certDomain, hostPort)
+func TLS(hostPort string) ProbeClass {
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			certDomain, _, err := net.SplitHostPort(hostPort)
+			if err != nil {
+				return err
+			}
+			return probeTLS(ctx, certDomain, hostPort)
+		},
+		Class: "tls",
 	}
 }
 
 // TLSWithIP is like TLS, but dials the provided dialAddr instead
 // of using DNS resolution. The certDomain is the expected name in
 // the cert (and the SNI name to send).
-func TLSWithIP(certDomain string, dialAddr netip.AddrPort) ProbeFunc {
-	return func(ctx context.Context) error {
-		return probeTLS(ctx, certDomain, dialAddr.String())
+func TLSWithIP(certDomain string, dialAddr netip.AddrPort) ProbeClass {
+	return ProbeClass{
+		Probe: func(ctx context.Context) error {
+			return probeTLS(ctx, certDomain, dialAddr.String())
+		},
+		Class: "tls",
 	}
 }