Explorar el Código

cmd/strelaypoolsrv: Prevent scraped metrics moving backwards (#5068)

Audrius Butkevicius hace 7 años
padre
commit
93fdd1c012
Se han modificado 3 ficheros con 61 adiciones y 3 borrados
  1. 1 1
      cmd/strelaypoolsrv/main.go
  2. 39 2
      cmd/strelaypoolsrv/stats.go
  3. 21 0
      cmd/strelaypoolsrv/stats_test.go

+ 1 - 1
cmd/strelaypoolsrv/main.go

@@ -496,7 +496,7 @@ func handleRelayTest(request request) {
 
 	mut.Lock()
 	if stats != nil {
-		updateMetrics(request.relay.uri.Host, stats, location)
+		updateMetrics(request.relay.uri.Host, *stats, location)
 	}
 	request.relay.Stats = stats
 	request.relay.StatsRetrieved = time.Now()

+ 39 - 2
cmd/strelaypoolsrv/stats.go

@@ -44,6 +44,8 @@ var (
 	relayGlobalRate         = makeGauge("relay_global_rate", "Global rate applied on the whole relay", "relay")
 	relayBuildInfo          = makeGauge("relay_build_info", "Build information about a relay", "relay", "go_version", "go_os", "go_arch")
 	relayLocationInfo       = makeGauge("relay_location_info", "Location information about a relay", "relay", "city", "country", "continent")
+
+	lastStats = make(map[string]stats)
 )
 
 func makeGauge(name string, help string, labels ...string) *prometheus.GaugeVec {
@@ -142,7 +144,7 @@ func refreshStats() {
 		if result.stats == nil {
 			deleteMetrics(result.relay.uri.Host)
 		} else {
-			updateMetrics(result.relay.uri.Host, result.stats, result.relay.Location)
+			updateMetrics(result.relay.uri.Host, *result.stats, result.relay.Location)
 		}
 	}
 	mut.Unlock()
@@ -182,13 +184,18 @@ func fetchStats(relay *relay) *stats {
 	return &stats
 }
 
-func updateMetrics(host string, stats *stats, location location) {
+func updateMetrics(host string, stats stats, location location) {
 	if stats.GoVersion != "" || stats.GoOS != "" || stats.GoArch != "" {
 		relayBuildInfo.WithLabelValues(host, stats.GoVersion, stats.GoOS, stats.GoArch).Add(1)
 	}
 	if location.City != "" || location.Country != "" || location.Continent != "" {
 		relayLocationInfo.WithLabelValues(host, location.City, location.Country, location.Continent).Add(1)
 	}
+
+	if lastStat, ok := lastStats[host]; ok {
+		stats = mergeStats(stats, lastStat)
+	}
+
 	relayUptime.WithLabelValues(host).Set(float64(stats.UptimeSeconds))
 	relayPendingSessionKeys.WithLabelValues(host).Set(float64(stats.PendingSessionKeys))
 	relayActiveSessions.WithLabelValues(host).Set(float64(stats.ActiveSessions))
@@ -198,6 +205,7 @@ func updateMetrics(host string, stats *stats, location location) {
 	relayGoRoutines.WithLabelValues(host).Set(float64(stats.GoRoutines))
 	relaySessionRate.WithLabelValues(host).Set(float64(stats.Options.SessionRate))
 	relayGlobalRate.WithLabelValues(host).Set(float64(stats.Options.GlobalRate))
+	lastStats[host] = stats
 }
 
 func deleteMetrics(host string) {
@@ -210,4 +218,33 @@ func deleteMetrics(host string) {
 	relayGoRoutines.DeleteLabelValues(host)
 	relaySessionRate.DeleteLabelValues(host)
 	relayGlobalRate.DeleteLabelValues(host)
+	delete(lastStats, host)
+}
+
+// Due to some unexplainable behaviour, some of the numbers sometimes travel slightly backwards (by less than 1%)
+// This happens between scrapes, which is 30s, so this can't be a race.
+// This causes prometheus to assume a "rate reset", hence causes phenomenal spikes.
+// One of the number that moves backwards is BytesProxied, which atomically increments a counter with numeric value
+// returned by net.Conn.Read(). I don't think that can return a negative value, so I have no idea what's going on.
+func mergeStats(new stats, old stats) stats {
+	new.UptimeSeconds = mergeValue(new.UptimeSeconds, old.UptimeSeconds)
+	new.PendingSessionKeys = mergeValue(new.PendingSessionKeys, old.PendingSessionKeys)
+	new.ActiveSessions = mergeValue(new.ActiveSessions, old.ActiveSessions)
+	new.Connections = mergeValue(new.Connections, old.Connections)
+	new.Proxies = mergeValue(new.Proxies, old.Proxies)
+	new.BytesProxied = mergeValue(new.BytesProxied, old.BytesProxied)
+	new.GoRoutines = mergeValue(new.GoRoutines, old.GoRoutines)
+	new.Options.SessionRate = mergeValue(new.Options.SessionRate, old.Options.SessionRate)
+	new.Options.GlobalRate = mergeValue(new.Options.GlobalRate, old.Options.GlobalRate)
+	return new
+}
+
+func mergeValue(new, old int) int {
+	if new >= old {
+		return new // normal increase
+	}
+	if float64(new) > 0.99*float64(old) {
+		return old // slight backward movement
+	}
+	return new // reset (relay restart)
 }

+ 21 - 0
cmd/strelaypoolsrv/stats_test.go

@@ -0,0 +1,21 @@
+// Copyright (C) 2015 Audrius Butkevicius and Contributors (see the CONTRIBUTORS file).
+
+package main
+
+import (
+	"testing"
+)
+
+func TestMerge(t *testing.T) {
+	if mergeValue(1001, 1000) != 1001 {
+		t.Error("the computer says no")
+	}
+
+	if mergeValue(999, 1000) != 1000 {
+		t.Error("the computer says no")
+	}
+
+	if mergeValue(1, 1000) != 1 {
+		t.Error("the computer says no")
+	}
+}