prober_test.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // Copyright (c) Tailscale Inc & AUTHORS
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. package prober
  4. import (
  5. "context"
  6. "errors"
  7. "fmt"
  8. "strings"
  9. "sync"
  10. "sync/atomic"
  11. "testing"
  12. "time"
  13. "github.com/prometheus/client_golang/prometheus/testutil"
  14. "tailscale.com/tstest"
  15. )
  16. const (
  17. probeInterval = 8 * time.Second // So expvars that are integer numbers of seconds change
  18. halfProbeInterval = probeInterval / 2
  19. quarterProbeInterval = probeInterval / 4
  20. convergenceTimeout = time.Second
  21. convergenceSleep = time.Millisecond
  22. aFewMillis = 20 * time.Millisecond
  23. )
  24. var epoch = time.Unix(0, 0)
  25. func TestProberTiming(t *testing.T) {
  26. clk := newFakeTime()
  27. p := newForTest(clk.Now, clk.NewTicker)
  28. invoked := make(chan struct{}, 1)
  29. notCalled := func() {
  30. t.Helper()
  31. select {
  32. case <-invoked:
  33. t.Fatal("probe was invoked earlier than expected")
  34. default:
  35. }
  36. }
  37. called := func() {
  38. t.Helper()
  39. select {
  40. case <-invoked:
  41. case <-time.After(2 * time.Second):
  42. t.Fatal("probe wasn't invoked as expected")
  43. }
  44. }
  45. p.Run("test-probe", probeInterval, nil, func(context.Context) error {
  46. invoked <- struct{}{}
  47. return nil
  48. })
  49. waitActiveProbes(t, p, clk, 1)
  50. called()
  51. notCalled()
  52. clk.Advance(probeInterval + halfProbeInterval)
  53. called()
  54. notCalled()
  55. clk.Advance(quarterProbeInterval)
  56. notCalled()
  57. clk.Advance(probeInterval)
  58. called()
  59. notCalled()
  60. }
  61. func TestProberTimingSpread(t *testing.T) {
  62. clk := newFakeTime()
  63. p := newForTest(clk.Now, clk.NewTicker).WithSpread(true)
  64. invoked := make(chan struct{}, 1)
  65. notCalled := func() {
  66. t.Helper()
  67. select {
  68. case <-invoked:
  69. t.Fatal("probe was invoked earlier than expected")
  70. default:
  71. }
  72. }
  73. called := func() {
  74. t.Helper()
  75. select {
  76. case <-invoked:
  77. case <-time.After(2 * time.Second):
  78. t.Fatal("probe wasn't invoked as expected")
  79. }
  80. }
  81. probe := p.Run("test-spread-probe", probeInterval, nil, func(context.Context) error {
  82. invoked <- struct{}{}
  83. return nil
  84. })
  85. waitActiveProbes(t, p, clk, 1)
  86. notCalled()
  87. // Name of the probe (test-spread-probe) has been chosen to ensure that
  88. // the initial delay is smaller than half of the probe interval.
  89. clk.Advance(halfProbeInterval)
  90. called()
  91. notCalled()
  92. // We need to wait until the main (non-initial) ticker in Probe.loop is
  93. // waiting, or we could race and advance the test clock between when
  94. // the initial delay ticker completes and before the ticker for the
  95. // main loop is created. In this race, we'd first advance the test
  96. // clock, then the ticker would be registered, and the test would fail
  97. // because that ticker would never be fired.
  98. err := tstest.WaitFor(convergenceTimeout, func() error {
  99. clk.Lock()
  100. defer clk.Unlock()
  101. for _, tick := range clk.tickers {
  102. tick.Lock()
  103. stopped, interval := tick.stopped, tick.interval
  104. tick.Unlock()
  105. if stopped {
  106. continue
  107. }
  108. // Test for the main loop, not the initialDelay
  109. if interval == probe.interval {
  110. return nil
  111. }
  112. }
  113. return fmt.Errorf("no ticker with interval %d found", probe.interval)
  114. })
  115. if err != nil {
  116. t.Fatal(err)
  117. }
  118. clk.Advance(quarterProbeInterval)
  119. notCalled()
  120. clk.Advance(probeInterval)
  121. called()
  122. notCalled()
  123. }
  124. func TestProberRun(t *testing.T) {
  125. clk := newFakeTime()
  126. p := newForTest(clk.Now, clk.NewTicker)
  127. var (
  128. mu sync.Mutex
  129. cnt int
  130. )
  131. const startingProbes = 100
  132. var probes []*Probe
  133. for i := 0; i < startingProbes; i++ {
  134. probes = append(probes, p.Run(fmt.Sprintf("probe%d", i), probeInterval, nil, func(context.Context) error {
  135. mu.Lock()
  136. defer mu.Unlock()
  137. cnt++
  138. return nil
  139. }))
  140. }
  141. checkCnt := func(want int) {
  142. t.Helper()
  143. err := tstest.WaitFor(convergenceTimeout, func() error {
  144. mu.Lock()
  145. defer mu.Unlock()
  146. if cnt == want {
  147. cnt = 0
  148. return nil
  149. }
  150. return fmt.Errorf("wrong number of probe counter increments, got %d want %d", cnt, want)
  151. })
  152. if err != nil {
  153. t.Fatal(err)
  154. }
  155. }
  156. waitActiveProbes(t, p, clk, startingProbes)
  157. checkCnt(startingProbes)
  158. clk.Advance(probeInterval + halfProbeInterval)
  159. checkCnt(startingProbes)
  160. if c, err := testutil.GatherAndCount(p.metrics, "prober_result"); c != startingProbes || err != nil {
  161. t.Fatalf("expected %d prober_result metrics; got %d (error %s)", startingProbes, c, err)
  162. }
  163. keep := startingProbes / 2
  164. for i := keep; i < startingProbes; i++ {
  165. probes[i].Close()
  166. }
  167. waitActiveProbes(t, p, clk, keep)
  168. clk.Advance(probeInterval)
  169. checkCnt(keep)
  170. if c, err := testutil.GatherAndCount(p.metrics, "prober_result"); c != keep || err != nil {
  171. t.Fatalf("expected %d prober_result metrics; got %d (error %s)", keep, c, err)
  172. }
  173. }
  174. func TestPrometheus(t *testing.T) {
  175. clk := newFakeTime()
  176. p := newForTest(clk.Now, clk.NewTicker).WithMetricNamespace("probe")
  177. var succeed atomic.Bool
  178. p.Run("testprobe", probeInterval, map[string]string{"label": "value"}, func(context.Context) error {
  179. clk.Advance(aFewMillis)
  180. if succeed.Load() {
  181. return nil
  182. }
  183. return errors.New("failing, as instructed by test")
  184. })
  185. waitActiveProbes(t, p, clk, 1)
  186. err := tstest.WaitFor(convergenceTimeout, func() error {
  187. want := fmt.Sprintf(`
  188. # HELP probe_interval_secs Probe interval in seconds
  189. # TYPE probe_interval_secs gauge
  190. probe_interval_secs{label="value",name="testprobe"} %f
  191. # HELP probe_start_secs Latest probe start time (seconds since epoch)
  192. # TYPE probe_start_secs gauge
  193. probe_start_secs{label="value",name="testprobe"} %d
  194. # HELP probe_end_secs Latest probe end time (seconds since epoch)
  195. # TYPE probe_end_secs gauge
  196. probe_end_secs{label="value",name="testprobe"} %d
  197. # HELP probe_result Latest probe result (1 = success, 0 = failure)
  198. # TYPE probe_result gauge
  199. probe_result{label="value",name="testprobe"} 0
  200. `, probeInterval.Seconds(), epoch.Unix(), epoch.Add(aFewMillis).Unix())
  201. return testutil.GatherAndCompare(p.metrics, strings.NewReader(want),
  202. "probe_interval_secs", "probe_start_secs", "probe_end_secs", "probe_result")
  203. })
  204. if err != nil {
  205. t.Fatal(err)
  206. }
  207. succeed.Store(true)
  208. clk.Advance(probeInterval + halfProbeInterval)
  209. err = tstest.WaitFor(convergenceTimeout, func() error {
  210. start := epoch.Add(probeInterval + halfProbeInterval)
  211. end := start.Add(aFewMillis)
  212. want := fmt.Sprintf(`
  213. # HELP probe_interval_secs Probe interval in seconds
  214. # TYPE probe_interval_secs gauge
  215. probe_interval_secs{label="value",name="testprobe"} %f
  216. # HELP probe_start_secs Latest probe start time (seconds since epoch)
  217. # TYPE probe_start_secs gauge
  218. probe_start_secs{label="value",name="testprobe"} %d
  219. # HELP probe_end_secs Latest probe end time (seconds since epoch)
  220. # TYPE probe_end_secs gauge
  221. probe_end_secs{label="value",name="testprobe"} %d
  222. # HELP probe_latency_millis Latest probe latency (ms)
  223. # TYPE probe_latency_millis gauge
  224. probe_latency_millis{label="value",name="testprobe"} %d
  225. # HELP probe_result Latest probe result (1 = success, 0 = failure)
  226. # TYPE probe_result gauge
  227. probe_result{label="value",name="testprobe"} 1
  228. `, probeInterval.Seconds(), start.Unix(), end.Unix(), aFewMillis.Milliseconds())
  229. return testutil.GatherAndCompare(p.metrics, strings.NewReader(want),
  230. "probe_interval_secs", "probe_start_secs", "probe_end_secs", "probe_latency_millis", "probe_result")
  231. })
  232. if err != nil {
  233. t.Fatal(err)
  234. }
  235. }
  236. func TestOnceMode(t *testing.T) {
  237. clk := newFakeTime()
  238. p := newForTest(clk.Now, clk.NewTicker).WithOnce(true)
  239. p.Run("probe1", probeInterval, nil, func(context.Context) error { return nil })
  240. p.Run("probe2", probeInterval, nil, func(context.Context) error { return fmt.Errorf("error2") })
  241. p.Run("probe3", probeInterval, nil, func(context.Context) error {
  242. p.Run("probe4", probeInterval, nil, func(context.Context) error {
  243. return fmt.Errorf("error4")
  244. })
  245. return nil
  246. })
  247. p.Wait()
  248. wantCount := 4
  249. for _, metric := range []string{"prober_result", "prober_end_secs"} {
  250. if c, err := testutil.GatherAndCount(p.metrics, metric); c != wantCount || err != nil {
  251. t.Fatalf("expected %d %s metrics; got %d (error %s)", wantCount, metric, c, err)
  252. }
  253. }
  254. }
  255. type fakeTicker struct {
  256. ch chan time.Time
  257. interval time.Duration
  258. sync.Mutex
  259. next time.Time
  260. stopped bool
  261. }
  262. func (t *fakeTicker) Chan() <-chan time.Time {
  263. return t.ch
  264. }
  265. func (t *fakeTicker) Stop() {
  266. t.Lock()
  267. defer t.Unlock()
  268. t.stopped = true
  269. }
  270. func (t *fakeTicker) fire(now time.Time) {
  271. t.Lock()
  272. defer t.Unlock()
  273. // Slight deviation from the stdlib ticker: time.Ticker will
  274. // adjust t.next to make up for missed ticks, whereas we tick on a
  275. // fixed interval regardless of receiver behavior. In our case
  276. // this is fine, since we're using the ticker as a wakeup
  277. // mechanism and not a precise timekeeping system.
  278. select {
  279. case t.ch <- now:
  280. default:
  281. }
  282. for now.After(t.next) {
  283. t.next = t.next.Add(t.interval)
  284. }
  285. }
  286. type fakeTime struct {
  287. sync.Mutex
  288. *sync.Cond
  289. curTime time.Time
  290. tickers []*fakeTicker
  291. }
  292. func newFakeTime() *fakeTime {
  293. ret := &fakeTime{
  294. curTime: epoch,
  295. }
  296. ret.Cond = &sync.Cond{L: &ret.Mutex}
  297. return ret
  298. }
  299. func (t *fakeTime) Now() time.Time {
  300. t.Lock()
  301. defer t.Unlock()
  302. ret := t.curTime
  303. return ret
  304. }
  305. func (t *fakeTime) NewTicker(d time.Duration) ticker {
  306. t.Lock()
  307. defer t.Unlock()
  308. ret := &fakeTicker{
  309. ch: make(chan time.Time, 1),
  310. interval: d,
  311. next: t.curTime.Add(d),
  312. }
  313. t.tickers = append(t.tickers, ret)
  314. t.Cond.Broadcast()
  315. return ret
  316. }
  317. func (t *fakeTime) Advance(d time.Duration) {
  318. t.Lock()
  319. defer t.Unlock()
  320. t.curTime = t.curTime.Add(d)
  321. for _, tick := range t.tickers {
  322. if t.curTime.After(tick.next) {
  323. tick.fire(t.curTime)
  324. }
  325. }
  326. }
  327. func (t *fakeTime) activeTickers() (count int) {
  328. t.Lock()
  329. defer t.Unlock()
  330. for _, tick := range t.tickers {
  331. if !tick.stopped {
  332. count += 1
  333. }
  334. }
  335. return
  336. }
  337. func waitActiveProbes(t *testing.T, p *Prober, clk *fakeTime, want int) {
  338. t.Helper()
  339. err := tstest.WaitFor(convergenceTimeout, func() error {
  340. if got := p.activeProbes(); got != want {
  341. return fmt.Errorf("installed probe count is %d, want %d", got, want)
  342. }
  343. if got := clk.activeTickers(); got != want {
  344. return fmt.Errorf("active ticker count is %d, want %d", got, want)
  345. }
  346. return nil
  347. })
  348. if err != nil {
  349. t.Fatal(err)
  350. }
  351. }