failurereporting.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. // Copyright (C) 2020 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package ur
  7. import (
  8. "bytes"
  9. "context"
  10. "encoding/json"
  11. "log/slog"
  12. "net/http"
  13. "runtime/pprof"
  14. "strings"
  15. "time"
  16. "github.com/syncthing/syncthing/internal/slogutil"
  17. "github.com/syncthing/syncthing/lib/build"
  18. "github.com/syncthing/syncthing/lib/config"
  19. "github.com/syncthing/syncthing/lib/dialer"
  20. "github.com/syncthing/syncthing/lib/events"
  21. "github.com/syncthing/syncthing/lib/svcutil"
  22. "github.com/syncthing/syncthing/lib/tlsutil"
  23. "github.com/thejerf/suture/v4"
  24. )
  25. var (
  26. // When a specific failure first occurs, it is delayed by minDelay. If
  27. // more of the same failures occurs those are further delayed and
  28. // aggregated for maxDelay.
  29. minDelay = 10 * time.Second
  30. maxDelay = time.Minute
  31. sendTimeout = time.Minute
  32. finalSendTimeout = svcutil.ServiceTimeout / 2
  33. evChanClosed = "failure event channel closed"
  34. invalidEventDataType = "failure event data is not a string"
  35. )
  36. type FailureReport struct {
  37. FailureData
  38. Count int
  39. Version string
  40. }
  41. type FailureData struct {
  42. Description string
  43. Goroutines string
  44. Extra map[string]string
  45. }
  46. func FailureDataWithGoroutines(description string) FailureData {
  47. var buf strings.Builder
  48. pprof.Lookup("goroutine").WriteTo(&buf, 1)
  49. return FailureData{
  50. Description: description,
  51. Goroutines: buf.String(),
  52. Extra: make(map[string]string),
  53. }
  54. }
  55. type FailureHandler interface {
  56. suture.Service
  57. config.Committer
  58. }
  59. func NewFailureHandler(cfg config.Wrapper, evLogger events.Logger) FailureHandler {
  60. return &failureHandler{
  61. cfg: cfg,
  62. evLogger: evLogger,
  63. optsChan: make(chan config.OptionsConfiguration),
  64. buf: make(map[string]*failureStat),
  65. }
  66. }
  67. type failureHandler struct {
  68. cfg config.Wrapper
  69. evLogger events.Logger
  70. optsChan chan config.OptionsConfiguration
  71. buf map[string]*failureStat
  72. }
  73. type failureStat struct {
  74. first, last time.Time
  75. count int
  76. data FailureData
  77. }
  78. func (h *failureHandler) Serve(ctx context.Context) error {
  79. cfg := h.cfg.Subscribe(h)
  80. defer h.cfg.Unsubscribe(h)
  81. url, sub, evChan := h.applyOpts(cfg.Options, nil)
  82. var err error
  83. timer := time.NewTimer(minDelay)
  84. resetTimer := make(chan struct{})
  85. for err == nil {
  86. select {
  87. case opts := <-h.optsChan:
  88. url, sub, evChan = h.applyOpts(opts, sub)
  89. case e, ok := <-evChan:
  90. if !ok {
  91. // Just to be safe - shouldn't ever happen, as
  92. // evChan is set to nil when unsubscribing.
  93. h.addReport(FailureData{Description: evChanClosed}, time.Now())
  94. evChan = nil
  95. continue
  96. }
  97. var data FailureData
  98. switch d := e.Data.(type) {
  99. case string:
  100. data.Description = d
  101. case FailureData:
  102. data = d
  103. default:
  104. // Same here, shouldn't ever happen.
  105. h.addReport(FailureData{Description: invalidEventDataType}, time.Now())
  106. continue
  107. }
  108. h.addReport(data, e.Time)
  109. case <-timer.C:
  110. reports := make([]FailureReport, 0, len(h.buf))
  111. now := time.Now()
  112. for descr, stat := range h.buf {
  113. if now.Sub(stat.last) > minDelay || now.Sub(stat.first) > maxDelay {
  114. reports = append(reports, newFailureReport(stat))
  115. delete(h.buf, descr)
  116. }
  117. }
  118. if len(reports) > 0 {
  119. // Lets keep process events/configs while it might be timing out for a while
  120. go func() {
  121. sendFailureReports(ctx, reports, url)
  122. select {
  123. case resetTimer <- struct{}{}:
  124. case <-ctx.Done():
  125. }
  126. }()
  127. } else {
  128. timer.Reset(minDelay)
  129. }
  130. case <-resetTimer:
  131. timer.Reset(minDelay)
  132. case <-ctx.Done():
  133. err = ctx.Err()
  134. }
  135. }
  136. if sub != nil {
  137. sub.Unsubscribe()
  138. if len(h.buf) > 0 {
  139. reports := make([]FailureReport, 0, len(h.buf))
  140. for _, stat := range h.buf {
  141. reports = append(reports, newFailureReport(stat))
  142. }
  143. timeout, cancel := context.WithTimeout(context.Background(), finalSendTimeout)
  144. defer cancel()
  145. sendFailureReports(timeout, reports, url)
  146. }
  147. }
  148. return err
  149. }
  150. func (h *failureHandler) applyOpts(opts config.OptionsConfiguration, sub events.Subscription) (string, events.Subscription, <-chan events.Event) {
  151. // Sub nil checks just for safety - config updates can be racy.
  152. url := opts.CRURL + "/failure"
  153. if opts.URAccepted > 0 {
  154. if sub == nil {
  155. sub = h.evLogger.Subscribe(events.Failure)
  156. }
  157. return url, sub, sub.C()
  158. }
  159. if sub != nil {
  160. sub.Unsubscribe()
  161. }
  162. return url, nil, nil
  163. }
  164. func (h *failureHandler) addReport(data FailureData, evTime time.Time) {
  165. if stat, ok := h.buf[data.Description]; ok {
  166. stat.last = evTime
  167. stat.count++
  168. return
  169. }
  170. h.buf[data.Description] = &failureStat{
  171. first: evTime,
  172. last: evTime,
  173. count: 1,
  174. data: data,
  175. }
  176. }
  177. func (h *failureHandler) CommitConfiguration(from, to config.Configuration) bool {
  178. if from.Options.CREnabled != to.Options.CREnabled || from.Options.CRURL != to.Options.CRURL {
  179. h.optsChan <- to.Options
  180. }
  181. return true
  182. }
  183. func (*failureHandler) String() string {
  184. return "FailureHandler"
  185. }
  186. func sendFailureReports(ctx context.Context, reports []FailureReport, url string) {
  187. var b bytes.Buffer
  188. if err := json.NewEncoder(&b).Encode(reports); err != nil {
  189. panic(err)
  190. }
  191. client := &http.Client{
  192. Transport: &http.Transport{
  193. DialContext: dialer.DialContext,
  194. Proxy: http.ProxyFromEnvironment,
  195. TLSClientConfig: tlsutil.SecureDefaultWithTLS12(),
  196. },
  197. }
  198. reqCtx, reqCancel := context.WithTimeout(ctx, sendTimeout)
  199. defer reqCancel()
  200. req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, &b)
  201. if err != nil {
  202. slog.WarnContext(ctx, "Failed to send failure report", slogutil.Error(err))
  203. return
  204. }
  205. req.Header.Set("Content-Type", "application/json")
  206. resp, err := client.Do(req)
  207. if err != nil {
  208. slog.WarnContext(ctx, "Failed to send failure report", slogutil.Error(err))
  209. return
  210. }
  211. resp.Body.Close()
  212. }
  213. func newFailureReport(stat *failureStat) FailureReport {
  214. return FailureReport{
  215. FailureData: stat.data,
  216. Count: stat.count,
  217. Version: build.LongVersion,
  218. }
  219. }