failurereporting.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239
  1. // Copyright (C) 2020 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package ur
  7. import (
  8. "bytes"
  9. "context"
  10. "encoding/json"
  11. "net/http"
  12. "runtime/pprof"
  13. "strings"
  14. "time"
  15. "github.com/syncthing/syncthing/lib/build"
  16. "github.com/syncthing/syncthing/lib/config"
  17. "github.com/syncthing/syncthing/lib/dialer"
  18. "github.com/syncthing/syncthing/lib/events"
  19. "github.com/syncthing/syncthing/lib/svcutil"
  20. "github.com/thejerf/suture/v4"
  21. )
  22. var (
  23. // When a specific failure first occurs, it is delayed by minDelay. If
  24. // more of the same failures occurs those are further delayed and
  25. // aggregated for maxDelay.
  26. minDelay = 10 * time.Second
  27. maxDelay = time.Minute
  28. sendTimeout = time.Minute
  29. finalSendTimeout = svcutil.ServiceTimeout / 2
  30. evChanClosed = "failure event channel closed"
  31. invalidEventDataType = "failure event data is not a string"
  32. )
  33. type FailureReport struct {
  34. FailureData
  35. Count int
  36. Version string
  37. }
  38. type FailureData struct {
  39. Description string
  40. Goroutines string
  41. Extra map[string]string
  42. }
  43. func FailureDataWithGoroutines(description string) FailureData {
  44. var buf strings.Builder
  45. pprof.Lookup("goroutine").WriteTo(&buf, 1)
  46. return FailureData{
  47. Description: description,
  48. Goroutines: buf.String(),
  49. Extra: make(map[string]string),
  50. }
  51. }
  52. type FailureHandler interface {
  53. suture.Service
  54. config.Committer
  55. }
  56. func NewFailureHandler(cfg config.Wrapper, evLogger events.Logger) FailureHandler {
  57. return &failureHandler{
  58. cfg: cfg,
  59. evLogger: evLogger,
  60. optsChan: make(chan config.OptionsConfiguration),
  61. buf: make(map[string]*failureStat),
  62. }
  63. }
  64. type failureHandler struct {
  65. cfg config.Wrapper
  66. evLogger events.Logger
  67. optsChan chan config.OptionsConfiguration
  68. buf map[string]*failureStat
  69. }
  70. type failureStat struct {
  71. first, last time.Time
  72. count int
  73. data FailureData
  74. }
  75. func (h *failureHandler) Serve(ctx context.Context) error {
  76. cfg := h.cfg.Subscribe(h)
  77. defer h.cfg.Unsubscribe(h)
  78. url, sub, evChan := h.applyOpts(cfg.Options, nil)
  79. var err error
  80. timer := time.NewTimer(minDelay)
  81. resetTimer := make(chan struct{})
  82. for err == nil {
  83. select {
  84. case opts := <-h.optsChan:
  85. url, sub, evChan = h.applyOpts(opts, sub)
  86. case e, ok := <-evChan:
  87. if !ok {
  88. // Just to be safe - shouldn't ever happen, as
  89. // evChan is set to nil when unsubscribing.
  90. h.addReport(FailureData{Description: evChanClosed}, time.Now())
  91. evChan = nil
  92. continue
  93. }
  94. var data FailureData
  95. switch d := e.Data.(type) {
  96. case string:
  97. data.Description = d
  98. case FailureData:
  99. data = d
  100. default:
  101. // Same here, shouldn't ever happen.
  102. h.addReport(FailureData{Description: invalidEventDataType}, time.Now())
  103. continue
  104. }
  105. h.addReport(data, e.Time)
  106. case <-timer.C:
  107. reports := make([]FailureReport, 0, len(h.buf))
  108. now := time.Now()
  109. for descr, stat := range h.buf {
  110. if now.Sub(stat.last) > minDelay || now.Sub(stat.first) > maxDelay {
  111. reports = append(reports, newFailureReport(stat))
  112. delete(h.buf, descr)
  113. }
  114. }
  115. if len(reports) > 0 {
  116. // Lets keep process events/configs while it might be timing out for a while
  117. go func() {
  118. sendFailureReports(ctx, reports, url)
  119. select {
  120. case resetTimer <- struct{}{}:
  121. case <-ctx.Done():
  122. }
  123. }()
  124. } else {
  125. timer.Reset(minDelay)
  126. }
  127. case <-resetTimer:
  128. timer.Reset(minDelay)
  129. case <-ctx.Done():
  130. err = ctx.Err()
  131. }
  132. }
  133. if sub != nil {
  134. sub.Unsubscribe()
  135. if len(h.buf) > 0 {
  136. reports := make([]FailureReport, 0, len(h.buf))
  137. for _, stat := range h.buf {
  138. reports = append(reports, newFailureReport(stat))
  139. }
  140. timeout, cancel := context.WithTimeout(context.Background(), finalSendTimeout)
  141. defer cancel()
  142. sendFailureReports(timeout, reports, url)
  143. }
  144. }
  145. return err
  146. }
  147. func (h *failureHandler) applyOpts(opts config.OptionsConfiguration, sub events.Subscription) (string, events.Subscription, <-chan events.Event) {
  148. // Sub nil checks just for safety - config updates can be racy.
  149. url := opts.CRURL + "/failure"
  150. if opts.URAccepted > 0 {
  151. if sub == nil {
  152. sub = h.evLogger.Subscribe(events.Failure)
  153. }
  154. return url, sub, sub.C()
  155. }
  156. if sub != nil {
  157. sub.Unsubscribe()
  158. }
  159. return url, nil, nil
  160. }
  161. func (h *failureHandler) addReport(data FailureData, evTime time.Time) {
  162. if stat, ok := h.buf[data.Description]; ok {
  163. stat.last = evTime
  164. stat.count++
  165. return
  166. }
  167. h.buf[data.Description] = &failureStat{
  168. first: evTime,
  169. last: evTime,
  170. count: 1,
  171. data: data,
  172. }
  173. }
  174. func (h *failureHandler) CommitConfiguration(from, to config.Configuration) bool {
  175. if from.Options.CREnabled != to.Options.CREnabled || from.Options.CRURL != to.Options.CRURL {
  176. h.optsChan <- to.Options
  177. }
  178. return true
  179. }
  180. func (*failureHandler) String() string {
  181. return "FailureHandler"
  182. }
  183. func sendFailureReports(ctx context.Context, reports []FailureReport, url string) {
  184. var b bytes.Buffer
  185. if err := json.NewEncoder(&b).Encode(reports); err != nil {
  186. panic(err)
  187. }
  188. client := &http.Client{
  189. Transport: &http.Transport{
  190. DialContext: dialer.DialContext,
  191. Proxy: http.ProxyFromEnvironment,
  192. },
  193. }
  194. reqCtx, reqCancel := context.WithTimeout(ctx, sendTimeout)
  195. defer reqCancel()
  196. req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, &b)
  197. if err != nil {
  198. l.Infoln("Failed to send failure report:", err)
  199. return
  200. }
  201. req.Header.Set("Content-Type", "application/json")
  202. resp, err := client.Do(req)
  203. if err != nil {
  204. l.Infoln("Failed to send failure report:", err)
  205. return
  206. }
  207. resp.Body.Close()
  208. }
  209. func newFailureReport(stat *failureStat) FailureReport {
  210. return FailureReport{
  211. FailureData: stat.data,
  212. Count: stat.count,
  213. Version: build.LongVersion,
  214. }
  215. }