failurereporting.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. // Copyright (C) 2020 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package ur
  7. import (
  8. "bytes"
  9. "context"
  10. "encoding/json"
  11. "net/http"
  12. "runtime/pprof"
  13. "strings"
  14. "time"
  15. "github.com/syncthing/syncthing/lib/build"
  16. "github.com/syncthing/syncthing/lib/config"
  17. "github.com/syncthing/syncthing/lib/dialer"
  18. "github.com/syncthing/syncthing/lib/events"
  19. "github.com/syncthing/syncthing/lib/svcutil"
  20. "github.com/thejerf/suture/v4"
  21. )
  22. var (
  23. // When a specific failure first occurs, it is delayed by minDelay. If
  24. // more of the same failures occurs those are further delayed and
  25. // aggregated for maxDelay.
  26. minDelay = 10 * time.Second
  27. maxDelay = time.Minute
  28. sendTimeout = time.Minute
  29. finalSendTimeout = svcutil.ServiceTimeout / 2
  30. evChanClosed = "failure event channel closed"
  31. invalidEventDataType = "failure event data is not a string"
  32. )
  33. type FailureReport struct {
  34. FailureData
  35. Count int
  36. Version string
  37. }
  38. type FailureData struct {
  39. Description string
  40. Goroutines string
  41. Extra map[string]string
  42. }
  43. func FailureDataWithGoroutines(description string) FailureData {
  44. var buf strings.Builder
  45. pprof.Lookup("goroutine").WriteTo(&buf, 1)
  46. return FailureData{
  47. Description: description,
  48. Goroutines: buf.String(),
  49. }
  50. }
  51. type FailureHandler interface {
  52. suture.Service
  53. config.Committer
  54. }
  55. func NewFailureHandler(cfg config.Wrapper, evLogger events.Logger) FailureHandler {
  56. return &failureHandler{
  57. cfg: cfg,
  58. evLogger: evLogger,
  59. optsChan: make(chan config.OptionsConfiguration),
  60. buf: make(map[string]*failureStat),
  61. }
  62. }
  63. type failureHandler struct {
  64. cfg config.Wrapper
  65. evLogger events.Logger
  66. optsChan chan config.OptionsConfiguration
  67. buf map[string]*failureStat
  68. }
  69. type failureStat struct {
  70. first, last time.Time
  71. count int
  72. data FailureData
  73. }
  74. func (h *failureHandler) Serve(ctx context.Context) error {
  75. cfg := h.cfg.Subscribe(h)
  76. defer h.cfg.Unsubscribe(h)
  77. url, sub, evChan := h.applyOpts(cfg.Options, nil)
  78. var err error
  79. timer := time.NewTimer(minDelay)
  80. resetTimer := make(chan struct{})
  81. for err == nil {
  82. select {
  83. case opts := <-h.optsChan:
  84. url, sub, evChan = h.applyOpts(opts, sub)
  85. case e, ok := <-evChan:
  86. if !ok {
  87. // Just to be safe - shouldn't ever happen, as
  88. // evChan is set to nil when unsubscribing.
  89. h.addReport(FailureData{Description: evChanClosed}, time.Now())
  90. evChan = nil
  91. continue
  92. }
  93. var data FailureData
  94. switch d := e.Data.(type) {
  95. case string:
  96. data.Description = d
  97. case FailureData:
  98. data = d
  99. default:
  100. // Same here, shouldn't ever happen.
  101. h.addReport(FailureData{Description: invalidEventDataType}, time.Now())
  102. continue
  103. }
  104. h.addReport(data, e.Time)
  105. case <-timer.C:
  106. reports := make([]FailureReport, 0, len(h.buf))
  107. now := time.Now()
  108. for descr, stat := range h.buf {
  109. if now.Sub(stat.last) > minDelay || now.Sub(stat.first) > maxDelay {
  110. reports = append(reports, newFailureReport(stat))
  111. delete(h.buf, descr)
  112. }
  113. }
  114. if len(reports) > 0 {
  115. // Lets keep process events/configs while it might be timing out for a while
  116. go func() {
  117. sendFailureReports(ctx, reports, url)
  118. select {
  119. case resetTimer <- struct{}{}:
  120. case <-ctx.Done():
  121. }
  122. }()
  123. } else {
  124. timer.Reset(minDelay)
  125. }
  126. case <-resetTimer:
  127. timer.Reset(minDelay)
  128. case <-ctx.Done():
  129. err = ctx.Err()
  130. }
  131. }
  132. if sub != nil {
  133. sub.Unsubscribe()
  134. if len(h.buf) > 0 {
  135. reports := make([]FailureReport, 0, len(h.buf))
  136. for _, stat := range h.buf {
  137. reports = append(reports, newFailureReport(stat))
  138. }
  139. timeout, cancel := context.WithTimeout(context.Background(), finalSendTimeout)
  140. defer cancel()
  141. sendFailureReports(timeout, reports, url)
  142. }
  143. }
  144. return err
  145. }
  146. func (h *failureHandler) applyOpts(opts config.OptionsConfiguration, sub events.Subscription) (string, events.Subscription, <-chan events.Event) {
  147. // Sub nil checks just for safety - config updates can be racy.
  148. url := opts.CRURL + "/failure"
  149. if opts.URAccepted > 0 {
  150. if sub == nil {
  151. sub = h.evLogger.Subscribe(events.Failure)
  152. }
  153. return url, sub, sub.C()
  154. }
  155. if sub != nil {
  156. sub.Unsubscribe()
  157. }
  158. return url, nil, nil
  159. }
  160. func (h *failureHandler) addReport(data FailureData, evTime time.Time) {
  161. if stat, ok := h.buf[data.Description]; ok {
  162. stat.last = evTime
  163. stat.count++
  164. return
  165. }
  166. h.buf[data.Description] = &failureStat{
  167. first: evTime,
  168. last: evTime,
  169. count: 1,
  170. data: data,
  171. }
  172. }
  173. func (h *failureHandler) CommitConfiguration(from, to config.Configuration) bool {
  174. if from.Options.CREnabled != to.Options.CREnabled || from.Options.CRURL != to.Options.CRURL {
  175. h.optsChan <- to.Options
  176. }
  177. return true
  178. }
  179. func (*failureHandler) String() string {
  180. return "FailureHandler"
  181. }
  182. func sendFailureReports(ctx context.Context, reports []FailureReport, url string) {
  183. var b bytes.Buffer
  184. if err := json.NewEncoder(&b).Encode(reports); err != nil {
  185. panic(err)
  186. }
  187. client := &http.Client{
  188. Transport: &http.Transport{
  189. DialContext: dialer.DialContext,
  190. Proxy: http.ProxyFromEnvironment,
  191. },
  192. }
  193. reqCtx, reqCancel := context.WithTimeout(ctx, sendTimeout)
  194. defer reqCancel()
  195. req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, &b)
  196. if err != nil {
  197. l.Infoln("Failed to send failure report:", err)
  198. return
  199. }
  200. req.Header.Set("Content-Type", "application/json")
  201. resp, err := client.Do(req)
  202. if err != nil {
  203. l.Infoln("Failed to send failure report:", err)
  204. return
  205. }
  206. resp.Body.Close()
  207. }
  208. func newFailureReport(stat *failureStat) FailureReport {
  209. return FailureReport{
  210. FailureData: stat.data,
  211. Count: stat.count,
  212. Version: build.LongVersion,
  213. }
  214. }