failurereporting.go 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. // Copyright (C) 2020 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package ur
  7. import (
  8. "bytes"
  9. "context"
  10. "encoding/json"
  11. "net/http"
  12. "runtime/pprof"
  13. "strings"
  14. "time"
  15. "github.com/syncthing/syncthing/lib/build"
  16. "github.com/syncthing/syncthing/lib/config"
  17. "github.com/syncthing/syncthing/lib/dialer"
  18. "github.com/syncthing/syncthing/lib/events"
  19. "github.com/syncthing/syncthing/lib/svcutil"
  20. "github.com/syncthing/syncthing/lib/tlsutil"
  21. "github.com/thejerf/suture/v4"
  22. )
  23. var (
  24. // When a specific failure first occurs, it is delayed by minDelay. If
  25. // more of the same failures occurs those are further delayed and
  26. // aggregated for maxDelay.
  27. minDelay = 10 * time.Second
  28. maxDelay = time.Minute
  29. sendTimeout = time.Minute
  30. finalSendTimeout = svcutil.ServiceTimeout / 2
  31. evChanClosed = "failure event channel closed"
  32. invalidEventDataType = "failure event data is not a string"
  33. )
  34. type FailureReport struct {
  35. FailureData
  36. Count int
  37. Version string
  38. }
  39. type FailureData struct {
  40. Description string
  41. Goroutines string
  42. Extra map[string]string
  43. }
  44. func FailureDataWithGoroutines(description string) FailureData {
  45. var buf strings.Builder
  46. pprof.Lookup("goroutine").WriteTo(&buf, 1)
  47. return FailureData{
  48. Description: description,
  49. Goroutines: buf.String(),
  50. Extra: make(map[string]string),
  51. }
  52. }
  53. type FailureHandler interface {
  54. suture.Service
  55. config.Committer
  56. }
  57. func NewFailureHandler(cfg config.Wrapper, evLogger events.Logger) FailureHandler {
  58. return &failureHandler{
  59. cfg: cfg,
  60. evLogger: evLogger,
  61. optsChan: make(chan config.OptionsConfiguration),
  62. buf: make(map[string]*failureStat),
  63. }
  64. }
  65. type failureHandler struct {
  66. cfg config.Wrapper
  67. evLogger events.Logger
  68. optsChan chan config.OptionsConfiguration
  69. buf map[string]*failureStat
  70. }
  71. type failureStat struct {
  72. first, last time.Time
  73. count int
  74. data FailureData
  75. }
  76. func (h *failureHandler) Serve(ctx context.Context) error {
  77. cfg := h.cfg.Subscribe(h)
  78. defer h.cfg.Unsubscribe(h)
  79. url, sub, evChan := h.applyOpts(cfg.Options, nil)
  80. var err error
  81. timer := time.NewTimer(minDelay)
  82. resetTimer := make(chan struct{})
  83. for err == nil {
  84. select {
  85. case opts := <-h.optsChan:
  86. url, sub, evChan = h.applyOpts(opts, sub)
  87. case e, ok := <-evChan:
  88. if !ok {
  89. // Just to be safe - shouldn't ever happen, as
  90. // evChan is set to nil when unsubscribing.
  91. h.addReport(FailureData{Description: evChanClosed}, time.Now())
  92. evChan = nil
  93. continue
  94. }
  95. var data FailureData
  96. switch d := e.Data.(type) {
  97. case string:
  98. data.Description = d
  99. case FailureData:
  100. data = d
  101. default:
  102. // Same here, shouldn't ever happen.
  103. h.addReport(FailureData{Description: invalidEventDataType}, time.Now())
  104. continue
  105. }
  106. h.addReport(data, e.Time)
  107. case <-timer.C:
  108. reports := make([]FailureReport, 0, len(h.buf))
  109. now := time.Now()
  110. for descr, stat := range h.buf {
  111. if now.Sub(stat.last) > minDelay || now.Sub(stat.first) > maxDelay {
  112. reports = append(reports, newFailureReport(stat))
  113. delete(h.buf, descr)
  114. }
  115. }
  116. if len(reports) > 0 {
  117. // Lets keep process events/configs while it might be timing out for a while
  118. go func() {
  119. sendFailureReports(ctx, reports, url)
  120. select {
  121. case resetTimer <- struct{}{}:
  122. case <-ctx.Done():
  123. }
  124. }()
  125. } else {
  126. timer.Reset(minDelay)
  127. }
  128. case <-resetTimer:
  129. timer.Reset(minDelay)
  130. case <-ctx.Done():
  131. err = ctx.Err()
  132. }
  133. }
  134. if sub != nil {
  135. sub.Unsubscribe()
  136. if len(h.buf) > 0 {
  137. reports := make([]FailureReport, 0, len(h.buf))
  138. for _, stat := range h.buf {
  139. reports = append(reports, newFailureReport(stat))
  140. }
  141. timeout, cancel := context.WithTimeout(context.Background(), finalSendTimeout)
  142. defer cancel()
  143. sendFailureReports(timeout, reports, url)
  144. }
  145. }
  146. return err
  147. }
  148. func (h *failureHandler) applyOpts(opts config.OptionsConfiguration, sub events.Subscription) (string, events.Subscription, <-chan events.Event) {
  149. // Sub nil checks just for safety - config updates can be racy.
  150. url := opts.CRURL + "/failure"
  151. if opts.URAccepted > 0 {
  152. if sub == nil {
  153. sub = h.evLogger.Subscribe(events.Failure)
  154. }
  155. return url, sub, sub.C()
  156. }
  157. if sub != nil {
  158. sub.Unsubscribe()
  159. }
  160. return url, nil, nil
  161. }
  162. func (h *failureHandler) addReport(data FailureData, evTime time.Time) {
  163. if stat, ok := h.buf[data.Description]; ok {
  164. stat.last = evTime
  165. stat.count++
  166. return
  167. }
  168. h.buf[data.Description] = &failureStat{
  169. first: evTime,
  170. last: evTime,
  171. count: 1,
  172. data: data,
  173. }
  174. }
  175. func (h *failureHandler) CommitConfiguration(from, to config.Configuration) bool {
  176. if from.Options.CREnabled != to.Options.CREnabled || from.Options.CRURL != to.Options.CRURL {
  177. h.optsChan <- to.Options
  178. }
  179. return true
  180. }
  181. func (*failureHandler) String() string {
  182. return "FailureHandler"
  183. }
  184. func sendFailureReports(ctx context.Context, reports []FailureReport, url string) {
  185. var b bytes.Buffer
  186. if err := json.NewEncoder(&b).Encode(reports); err != nil {
  187. panic(err)
  188. }
  189. client := &http.Client{
  190. Transport: &http.Transport{
  191. DialContext: dialer.DialContext,
  192. Proxy: http.ProxyFromEnvironment,
  193. TLSClientConfig: tlsutil.SecureDefaultWithTLS12(),
  194. },
  195. }
  196. reqCtx, reqCancel := context.WithTimeout(ctx, sendTimeout)
  197. defer reqCancel()
  198. req, err := http.NewRequestWithContext(reqCtx, http.MethodPost, url, &b)
  199. if err != nil {
  200. l.Infoln("Failed to send failure report:", err)
  201. return
  202. }
  203. req.Header.Set("Content-Type", "application/json")
  204. resp, err := client.Do(req)
  205. if err != nil {
  206. l.Infoln("Failed to send failure report:", err)
  207. return
  208. }
  209. resp.Body.Close()
  210. }
  211. func newFailureReport(stat *failureStat) FailureReport {
  212. return FailureReport{
  213. FailureData: stat.data,
  214. Count: stat.count,
  215. Version: build.LongVersion,
  216. }
  217. }