monitor.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581
  1. // Copyright (C) 2014 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package main
  7. import (
  8. "bufio"
  9. "context"
  10. "fmt"
  11. "io"
  12. "os"
  13. "os/exec"
  14. "os/signal"
  15. "path/filepath"
  16. "runtime"
  17. "strings"
  18. "syscall"
  19. "time"
  20. "github.com/syncthing/syncthing/lib/events"
  21. "github.com/syncthing/syncthing/lib/fs"
  22. "github.com/syncthing/syncthing/lib/locations"
  23. "github.com/syncthing/syncthing/lib/osutil"
  24. "github.com/syncthing/syncthing/lib/protocol"
  25. "github.com/syncthing/syncthing/lib/sync"
  26. "github.com/syncthing/syncthing/lib/syncthing"
  27. )
  28. var (
  29. stdoutFirstLines []string // The first 10 lines of stdout
  30. stdoutLastLines []string // The last 50 lines of stdout
  31. stdoutMut = sync.NewMutex()
  32. )
  33. const (
  34. countRestarts = 4
  35. loopThreshold = 60 * time.Second
  36. logFileAutoCloseDelay = 5 * time.Second
  37. logFileMaxOpenTime = time.Minute
  38. panicUploadMaxWait = 30 * time.Second
  39. panicUploadNoticeWait = 10 * time.Second
  40. )
  41. func monitorMain(runtimeOptions RuntimeOptions) {
  42. l.SetPrefix("[monitor] ")
  43. var dst io.Writer = os.Stdout
  44. logFile := runtimeOptions.logFile
  45. if logFile != "-" {
  46. if expanded, err := fs.ExpandTilde(logFile); err == nil {
  47. logFile = expanded
  48. }
  49. var fileDst io.Writer
  50. if runtimeOptions.logMaxSize > 0 {
  51. open := func(name string) (io.WriteCloser, error) {
  52. return newAutoclosedFile(name, logFileAutoCloseDelay, logFileMaxOpenTime), nil
  53. }
  54. fileDst = newRotatedFile(logFile, open, int64(runtimeOptions.logMaxSize), runtimeOptions.logMaxFiles)
  55. } else {
  56. fileDst = newAutoclosedFile(logFile, logFileAutoCloseDelay, logFileMaxOpenTime)
  57. }
  58. if runtime.GOOS == "windows" {
  59. // Translate line breaks to Windows standard
  60. fileDst = osutil.ReplacingWriter{
  61. Writer: fileDst,
  62. From: '\n',
  63. To: []byte{'\r', '\n'},
  64. }
  65. }
  66. // Log to both stdout and file.
  67. dst = io.MultiWriter(dst, fileDst)
  68. l.Infof(`Log output saved to file "%s"`, logFile)
  69. }
  70. args := os.Args
  71. var restarts [countRestarts]time.Time
  72. stopSign := make(chan os.Signal, 1)
  73. signal.Notify(stopSign, os.Interrupt, sigTerm)
  74. restartSign := make(chan os.Signal, 1)
  75. sigHup := syscall.Signal(1)
  76. signal.Notify(restartSign, sigHup)
  77. childEnv := childEnv()
  78. first := true
  79. for {
  80. maybeReportPanics()
  81. if t := time.Since(restarts[0]); t < loopThreshold {
  82. l.Warnf("%d restarts in %v; not retrying further", countRestarts, t)
  83. os.Exit(syncthing.ExitError.AsInt())
  84. }
  85. copy(restarts[0:], restarts[1:])
  86. restarts[len(restarts)-1] = time.Now()
  87. cmd := exec.Command(args[0], args[1:]...)
  88. cmd.Env = childEnv
  89. stderr, err := cmd.StderrPipe()
  90. if err != nil {
  91. panic(err)
  92. }
  93. stdout, err := cmd.StdoutPipe()
  94. if err != nil {
  95. panic(err)
  96. }
  97. l.Debugln("Starting syncthing")
  98. err = cmd.Start()
  99. if err != nil {
  100. l.Warnln("Error starting the main Syncthing process:", err)
  101. panic("Error starting the main Syncthing process")
  102. }
  103. stdoutMut.Lock()
  104. stdoutFirstLines = make([]string, 0, 10)
  105. stdoutLastLines = make([]string, 0, 50)
  106. stdoutMut.Unlock()
  107. wg := sync.NewWaitGroup()
  108. wg.Add(1)
  109. go func() {
  110. copyStderr(stderr, dst)
  111. wg.Done()
  112. }()
  113. wg.Add(1)
  114. go func() {
  115. copyStdout(stdout, dst)
  116. wg.Done()
  117. }()
  118. exit := make(chan error)
  119. go func() {
  120. wg.Wait()
  121. exit <- cmd.Wait()
  122. }()
  123. stopped := false
  124. select {
  125. case s := <-stopSign:
  126. l.Infof("Signal %d received; exiting", s)
  127. cmd.Process.Signal(sigTerm)
  128. err = <-exit
  129. stopped = true
  130. case s := <-restartSign:
  131. l.Infof("Signal %d received; restarting", s)
  132. cmd.Process.Signal(sigHup)
  133. err = <-exit
  134. case err = <-exit:
  135. }
  136. if err == nil {
  137. // Successful exit indicates an intentional shutdown
  138. os.Exit(syncthing.ExitSuccess.AsInt())
  139. }
  140. if exiterr, ok := err.(*exec.ExitError); ok {
  141. exitCode := exiterr.ExitCode()
  142. if stopped || runtimeOptions.noRestart {
  143. os.Exit(exitCode)
  144. }
  145. if exitCode == syncthing.ExitUpgrade.AsInt() {
  146. // Restart the monitor process to release the .old
  147. // binary as part of the upgrade process.
  148. l.Infoln("Restarting monitor...")
  149. if err = restartMonitor(args); err != nil {
  150. l.Warnln("Restart:", err)
  151. }
  152. os.Exit(exitCode)
  153. }
  154. }
  155. if runtimeOptions.noRestart {
  156. os.Exit(syncthing.ExitError.AsInt())
  157. }
  158. l.Infoln("Syncthing exited:", err)
  159. time.Sleep(1 * time.Second)
  160. if first {
  161. // Let the next child process know that this is not the first time
  162. // it's starting up.
  163. childEnv = append(childEnv, "STRESTART=yes")
  164. first = false
  165. }
  166. }
  167. }
  168. func copyStderr(stderr io.Reader, dst io.Writer) {
  169. br := bufio.NewReader(stderr)
  170. var panicFd *os.File
  171. defer func() {
  172. if panicFd != nil {
  173. _ = panicFd.Close()
  174. maybeReportPanics()
  175. }
  176. }()
  177. for {
  178. line, err := br.ReadString('\n')
  179. if err != nil {
  180. return
  181. }
  182. if panicFd == nil {
  183. dst.Write([]byte(line))
  184. if strings.Contains(line, "SIGILL") {
  185. l.Warnln(`
  186. *******************************************************************************
  187. * Crash due to illegal instruction detected. This is most likely due to a CPU *
  188. * incompatibility with the high performance hashing package. Switching to the *
  189. * standard hashing package instead. Please report this issue at: *
  190. * *
  191. * https://github.com/syncthing/syncthing/issues *
  192. * *
  193. * Include the details of your CPU. *
  194. *******************************************************************************
  195. `)
  196. os.Setenv("STHASHING", "standard")
  197. return
  198. }
  199. if strings.HasPrefix(line, "panic:") || strings.HasPrefix(line, "fatal error:") {
  200. panicFd, err = os.Create(locations.GetTimestamped(locations.PanicLog))
  201. if err != nil {
  202. l.Warnln("Create panic log:", err)
  203. continue
  204. }
  205. l.Warnf("Panic detected, writing to \"%s\"", panicFd.Name())
  206. if strings.Contains(line, "leveldb") && strings.Contains(line, "corrupt") {
  207. l.Warnln(`
  208. *********************************************************************************
  209. * Crash due to corrupt database. *
  210. * *
  211. * This crash usually occurs due to one of the following reasons: *
  212. * - Syncthing being stopped abruptly (killed/loss of power) *
  213. * - Bad hardware (memory/disk issues) *
  214. * - Software that affects disk writes (SSD caching software and simillar) *
  215. * *
  216. * Please see the following URL for instructions on how to recover: *
  217. * https://docs.syncthing.net/users/faq.html#my-syncthing-database-is-corrupt *
  218. *********************************************************************************
  219. `)
  220. } else {
  221. l.Warnln("Please check for existing issues with similar panic message at https://github.com/syncthing/syncthing/issues/")
  222. l.Warnln("If no issue with similar panic message exists, please create a new issue with the panic log attached")
  223. }
  224. stdoutMut.Lock()
  225. for _, line := range stdoutFirstLines {
  226. panicFd.WriteString(line)
  227. }
  228. panicFd.WriteString("...\n")
  229. for _, line := range stdoutLastLines {
  230. panicFd.WriteString(line)
  231. }
  232. stdoutMut.Unlock()
  233. }
  234. panicFd.WriteString("Panic at " + time.Now().Format(time.RFC3339) + "\n")
  235. }
  236. if panicFd != nil {
  237. panicFd.WriteString(line)
  238. }
  239. }
  240. }
  241. func copyStdout(stdout io.Reader, dst io.Writer) {
  242. br := bufio.NewReader(stdout)
  243. for {
  244. line, err := br.ReadString('\n')
  245. if err != nil {
  246. return
  247. }
  248. stdoutMut.Lock()
  249. if len(stdoutFirstLines) < cap(stdoutFirstLines) {
  250. stdoutFirstLines = append(stdoutFirstLines, line)
  251. } else {
  252. if l := len(stdoutLastLines); l == cap(stdoutLastLines) {
  253. stdoutLastLines = stdoutLastLines[:l-1]
  254. }
  255. stdoutLastLines = append(stdoutLastLines, line)
  256. }
  257. stdoutMut.Unlock()
  258. dst.Write([]byte(line))
  259. }
  260. }
  261. func restartMonitor(args []string) error {
  262. if runtime.GOOS != "windows" {
  263. // syscall.Exec is the cleanest way to restart on Unixes as it
  264. // replaces the current process with the new one, keeping the pid and
  265. // controlling terminal and so on
  266. return restartMonitorUnix(args)
  267. }
  268. // but it isn't supported on Windows, so there we start a normal
  269. // exec.Command and return.
  270. return restartMonitorWindows(args)
  271. }
  272. func restartMonitorUnix(args []string) error {
  273. if !strings.ContainsRune(args[0], os.PathSeparator) {
  274. // The path to the binary doesn't contain a slash, so it should be
  275. // found in $PATH.
  276. binary, err := exec.LookPath(args[0])
  277. if err != nil {
  278. return err
  279. }
  280. args[0] = binary
  281. }
  282. return syscall.Exec(args[0], args, os.Environ())
  283. }
  284. func restartMonitorWindows(args []string) error {
  285. cmd := exec.Command(args[0], args[1:]...)
  286. // Retain the standard streams
  287. cmd.Stderr = os.Stderr
  288. cmd.Stdout = os.Stdout
  289. cmd.Stdin = os.Stdin
  290. return cmd.Start()
  291. }
  292. // rotatedFile keeps a set of rotating logs. There will be the base file plus up
  293. // to maxFiles rotated ones, each ~ maxSize bytes large.
  294. type rotatedFile struct {
  295. name string
  296. create createFn
  297. maxSize int64 // bytes
  298. maxFiles int
  299. currentFile io.WriteCloser
  300. currentSize int64
  301. }
  302. // the createFn should act equivalently to os.Create
  303. type createFn func(name string) (io.WriteCloser, error)
  304. func newRotatedFile(name string, create createFn, maxSize int64, maxFiles int) *rotatedFile {
  305. return &rotatedFile{
  306. name: name,
  307. create: create,
  308. maxSize: maxSize,
  309. maxFiles: maxFiles,
  310. }
  311. }
  312. func (r *rotatedFile) Write(bs []byte) (int, error) {
  313. // Check if we're about to exceed the max size, and if so close this
  314. // file so we'll start on a new one.
  315. if r.currentSize+int64(len(bs)) > r.maxSize {
  316. r.currentFile.Close()
  317. r.currentFile = nil
  318. r.currentSize = 0
  319. }
  320. // If we have no current log, rotate old files out of the way and create
  321. // a new one.
  322. if r.currentFile == nil {
  323. r.rotate()
  324. fd, err := r.create(r.name)
  325. if err != nil {
  326. return 0, err
  327. }
  328. r.currentFile = fd
  329. }
  330. n, err := r.currentFile.Write(bs)
  331. r.currentSize += int64(n)
  332. return n, err
  333. }
  334. func (r *rotatedFile) rotate() {
  335. // The files are named "name", "name.0", "name.1", ...
  336. // "name.(r.maxFiles-1)". Increase the numbers on the
  337. // suffixed ones.
  338. for i := r.maxFiles - 1; i > 0; i-- {
  339. from := numberedFile(r.name, i-1)
  340. to := numberedFile(r.name, i)
  341. err := os.Rename(from, to)
  342. if err != nil && !os.IsNotExist(err) {
  343. fmt.Println("LOG: Rotating logs:", err)
  344. }
  345. }
  346. // Rename the base to base.0
  347. err := os.Rename(r.name, numberedFile(r.name, 0))
  348. if err != nil && !os.IsNotExist(err) {
  349. fmt.Println("LOG: Rotating logs:", err)
  350. }
  351. }
  352. // numberedFile adds the number between the file name and the extension.
  353. func numberedFile(name string, num int) string {
  354. ext := filepath.Ext(name) // contains the dot
  355. withoutExt := name[:len(name)-len(ext)]
  356. return fmt.Sprintf("%s.%d%s", withoutExt, num, ext)
  357. }
  358. // An autoclosedFile is an io.WriteCloser that opens itself for appending on
  359. // Write() and closes itself after an interval of no writes (closeDelay) or
  360. // when the file has been open for too long (maxOpenTime). A call to Write()
  361. // will return any error that happens on the resulting Open() call too. Errors
  362. // on automatic Close() calls are silently swallowed...
  363. type autoclosedFile struct {
  364. name string // path to write to
  365. closeDelay time.Duration // close after this long inactivity
  366. maxOpenTime time.Duration // or this long after opening
  367. fd io.WriteCloser // underlying WriteCloser
  368. opened time.Time // timestamp when the file was last opened
  369. closed chan struct{} // closed on Close(), stops the closerLoop
  370. closeTimer *time.Timer // fires closeDelay after a write
  371. mut sync.Mutex
  372. }
  373. func newAutoclosedFile(name string, closeDelay, maxOpenTime time.Duration) *autoclosedFile {
  374. f := &autoclosedFile{
  375. name: name,
  376. closeDelay: closeDelay,
  377. maxOpenTime: maxOpenTime,
  378. mut: sync.NewMutex(),
  379. closed: make(chan struct{}),
  380. closeTimer: time.NewTimer(time.Minute),
  381. }
  382. go f.closerLoop()
  383. return f
  384. }
  385. func (f *autoclosedFile) Write(bs []byte) (int, error) {
  386. f.mut.Lock()
  387. defer f.mut.Unlock()
  388. // Make sure the file is open for appending
  389. if err := f.ensureOpen(); err != nil {
  390. return 0, err
  391. }
  392. // If we haven't run into the maxOpenTime, postpone close for another
  393. // closeDelay
  394. if time.Since(f.opened) < f.maxOpenTime {
  395. f.closeTimer.Reset(f.closeDelay)
  396. }
  397. return f.fd.Write(bs)
  398. }
  399. func (f *autoclosedFile) Close() error {
  400. f.mut.Lock()
  401. defer f.mut.Unlock()
  402. // Stop the timer and closerLoop() routine
  403. f.closeTimer.Stop()
  404. close(f.closed)
  405. // Close the file, if it's open
  406. if f.fd != nil {
  407. return f.fd.Close()
  408. }
  409. return nil
  410. }
  411. // Must be called with f.mut held!
  412. func (f *autoclosedFile) ensureOpen() error {
  413. if f.fd != nil {
  414. // File is already open
  415. return nil
  416. }
  417. // We open the file for write only, and create it if it doesn't exist.
  418. flags := os.O_WRONLY | os.O_CREATE
  419. if f.opened.IsZero() {
  420. // This is the first time we are opening the file. We should truncate
  421. // it to better emulate an os.Create() call.
  422. flags |= os.O_TRUNC
  423. } else {
  424. // The file was already opened once, so we should append to it.
  425. flags |= os.O_APPEND
  426. }
  427. fd, err := os.OpenFile(f.name, flags, 0644)
  428. if err != nil {
  429. return err
  430. }
  431. f.fd = fd
  432. f.opened = time.Now()
  433. return nil
  434. }
  435. func (f *autoclosedFile) closerLoop() {
  436. for {
  437. select {
  438. case <-f.closeTimer.C:
  439. // Close the file when the timer expires.
  440. f.mut.Lock()
  441. if f.fd != nil {
  442. f.fd.Close() // errors, schmerrors
  443. f.fd = nil
  444. }
  445. f.mut.Unlock()
  446. case <-f.closed:
  447. return
  448. }
  449. }
  450. }
  451. // Returns the desired child environment, properly filtered and added to.
  452. func childEnv() []string {
  453. var env []string
  454. for _, str := range os.Environ() {
  455. if strings.HasPrefix(str, "STNORESTART=") {
  456. continue
  457. }
  458. if strings.HasPrefix(str, "STMONITORED=") {
  459. continue
  460. }
  461. env = append(env, str)
  462. }
  463. env = append(env, "STMONITORED=yes")
  464. return env
  465. }
  466. // maybeReportPanics tries to figure out if crash reporting is on or off,
  467. // and reports any panics it can find if it's enabled. We spend at most
  468. // panicUploadMaxWait uploading panics...
  469. func maybeReportPanics() {
  470. // Try to get a config to see if/where panics should be reported.
  471. cfg, err := loadOrDefaultConfig(protocol.EmptyDeviceID, events.NoopLogger)
  472. if err != nil {
  473. l.Warnln("Couldn't load config; not reporting crash")
  474. return
  475. }
  476. // Bail if we're not supposed to report panics.
  477. opts := cfg.Options()
  478. if !opts.CREnabled {
  479. return
  480. }
  481. // Set up a timeout on the whole operation.
  482. ctx, cancel := context.WithTimeout(context.Background(), panicUploadMaxWait)
  483. defer cancel()
  484. // Print a notice if the upload takes a long time.
  485. go func() {
  486. select {
  487. case <-ctx.Done():
  488. return
  489. case <-time.After(panicUploadNoticeWait):
  490. l.Warnln("Uploading crash reports is taking a while, please wait...")
  491. }
  492. }()
  493. // Report the panics.
  494. dir := locations.GetBaseDir(locations.ConfigBaseDir)
  495. uploadPanicLogs(ctx, opts.CRURL, dir)
  496. }