auto.go 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719
  1. // Copyright (c) 2020 Tailscale Inc & AUTHORS All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package controlclient
  5. import (
  6. "context"
  7. "fmt"
  8. "sync"
  9. "time"
  10. "tailscale.com/health"
  11. "tailscale.com/logtail/backoff"
  12. "tailscale.com/tailcfg"
  13. "tailscale.com/types/empty"
  14. "tailscale.com/types/key"
  15. "tailscale.com/types/logger"
  16. "tailscale.com/types/netmap"
  17. "tailscale.com/types/persist"
  18. "tailscale.com/types/structs"
  19. )
  20. type LoginGoal struct {
  21. _ structs.Incomparable
  22. wantLoggedIn bool // true if we *want* to be logged in
  23. token *tailcfg.Oauth2Token // oauth token to use when logging in
  24. flags LoginFlags // flags to use when logging in
  25. url string // auth url that needs to be visited
  26. loggedOutResult chan<- error
  27. }
  28. func (g *LoginGoal) sendLogoutError(err error) {
  29. if g.loggedOutResult == nil {
  30. return
  31. }
  32. select {
  33. case g.loggedOutResult <- err:
  34. default:
  35. }
  36. }
  37. // Auto connects to a tailcontrol server for a node.
  38. // It's a concrete implementation of the Client interface.
  39. type Auto struct {
  40. direct *Direct // our interface to the server APIs
  41. timeNow func() time.Time
  42. logf logger.Logf
  43. expiry *time.Time
  44. closed bool
  45. newMapCh chan struct{} // readable when we must restart a map request
  46. unregisterHealthWatch func()
  47. mu sync.Mutex // mutex guards the following fields
  48. statusFunc func(Status) // called to update Client status
  49. paused bool // whether we should stop making HTTP requests
  50. unpauseWaiters []chan struct{}
  51. loggedIn bool // true if currently logged in
  52. loginGoal *LoginGoal // non-nil if some login activity is desired
  53. synced bool // true if our netmap is up-to-date
  54. hostinfo *tailcfg.Hostinfo
  55. inPollNetMap bool // true if currently running a PollNetMap
  56. inLiteMapUpdate bool // true if a lite (non-streaming) map request is outstanding
  57. inSendStatus int // number of sendStatus calls currently in progress
  58. state State
  59. authCtx context.Context // context used for auth requests
  60. mapCtx context.Context // context used for netmap requests
  61. authCancel func() // cancel the auth context
  62. mapCancel func() // cancel the netmap context
  63. quit chan struct{} // when closed, goroutines should all exit
  64. authDone chan struct{} // when closed, auth goroutine is done
  65. mapDone chan struct{} // when closed, map goroutine is done
  66. }
  67. // New creates and starts a new Auto.
  68. func New(opts Options) (*Auto, error) {
  69. c, err := NewNoStart(opts)
  70. if c != nil {
  71. c.Start()
  72. }
  73. return c, err
  74. }
  75. // NewNoStart creates a new Auto, but without calling Start on it.
  76. func NewNoStart(opts Options) (*Auto, error) {
  77. direct, err := NewDirect(opts)
  78. if err != nil {
  79. return nil, err
  80. }
  81. if opts.Logf == nil {
  82. opts.Logf = func(fmt string, args ...interface{}) {}
  83. }
  84. if opts.TimeNow == nil {
  85. opts.TimeNow = time.Now
  86. }
  87. c := &Auto{
  88. direct: direct,
  89. timeNow: opts.TimeNow,
  90. logf: opts.Logf,
  91. newMapCh: make(chan struct{}, 1),
  92. quit: make(chan struct{}),
  93. authDone: make(chan struct{}),
  94. mapDone: make(chan struct{}),
  95. }
  96. c.authCtx, c.authCancel = context.WithCancel(context.Background())
  97. c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
  98. c.unregisterHealthWatch = health.RegisterWatcher(c.onHealthChange)
  99. return c, nil
  100. }
  101. func (c *Auto) onHealthChange(sys health.Subsystem, err error) {
  102. if sys == health.SysOverall {
  103. return
  104. }
  105. c.logf("controlclient: restarting map request for %q health change to new state: %v", sys, err)
  106. c.cancelMapSafely()
  107. }
  108. // SetPaused controls whether HTTP activity should be paused.
  109. //
  110. // The client can be paused and unpaused repeatedly, unlike Start and Shutdown, which can only be used once.
  111. func (c *Auto) SetPaused(paused bool) {
  112. c.mu.Lock()
  113. defer c.mu.Unlock()
  114. if paused == c.paused {
  115. return
  116. }
  117. c.logf("setPaused(%v)", paused)
  118. c.paused = paused
  119. if paused {
  120. // Only cancel the map routine. (The auth routine isn't expensive
  121. // so it's fine to keep it running.)
  122. c.cancelMapLocked()
  123. } else {
  124. for _, ch := range c.unpauseWaiters {
  125. close(ch)
  126. }
  127. c.unpauseWaiters = nil
  128. }
  129. }
  130. // Start starts the client's goroutines.
  131. //
  132. // It should only be called for clients created by NewNoStart.
  133. func (c *Auto) Start() {
  134. go c.authRoutine()
  135. go c.mapRoutine()
  136. }
  137. // sendNewMapRequest either sends a new OmitPeers, non-streaming map request
  138. // (to just send Hostinfo/Netinfo/Endpoints info, while keeping an existing
  139. // streaming response open), or start a new streaming one if necessary.
  140. //
  141. // It should be called whenever there's something new to tell the server.
  142. func (c *Auto) sendNewMapRequest() {
  143. c.mu.Lock()
  144. // If we're not already streaming a netmap, or if we're already stuck
  145. // in a lite update, then tear down everything and start a new stream
  146. // (which starts by sending a new map request)
  147. if !c.inPollNetMap || c.inLiteMapUpdate || !c.loggedIn {
  148. c.mu.Unlock()
  149. c.cancelMapSafely()
  150. return
  151. }
  152. // Otherwise, send a lite update that doesn't keep a
  153. // long-running stream response.
  154. defer c.mu.Unlock()
  155. c.inLiteMapUpdate = true
  156. ctx, cancel := context.WithTimeout(c.mapCtx, 10*time.Second)
  157. go func() {
  158. defer cancel()
  159. t0 := time.Now()
  160. err := c.direct.SendLiteMapUpdate(ctx)
  161. d := time.Since(t0).Round(time.Millisecond)
  162. c.mu.Lock()
  163. c.inLiteMapUpdate = false
  164. c.mu.Unlock()
  165. if err == nil {
  166. c.logf("[v1] successful lite map update in %v", d)
  167. return
  168. }
  169. if ctx.Err() == nil {
  170. c.logf("lite map update after %v: %v", d, err)
  171. }
  172. // Fall back to restarting the long-polling map
  173. // request (the old heavy way) if the lite update
  174. // failed for any reason.
  175. c.cancelMapSafely()
  176. }()
  177. }
  178. func (c *Auto) cancelAuth() {
  179. c.mu.Lock()
  180. if c.authCancel != nil {
  181. c.authCancel()
  182. }
  183. if !c.closed {
  184. c.authCtx, c.authCancel = context.WithCancel(context.Background())
  185. }
  186. c.mu.Unlock()
  187. }
  188. func (c *Auto) cancelMapLocked() {
  189. if c.mapCancel != nil {
  190. c.mapCancel()
  191. }
  192. if !c.closed {
  193. c.mapCtx, c.mapCancel = context.WithCancel(context.Background())
  194. }
  195. }
  196. func (c *Auto) cancelMapUnsafely() {
  197. c.mu.Lock()
  198. c.cancelMapLocked()
  199. c.mu.Unlock()
  200. }
  201. func (c *Auto) cancelMapSafely() {
  202. c.mu.Lock()
  203. defer c.mu.Unlock()
  204. c.logf("[v1] cancelMapSafely: synced=%v", c.synced)
  205. if c.inPollNetMap {
  206. // received at least one netmap since the last
  207. // interruption. That means the server has already
  208. // fully processed our last request, which might
  209. // include UpdateEndpoints(). Interrupt it and try
  210. // again.
  211. c.cancelMapLocked()
  212. } else {
  213. // !synced means we either haven't done a netmap
  214. // request yet, or it hasn't answered yet. So the
  215. // server is in an undefined state. If we send
  216. // another netmap request too soon, it might race
  217. // with the last one, and if we're very unlucky,
  218. // the new request will be applied before the old one,
  219. // and the wrong endpoints will get registered. We
  220. // have to tell the client to abort politely, only
  221. // after it receives a response to its existing netmap
  222. // request.
  223. select {
  224. case c.newMapCh <- struct{}{}:
  225. c.logf("[v1] cancelMapSafely: wrote to channel")
  226. default:
  227. // if channel write failed, then there was already
  228. // an outstanding newMapCh request. One is enough,
  229. // since it'll always use the latest endpoints.
  230. c.logf("[v1] cancelMapSafely: channel was full")
  231. }
  232. }
  233. }
  234. func (c *Auto) authRoutine() {
  235. defer close(c.authDone)
  236. bo := backoff.NewBackoff("authRoutine", c.logf, 30*time.Second)
  237. for {
  238. c.mu.Lock()
  239. goal := c.loginGoal
  240. ctx := c.authCtx
  241. if goal != nil {
  242. c.logf("authRoutine: %s; wantLoggedIn=%v", c.state, goal.wantLoggedIn)
  243. } else {
  244. c.logf("authRoutine: %s; goal=nil paused=%v", c.state, c.paused)
  245. }
  246. c.mu.Unlock()
  247. select {
  248. case <-c.quit:
  249. c.logf("[v1] authRoutine: quit")
  250. return
  251. default:
  252. }
  253. report := func(err error, msg string) {
  254. c.logf("[v1] %s: %v", msg, err)
  255. // don't send status updates for context errors,
  256. // since context cancelation is always on purpose.
  257. if ctx.Err() == nil {
  258. c.sendStatus("authRoutine-report", err, "", nil)
  259. }
  260. }
  261. if goal == nil {
  262. // Wait for user to Login or Logout.
  263. <-ctx.Done()
  264. c.logf("[v1] authRoutine: context done.")
  265. continue
  266. }
  267. if !goal.wantLoggedIn {
  268. err := c.direct.TryLogout(ctx)
  269. goal.sendLogoutError(err)
  270. if err != nil {
  271. report(err, "TryLogout")
  272. bo.BackOff(ctx, err)
  273. continue
  274. }
  275. // success
  276. c.mu.Lock()
  277. c.loggedIn = false
  278. c.loginGoal = nil
  279. c.state = StateNotAuthenticated
  280. c.synced = false
  281. c.mu.Unlock()
  282. c.sendStatus("authRoutine-wantout", nil, "", nil)
  283. bo.BackOff(ctx, nil)
  284. } else { // ie. goal.wantLoggedIn
  285. c.mu.Lock()
  286. if goal.url != "" {
  287. c.state = StateURLVisitRequired
  288. } else {
  289. c.state = StateAuthenticating
  290. }
  291. c.mu.Unlock()
  292. var url string
  293. var err error
  294. var f string
  295. if goal.url != "" {
  296. url, err = c.direct.WaitLoginURL(ctx, goal.url)
  297. f = "WaitLoginURL"
  298. } else {
  299. url, err = c.direct.TryLogin(ctx, goal.token, goal.flags)
  300. f = "TryLogin"
  301. }
  302. if err != nil {
  303. report(err, f)
  304. bo.BackOff(ctx, err)
  305. continue
  306. }
  307. if url != "" {
  308. // goal.url ought to be empty here.
  309. // However, not all control servers get this right,
  310. // and logging about it here just generates noise.
  311. c.mu.Lock()
  312. c.loginGoal = &LoginGoal{
  313. wantLoggedIn: true,
  314. flags: LoginDefault,
  315. url: url,
  316. }
  317. c.state = StateURLVisitRequired
  318. c.synced = false
  319. c.mu.Unlock()
  320. c.sendStatus("authRoutine-url", err, url, nil)
  321. bo.BackOff(ctx, err)
  322. continue
  323. }
  324. // success
  325. c.mu.Lock()
  326. c.loggedIn = true
  327. c.loginGoal = nil
  328. c.state = StateAuthenticated
  329. c.mu.Unlock()
  330. c.sendStatus("authRoutine-success", nil, "", nil)
  331. c.cancelMapSafely()
  332. bo.BackOff(ctx, nil)
  333. }
  334. }
  335. }
  336. // Expiry returns the credential expiration time, or the zero time if
  337. // the expiration time isn't known. Used in tests only.
  338. func (c *Auto) Expiry() *time.Time {
  339. c.mu.Lock()
  340. defer c.mu.Unlock()
  341. return c.expiry
  342. }
  343. // Direct returns the underlying direct client object. Used in tests
  344. // only.
  345. func (c *Auto) Direct() *Direct {
  346. return c.direct
  347. }
  348. // unpausedChanLocked returns a new channel that is closed when the
  349. // current Auto pause is unpaused.
  350. //
  351. // c.mu must be held
  352. func (c *Auto) unpausedChanLocked() <-chan struct{} {
  353. unpaused := make(chan struct{})
  354. c.unpauseWaiters = append(c.unpauseWaiters, unpaused)
  355. return unpaused
  356. }
  357. func (c *Auto) mapRoutine() {
  358. defer close(c.mapDone)
  359. bo := backoff.NewBackoff("mapRoutine", c.logf, 30*time.Second)
  360. for {
  361. c.mu.Lock()
  362. if c.paused {
  363. unpaused := c.unpausedChanLocked()
  364. c.mu.Unlock()
  365. c.logf("mapRoutine: awaiting unpause")
  366. select {
  367. case <-unpaused:
  368. c.logf("mapRoutine: unpaused")
  369. case <-c.quit:
  370. c.logf("mapRoutine: quit")
  371. return
  372. }
  373. continue
  374. }
  375. c.logf("mapRoutine: %s", c.state)
  376. loggedIn := c.loggedIn
  377. ctx := c.mapCtx
  378. c.mu.Unlock()
  379. select {
  380. case <-c.quit:
  381. c.logf("mapRoutine: quit")
  382. return
  383. default:
  384. }
  385. report := func(err error, msg string) {
  386. c.logf("[v1] %s: %v", msg, err)
  387. err = fmt.Errorf("%s: %w", msg, err)
  388. // don't send status updates for context errors,
  389. // since context cancelation is always on purpose.
  390. if ctx.Err() == nil {
  391. c.sendStatus("mapRoutine1", err, "", nil)
  392. }
  393. }
  394. if !loggedIn {
  395. // Wait for something interesting to happen
  396. c.mu.Lock()
  397. c.synced = false
  398. // c.state is set by authRoutine()
  399. c.mu.Unlock()
  400. select {
  401. case <-ctx.Done():
  402. c.logf("mapRoutine: context done.")
  403. case <-c.newMapCh:
  404. c.logf("mapRoutine: new map needed while idle.")
  405. }
  406. } else {
  407. // Be sure this is false when we're not inside
  408. // PollNetMap, so that cancelMapSafely() can notify
  409. // us correctly.
  410. c.mu.Lock()
  411. c.inPollNetMap = false
  412. c.mu.Unlock()
  413. health.SetInPollNetMap(false)
  414. err := c.direct.PollNetMap(ctx, -1, func(nm *netmap.NetworkMap) {
  415. health.SetInPollNetMap(true)
  416. c.mu.Lock()
  417. select {
  418. case <-c.newMapCh:
  419. c.logf("[v1] mapRoutine: new map request during PollNetMap. canceling.")
  420. c.cancelMapLocked()
  421. // Don't emit this netmap; we're
  422. // about to request a fresh one.
  423. c.mu.Unlock()
  424. return
  425. default:
  426. }
  427. c.synced = true
  428. c.inPollNetMap = true
  429. if c.loggedIn {
  430. c.state = StateSynchronized
  431. }
  432. exp := nm.Expiry
  433. c.expiry = &exp
  434. stillAuthed := c.loggedIn
  435. state := c.state
  436. c.mu.Unlock()
  437. c.logf("[v1] mapRoutine: netmap received: %s", state)
  438. if stillAuthed {
  439. c.sendStatus("mapRoutine-got-netmap", nil, "", nm)
  440. }
  441. })
  442. health.SetInPollNetMap(false)
  443. c.mu.Lock()
  444. c.synced = false
  445. c.inPollNetMap = false
  446. if c.state == StateSynchronized {
  447. c.state = StateAuthenticated
  448. }
  449. paused := c.paused
  450. c.mu.Unlock()
  451. if paused {
  452. c.logf("mapRoutine: paused")
  453. continue
  454. }
  455. if err != nil {
  456. report(err, "PollNetMap")
  457. bo.BackOff(ctx, err)
  458. continue
  459. }
  460. bo.BackOff(ctx, nil)
  461. }
  462. }
  463. }
  464. func (c *Auto) AuthCantContinue() bool {
  465. if c == nil {
  466. return true
  467. }
  468. c.mu.Lock()
  469. defer c.mu.Unlock()
  470. return !c.loggedIn && (c.loginGoal == nil || c.loginGoal.url != "")
  471. }
  472. // SetStatusFunc sets fn as the callback to run on any status change.
  473. func (c *Auto) SetStatusFunc(fn func(Status)) {
  474. c.mu.Lock()
  475. c.statusFunc = fn
  476. c.mu.Unlock()
  477. }
  478. func (c *Auto) SetHostinfo(hi *tailcfg.Hostinfo) {
  479. if hi == nil {
  480. panic("nil Hostinfo")
  481. }
  482. if !c.direct.SetHostinfo(hi) {
  483. // No changes. Don't log.
  484. return
  485. }
  486. // Send new Hostinfo to server
  487. c.sendNewMapRequest()
  488. }
  489. func (c *Auto) SetNetInfo(ni *tailcfg.NetInfo) {
  490. if ni == nil {
  491. panic("nil NetInfo")
  492. }
  493. if !c.direct.SetNetInfo(ni) {
  494. return
  495. }
  496. c.logf("NetInfo: %v", ni)
  497. // Send new Hostinfo (which includes NetInfo) to server
  498. c.sendNewMapRequest()
  499. }
  500. func (c *Auto) sendStatus(who string, err error, url string, nm *netmap.NetworkMap) {
  501. c.mu.Lock()
  502. state := c.state
  503. loggedIn := c.loggedIn
  504. synced := c.synced
  505. statusFunc := c.statusFunc
  506. hi := c.hostinfo
  507. c.inSendStatus++
  508. c.mu.Unlock()
  509. c.logf("[v1] sendStatus: %s: %v", who, state)
  510. var p *persist.Persist
  511. var loginFin, logoutFin *empty.Message
  512. if state == StateAuthenticated {
  513. loginFin = new(empty.Message)
  514. }
  515. if state == StateNotAuthenticated {
  516. logoutFin = new(empty.Message)
  517. }
  518. if nm != nil && loggedIn && synced {
  519. pp := c.direct.GetPersist()
  520. p = &pp
  521. } else {
  522. // don't send netmap status, as it's misleading when we're
  523. // not logged in.
  524. nm = nil
  525. }
  526. new := Status{
  527. LoginFinished: loginFin,
  528. LogoutFinished: logoutFin,
  529. URL: url,
  530. Persist: p,
  531. NetMap: nm,
  532. Hostinfo: hi,
  533. State: state,
  534. Err: err,
  535. }
  536. if statusFunc != nil {
  537. statusFunc(new)
  538. }
  539. c.mu.Lock()
  540. c.inSendStatus--
  541. c.mu.Unlock()
  542. }
  543. func (c *Auto) Login(t *tailcfg.Oauth2Token, flags LoginFlags) {
  544. c.logf("client.Login(%v, %v)", t != nil, flags)
  545. c.mu.Lock()
  546. c.loginGoal = &LoginGoal{
  547. wantLoggedIn: true,
  548. token: t,
  549. flags: flags,
  550. }
  551. c.mu.Unlock()
  552. c.cancelAuth()
  553. }
  554. func (c *Auto) StartLogout() {
  555. c.logf("client.StartLogout()")
  556. c.mu.Lock()
  557. c.loginGoal = &LoginGoal{
  558. wantLoggedIn: false,
  559. }
  560. c.mu.Unlock()
  561. c.cancelAuth()
  562. }
  563. func (c *Auto) Logout(ctx context.Context) error {
  564. c.logf("client.Logout()")
  565. errc := make(chan error, 1)
  566. c.mu.Lock()
  567. c.loginGoal = &LoginGoal{
  568. wantLoggedIn: false,
  569. loggedOutResult: errc,
  570. }
  571. c.mu.Unlock()
  572. c.cancelAuth()
  573. timer := time.NewTimer(10 * time.Second)
  574. defer timer.Stop()
  575. select {
  576. case err := <-errc:
  577. return err
  578. case <-ctx.Done():
  579. return ctx.Err()
  580. case <-timer.C:
  581. return context.DeadlineExceeded
  582. }
  583. }
  584. // UpdateEndpoints sets the client's discovered endpoints and sends
  585. // them to the control server if they've changed.
  586. //
  587. // It does not retain the provided slice.
  588. //
  589. // The localPort field is unused except for integration tests in
  590. // another repo.
  591. func (c *Auto) UpdateEndpoints(localPort uint16, endpoints []tailcfg.Endpoint) {
  592. changed := c.direct.SetEndpoints(localPort, endpoints)
  593. if changed {
  594. c.sendNewMapRequest()
  595. }
  596. }
  597. func (c *Auto) Shutdown() {
  598. c.logf("client.Shutdown()")
  599. c.mu.Lock()
  600. inSendStatus := c.inSendStatus
  601. closed := c.closed
  602. if !closed {
  603. c.closed = true
  604. c.statusFunc = nil
  605. }
  606. c.mu.Unlock()
  607. c.logf("client.Shutdown: inSendStatus=%v", inSendStatus)
  608. if !closed {
  609. c.unregisterHealthWatch()
  610. close(c.quit)
  611. c.cancelAuth()
  612. <-c.authDone
  613. c.cancelMapUnsafely()
  614. <-c.mapDone
  615. c.logf("Client.Shutdown done.")
  616. }
  617. }
  618. // NodePublicKey returns the node public key currently in use. This is
  619. // used exclusively in tests.
  620. func (c *Auto) TestOnlyNodePublicKey() key.NodePublic {
  621. priv := c.direct.GetPersist()
  622. return priv.PrivateNodeKey.Public()
  623. }
  624. func (c *Auto) TestOnlySetAuthKey(authkey string) {
  625. c.direct.mu.Lock()
  626. defer c.direct.mu.Unlock()
  627. c.direct.authKey = authkey
  628. }
  629. func (c *Auto) TestOnlyTimeNow() time.Time {
  630. return c.timeNow()
  631. }
  632. // SetDNS sends the SetDNSRequest request to the control plane server,
  633. // requesting a DNS record be created or updated.
  634. func (c *Auto) SetDNS(ctx context.Context, req *tailcfg.SetDNSRequest) error {
  635. return c.direct.SetDNS(ctx, req)
  636. }