client.go 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. // Copyright (c) Tailscale Inc & AUTHORS
  2. // SPDX-License-Identifier: BSD-3-Clause
  3. //go:build !js
  4. // Package controlhttp implements the Tailscale 2021 control protocol
  5. // base transport over HTTP.
  6. //
  7. // This tunnels the protocol in control/controlbase over HTTP with a
  8. // variety of compatibility fallbacks for handling picky or deep
  9. // inspecting proxies.
  10. //
  11. // In the happy path, a client makes a single cleartext HTTP request
  12. // to the server, the server responds with 101 Switching Protocols,
  13. // and the control base protocol takes place over plain TCP.
  14. //
  15. // In the compatibility path, the client does the above over HTTPS,
  16. // resulting in double encryption (once for the control transport, and
  17. // once for the outer TLS layer).
  18. package controlhttp
  19. import (
  20. "context"
  21. "crypto/tls"
  22. "encoding/base64"
  23. "errors"
  24. "fmt"
  25. "io"
  26. "math"
  27. "net"
  28. "net/http"
  29. "net/http/httptrace"
  30. "net/netip"
  31. "net/url"
  32. "runtime"
  33. "sort"
  34. "sync/atomic"
  35. "time"
  36. "tailscale.com/control/controlbase"
  37. "tailscale.com/envknob"
  38. "tailscale.com/health"
  39. "tailscale.com/net/dnscache"
  40. "tailscale.com/net/dnsfallback"
  41. "tailscale.com/net/netutil"
  42. "tailscale.com/net/sockstats"
  43. "tailscale.com/net/tlsdial"
  44. "tailscale.com/net/tshttpproxy"
  45. "tailscale.com/syncs"
  46. "tailscale.com/tailcfg"
  47. "tailscale.com/tstime"
  48. "tailscale.com/util/multierr"
  49. )
  50. var stdDialer net.Dialer
  51. // Dial connects to the HTTP server at this Dialer's Host:HTTPPort, requests to
  52. // switch to the Tailscale control protocol, and returns an established control
  53. // protocol connection.
  54. //
  55. // If Dial fails to connect using HTTP, it also tries to tunnel over TLS to the
  56. // Dialer's Host:HTTPSPort as a compatibility fallback.
  57. //
  58. // The provided ctx is only used for the initial connection, until
  59. // Dial returns. It does not affect the connection once established.
  60. func (a *Dialer) Dial(ctx context.Context) (*ClientConn, error) {
  61. if a.Hostname == "" {
  62. return nil, errors.New("required Dialer.Hostname empty")
  63. }
  64. return a.dial(ctx)
  65. }
  66. func (a *Dialer) logf(format string, args ...any) {
  67. if a.Logf != nil {
  68. a.Logf(format, args...)
  69. }
  70. }
  71. func (a *Dialer) getProxyFunc() func(*http.Request) (*url.URL, error) {
  72. if a.proxyFunc != nil {
  73. return a.proxyFunc
  74. }
  75. return tshttpproxy.ProxyFromEnvironment
  76. }
  77. // httpsFallbackDelay is how long we'll wait for a.HTTPPort to work before
  78. // starting to try a.HTTPSPort.
  79. func (a *Dialer) httpsFallbackDelay() time.Duration {
  80. if forceNoise443() {
  81. return time.Nanosecond
  82. }
  83. if v := a.testFallbackDelay; v != 0 {
  84. return v
  85. }
  86. return 500 * time.Millisecond
  87. }
  88. var _ = envknob.RegisterBool("TS_USE_CONTROL_DIAL_PLAN") // to record at init time whether it's in use
  89. func (a *Dialer) dial(ctx context.Context) (*ClientConn, error) {
  90. // If we don't have a dial plan, just fall back to dialing the single
  91. // host we know about.
  92. useDialPlan := envknob.BoolDefaultTrue("TS_USE_CONTROL_DIAL_PLAN")
  93. if !useDialPlan || a.DialPlan == nil || len(a.DialPlan.Candidates) == 0 {
  94. return a.dialHost(ctx, netip.Addr{})
  95. }
  96. candidates := a.DialPlan.Candidates
  97. // Otherwise, we try dialing per the plan. Store the highest priority
  98. // in the list, so that if we get a connection to one of those
  99. // candidates we can return quickly.
  100. var highestPriority int = math.MinInt
  101. for _, c := range candidates {
  102. if c.Priority > highestPriority {
  103. highestPriority = c.Priority
  104. }
  105. }
  106. // This context allows us to cancel in-flight connections if we get a
  107. // highest-priority connection before we're all done.
  108. ctx, cancel := context.WithCancel(ctx)
  109. defer cancel()
  110. // Now, for each candidate, kick off a dial in parallel.
  111. type dialResult struct {
  112. conn *ClientConn
  113. err error
  114. addr netip.Addr
  115. priority int
  116. }
  117. resultsCh := make(chan dialResult, len(candidates))
  118. var pending atomic.Int32
  119. pending.Store(int32(len(candidates)))
  120. for _, c := range candidates {
  121. go func(ctx context.Context, c tailcfg.ControlIPCandidate) {
  122. var (
  123. conn *ClientConn
  124. err error
  125. )
  126. // Always send results back to our channel.
  127. defer func() {
  128. resultsCh <- dialResult{conn, err, c.IP, c.Priority}
  129. if pending.Add(-1) == 0 {
  130. close(resultsCh)
  131. }
  132. }()
  133. // If non-zero, wait the configured start timeout
  134. // before we do anything.
  135. if c.DialStartDelaySec > 0 {
  136. a.logf("[v2] controlhttp: waiting %.2f seconds before dialing %q @ %v", c.DialStartDelaySec, a.Hostname, c.IP)
  137. if a.Clock == nil {
  138. a.Clock = tstime.StdClock{}
  139. }
  140. tmr, tmrChannel := a.Clock.NewTimer(time.Duration(c.DialStartDelaySec * float64(time.Second)))
  141. defer tmr.Stop()
  142. select {
  143. case <-ctx.Done():
  144. err = ctx.Err()
  145. return
  146. case <-tmrChannel:
  147. }
  148. }
  149. // Now, create a sub-context with the given timeout and
  150. // try dialing the provided host.
  151. ctx, cancel := context.WithTimeout(ctx, time.Duration(c.DialTimeoutSec*float64(time.Second)))
  152. defer cancel()
  153. // This will dial, and the defer above sends it back to our parent.
  154. a.logf("[v2] controlhttp: trying to dial %q @ %v", a.Hostname, c.IP)
  155. conn, err = a.dialHost(ctx, c.IP)
  156. }(ctx, c)
  157. }
  158. var results []dialResult
  159. for res := range resultsCh {
  160. // If we get a response that has the highest priority, we don't
  161. // need to wait for any of the other connections to finish; we
  162. // can just return this connection.
  163. //
  164. // TODO(andrew): we could make this better by keeping track of
  165. // the highest remaining priority dynamically, instead of just
  166. // checking for the highest total
  167. if res.priority == highestPriority && res.conn != nil {
  168. a.logf("[v1] controlhttp: high-priority success dialing %q @ %v from dial plan", a.Hostname, res.addr)
  169. // Drain the channel and any existing connections in
  170. // the background.
  171. go func() {
  172. for _, res := range results {
  173. if res.conn != nil {
  174. res.conn.Close()
  175. }
  176. }
  177. for res := range resultsCh {
  178. if res.conn != nil {
  179. res.conn.Close()
  180. }
  181. }
  182. if a.drainFinished != nil {
  183. close(a.drainFinished)
  184. }
  185. }()
  186. return res.conn, nil
  187. }
  188. // This isn't a highest-priority result, so just store it until
  189. // we're done.
  190. results = append(results, res)
  191. }
  192. // After we finish this function, close any remaining open connections.
  193. defer func() {
  194. for _, result := range results {
  195. // Note: below, we nil out the returned connection (if
  196. // any) in the slice so we don't close it.
  197. if result.conn != nil {
  198. result.conn.Close()
  199. }
  200. }
  201. // We don't drain asynchronously after this point, so notify our
  202. // channel when we return.
  203. if a.drainFinished != nil {
  204. close(a.drainFinished)
  205. }
  206. }()
  207. // Sort by priority, then take the first non-error response.
  208. sort.Slice(results, func(i, j int) bool {
  209. // NOTE: intentionally inverted so that the highest priority
  210. // item comes first
  211. return results[i].priority > results[j].priority
  212. })
  213. var (
  214. conn *ClientConn
  215. errs []error
  216. )
  217. for i, result := range results {
  218. if result.err != nil {
  219. errs = append(errs, result.err)
  220. continue
  221. }
  222. a.logf("[v1] controlhttp: succeeded dialing %q @ %v from dial plan", a.Hostname, result.addr)
  223. conn = result.conn
  224. results[i].conn = nil // so we don't close it in the defer
  225. return conn, nil
  226. }
  227. merr := multierr.New(errs...)
  228. // If we get here, then we didn't get anywhere with our dial plan; fall back to just using DNS.
  229. a.logf("controlhttp: failed dialing using DialPlan, falling back to DNS; errs=%s", merr.Error())
  230. return a.dialHost(ctx, netip.Addr{})
  231. }
  232. // The TS_FORCE_NOISE_443 envknob forces the controlclient noise dialer to
  233. // always use port 443 HTTPS connections to the controlplane and not try the
  234. // port 80 HTTP fast path.
  235. //
  236. // This is currently (2023-01-17) needed for Docker Desktop's "VPNKit" proxy
  237. // that breaks port 80 for us post-Noise-handshake, causing us to never try port
  238. // 443. Until one of Docker's proxy and/or this package's port 443 fallback is
  239. // fixed, this is a workaround. It might also be useful for future debugging.
  240. var forceNoise443 = envknob.RegisterBool("TS_FORCE_NOISE_443")
  241. var debugNoiseDial = envknob.RegisterBool("TS_DEBUG_NOISE_DIAL")
  242. // dialHost connects to the configured Dialer.Hostname and upgrades the
  243. // connection into a controlbase.Conn. If addr is valid, then no DNS is used
  244. // and the connection will be made to the provided address.
  245. func (a *Dialer) dialHost(ctx context.Context, addr netip.Addr) (*ClientConn, error) {
  246. // Create one shared context used by both port 80 and port 443 dials.
  247. // If port 80 is still in flight when 443 returns, this deferred cancel
  248. // will stop the port 80 dial.
  249. ctx, cancel := context.WithCancel(ctx)
  250. defer cancel()
  251. ctx = sockstats.WithSockStats(ctx, sockstats.LabelControlClientDialer, a.logf)
  252. // u80 and u443 are the URLs we'll try to hit over HTTP or HTTPS,
  253. // respectively, in order to do the HTTP upgrade to a net.Conn over which
  254. // we'll speak Noise.
  255. u80 := &url.URL{
  256. Scheme: "http",
  257. Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPPort, "80")),
  258. Path: serverUpgradePath,
  259. }
  260. u443 := &url.URL{
  261. Scheme: "https",
  262. Host: net.JoinHostPort(a.Hostname, strDef(a.HTTPSPort, "443")),
  263. Path: serverUpgradePath,
  264. }
  265. type tryURLRes struct {
  266. u *url.URL // input (the URL conn+err are for/from)
  267. conn *ClientConn // result (mutually exclusive with err)
  268. err error
  269. }
  270. ch := make(chan tryURLRes) // must be unbuffered
  271. try := func(u *url.URL) {
  272. if debugNoiseDial() {
  273. a.logf("trying noise dial (%v, %v) ...", u, addr)
  274. }
  275. cbConn, err := a.dialURL(ctx, u, addr)
  276. if debugNoiseDial() {
  277. a.logf("noise dial (%v, %v) = (%v, %v)", u, addr, cbConn, err)
  278. }
  279. select {
  280. case ch <- tryURLRes{u, cbConn, err}:
  281. case <-ctx.Done():
  282. if cbConn != nil {
  283. cbConn.Close()
  284. }
  285. }
  286. }
  287. // Start the plaintext HTTP attempt first, unless disabled by the envknob.
  288. if !forceNoise443() {
  289. go try(u80)
  290. }
  291. // In case outbound port 80 blocked or MITM'ed poorly, start a backup timer
  292. // to dial port 443 if port 80 doesn't either succeed or fail quickly.
  293. if a.Clock == nil {
  294. a.Clock = tstime.StdClock{}
  295. }
  296. try443Timer := a.Clock.AfterFunc(a.httpsFallbackDelay(), func() { try(u443) })
  297. defer try443Timer.Stop()
  298. var err80, err443 error
  299. for {
  300. select {
  301. case <-ctx.Done():
  302. return nil, fmt.Errorf("connection attempts aborted by context: %w", ctx.Err())
  303. case res := <-ch:
  304. if res.err == nil {
  305. return res.conn, nil
  306. }
  307. switch res.u {
  308. case u80:
  309. // Connecting over plain HTTP failed; assume it's an HTTP proxy
  310. // being difficult and see if we can get through over HTTPS.
  311. err80 = res.err
  312. // Stop the fallback timer and run it immediately. We don't use
  313. // Timer.Reset(0) here because on AfterFuncs, that can run it
  314. // again.
  315. if try443Timer.Stop() {
  316. go try(u443)
  317. } // else we lost the race and it started already which is what we want
  318. case u443:
  319. err443 = res.err
  320. default:
  321. panic("invalid")
  322. }
  323. if err80 != nil && err443 != nil {
  324. return nil, fmt.Errorf("all connection attempts failed (HTTP: %v, HTTPS: %v)", err80, err443)
  325. }
  326. }
  327. }
  328. }
  329. // dialURL attempts to connect to the given URL.
  330. func (a *Dialer) dialURL(ctx context.Context, u *url.URL, addr netip.Addr) (*ClientConn, error) {
  331. init, cont, err := controlbase.ClientDeferred(a.MachineKey, a.ControlKey, a.ProtocolVersion)
  332. if err != nil {
  333. return nil, err
  334. }
  335. netConn, err := a.tryURLUpgrade(ctx, u, addr, init)
  336. if err != nil {
  337. return nil, err
  338. }
  339. cbConn, err := cont(ctx, netConn)
  340. if err != nil {
  341. netConn.Close()
  342. return nil, err
  343. }
  344. return &ClientConn{
  345. Conn: cbConn,
  346. }, nil
  347. }
  348. // resolver returns a.DNSCache if non-nil or a new *dnscache.Resolver
  349. // otherwise.
  350. func (a *Dialer) resolver() *dnscache.Resolver {
  351. if a.DNSCache != nil {
  352. return a.DNSCache
  353. }
  354. return &dnscache.Resolver{
  355. Forward: dnscache.Get().Forward,
  356. LookupIPFallback: dnsfallback.MakeLookupFunc(a.logf, a.NetMon),
  357. UseLastGood: true,
  358. Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
  359. }
  360. }
  361. func isLoopback(a net.Addr) bool {
  362. if ta, ok := a.(*net.TCPAddr); ok {
  363. return ta.IP.IsLoopback()
  364. }
  365. return false
  366. }
  367. var macOSScreenTime = health.Register(&health.Warnable{
  368. Code: "macos-screen-time",
  369. Severity: health.SeverityHigh,
  370. Title: "Tailscale blocked by Screen Time",
  371. Text: func(args health.Args) string {
  372. return "macOS Screen Time seems to be blocking Tailscale. Try disabling Screen Time in System Settings > Screen Time > Content & Privacy > Access to Web Content."
  373. },
  374. ImpactsConnectivity: true,
  375. })
  376. // tryURLUpgrade connects to u, and tries to upgrade it to a net.Conn. If addr
  377. // is valid, then no DNS is used and the connection will be made to the
  378. // provided address.
  379. //
  380. // Only the provided ctx is used, not a.ctx.
  381. func (a *Dialer) tryURLUpgrade(ctx context.Context, u *url.URL, addr netip.Addr, init []byte) (_ net.Conn, retErr error) {
  382. var dns *dnscache.Resolver
  383. // If we were provided an address to dial, then create a resolver that just
  384. // returns that value; otherwise, fall back to DNS.
  385. if addr.IsValid() {
  386. dns = &dnscache.Resolver{
  387. SingleHostStaticResult: []netip.Addr{addr},
  388. SingleHost: u.Hostname(),
  389. Logf: a.Logf, // not a.logf method; we want to propagate nil-ness
  390. }
  391. } else {
  392. dns = a.resolver()
  393. }
  394. var dialer dnscache.DialContextFunc
  395. if a.Dialer != nil {
  396. dialer = a.Dialer
  397. } else {
  398. dialer = stdDialer.DialContext
  399. }
  400. // On macOS, see if Screen Time is blocking things.
  401. if runtime.GOOS == "darwin" {
  402. var proxydIntercepted atomic.Bool // intercepted by macOS webfilterproxyd
  403. origDialer := dialer
  404. dialer = func(ctx context.Context, network, address string) (net.Conn, error) {
  405. c, err := origDialer(ctx, network, address)
  406. if err != nil {
  407. return nil, err
  408. }
  409. if isLoopback(c.LocalAddr()) && isLoopback(c.RemoteAddr()) {
  410. proxydIntercepted.Store(true)
  411. }
  412. return c, nil
  413. }
  414. defer func() {
  415. if retErr != nil && proxydIntercepted.Load() {
  416. a.HealthTracker.SetUnhealthy(macOSScreenTime, nil)
  417. retErr = fmt.Errorf("macOS Screen Time is blocking network access: %w", retErr)
  418. } else {
  419. a.HealthTracker.SetHealthy(macOSScreenTime)
  420. }
  421. }()
  422. }
  423. tr := http.DefaultTransport.(*http.Transport).Clone()
  424. defer tr.CloseIdleConnections()
  425. tr.Proxy = a.getProxyFunc()
  426. tshttpproxy.SetTransportGetProxyConnectHeader(tr)
  427. tr.DialContext = dnscache.Dialer(dialer, dns)
  428. // Disable HTTP2, since h2 can't do protocol switching.
  429. tr.TLSClientConfig.NextProtos = []string{}
  430. tr.TLSNextProto = map[string]func(string, *tls.Conn) http.RoundTripper{}
  431. tr.TLSClientConfig = tlsdial.Config(a.Hostname, a.HealthTracker, tr.TLSClientConfig)
  432. if !tr.TLSClientConfig.InsecureSkipVerify {
  433. panic("unexpected") // should be set by tlsdial.Config
  434. }
  435. verify := tr.TLSClientConfig.VerifyConnection
  436. if verify == nil {
  437. panic("unexpected") // should be set by tlsdial.Config
  438. }
  439. // Demote all cert verification errors to log messages. We don't actually
  440. // care about the TLS security (because we just do the Noise crypto atop whatever
  441. // connection we get, including HTTP port 80 plaintext) so this permits
  442. // middleboxes to MITM their users. All they'll see is some Noise.
  443. tr.TLSClientConfig.VerifyConnection = func(cs tls.ConnectionState) error {
  444. if err := verify(cs); err != nil && a.Logf != nil && !a.omitCertErrorLogging {
  445. a.Logf("warning: TLS cert verificication for %q failed: %v", a.Hostname, err)
  446. }
  447. return nil // regardless
  448. }
  449. tr.DialTLSContext = dnscache.TLSDialer(dialer, dns, tr.TLSClientConfig)
  450. tr.DisableCompression = true
  451. // (mis)use httptrace to extract the underlying net.Conn from the
  452. // transport. The transport handles 101 Switching Protocols correctly,
  453. // such that the Conn will not be reused or kept alive by the transport
  454. // once the response has been handed back from RoundTrip.
  455. //
  456. // In theory, the machinery of net/http should make it such that
  457. // the trace callback happens-before we get the response, but
  458. // there's no promise of that. So, to make sure, we use a buffered
  459. // channel as a synchronization step to avoid data races.
  460. //
  461. // Note that even though we're able to extract a net.Conn via this
  462. // mechanism, we must still keep using the eventual resp.Body to
  463. // read from, because it includes a buffer we can't get rid of. If
  464. // the server never sends any data after sending the HTTP
  465. // response, we could get away with it, but violating this
  466. // assumption leads to very mysterious transport errors (lockups,
  467. // unexpected EOFs...), and we're bound to forget someday and
  468. // introduce a protocol optimization at a higher level that starts
  469. // eagerly transmitting from the server.
  470. var lastConn syncs.AtomicValue[net.Conn]
  471. trace := httptrace.ClientTrace{
  472. // Even though we only make a single HTTP request which should
  473. // require a single connection, the context (with the attached
  474. // trace configuration) might be used by our custom dialer to
  475. // make other HTTP requests (e.g. BootstrapDNS). We only care
  476. // about the last connection made, which should be the one to
  477. // the control server.
  478. GotConn: func(info httptrace.GotConnInfo) {
  479. lastConn.Store(info.Conn)
  480. },
  481. }
  482. ctx = httptrace.WithClientTrace(ctx, &trace)
  483. req := &http.Request{
  484. Method: "POST",
  485. URL: u,
  486. Header: http.Header{
  487. "Upgrade": []string{upgradeHeaderValue},
  488. "Connection": []string{"upgrade"},
  489. handshakeHeaderName: []string{base64.StdEncoding.EncodeToString(init)},
  490. },
  491. }
  492. req = req.WithContext(ctx)
  493. resp, err := tr.RoundTrip(req)
  494. if err != nil {
  495. return nil, err
  496. }
  497. if resp.StatusCode != http.StatusSwitchingProtocols {
  498. return nil, fmt.Errorf("unexpected HTTP response: %s", resp.Status)
  499. }
  500. // From here on, the underlying net.Conn is ours to use, but there
  501. // is still a read buffer attached to it within resp.Body. So, we
  502. // must direct I/O through resp.Body, but we can still use the
  503. // underlying net.Conn for stuff like deadlines.
  504. switchedConn := lastConn.Load()
  505. if switchedConn == nil {
  506. resp.Body.Close()
  507. return nil, fmt.Errorf("httptrace didn't provide a connection")
  508. }
  509. if next := resp.Header.Get("Upgrade"); next != upgradeHeaderValue {
  510. resp.Body.Close()
  511. return nil, fmt.Errorf("server switched to unexpected protocol %q", next)
  512. }
  513. rwc, ok := resp.Body.(io.ReadWriteCloser)
  514. if !ok {
  515. resp.Body.Close()
  516. return nil, errors.New("http Transport did not provide a writable body")
  517. }
  518. return netutil.NewAltReadWriteCloserConn(rwc, switchedConn), nil
  519. }