epoll.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. /* Copyright libuv contributors. All rights reserved.
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to
  5. * deal in the Software without restriction, including without limitation the
  6. * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7. * sell copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19. * IN THE SOFTWARE.
  20. */
  21. #include "uv.h"
  22. #include "internal.h"
  23. #include <errno.h>
  24. #include <sys/epoll.h>
  25. int uv__epoll_init(uv_loop_t* loop) {
  26. int fd;
  27. fd = epoll_create1(O_CLOEXEC);
  28. /* epoll_create1() can fail either because it's not implemented (old kernel)
  29. * or because it doesn't understand the O_CLOEXEC flag.
  30. */
  31. if (fd == -1 && (errno == ENOSYS || errno == EINVAL)) {
  32. fd = epoll_create(256);
  33. if (fd != -1)
  34. uv__cloexec(fd, 1);
  35. }
  36. loop->backend_fd = fd;
  37. if (fd == -1)
  38. return UV__ERR(errno);
  39. return 0;
  40. }
  41. void uv__platform_invalidate_fd(uv_loop_t* loop, int fd) {
  42. struct epoll_event* events;
  43. struct epoll_event dummy;
  44. uintptr_t i;
  45. uintptr_t nfds;
  46. assert(loop->watchers != NULL);
  47. assert(fd >= 0);
  48. events = (struct epoll_event*) loop->watchers[loop->nwatchers];
  49. nfds = (uintptr_t) loop->watchers[loop->nwatchers + 1];
  50. if (events != NULL)
  51. /* Invalidate events with same file descriptor */
  52. for (i = 0; i < nfds; i++)
  53. if (events[i].data.fd == fd)
  54. events[i].data.fd = -1;
  55. /* Remove the file descriptor from the epoll.
  56. * This avoids a problem where the same file description remains open
  57. * in another process, causing repeated junk epoll events.
  58. *
  59. * We pass in a dummy epoll_event, to work around a bug in old kernels.
  60. */
  61. if (loop->backend_fd >= 0) {
  62. /* Work around a bug in kernels 3.10 to 3.19 where passing a struct that
  63. * has the EPOLLWAKEUP flag set generates spurious audit syslog warnings.
  64. */
  65. memset(&dummy, 0, sizeof(dummy));
  66. epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &dummy);
  67. }
  68. }
  69. int uv__io_check_fd(uv_loop_t* loop, int fd) {
  70. struct epoll_event e;
  71. int rc;
  72. memset(&e, 0, sizeof(e));
  73. e.events = POLLIN;
  74. e.data.fd = -1;
  75. rc = 0;
  76. if (epoll_ctl(loop->backend_fd, EPOLL_CTL_ADD, fd, &e))
  77. if (errno != EEXIST)
  78. rc = UV__ERR(errno);
  79. if (rc == 0)
  80. if (epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, &e))
  81. abort();
  82. return rc;
  83. }
  84. void uv__io_poll(uv_loop_t* loop, int timeout) {
  85. /* A bug in kernels < 2.6.37 makes timeouts larger than ~30 minutes
  86. * effectively infinite on 32 bits architectures. To avoid blocking
  87. * indefinitely, we cap the timeout and poll again if necessary.
  88. *
  89. * Note that "30 minutes" is a simplification because it depends on
  90. * the value of CONFIG_HZ. The magic constant assumes CONFIG_HZ=1200,
  91. * that being the largest value I have seen in the wild (and only once.)
  92. */
  93. static const int max_safe_timeout = 1789569;
  94. static int no_epoll_pwait_cached;
  95. static int no_epoll_wait_cached;
  96. int no_epoll_pwait;
  97. int no_epoll_wait;
  98. struct epoll_event events[1024];
  99. struct epoll_event* pe;
  100. struct epoll_event e;
  101. int real_timeout;
  102. QUEUE* q;
  103. uv__io_t* w;
  104. sigset_t sigset;
  105. uint64_t sigmask;
  106. uint64_t base;
  107. int have_signals;
  108. int nevents;
  109. int count;
  110. int nfds;
  111. int fd;
  112. int op;
  113. int i;
  114. int user_timeout;
  115. int reset_timeout;
  116. if (loop->nfds == 0) {
  117. assert(QUEUE_EMPTY(&loop->watcher_queue));
  118. return;
  119. }
  120. memset(&e, 0, sizeof(e));
  121. while (!QUEUE_EMPTY(&loop->watcher_queue)) {
  122. q = QUEUE_HEAD(&loop->watcher_queue);
  123. QUEUE_REMOVE(q);
  124. QUEUE_INIT(q);
  125. w = QUEUE_DATA(q, uv__io_t, watcher_queue);
  126. assert(w->pevents != 0);
  127. assert(w->fd >= 0);
  128. assert(w->fd < (int) loop->nwatchers);
  129. e.events = w->pevents;
  130. e.data.fd = w->fd;
  131. if (w->events == 0)
  132. op = EPOLL_CTL_ADD;
  133. else
  134. op = EPOLL_CTL_MOD;
  135. /* XXX Future optimization: do EPOLL_CTL_MOD lazily if we stop watching
  136. * events, skip the syscall and squelch the events after epoll_wait().
  137. */
  138. if (epoll_ctl(loop->backend_fd, op, w->fd, &e)) {
  139. if (errno != EEXIST)
  140. abort();
  141. assert(op == EPOLL_CTL_ADD);
  142. /* We've reactivated a file descriptor that's been watched before. */
  143. if (epoll_ctl(loop->backend_fd, EPOLL_CTL_MOD, w->fd, &e))
  144. abort();
  145. }
  146. w->events = w->pevents;
  147. }
  148. sigmask = 0;
  149. if (loop->flags & UV_LOOP_BLOCK_SIGPROF) {
  150. sigemptyset(&sigset);
  151. sigaddset(&sigset, SIGPROF);
  152. sigmask |= 1 << (SIGPROF - 1);
  153. }
  154. assert(timeout >= -1);
  155. base = loop->time;
  156. count = 48; /* Benchmarks suggest this gives the best throughput. */
  157. real_timeout = timeout;
  158. if (uv__get_internal_fields(loop)->flags & UV_METRICS_IDLE_TIME) {
  159. reset_timeout = 1;
  160. user_timeout = timeout;
  161. timeout = 0;
  162. } else {
  163. reset_timeout = 0;
  164. user_timeout = 0;
  165. }
  166. /* You could argue there is a dependency between these two but
  167. * ultimately we don't care about their ordering with respect
  168. * to one another. Worst case, we make a few system calls that
  169. * could have been avoided because another thread already knows
  170. * they fail with ENOSYS. Hardly the end of the world.
  171. */
  172. no_epoll_pwait = uv__load_relaxed(&no_epoll_pwait_cached);
  173. no_epoll_wait = uv__load_relaxed(&no_epoll_wait_cached);
  174. for (;;) {
  175. /* Only need to set the provider_entry_time if timeout != 0. The function
  176. * will return early if the loop isn't configured with UV_METRICS_IDLE_TIME.
  177. */
  178. if (timeout != 0)
  179. uv__metrics_set_provider_entry_time(loop);
  180. /* See the comment for max_safe_timeout for an explanation of why
  181. * this is necessary. Executive summary: kernel bug workaround.
  182. */
  183. if (sizeof(int32_t) == sizeof(long) && timeout >= max_safe_timeout)
  184. timeout = max_safe_timeout;
  185. if (sigmask != 0 && no_epoll_pwait != 0)
  186. if (pthread_sigmask(SIG_BLOCK, &sigset, NULL))
  187. abort();
  188. if (no_epoll_wait != 0 || (sigmask != 0 && no_epoll_pwait == 0)) {
  189. nfds = epoll_pwait(loop->backend_fd,
  190. events,
  191. ARRAY_SIZE(events),
  192. timeout,
  193. &sigset);
  194. if (nfds == -1 && errno == ENOSYS) {
  195. uv__store_relaxed(&no_epoll_pwait_cached, 1);
  196. no_epoll_pwait = 1;
  197. }
  198. } else {
  199. nfds = epoll_wait(loop->backend_fd,
  200. events,
  201. ARRAY_SIZE(events),
  202. timeout);
  203. if (nfds == -1 && errno == ENOSYS) {
  204. uv__store_relaxed(&no_epoll_wait_cached, 1);
  205. no_epoll_wait = 1;
  206. }
  207. }
  208. if (sigmask != 0 && no_epoll_pwait != 0)
  209. if (pthread_sigmask(SIG_UNBLOCK, &sigset, NULL))
  210. abort();
  211. /* Update loop->time unconditionally. It's tempting to skip the update when
  212. * timeout == 0 (i.e. non-blocking poll) but there is no guarantee that the
  213. * operating system didn't reschedule our process while in the syscall.
  214. */
  215. SAVE_ERRNO(uv__update_time(loop));
  216. if (nfds == 0) {
  217. assert(timeout != -1);
  218. if (reset_timeout != 0) {
  219. timeout = user_timeout;
  220. reset_timeout = 0;
  221. }
  222. if (timeout == -1)
  223. continue;
  224. if (timeout == 0)
  225. return;
  226. /* We may have been inside the system call for longer than |timeout|
  227. * milliseconds so we need to update the timestamp to avoid drift.
  228. */
  229. goto update_timeout;
  230. }
  231. if (nfds == -1) {
  232. if (errno == ENOSYS) {
  233. /* epoll_wait() or epoll_pwait() failed, try the other system call. */
  234. assert(no_epoll_wait == 0 || no_epoll_pwait == 0);
  235. continue;
  236. }
  237. if (errno != EINTR)
  238. abort();
  239. if (reset_timeout != 0) {
  240. timeout = user_timeout;
  241. reset_timeout = 0;
  242. }
  243. if (timeout == -1)
  244. continue;
  245. if (timeout == 0)
  246. return;
  247. /* Interrupted by a signal. Update timeout and poll again. */
  248. goto update_timeout;
  249. }
  250. have_signals = 0;
  251. nevents = 0;
  252. {
  253. /* Squelch a -Waddress-of-packed-member warning with gcc >= 9. */
  254. union {
  255. struct epoll_event* events;
  256. uv__io_t* watchers;
  257. } x;
  258. x.events = events;
  259. assert(loop->watchers != NULL);
  260. loop->watchers[loop->nwatchers] = x.watchers;
  261. loop->watchers[loop->nwatchers + 1] = (void*) (uintptr_t) nfds;
  262. }
  263. for (i = 0; i < nfds; i++) {
  264. pe = events + i;
  265. fd = pe->data.fd;
  266. /* Skip invalidated events, see uv__platform_invalidate_fd */
  267. if (fd == -1)
  268. continue;
  269. assert(fd >= 0);
  270. assert((unsigned) fd < loop->nwatchers);
  271. w = loop->watchers[fd];
  272. if (w == NULL) {
  273. /* File descriptor that we've stopped watching, disarm it.
  274. *
  275. * Ignore all errors because we may be racing with another thread
  276. * when the file descriptor is closed.
  277. */
  278. epoll_ctl(loop->backend_fd, EPOLL_CTL_DEL, fd, pe);
  279. continue;
  280. }
  281. /* Give users only events they're interested in. Prevents spurious
  282. * callbacks when previous callback invocation in this loop has stopped
  283. * the current watcher. Also, filters out events that users has not
  284. * requested us to watch.
  285. */
  286. pe->events &= w->pevents | POLLERR | POLLHUP;
  287. /* Work around an epoll quirk where it sometimes reports just the
  288. * EPOLLERR or EPOLLHUP event. In order to force the event loop to
  289. * move forward, we merge in the read/write events that the watcher
  290. * is interested in; uv__read() and uv__write() will then deal with
  291. * the error or hangup in the usual fashion.
  292. *
  293. * Note to self: happens when epoll reports EPOLLIN|EPOLLHUP, the user
  294. * reads the available data, calls uv_read_stop(), then sometime later
  295. * calls uv_read_start() again. By then, libuv has forgotten about the
  296. * hangup and the kernel won't report EPOLLIN again because there's
  297. * nothing left to read. If anything, libuv is to blame here. The
  298. * current hack is just a quick bandaid; to properly fix it, libuv
  299. * needs to remember the error/hangup event. We should get that for
  300. * free when we switch over to edge-triggered I/O.
  301. */
  302. if (pe->events == POLLERR || pe->events == POLLHUP)
  303. pe->events |=
  304. w->pevents & (POLLIN | POLLOUT | UV__POLLRDHUP | UV__POLLPRI);
  305. if (pe->events != 0) {
  306. /* Run signal watchers last. This also affects child process watchers
  307. * because those are implemented in terms of signal watchers.
  308. */
  309. if (w == &loop->signal_io_watcher) {
  310. have_signals = 1;
  311. } else {
  312. uv__metrics_update_idle_time(loop);
  313. w->cb(loop, w, pe->events);
  314. }
  315. nevents++;
  316. }
  317. }
  318. if (reset_timeout != 0) {
  319. timeout = user_timeout;
  320. reset_timeout = 0;
  321. }
  322. if (have_signals != 0) {
  323. uv__metrics_update_idle_time(loop);
  324. loop->signal_io_watcher.cb(loop, &loop->signal_io_watcher, POLLIN);
  325. }
  326. loop->watchers[loop->nwatchers] = NULL;
  327. loop->watchers[loop->nwatchers + 1] = NULL;
  328. if (have_signals != 0)
  329. return; /* Event loop should cycle now so don't poll again. */
  330. if (nevents != 0) {
  331. if (nfds == ARRAY_SIZE(events) && --count != 0) {
  332. /* Poll for more events but don't block this time. */
  333. timeout = 0;
  334. continue;
  335. }
  336. return;
  337. }
  338. if (timeout == 0)
  339. return;
  340. if (timeout == -1)
  341. continue;
  342. update_timeout:
  343. assert(timeout > 0);
  344. real_timeout -= (loop->time - base);
  345. if (real_timeout <= 0)
  346. return;
  347. timeout = real_timeout;
  348. }
  349. }