aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavide Libenzi <davidel@xmailserver.org>2009-03-31 18:24:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-04-01 11:59:18 -0400
commit5071f97ec6d74f006072de0ce89b67c8792fe5a1 (patch)
treecb20ebd79c6c146c73d321b4558f8176a0cf06c9
parent3cdbbeebb77348176bd6a03fd86e11bc281c529e (diff)
epoll: fix epoll's own poll
Fix a bug inside the epoll's f_op->poll() code, that returns POLLIN even though there are no actual ready monitored fds. The bug shows up if you add an epoll fd inside another fd container (poll, select, epoll). The problem is that callback-based wake ups used by epoll does not carry (patches will follow, to fix this) any information about the events that actually happened. So the callback code, since it can't call the file* ->poll() inside the callback, chains the file* into a ready-list. So, suppose you added an fd with EPOLLOUT only, and some data shows up on the fd, the file* mapped by the fd will be added into the ready-list (via wakeup callback). During normal epoll_wait() use, this condition is sorted out at the time we're actually able to call the file*'s f_op->poll(). Inside the old epoll's f_op->poll() though, only a quick check !list_empty(ready-list) was performed, and this could have led to reporting POLLIN even though no ready fds would show up at a following epoll_wait(). In order to correctly report the ready status for an epoll fd, the ready-list must be checked to see if any really available fd+event would be ready in a following epoll_wait(). Operation (calling f_op->poll() from inside f_op->poll()) that, like wake ups, must be handled with care because of the fact that epoll fds can be added to other epoll fds. Test code: /* * epoll_test by Davide Libenzi (Simple code to test epoll internals) * Copyright (C) 2008 Davide Libenzi * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * Davide Libenzi <davidel@xmailserver.org> * */ #include <sys/types.h> #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <signal.h> #include <limits.h> #include <poll.h> #include <sys/epoll.h> #include <sys/wait.h> #define EPWAIT_TIMEO (1 * 1000) #ifndef POLLRDHUP #define POLLRDHUP 0x2000 #endif #define EPOLL_MAX_CHAIN 100L #define EPOLL_TF_LOOP (1 << 0) struct epoll_test_cfg { long size; long flags; }; static int xepoll_create(int n) { int epfd; if ((epfd = epoll_create(n)) == -1) { perror("epoll_create"); exit(2); } return epfd; } static void xepoll_ctl(int epfd, int cmd, int fd, struct epoll_event *evt) { if (epoll_ctl(epfd, cmd, fd, evt) < 0) { perror("epoll_ctl"); exit(3); } } static void xpipe(int *fds) { if (pipe(fds)) { perror("pipe"); exit(4); } } static pid_t xfork(void) { pid_t pid; if ((pid = fork()) == (pid_t) -1) { perror("pipe"); exit(5); } return pid; } static int run_forked_proc(int (*proc)(void *), void *data) { int status; pid_t pid; if ((pid = xfork()) == 0) exit((*proc)(data)); if (waitpid(pid, &status, 0) != pid) { perror("waitpid"); return -1; } return WIFEXITED(status) ? WEXITSTATUS(status): -2; } static int check_events(int fd, int timeo) { struct pollfd pfd; fprintf(stdout, "Checking events for fd %d\n", fd); memset(&pfd, 0, sizeof(pfd)); pfd.fd = fd; pfd.events = POLLIN | POLLOUT; if (poll(&pfd, 1, timeo) < 0) { perror("poll()"); return 0; } if (pfd.revents & POLLIN) fprintf(stdout, "\tPOLLIN\n"); if (pfd.revents & POLLOUT) fprintf(stdout, "\tPOLLOUT\n"); if (pfd.revents & POLLERR) fprintf(stdout, "\tPOLLERR\n"); if (pfd.revents & POLLHUP) fprintf(stdout, "\tPOLLHUP\n"); if (pfd.revents & POLLRDHUP) fprintf(stdout, "\tPOLLRDHUP\n"); return pfd.revents; } static int epoll_test_tty(void *data) { int epfd, ifd = fileno(stdin), res; struct epoll_event evt; if (check_events(ifd, 0) != POLLOUT) { fprintf(stderr, "Something is cooking on STDIN (%d)\n", ifd); return 1; } epfd = xepoll_create(1); fprintf(stdout, "Created epoll fd (%d)\n", epfd); memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; xepoll_ctl(epfd, EPOLL_CTL_ADD, ifd, &evt); if (check_events(epfd, 0) & POLLIN) { res = epoll_wait(epfd, &evt, 1, 0); if (res == 0) { fprintf(stderr, "Epoll fd (%d) is ready when it shouldn't!\n", epfd); return 2; } } return 0; } static int epoll_wakeup_chain(void *data) { struct epoll_test_cfg *tcfg = data; int i, res, epfd, bfd, nfd, pfds[2]; pid_t pid; struct epoll_event evt; memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; epfd = bfd = xepoll_create(1); for (i = 0; i < tcfg->size; i++) { nfd = xepoll_create(1); xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt); bfd = nfd; } xpipe(pfds); if (tcfg->flags & EPOLL_TF_LOOP) { xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt); /* * If we're testing for loop, we want that the wakeup * triggered by the write to the pipe done in the child * process, triggers a fake event. So we add the pipe * read size with EPOLLOUT events. This will trigger * an addition to the ready-list, but no real events * will be there. The the epoll kernel code will proceed * in calling f_op->poll() of the epfd, triggering the * loop we want to test. */ evt.events = EPOLLOUT; } xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt); /* * The pipe write must come after the poll(2) call inside * check_events(). This tests the nested wakeup code in * fs/eventpoll.c:ep_poll_safewake() * By having the check_events() (hence poll(2)) happens first, * we have poll wait queue filled up, and the write(2) in the * child will trigger the wakeup chain. */ if ((pid = xfork()) == 0) { sleep(1); write(pfds[1], "w", 1); exit(0); } res = check_events(epfd, 2000) & POLLIN; if (waitpid(pid, NULL, 0) != pid) { perror("waitpid"); return -1; } return res; } static int epoll_poll_chain(void *data) { struct epoll_test_cfg *tcfg = data; int i, res, epfd, bfd, nfd, pfds[2]; pid_t pid; struct epoll_event evt; memset(&evt, 0, sizeof(evt)); evt.events = EPOLLIN; epfd = bfd = xepoll_create(1); for (i = 0; i < tcfg->size; i++) { nfd = xepoll_create(1); xepoll_ctl(bfd, EPOLL_CTL_ADD, nfd, &evt); bfd = nfd; } xpipe(pfds); if (tcfg->flags & EPOLL_TF_LOOP) { xepoll_ctl(bfd, EPOLL_CTL_ADD, epfd, &evt); /* * If we're testing for loop, we want that the wakeup * triggered by the write to the pipe done in the child * process, triggers a fake event. So we add the pipe * read size with EPOLLOUT events. This will trigger * an addition to the ready-list, but no real events * will be there. The the epoll kernel code will proceed * in calling f_op->poll() of the epfd, triggering the * loop we want to test. */ evt.events = EPOLLOUT; } xepoll_ctl(bfd, EPOLL_CTL_ADD, pfds[0], &evt); /* * The pipe write mush come before the poll(2) call inside * check_events(). This tests the nested f_op->poll calls code in * fs/eventpoll.c:ep_eventpoll_poll() * By having the pipe write(2) happen first, we make the kernel * epoll code to load the ready lists, and the following poll(2) * done inside check_events() will test nested poll code in * ep_eventpoll_poll(). */ if ((pid = xfork()) == 0) { write(pfds[1], "w", 1); exit(0); } sleep(1); res = check_events(epfd, 1000) & POLLIN; if (waitpid(pid, NULL, 0) != pid) { perror("waitpid"); return -1; } return res; } int main(int ac, char **av) { int error; struct epoll_test_cfg tcfg; fprintf(stdout, "\n********** Testing TTY events\n"); error = run_forked_proc(epoll_test_tty, NULL); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = 0; fprintf(stdout, "\n********** Testing short wakeup chain\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == POLLIN ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = EPOLL_MAX_CHAIN; tcfg.flags = 0; fprintf(stdout, "\n********** Testing long wakeup chain (HOLD ON)\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = 0; fprintf(stdout, "\n********** Testing short poll chain\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == POLLIN ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = EPOLL_MAX_CHAIN; tcfg.flags = 0; fprintf(stdout, "\n********** Testing long poll chain (HOLD ON)\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = EPOLL_TF_LOOP; fprintf(stdout, "\n********** Testing loopy wakeup chain (HOLD ON)\n"); error = run_forked_proc(epoll_wakeup_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); tcfg.size = 3; tcfg.flags = EPOLL_TF_LOOP; fprintf(stdout, "\n********** Testing loopy poll chain (HOLD ON)\n"); error = run_forked_proc(epoll_poll_chain, &tcfg); fprintf(stdout, error == 0 ? "********** OK\n": "********** FAIL (%d)\n", error); return 0; } Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: Pavel Pisa <pisa@cmp.felk.cvut.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c511
1 files changed, 304 insertions, 207 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..8a23a91e1377 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -92,8 +92,8 @@
92/* Epoll private bits inside the event mask */ 92/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 94
95/* Maximum number of poll wake up nests we are allowing */ 95/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 96#define EP_MAX_NESTS 4
97 97
98/* Maximum msec timeout value storeable in a long int */ 98/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +110,21 @@ struct epoll_filefd {
110}; 110};
111 111
112/* 112/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 113 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 114 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 115 */
119struct wake_task_node { 116struct nested_call_node {
120 struct list_head llink; 117 struct list_head llink;
121 struct task_struct *task; 118 struct task_struct *task;
122 wait_queue_head_t *wq; 119 void *cookie;
123}; 120};
124 121
125/* 122/*
126 * This is used to implement the safe poll wake up avoiding to reenter 123 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 124 * maximum recursion dept and loop cycles.
128 */ 125 */
129struct poll_safewake { 126struct nested_calls {
130 struct list_head wake_task_list; 127 struct list_head tasks_call_list;
131 spinlock_t lock; 128 spinlock_t lock;
132}; 129};
133 130
@@ -231,6 +228,12 @@ struct ep_pqueue {
231 struct epitem *epi; 228 struct epitem *epi;
232}; 229};
233 230
231/* Used by the ep_send_events() function as callback private data */
232struct ep_send_events_data {
233 int maxevents;
234 struct epoll_event __user *events;
235};
236
234/* 237/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 238 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 239 */
@@ -242,8 +245,11 @@ static int max_user_watches __read_mostly;
242 */ 245 */
243static DEFINE_MUTEX(epmutex); 246static DEFINE_MUTEX(epmutex);
244 247
245/* Safe wake up implementation */ 248/* Used for safe wake up implementation */
246static struct poll_safewake psw; 249static struct nested_calls poll_safewake_ncalls;
250
251/* Used to call file's f_op->poll() under the nested calls boundaries */
252static struct nested_calls poll_readywalk_ncalls;
247 253
248/* Slab cache used to allocate "struct epitem" */ 254/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 255static struct kmem_cache *epi_cache __read_mostly;
@@ -312,64 +318,96 @@ static inline int ep_op_has_event(int op)
312} 318}
313 319
314/* Initialize the poll safe wake up structure */ 320/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 321static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 322{
317 323 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 324 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 325}
321 326
322/* 327/**
323 * Perform a safe wake up of the poll wait list. The problem is that 328 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 329 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 330 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 331 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 332 *
328 * and we cannot reenter the same wait queue head at all. This will 333 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 334 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 335 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 336 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 337 * @cookie: Cookie to be used to identify this nested call.
338 *
339 * Returns: Returns the code returned by the @nproc callback, or -1 if
340 * the maximum recursion limit has been exceeded.
333 */ 341 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 342static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
343 int (*nproc)(void *, void *, int), void *priv,
344 void *cookie)
335{ 345{
336 int wake_nests = 0; 346 int error, call_nests = 0;
337 unsigned long flags; 347 unsigned long flags;
338 struct task_struct *this_task = current; 348 struct task_struct *this_task = current;
339 struct list_head *lsthead = &psw->wake_task_list; 349 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 350 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 351 struct nested_call_node tnode;
342 352
343 spin_lock_irqsave(&psw->lock, flags); 353 spin_lock_irqsave(&ncalls->lock, flags);
344 354
345 /* Try to see if the current task is already inside this wakeup call */ 355 /*
356 * Try to see if the current task is already inside this wakeup call.
357 * We use a list here, since the population inside this set is always
358 * very much limited.
359 */
346 list_for_each_entry(tncur, lsthead, llink) { 360 list_for_each_entry(tncur, lsthead, llink) {
347 361 if (tncur->task == this_task &&
348 if (tncur->wq == wq || 362 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 363 /*
351 * Ops ... loop detected or maximum nest level reached. 364 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 365 * We abort this wake by breaking the cycle itself.
353 */ 366 */
354 spin_unlock_irqrestore(&psw->lock, flags); 367 spin_unlock_irqrestore(&ncalls->lock, flags);
355 return; 368
369 return -1;
356 } 370 }
357 } 371 }
358 372
359 /* Add the current task to the list */ 373 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 374 tnode.task = this_task;
361 tnode.wq = wq; 375 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 376 list_add(&tnode.llink, lsthead);
363 377
364 spin_unlock_irqrestore(&psw->lock, flags); 378 spin_unlock_irqrestore(&ncalls->lock, flags);
365 379
366 /* Do really wake up now */ 380 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 381 error = (*nproc)(priv, cookie, call_nests);
368 382
369 /* Remove the current task from the list */ 383 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 384 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 385 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 386 spin_unlock_irqrestore(&ncalls->lock, flags);
387
388 return error;
389}
390
391static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
392{
393 wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests);
394 return 0;
395}
396
397/*
398 * Perform a safe wake up of the poll wait list. The problem is that
399 * with the new callback'd wake up system, it is possible that the
400 * poll callback is reentered from inside the call to wake_up() done
401 * on the poll wait queue head. The rule is that we cannot reenter the
402 * wake up code from the same task more than EP_MAX_NESTS times,
403 * and we cannot reenter the same wait queue head at all. This will
404 * enable to have a hierarchy of epoll file descriptor of no more than
405 * EP_MAX_NESTS deep.
406 */
407static void ep_poll_safewake(wait_queue_head_t *wq)
408{
409 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
410 ep_poll_wakeup_proc, NULL, wq);
373} 411}
374 412
375/* 413/*
@@ -397,6 +435,104 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
397 } 435 }
398} 436}
399 437
438/**
439 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
440 * the scan code, to call f_op->poll(). Also allows for
441 * O(NumReady) performance.
442 *
443 * @ep: Pointer to the epoll private data structure.
444 * @sproc: Pointer to the scan callback.
445 * @priv: Private opaque data passed to the @sproc callback.
446 *
447 * Returns: The same integer error code returned by the @sproc callback.
448 */
449static int ep_scan_ready_list(struct eventpoll *ep,
450 int (*sproc)(struct eventpoll *,
451 struct list_head *, void *),
452 void *priv)
453{
454 int error, pwake = 0;
455 unsigned long flags;
456 struct epitem *epi, *nepi;
457 struct list_head txlist;
458
459 INIT_LIST_HEAD(&txlist);
460
461 /*
462 * We need to lock this because we could be hit by
463 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
464 */
465 mutex_lock(&ep->mtx);
466
467 /*
468 * Steal the ready list, and re-init the original one to the
469 * empty list. Also, set ep->ovflist to NULL so that events
470 * happening while looping w/out locks, are not lost. We cannot
471 * have the poll callback to queue directly on ep->rdllist,
472 * because we want the "sproc" callback to be able to do it
473 * in a lockless way.
474 */
475 spin_lock_irqsave(&ep->lock, flags);
476 list_splice(&ep->rdllist, &txlist);
477 INIT_LIST_HEAD(&ep->rdllist);
478 ep->ovflist = NULL;
479 spin_unlock_irqrestore(&ep->lock, flags);
480
481 /*
482 * Now call the callback function.
483 */
484 error = (*sproc)(ep, &txlist, priv);
485
486 spin_lock_irqsave(&ep->lock, flags);
487 /*
488 * During the time we spent inside the "sproc" callback, some
489 * other events might have been queued by the poll callback.
490 * We re-insert them inside the main ready-list here.
491 */
492 for (nepi = ep->ovflist; (epi = nepi) != NULL;
493 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
494 /*
495 * We need to check if the item is already in the list.
496 * During the "sproc" callback execution time, items are
497 * queued into ->ovflist but the "txlist" might already
498 * contain them, and the list_splice() below takes care of them.
499 */
500 if (!ep_is_linked(&epi->rdllink))
501 list_add_tail(&epi->rdllink, &ep->rdllist);
502 }
503 /*
504 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
505 * releasing the lock, events will be queued in the normal way inside
506 * ep->rdllist.
507 */
508 ep->ovflist = EP_UNACTIVE_PTR;
509
510 /*
511 * Quickly re-inject items left on "txlist".
512 */
513 list_splice(&txlist, &ep->rdllist);
514
515 if (!list_empty(&ep->rdllist)) {
516 /*
517 * Wake up (if active) both the eventpoll wait list and the ->poll()
518 * wait list (delayed after we release the lock).
519 */
520 if (waitqueue_active(&ep->wq))
521 wake_up_locked(&ep->wq);
522 if (waitqueue_active(&ep->poll_wait))
523 pwake++;
524 }
525 spin_unlock_irqrestore(&ep->lock, flags);
526
527 mutex_unlock(&ep->mtx);
528
529 /* We have to call this outside the lock */
530 if (pwake)
531 ep_poll_safewake(&ep->poll_wait);
532
533 return error;
534}
535
400/* 536/*
401 * Removes a "struct epitem" from the eventpoll RB tree and deallocates 537 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
402 * all the associated resources. Must be called with "mtx" held. 538 * all the associated resources. Must be called with "mtx" held.
@@ -447,7 +583,7 @@ static void ep_free(struct eventpoll *ep)
447 583
448 /* We need to release all tasks waiting for these file */ 584 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 585 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 586 ep_poll_safewake(&ep->poll_wait);
451 587
452 /* 588 /*
453 * We need to lock this because we could be hit by 589 * We need to lock this because we could be hit by
@@ -496,22 +632,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
496 return 0; 632 return 0;
497} 633}
498 634
635static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
636{
637 struct epitem *epi, *tmp;
638
639 list_for_each_entry_safe(epi, tmp, head, rdllink) {
640 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
641 epi->event.events)
642 return POLLIN | POLLRDNORM;
643 else
644 /*
645 * Item has been dropped into the ready list by the poll
646 * callback, but it's not actually ready, as far as
647 * caller requested events goes. We can remove it here.
648 */
649 list_del_init(&epi->rdllink);
650 }
651
652 return 0;
653}
654
655static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
656{
657 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
658}
659
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 660static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 661{
501 unsigned int pollflags = 0; 662 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 663 struct eventpoll *ep = file->private_data;
504 664
505 /* Insert inside our poll wait queue */ 665 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 666 poll_wait(file, &ep->poll_wait, wait);
507 667
508 /* Check our condition */ 668 /*
509 spin_lock_irqsave(&ep->lock, flags); 669 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 670 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 671 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 672 * could re-enter here.
673 */
674 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
675 ep_poll_readyevents_proc, ep, ep);
513 676
514 return pollflags; 677 return pollflags != -1 ? pollflags: 0;
515} 678}
516 679
517/* File callbacks that implement the eventpoll file behaviour */ 680/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +704,7 @@ void eventpoll_release_file(struct file *file)
541 * We don't want to get "file->f_lock" because it is not 704 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 705 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 706 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 707 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 708 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 709 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 710 * will correctly serialize the operation. We do need to acquire
@@ -670,12 +833,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
670 } 833 }
671 834
672 /* If this file is already in the ready list we exit soon */ 835 /* If this file is already in the ready list we exit soon */
673 if (ep_is_linked(&epi->rdllink)) 836 if (!ep_is_linked(&epi->rdllink))
674 goto is_linked; 837 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676 list_add_tail(&epi->rdllink, &ep->rdllist);
677 838
678is_linked:
679 /* 839 /*
680 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 840 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
681 * wait list. 841 * wait list.
@@ -690,7 +850,7 @@ out_unlock:
690 850
691 /* We have to call this outside the lock */ 851 /* We have to call this outside the lock */
692 if (pwake) 852 if (pwake)
693 ep_poll_safewake(&psw, &ep->poll_wait); 853 ep_poll_safewake(&ep->poll_wait);
694 854
695 return 1; 855 return 1;
696} 856}
@@ -712,10 +872,9 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
712 add_wait_queue(whead, &pwq->wait); 872 add_wait_queue(whead, &pwq->wait);
713 list_add_tail(&pwq->llink, &epi->pwqlist); 873 list_add_tail(&pwq->llink, &epi->pwqlist);
714 epi->nwait++; 874 epi->nwait++;
715 } else { 875 } else
716 /* We have to signal that an error occurred */ 876 /* We have to signal that an error occurred */
717 epi->nwait = -1; 877 epi->nwait = -1;
718 }
719} 878}
720 879
721static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 880static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
@@ -817,7 +976,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
817 976
818 /* We have to call this outside the lock */ 977 /* We have to call this outside the lock */
819 if (pwake) 978 if (pwake)
820 ep_poll_safewake(&psw, &ep->poll_wait); 979 ep_poll_safewake(&ep->poll_wait);
821 980
822 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", 981 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
823 current, ep, tfile, fd)); 982 current, ep, tfile, fd));
@@ -891,137 +1050,74 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
891 1050
892 /* We have to call this outside the lock */ 1051 /* We have to call this outside the lock */
893 if (pwake) 1052 if (pwake)
894 ep_poll_safewake(&psw, &ep->poll_wait); 1053 ep_poll_safewake(&ep->poll_wait);
895 1054
896 return 0; 1055 return 0;
897} 1056}
898 1057
899static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1058static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
900 int maxevents)
901{ 1059{
902 int eventcnt, error = -EFAULT, pwake = 0; 1060 struct ep_send_events_data *esed = priv;
903 unsigned int revents; 1061 int eventcnt;
904 unsigned long flags; 1062 unsigned int revents;
905 struct epitem *epi, *nepi; 1063 struct epitem *epi;
906 struct list_head txlist; 1064 struct epoll_event __user *uevent;
907
908 INIT_LIST_HEAD(&txlist);
909
910 /*
911 * We need to lock this because we could be hit by
912 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
913 */
914 mutex_lock(&ep->mtx);
915
916 /*
917 * Steal the ready list, and re-init the original one to the
918 * empty list. Also, set ep->ovflist to NULL so that events
919 * happening while looping w/out locks, are not lost. We cannot
920 * have the poll callback to queue directly on ep->rdllist,
921 * because we are doing it in the loop below, in a lockless way.
922 */
923 spin_lock_irqsave(&ep->lock, flags);
924 list_splice(&ep->rdllist, &txlist);
925 INIT_LIST_HEAD(&ep->rdllist);
926 ep->ovflist = NULL;
927 spin_unlock_irqrestore(&ep->lock, flags);
928 1065
929 /* 1066 /*
930 * We can loop without lock because this is a task private list. 1067 * We can loop without lock because we are passed a task private list.
931 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1068 * Items cannot vanish during the loop because ep_scan_ready_list() is
932 * Items cannot vanish during the loop because we are holding "mtx". 1069 * holding "mtx" during this call.
933 */ 1070 */
934 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1071 for (eventcnt = 0, uevent = esed->events;
935 epi = list_first_entry(&txlist, struct epitem, rdllink); 1072 !list_empty(head) && eventcnt < esed->maxevents;) {
1073 epi = list_first_entry(head, struct epitem, rdllink);
936 1074
937 list_del_init(&epi->rdllink); 1075 list_del_init(&epi->rdllink);
938 1076
939 /* 1077 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
940 * Get the ready file event set. We can safely use the file 1078 epi->event.events;
941 * because we are holding the "mtx" and this will guarantee 1079
942 * that both the file and the item will not vanish. 1080 /*
943 */ 1081 * If the event mask intersect the caller-requested one,
944 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1082 * deliver the event to userspace. Again, ep_scan_ready_list()
945 revents &= epi->event.events; 1083 * is holding "mtx", so no operations coming from userspace
946 1084 * can change the item.
947 /* 1085 */
948 * Is the event mask intersect the caller-requested one, 1086 if (revents) {
949 * deliver the event to userspace. Again, we are holding 1087 if (__put_user(revents, &uevent->events) ||
950 * "mtx", so no operations coming from userspace can change 1088 __put_user(epi->event.data, &uevent->data))
951 * the item. 1089 return eventcnt ? eventcnt: -EFAULT;
952 */ 1090 eventcnt++;
953 if (revents) { 1091 uevent++;
954 if (__put_user(revents, 1092 if (epi->event.events & EPOLLONESHOT)
955 &events[eventcnt].events) || 1093 epi->event.events &= EP_PRIVATE_BITS;
956 __put_user(epi->event.data, 1094 else if (!(epi->event.events & EPOLLET))
957 &events[eventcnt].data)) 1095 /*
958 goto errxit; 1096 * If this file has been added with Level Trigger
959 if (epi->event.events & EPOLLONESHOT) 1097 * mode, we need to insert back inside the ready
960 epi->event.events &= EP_PRIVATE_BITS; 1098 * list, so that the next call to epoll_wait()
961 eventcnt++; 1099 * will check again the events availability.
962 } 1100 * At this point, noone can insert into ep->rdllist
963 /* 1101 * besides us. The epoll_ctl() callers are locked
964 * At this point, noone can insert into ep->rdllist besides 1102 * out by ep_scan_ready_list() holding "mtx" and
965 * us. The epoll_ctl() callers are locked out by us holding 1103 * the poll callback will queue them in ep->ovflist.
966 * "mtx" and the poll callback will queue them in ep->ovflist. 1104 */
967 */ 1105 list_add_tail(&epi->rdllink, &ep->rdllist);
968 if (!(epi->event.events & EPOLLET) && 1106 }
969 (revents & epi->event.events)) 1107 }
970 list_add_tail(&epi->rdllink, &ep->rdllist); 1108
971 } 1109 return eventcnt;
972 error = 0; 1110}
973
974errxit:
975
976 spin_lock_irqsave(&ep->lock, flags);
977 /*
978 * During the time we spent in the loop above, some other events
979 * might have been queued by the poll callback. We re-insert them
980 * inside the main ready-list here.
981 */
982 for (nepi = ep->ovflist; (epi = nepi) != NULL;
983 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
984 /*
985 * If the above loop quit with errors, the epoll item might still
986 * be linked to "txlist", and the list_splice() done below will
987 * take care of those cases.
988 */
989 if (!ep_is_linked(&epi->rdllink))
990 list_add_tail(&epi->rdllink, &ep->rdllist);
991 }
992 /*
993 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
994 * releasing the lock, events will be queued in the normal way inside
995 * ep->rdllist.
996 */
997 ep->ovflist = EP_UNACTIVE_PTR;
998
999 /*
1000 * In case of error in the event-send loop, or in case the number of
1001 * ready events exceeds the userspace limit, we need to splice the
1002 * "txlist" back inside ep->rdllist.
1003 */
1004 list_splice(&txlist, &ep->rdllist);
1005
1006 if (!list_empty(&ep->rdllist)) {
1007 /*
1008 * Wake up (if active) both the eventpoll wait list and the ->poll()
1009 * wait list (delayed after we release the lock).
1010 */
1011 if (waitqueue_active(&ep->wq))
1012 wake_up_locked(&ep->wq);
1013 if (waitqueue_active(&ep->poll_wait))
1014 pwake++;
1015 }
1016 spin_unlock_irqrestore(&ep->lock, flags);
1017 1111
1018 mutex_unlock(&ep->mtx); 1112static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
1113 int maxevents)
1114{
1115 struct ep_send_events_data esed;
1019 1116
1020 /* We have to call this outside the lock */ 1117 esed.maxevents = maxevents;
1021 if (pwake) 1118 esed.events = events;
1022 ep_poll_safewake(&psw, &ep->poll_wait);
1023 1119
1024 return eventcnt == 0 ? error: eventcnt; 1120 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1025} 1121}
1026 1122
1027static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1123static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1129,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1033 wait_queue_t wait; 1129 wait_queue_t wait;
1034 1130
1035 /* 1131 /*
1036 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1132 * Calculate the timeout by checking for the "infinite" value (-1)
1037 * and the overflow condition. The passed timeout is in milliseconds, 1133 * and the overflow condition. The passed timeout is in milliseconds,
1038 * that why (t * HZ) / 1000. 1134 * that why (t * HZ) / 1000.
1039 */ 1135 */
@@ -1076,9 +1172,8 @@ retry:
1076 1172
1077 set_current_state(TASK_RUNNING); 1173 set_current_state(TASK_RUNNING);
1078 } 1174 }
1079
1080 /* Is it worth to try to dig for events ? */ 1175 /* Is it worth to try to dig for events ? */
1081 eavail = !list_empty(&ep->rdllist); 1176 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1082 1177
1083 spin_unlock_irqrestore(&ep->lock, flags); 1178 spin_unlock_irqrestore(&ep->lock, flags);
1084 1179
@@ -1099,41 +1194,40 @@ retry:
1099 */ 1194 */
1100SYSCALL_DEFINE1(epoll_create1, int, flags) 1195SYSCALL_DEFINE1(epoll_create1, int, flags)
1101{ 1196{
1102 int error, fd = -1; 1197 int error;
1103 struct eventpoll *ep; 1198 struct eventpoll *ep = NULL;
1104 1199
1105 /* Check the EPOLL_* constant for consistency. */ 1200 /* Check the EPOLL_* constant for consistency. */
1106 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1201 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1107 1202
1108 if (flags & ~EPOLL_CLOEXEC)
1109 return -EINVAL;
1110
1111 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", 1203 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1112 current, flags)); 1204 current, flags));
1113 1205
1206 error = -EINVAL;
1207 if (flags & ~EPOLL_CLOEXEC)
1208 goto error_return;
1209
1114 /* 1210 /*
1115 * Create the internal data structure ( "struct eventpoll" ). 1211 * Create the internal data structure ("struct eventpoll").
1116 */ 1212 */
1117 error = ep_alloc(&ep); 1213 error = ep_alloc(&ep);
1118 if (error < 0) { 1214 if (error < 0)
1119 fd = error;
1120 goto error_return; 1215 goto error_return;
1121 }
1122 1216
1123 /* 1217 /*
1124 * Creates all the items needed to setup an eventpoll file. That is, 1218 * Creates all the items needed to setup an eventpoll file. That is,
1125 * a file structure and a free file descriptor. 1219 * a file structure and a free file descriptor.
1126 */ 1220 */
1127 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1221 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1128 flags & O_CLOEXEC); 1222 flags & O_CLOEXEC);
1129 if (fd < 0) 1223 if (error < 0)
1130 ep_free(ep); 1224 ep_free(ep);
1131 1225
1132error_return: 1226error_return:
1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1227 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1134 current, flags, fd)); 1228 current, flags, error));
1135 1229
1136 return fd; 1230 return error;
1137} 1231}
1138 1232
1139SYSCALL_DEFINE1(epoll_create, int, size) 1233SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1359,7 +1453,10 @@ static int __init eventpoll_init(void)
1359 EP_ITEM_COST; 1453 EP_ITEM_COST;
1360 1454
1361 /* Initialize the structure used to perform safe poll wait head wake ups */ 1455 /* Initialize the structure used to perform safe poll wait head wake ups */
1362 ep_poll_safewake_init(&psw); 1456 ep_nested_calls_init(&poll_safewake_ncalls);
1457
1458 /* Initialize the structure used to perform file's f_op->poll() calls */
1459 ep_nested_calls_init(&poll_readywalk_ncalls);
1363 1460
1364 /* Allocates slab cache used to allocate "struct epitem" items */ 1461 /* Allocates slab cache used to allocate "struct epitem" items */
1365 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1462 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),