aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/eventpoll.c511
1 files changed, 304 insertions, 207 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..8a23a91e1377 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -92,8 +92,8 @@
92/* Epoll private bits inside the event mask */ 92/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 94
95/* Maximum number of poll wake up nests we are allowing */ 95/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 96#define EP_MAX_NESTS 4
97 97
98/* Maximum msec timeout value storeable in a long int */ 98/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +110,21 @@ struct epoll_filefd {
110}; 110};
111 111
112/* 112/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 113 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 114 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 115 */
119struct wake_task_node { 116struct nested_call_node {
120 struct list_head llink; 117 struct list_head llink;
121 struct task_struct *task; 118 struct task_struct *task;
122 wait_queue_head_t *wq; 119 void *cookie;
123}; 120};
124 121
125/* 122/*
126 * This is used to implement the safe poll wake up avoiding to reenter 123 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 124 * maximum recursion dept and loop cycles.
128 */ 125 */
129struct poll_safewake { 126struct nested_calls {
130 struct list_head wake_task_list; 127 struct list_head tasks_call_list;
131 spinlock_t lock; 128 spinlock_t lock;
132}; 129};
133 130
@@ -231,6 +228,12 @@ struct ep_pqueue {
231 struct epitem *epi; 228 struct epitem *epi;
232}; 229};
233 230
231/* Used by the ep_send_events() function as callback private data */
232struct ep_send_events_data {
233 int maxevents;
234 struct epoll_event __user *events;
235};
236
234/* 237/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 238 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 239 */
@@ -242,8 +245,11 @@ static int max_user_watches __read_mostly;
242 */ 245 */
243static DEFINE_MUTEX(epmutex); 246static DEFINE_MUTEX(epmutex);
244 247
245/* Safe wake up implementation */ 248/* Used for safe wake up implementation */
246static struct poll_safewake psw; 249static struct nested_calls poll_safewake_ncalls;
250
251/* Used to call file's f_op->poll() under the nested calls boundaries */
252static struct nested_calls poll_readywalk_ncalls;
247 253
248/* Slab cache used to allocate "struct epitem" */ 254/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 255static struct kmem_cache *epi_cache __read_mostly;
@@ -312,64 +318,96 @@ static inline int ep_op_has_event(int op)
312} 318}
313 319
314/* Initialize the poll safe wake up structure */ 320/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 321static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 322{
317 323 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 324 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 325}
321 326
322/* 327/**
323 * Perform a safe wake up of the poll wait list. The problem is that 328 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 329 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 330 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 331 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 332 *
328 * and we cannot reenter the same wait queue head at all. This will 333 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 334 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 335 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 336 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 337 * @cookie: Cookie to be used to identify this nested call.
338 *
339 * Returns: Returns the code returned by the @nproc callback, or -1 if
340 * the maximum recursion limit has been exceeded.
333 */ 341 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 342static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
343 int (*nproc)(void *, void *, int), void *priv,
344 void *cookie)
335{ 345{
336 int wake_nests = 0; 346 int error, call_nests = 0;
337 unsigned long flags; 347 unsigned long flags;
338 struct task_struct *this_task = current; 348 struct task_struct *this_task = current;
339 struct list_head *lsthead = &psw->wake_task_list; 349 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 350 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 351 struct nested_call_node tnode;
342 352
343 spin_lock_irqsave(&psw->lock, flags); 353 spin_lock_irqsave(&ncalls->lock, flags);
344 354
345 /* Try to see if the current task is already inside this wakeup call */ 355 /*
356 * Try to see if the current task is already inside this wakeup call.
357 * We use a list here, since the population inside this set is always
358 * very much limited.
359 */
346 list_for_each_entry(tncur, lsthead, llink) { 360 list_for_each_entry(tncur, lsthead, llink) {
347 361 if (tncur->task == this_task &&
348 if (tncur->wq == wq || 362 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 363 /*
351 * Ops ... loop detected or maximum nest level reached. 364 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 365 * We abort this wake by breaking the cycle itself.
353 */ 366 */
354 spin_unlock_irqrestore(&psw->lock, flags); 367 spin_unlock_irqrestore(&ncalls->lock, flags);
355 return; 368
369 return -1;
356 } 370 }
357 } 371 }
358 372
359 /* Add the current task to the list */ 373 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 374 tnode.task = this_task;
361 tnode.wq = wq; 375 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 376 list_add(&tnode.llink, lsthead);
363 377
364 spin_unlock_irqrestore(&psw->lock, flags); 378 spin_unlock_irqrestore(&ncalls->lock, flags);
365 379
366 /* Do really wake up now */ 380 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 381 error = (*nproc)(priv, cookie, call_nests);
368 382
369 /* Remove the current task from the list */ 383 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 384 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 385 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 386 spin_unlock_irqrestore(&ncalls->lock, flags);
387
388 return error;
389}
390
391static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
392{
393 wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests);
394 return 0;
395}
396
397/*
398 * Perform a safe wake up of the poll wait list. The problem is that
399 * with the new callback'd wake up system, it is possible that the
400 * poll callback is reentered from inside the call to wake_up() done
401 * on the poll wait queue head. The rule is that we cannot reenter the
402 * wake up code from the same task more than EP_MAX_NESTS times,
403 * and we cannot reenter the same wait queue head at all. This will
404 * enable to have a hierarchy of epoll file descriptor of no more than
405 * EP_MAX_NESTS deep.
406 */
407static void ep_poll_safewake(wait_queue_head_t *wq)
408{
409 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
410 ep_poll_wakeup_proc, NULL, wq);
373} 411}
374 412
375/* 413/*
@@ -397,6 +435,104 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
397 } 435 }
398} 436}
399 437
438/**
439 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
440 * the scan code, to call f_op->poll(). Also allows for
441 * O(NumReady) performance.
442 *
443 * @ep: Pointer to the epoll private data structure.
444 * @sproc: Pointer to the scan callback.
445 * @priv: Private opaque data passed to the @sproc callback.
446 *
447 * Returns: The same integer error code returned by the @sproc callback.
448 */
449static int ep_scan_ready_list(struct eventpoll *ep,
450 int (*sproc)(struct eventpoll *,
451 struct list_head *, void *),
452 void *priv)
453{
454 int error, pwake = 0;
455 unsigned long flags;
456 struct epitem *epi, *nepi;
457 struct list_head txlist;
458
459 INIT_LIST_HEAD(&txlist);
460
461 /*
462 * We need to lock this because we could be hit by
463 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
464 */
465 mutex_lock(&ep->mtx);
466
467 /*
468 * Steal the ready list, and re-init the original one to the
469 * empty list. Also, set ep->ovflist to NULL so that events
470 * happening while looping w/out locks, are not lost. We cannot
471 * have the poll callback to queue directly on ep->rdllist,
472 * because we want the "sproc" callback to be able to do it
473 * in a lockless way.
474 */
475 spin_lock_irqsave(&ep->lock, flags);
476 list_splice(&ep->rdllist, &txlist);
477 INIT_LIST_HEAD(&ep->rdllist);
478 ep->ovflist = NULL;
479 spin_unlock_irqrestore(&ep->lock, flags);
480
481 /*
482 * Now call the callback function.
483 */
484 error = (*sproc)(ep, &txlist, priv);
485
486 spin_lock_irqsave(&ep->lock, flags);
487 /*
488 * During the time we spent inside the "sproc" callback, some
489 * other events might have been queued by the poll callback.
490 * We re-insert them inside the main ready-list here.
491 */
492 for (nepi = ep->ovflist; (epi = nepi) != NULL;
493 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
494 /*
495 * We need to check if the item is already in the list.
496 * During the "sproc" callback execution time, items are
497 * queued into ->ovflist but the "txlist" might already
498 * contain them, and the list_splice() below takes care of them.
499 */
500 if (!ep_is_linked(&epi->rdllink))
501 list_add_tail(&epi->rdllink, &ep->rdllist);
502 }
503 /*
504 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
505 * releasing the lock, events will be queued in the normal way inside
506 * ep->rdllist.
507 */
508 ep->ovflist = EP_UNACTIVE_PTR;
509
510 /*
511 * Quickly re-inject items left on "txlist".
512 */
513 list_splice(&txlist, &ep->rdllist);
514
515 if (!list_empty(&ep->rdllist)) {
516 /*
517 * Wake up (if active) both the eventpoll wait list and the ->poll()
518 * wait list (delayed after we release the lock).
519 */
520 if (waitqueue_active(&ep->wq))
521 wake_up_locked(&ep->wq);
522 if (waitqueue_active(&ep->poll_wait))
523 pwake++;
524 }
525 spin_unlock_irqrestore(&ep->lock, flags);
526
527 mutex_unlock(&ep->mtx);
528
529 /* We have to call this outside the lock */
530 if (pwake)
531 ep_poll_safewake(&ep->poll_wait);
532
533 return error;
534}
535
400/* 536/*
401 * Removes a "struct epitem" from the eventpoll RB tree and deallocates 537 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
402 * all the associated resources. Must be called with "mtx" held. 538 * all the associated resources. Must be called with "mtx" held.
@@ -447,7 +583,7 @@ static void ep_free(struct eventpoll *ep)
447 583
448 /* We need to release all tasks waiting for these file */ 584 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 585 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 586 ep_poll_safewake(&ep->poll_wait);
451 587
452 /* 588 /*
453 * We need to lock this because we could be hit by 589 * We need to lock this because we could be hit by
@@ -496,22 +632,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
496 return 0; 632 return 0;
497} 633}
498 634
635static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
636{
637 struct epitem *epi, *tmp;
638
639 list_for_each_entry_safe(epi, tmp, head, rdllink) {
640 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
641 epi->event.events)
642 return POLLIN | POLLRDNORM;
643 else
644 /*
645 * Item has been dropped into the ready list by the poll
646 * callback, but it's not actually ready, as far as
647 * caller requested events goes. We can remove it here.
648 */
649 list_del_init(&epi->rdllink);
650 }
651
652 return 0;
653}
654
655static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
656{
657 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
658}
659
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 660static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 661{
501 unsigned int pollflags = 0; 662 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 663 struct eventpoll *ep = file->private_data;
504 664
505 /* Insert inside our poll wait queue */ 665 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 666 poll_wait(file, &ep->poll_wait, wait);
507 667
508 /* Check our condition */ 668 /*
509 spin_lock_irqsave(&ep->lock, flags); 669 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 670 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 671 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 672 * could re-enter here.
673 */
674 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
675 ep_poll_readyevents_proc, ep, ep);
513 676
514 return pollflags; 677 return pollflags != -1 ? pollflags: 0;
515} 678}
516 679
517/* File callbacks that implement the eventpoll file behaviour */ 680/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +704,7 @@ void eventpoll_release_file(struct file *file)
541 * We don't want to get "file->f_lock" because it is not 704 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 705 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 706 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 707 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 708 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 709 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 710 * will correctly serialize the operation. We do need to acquire
@@ -670,12 +833,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
670 } 833 }
671 834
672 /* If this file is already in the ready list we exit soon */ 835 /* If this file is already in the ready list we exit soon */
673 if (ep_is_linked(&epi->rdllink)) 836 if (!ep_is_linked(&epi->rdllink))
674 goto is_linked; 837 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676 list_add_tail(&epi->rdllink, &ep->rdllist);
677 838
678is_linked:
679 /* 839 /*
680 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 840 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
681 * wait list. 841 * wait list.
@@ -690,7 +850,7 @@ out_unlock:
690 850
691 /* We have to call this outside the lock */ 851 /* We have to call this outside the lock */
692 if (pwake) 852 if (pwake)
693 ep_poll_safewake(&psw, &ep->poll_wait); 853 ep_poll_safewake(&ep->poll_wait);
694 854
695 return 1; 855 return 1;
696} 856}
@@ -712,10 +872,9 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
712 add_wait_queue(whead, &pwq->wait); 872 add_wait_queue(whead, &pwq->wait);
713 list_add_tail(&pwq->llink, &epi->pwqlist); 873 list_add_tail(&pwq->llink, &epi->pwqlist);
714 epi->nwait++; 874 epi->nwait++;
715 } else { 875 } else
716 /* We have to signal that an error occurred */ 876 /* We have to signal that an error occurred */
717 epi->nwait = -1; 877 epi->nwait = -1;
718 }
719} 878}
720 879
721static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 880static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
@@ -817,7 +976,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
817 976
818 /* We have to call this outside the lock */ 977 /* We have to call this outside the lock */
819 if (pwake) 978 if (pwake)
820 ep_poll_safewake(&psw, &ep->poll_wait); 979 ep_poll_safewake(&ep->poll_wait);
821 980
822 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", 981 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
823 current, ep, tfile, fd)); 982 current, ep, tfile, fd));
@@ -891,137 +1050,74 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
891 1050
892 /* We have to call this outside the lock */ 1051 /* We have to call this outside the lock */
893 if (pwake) 1052 if (pwake)
894 ep_poll_safewake(&psw, &ep->poll_wait); 1053 ep_poll_safewake(&ep->poll_wait);
895 1054
896 return 0; 1055 return 0;
897} 1056}
898 1057
899static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1058static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv)
900 int maxevents)
901{ 1059{
902 int eventcnt, error = -EFAULT, pwake = 0; 1060 struct ep_send_events_data *esed = priv;
903 unsigned int revents; 1061 int eventcnt;
904 unsigned long flags; 1062 unsigned int revents;
905 struct epitem *epi, *nepi; 1063 struct epitem *epi;
906 struct list_head txlist; 1064 struct epoll_event __user *uevent;
907
908 INIT_LIST_HEAD(&txlist);
909
910 /*
911 * We need to lock this because we could be hit by
912 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
913 */
914 mutex_lock(&ep->mtx);
915
916 /*
917 * Steal the ready list, and re-init the original one to the
918 * empty list. Also, set ep->ovflist to NULL so that events
919 * happening while looping w/out locks, are not lost. We cannot
920 * have the poll callback to queue directly on ep->rdllist,
921 * because we are doing it in the loop below, in a lockless way.
922 */
923 spin_lock_irqsave(&ep->lock, flags);
924 list_splice(&ep->rdllist, &txlist);
925 INIT_LIST_HEAD(&ep->rdllist);
926 ep->ovflist = NULL;
927 spin_unlock_irqrestore(&ep->lock, flags);
928 1065
929 /* 1066 /*
930 * We can loop without lock because this is a task private list. 1067 * We can loop without lock because we are passed a task private list.
931 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1068 * Items cannot vanish during the loop because ep_scan_ready_list() is
932 * Items cannot vanish during the loop because we are holding "mtx". 1069 * holding "mtx" during this call.
933 */ 1070 */
934 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1071 for (eventcnt = 0, uevent = esed->events;
935 epi = list_first_entry(&txlist, struct epitem, rdllink); 1072 !list_empty(head) && eventcnt < esed->maxevents;) {
1073 epi = list_first_entry(head, struct epitem, rdllink);
936 1074
937 list_del_init(&epi->rdllink); 1075 list_del_init(&epi->rdllink);
938 1076
939 /* 1077 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
940 * Get the ready file event set. We can safely use the file 1078 epi->event.events;
941 * because we are holding the "mtx" and this will guarantee 1079
942 * that both the file and the item will not vanish. 1080 /*
943 */ 1081 * If the event mask intersect the caller-requested one,
944 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1082 * deliver the event to userspace. Again, ep_scan_ready_list()
945 revents &= epi->event.events; 1083 * is holding "mtx", so no operations coming from userspace
946 1084 * can change the item.
947 /* 1085 */
948 * Is the event mask intersect the caller-requested one, 1086 if (revents) {
949 * deliver the event to userspace. Again, we are holding 1087 if (__put_user(revents, &uevent->events) ||
950 * "mtx", so no operations coming from userspace can change 1088 __put_user(epi->event.data, &uevent->data))
951 * the item. 1089 return eventcnt ? eventcnt: -EFAULT;
952 */ 1090 eventcnt++;
953 if (revents) { 1091 uevent++;
954 if (__put_user(revents, 1092 if (epi->event.events & EPOLLONESHOT)
955 &events[eventcnt].events) || 1093 epi->event.events &= EP_PRIVATE_BITS;
956 __put_user(epi->event.data, 1094 else if (!(epi->event.events & EPOLLET))
957 &events[eventcnt].data)) 1095 /*
958 goto errxit; 1096 * If this file has been added with Level Trigger
959 if (epi->event.events & EPOLLONESHOT) 1097 * mode, we need to insert back inside the ready
960 epi->event.events &= EP_PRIVATE_BITS; 1098 * list, so that the next call to epoll_wait()
961 eventcnt++; 1099 * will check again the events availability.
962 } 1100 * At this point, noone can insert into ep->rdllist
963 /* 1101 * besides us. The epoll_ctl() callers are locked
964 * At this point, noone can insert into ep->rdllist besides 1102 * out by ep_scan_ready_list() holding "mtx" and
965 * us. The epoll_ctl() callers are locked out by us holding 1103 * the poll callback will queue them in ep->ovflist.
966 * "mtx" and the poll callback will queue them in ep->ovflist. 1104 */
967 */ 1105 list_add_tail(&epi->rdllink, &ep->rdllist);
968 if (!(epi->event.events & EPOLLET) && 1106 }
969 (revents & epi->event.events)) 1107 }
970 list_add_tail(&epi->rdllink, &ep->rdllist); 1108
971 } 1109 return eventcnt;
972 error = 0; 1110}
973
974errxit:
975
976 spin_lock_irqsave(&ep->lock, flags);
977 /*
978 * During the time we spent in the loop above, some other events
979 * might have been queued by the poll callback. We re-insert them
980 * inside the main ready-list here.
981 */
982 for (nepi = ep->ovflist; (epi = nepi) != NULL;
983 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
984 /*
985 * If the above loop quit with errors, the epoll item might still
986 * be linked to "txlist", and the list_splice() done below will
987 * take care of those cases.
988 */
989 if (!ep_is_linked(&epi->rdllink))
990 list_add_tail(&epi->rdllink, &ep->rdllist);
991 }
992 /*
993 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
994 * releasing the lock, events will be queued in the normal way inside
995 * ep->rdllist.
996 */
997 ep->ovflist = EP_UNACTIVE_PTR;
998
999 /*
1000 * In case of error in the event-send loop, or in case the number of
1001 * ready events exceeds the userspace limit, we need to splice the
1002 * "txlist" back inside ep->rdllist.
1003 */
1004 list_splice(&txlist, &ep->rdllist);
1005
1006 if (!list_empty(&ep->rdllist)) {
1007 /*
1008 * Wake up (if active) both the eventpoll wait list and the ->poll()
1009 * wait list (delayed after we release the lock).
1010 */
1011 if (waitqueue_active(&ep->wq))
1012 wake_up_locked(&ep->wq);
1013 if (waitqueue_active(&ep->poll_wait))
1014 pwake++;
1015 }
1016 spin_unlock_irqrestore(&ep->lock, flags);
1017 1111
1018 mutex_unlock(&ep->mtx); 1112static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
1113 int maxevents)
1114{
1115 struct ep_send_events_data esed;
1019 1116
1020 /* We have to call this outside the lock */ 1117 esed.maxevents = maxevents;
1021 if (pwake) 1118 esed.events = events;
1022 ep_poll_safewake(&psw, &ep->poll_wait);
1023 1119
1024 return eventcnt == 0 ? error: eventcnt; 1120 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1025} 1121}
1026 1122
1027static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1123static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1129,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1033 wait_queue_t wait; 1129 wait_queue_t wait;
1034 1130
1035 /* 1131 /*
1036 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1132 * Calculate the timeout by checking for the "infinite" value (-1)
1037 * and the overflow condition. The passed timeout is in milliseconds, 1133 * and the overflow condition. The passed timeout is in milliseconds,
1038 * that why (t * HZ) / 1000. 1134 * that why (t * HZ) / 1000.
1039 */ 1135 */
@@ -1076,9 +1172,8 @@ retry:
1076 1172
1077 set_current_state(TASK_RUNNING); 1173 set_current_state(TASK_RUNNING);
1078 } 1174 }
1079
1080 /* Is it worth to try to dig for events ? */ 1175 /* Is it worth to try to dig for events ? */
1081 eavail = !list_empty(&ep->rdllist); 1176 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1082 1177
1083 spin_unlock_irqrestore(&ep->lock, flags); 1178 spin_unlock_irqrestore(&ep->lock, flags);
1084 1179
@@ -1099,41 +1194,40 @@ retry:
1099 */ 1194 */
1100SYSCALL_DEFINE1(epoll_create1, int, flags) 1195SYSCALL_DEFINE1(epoll_create1, int, flags)
1101{ 1196{
1102 int error, fd = -1; 1197 int error;
1103 struct eventpoll *ep; 1198 struct eventpoll *ep = NULL;
1104 1199
1105 /* Check the EPOLL_* constant for consistency. */ 1200 /* Check the EPOLL_* constant for consistency. */
1106 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1201 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1107 1202
1108 if (flags & ~EPOLL_CLOEXEC)
1109 return -EINVAL;
1110
1111 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", 1203 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1112 current, flags)); 1204 current, flags));
1113 1205
1206 error = -EINVAL;
1207 if (flags & ~EPOLL_CLOEXEC)
1208 goto error_return;
1209
1114 /* 1210 /*
1115 * Create the internal data structure ( "struct eventpoll" ). 1211 * Create the internal data structure ("struct eventpoll").
1116 */ 1212 */
1117 error = ep_alloc(&ep); 1213 error = ep_alloc(&ep);
1118 if (error < 0) { 1214 if (error < 0)
1119 fd = error;
1120 goto error_return; 1215 goto error_return;
1121 }
1122 1216
1123 /* 1217 /*
1124 * Creates all the items needed to setup an eventpoll file. That is, 1218 * Creates all the items needed to setup an eventpoll file. That is,
1125 * a file structure and a free file descriptor. 1219 * a file structure and a free file descriptor.
1126 */ 1220 */
1127 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1221 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1128 flags & O_CLOEXEC); 1222 flags & O_CLOEXEC);
1129 if (fd < 0) 1223 if (error < 0)
1130 ep_free(ep); 1224 ep_free(ep);
1131 1225
1132error_return: 1226error_return:
1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", 1227 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1134 current, flags, fd)); 1228 current, flags, error));
1135 1229
1136 return fd; 1230 return error;
1137} 1231}
1138 1232
1139SYSCALL_DEFINE1(epoll_create, int, size) 1233SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1359,7 +1453,10 @@ static int __init eventpoll_init(void)
1359 EP_ITEM_COST; 1453 EP_ITEM_COST;
1360 1454
1361 /* Initialize the structure used to perform safe poll wait head wake ups */ 1455 /* Initialize the structure used to perform safe poll wait head wake ups */
1362 ep_poll_safewake_init(&psw); 1456 ep_nested_calls_init(&poll_safewake_ncalls);
1457
1458 /* Initialize the structure used to perform file's f_op->poll() calls */
1459 ep_nested_calls_init(&poll_readywalk_ncalls);
1363 1460
1364 /* Allocates slab cache used to allocate "struct epitem" items */ 1461 /* Allocates slab cache used to allocate "struct epitem" items */
1365 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1462 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),