aboutsummaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c614
1 files changed, 339 insertions, 275 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..a89f370fadb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * fs/eventpoll.c (Efficent event polling implementation) 2 * fs/eventpoll.c (Efficient event retrieval implementation)
3 * Copyright (C) 2001,...,2007 Davide Libenzi 3 * Copyright (C) 2001,...,2009 Davide Libenzi
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify 5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by 6 * it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
71 * a better scalability. 71 * a better scalability.
72 */ 72 */
73 73
74#define DEBUG_EPOLL 0
75
76#if DEBUG_EPOLL > 0
77#define DPRINTK(x) printk x
78#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
79#else /* #if DEBUG_EPOLL > 0 */
80#define DPRINTK(x) (void) 0
81#define DNPRINTK(n, x) (void) 0
82#endif /* #if DEBUG_EPOLL > 0 */
83
84#define DEBUG_EPI 0
85
86#if DEBUG_EPI != 0
87#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
88#else /* #if DEBUG_EPI != 0 */
89#define EPI_SLAB_DEBUG 0
90#endif /* #if DEBUG_EPI != 0 */
91
92/* Epoll private bits inside the event mask */ 74/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) 75#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 76
95/* Maximum number of poll wake up nests we are allowing */ 77/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_POLLWAKE_NESTS 4 78#define EP_MAX_NESTS 4
97 79
98/* Maximum msec timeout value storeable in a long int */ 80/* Maximum msec timeout value storeable in a long int */
99#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) 81#define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
110}; 92};
111 93
112/* 94/*
113 * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". 95 * Structure used to track possible nested calls, for too deep recursions
114 * It is used to keep track on all tasks that are currently inside the wake_up() code 96 * and loop cycles.
115 * to 1) short-circuit the one coming from the same task and same wait queue head
116 * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
117 * 3) let go the ones coming from other tasks.
118 */ 97 */
119struct wake_task_node { 98struct nested_call_node {
120 struct list_head llink; 99 struct list_head llink;
121 struct task_struct *task; 100 void *cookie;
122 wait_queue_head_t *wq; 101 int cpu;
123}; 102};
124 103
125/* 104/*
126 * This is used to implement the safe poll wake up avoiding to reenter 105 * This structure is used as collector for nested calls, to check for
127 * the poll callback from inside wake_up(). 106 * maximum recursion dept and loop cycles.
128 */ 107 */
129struct poll_safewake { 108struct nested_calls {
130 struct list_head wake_task_list; 109 struct list_head tasks_call_list;
131 spinlock_t lock; 110 spinlock_t lock;
132}; 111};
133 112
@@ -213,7 +192,7 @@ struct eppoll_entry {
213 struct list_head llink; 192 struct list_head llink;
214 193
215 /* The "base" pointer is set to the container "struct epitem" */ 194 /* The "base" pointer is set to the container "struct epitem" */
216 void *base; 195 struct epitem *base;
217 196
218 /* 197 /*
219 * Wait queue item that will be linked to the target file wait 198 * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
231 struct epitem *epi; 210 struct epitem *epi;
232}; 211};
233 212
213/* Used by the ep_send_events() function as callback private data */
214struct ep_send_events_data {
215 int maxevents;
216 struct epoll_event __user *events;
217};
218
234/* 219/*
235 * Configuration options available inside /proc/sys/fs/epoll/ 220 * Configuration options available inside /proc/sys/fs/epoll/
236 */ 221 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
242 */ 227 */
243static DEFINE_MUTEX(epmutex); 228static DEFINE_MUTEX(epmutex);
244 229
245/* Safe wake up implementation */ 230/* Used for safe wake up implementation */
246static struct poll_safewake psw; 231static struct nested_calls poll_safewake_ncalls;
232
233/* Used to call file's f_op->poll() under the nested calls boundaries */
234static struct nested_calls poll_readywalk_ncalls;
247 235
248/* Slab cache used to allocate "struct epitem" */ 236/* Slab cache used to allocate "struct epitem" */
249static struct kmem_cache *epi_cache __read_mostly; 237static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
312} 300}
313 301
314/* Initialize the poll safe wake up structure */ 302/* Initialize the poll safe wake up structure */
315static void ep_poll_safewake_init(struct poll_safewake *psw) 303static void ep_nested_calls_init(struct nested_calls *ncalls)
316{ 304{
317 305 INIT_LIST_HEAD(&ncalls->tasks_call_list);
318 INIT_LIST_HEAD(&psw->wake_task_list); 306 spin_lock_init(&ncalls->lock);
319 spin_lock_init(&psw->lock);
320} 307}
321 308
322/* 309/**
323 * Perform a safe wake up of the poll wait list. The problem is that 310 * ep_call_nested - Perform a bound (possibly) nested call, by checking
324 * with the new callback'd wake up system, it is possible that the 311 * that the recursion limit is not exceeded, and that
325 * poll callback is reentered from inside the call to wake_up() done 312 * the same nested call (by the meaning of same cookie) is
326 * on the poll wait queue head. The rule is that we cannot reenter the 313 * no re-entered.
327 * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, 314 *
328 * and we cannot reenter the same wait queue head at all. This will 315 * @ncalls: Pointer to the nested_calls structure to be used for this call.
329 * enable to have a hierarchy of epoll file descriptor of no more than 316 * @max_nests: Maximum number of allowed nesting calls.
330 * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock 317 * @nproc: Nested call core function pointer.
331 * because this one gets called by the poll callback, that in turn is called 318 * @priv: Opaque data to be passed to the @nproc callback.
332 * from inside a wake_up(), that might be called from irq context. 319 * @cookie: Cookie to be used to identify this nested call.
320 *
321 * Returns: Returns the code returned by the @nproc callback, or -1 if
322 * the maximum recursion limit has been exceeded.
333 */ 323 */
334static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) 324static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
325 int (*nproc)(void *, void *, int), void *priv,
326 void *cookie)
335{ 327{
336 int wake_nests = 0; 328 int error, call_nests = 0;
337 unsigned long flags; 329 unsigned long flags;
338 struct task_struct *this_task = current; 330 int this_cpu = get_cpu();
339 struct list_head *lsthead = &psw->wake_task_list; 331 struct list_head *lsthead = &ncalls->tasks_call_list;
340 struct wake_task_node *tncur; 332 struct nested_call_node *tncur;
341 struct wake_task_node tnode; 333 struct nested_call_node tnode;
342 334
343 spin_lock_irqsave(&psw->lock, flags); 335 spin_lock_irqsave(&ncalls->lock, flags);
344 336
345 /* Try to see if the current task is already inside this wakeup call */ 337 /*
338 * Try to see if the current task is already inside this wakeup call.
339 * We use a list here, since the population inside this set is always
340 * very much limited.
341 */
346 list_for_each_entry(tncur, lsthead, llink) { 342 list_for_each_entry(tncur, lsthead, llink) {
347 343 if (tncur->cpu == this_cpu &&
348 if (tncur->wq == wq || 344 (tncur->cookie == cookie || ++call_nests > max_nests)) {
349 (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
350 /* 345 /*
351 * Ops ... loop detected or maximum nest level reached. 346 * Ops ... loop detected or maximum nest level reached.
352 * We abort this wake by breaking the cycle itself. 347 * We abort this wake by breaking the cycle itself.
353 */ 348 */
354 spin_unlock_irqrestore(&psw->lock, flags); 349 error = -1;
355 return; 350 goto out_unlock;
356 } 351 }
357 } 352 }
358 353
359 /* Add the current task to the list */ 354 /* Add the current task and cookie to the list */
360 tnode.task = this_task; 355 tnode.cpu = this_cpu;
361 tnode.wq = wq; 356 tnode.cookie = cookie;
362 list_add(&tnode.llink, lsthead); 357 list_add(&tnode.llink, lsthead);
363 358
364 spin_unlock_irqrestore(&psw->lock, flags); 359 spin_unlock_irqrestore(&ncalls->lock, flags);
365 360
366 /* Do really wake up now */ 361 /* Call the nested function */
367 wake_up_nested(wq, 1 + wake_nests); 362 error = (*nproc)(priv, cookie, call_nests);
368 363
369 /* Remove the current task from the list */ 364 /* Remove the current task from the list */
370 spin_lock_irqsave(&psw->lock, flags); 365 spin_lock_irqsave(&ncalls->lock, flags);
371 list_del(&tnode.llink); 366 list_del(&tnode.llink);
372 spin_unlock_irqrestore(&psw->lock, flags); 367 out_unlock:
368 spin_unlock_irqrestore(&ncalls->lock, flags);
369
370 put_cpu();
371 return error;
372}
373
374#ifdef CONFIG_DEBUG_LOCK_ALLOC
375static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
376 unsigned long events, int subclass)
377{
378 unsigned long flags;
379
380 spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
381 wake_up_locked_poll(wqueue, events);
382 spin_unlock_irqrestore(&wqueue->lock, flags);
383}
384#else
385static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
386 unsigned long events, int subclass)
387{
388 wake_up_poll(wqueue, events);
389}
390#endif
391
392static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
393{
394 ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
395 1 + call_nests);
396 return 0;
397}
398
399/*
400 * Perform a safe wake up of the poll wait list. The problem is that
401 * with the new callback'd wake up system, it is possible that the
402 * poll callback is reentered from inside the call to wake_up() done
403 * on the poll wait queue head. The rule is that we cannot reenter the
404 * wake up code from the same task more than EP_MAX_NESTS times,
405 * and we cannot reenter the same wait queue head at all. This will
406 * enable to have a hierarchy of epoll file descriptor of no more than
407 * EP_MAX_NESTS deep.
408 */
409static void ep_poll_safewake(wait_queue_head_t *wq)
410{
411 ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
412 ep_poll_wakeup_proc, NULL, wq);
373} 413}
374 414
375/* 415/*
376 * This function unregister poll callbacks from the associated file descriptor. 416 * This function unregisters poll callbacks from the associated file
377 * Since this must be called without holding "ep->lock" the atomic exchange trick 417 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
378 * will protect us from multiple unregister. 418 * ep_free).
379 */ 419 */
380static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) 420static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
381{ 421{
382 int nwait;
383 struct list_head *lsthead = &epi->pwqlist; 422 struct list_head *lsthead = &epi->pwqlist;
384 struct eppoll_entry *pwq; 423 struct eppoll_entry *pwq;
385 424
386 /* This is called without locks, so we need the atomic exchange */ 425 while (!list_empty(lsthead)) {
387 nwait = xchg(&epi->nwait, 0); 426 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
388 427
389 if (nwait) { 428 list_del(&pwq->llink);
390 while (!list_empty(lsthead)) { 429 remove_wait_queue(pwq->whead, &pwq->wait);
391 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 430 kmem_cache_free(pwq_cache, pwq);
431 }
432}
392 433
393 list_del_init(&pwq->llink); 434/**
394 remove_wait_queue(pwq->whead, &pwq->wait); 435 * ep_scan_ready_list - Scans the ready list in a way that makes possible for
395 kmem_cache_free(pwq_cache, pwq); 436 * the scan code, to call f_op->poll(). Also allows for
396 } 437 * O(NumReady) performance.
438 *
439 * @ep: Pointer to the epoll private data structure.
440 * @sproc: Pointer to the scan callback.
441 * @priv: Private opaque data passed to the @sproc callback.
442 *
443 * Returns: The same integer error code returned by the @sproc callback.
444 */
445static int ep_scan_ready_list(struct eventpoll *ep,
446 int (*sproc)(struct eventpoll *,
447 struct list_head *, void *),
448 void *priv)
449{
450 int error, pwake = 0;
451 unsigned long flags;
452 struct epitem *epi, *nepi;
453 LIST_HEAD(txlist);
454
455 /*
456 * We need to lock this because we could be hit by
457 * eventpoll_release_file() and epoll_ctl().
458 */
459 mutex_lock(&ep->mtx);
460
461 /*
462 * Steal the ready list, and re-init the original one to the
463 * empty list. Also, set ep->ovflist to NULL so that events
464 * happening while looping w/out locks, are not lost. We cannot
465 * have the poll callback to queue directly on ep->rdllist,
466 * because we want the "sproc" callback to be able to do it
467 * in a lockless way.
468 */
469 spin_lock_irqsave(&ep->lock, flags);
470 list_splice_init(&ep->rdllist, &txlist);
471 ep->ovflist = NULL;
472 spin_unlock_irqrestore(&ep->lock, flags);
473
474 /*
475 * Now call the callback function.
476 */
477 error = (*sproc)(ep, &txlist, priv);
478
479 spin_lock_irqsave(&ep->lock, flags);
480 /*
481 * During the time we spent inside the "sproc" callback, some
482 * other events might have been queued by the poll callback.
483 * We re-insert them inside the main ready-list here.
484 */
485 for (nepi = ep->ovflist; (epi = nepi) != NULL;
486 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
487 /*
488 * We need to check if the item is already in the list.
489 * During the "sproc" callback execution time, items are
490 * queued into ->ovflist but the "txlist" might already
491 * contain them, and the list_splice() below takes care of them.
492 */
493 if (!ep_is_linked(&epi->rdllink))
494 list_add_tail(&epi->rdllink, &ep->rdllist);
495 }
496 /*
497 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
498 * releasing the lock, events will be queued in the normal way inside
499 * ep->rdllist.
500 */
501 ep->ovflist = EP_UNACTIVE_PTR;
502
503 /*
504 * Quickly re-inject items left on "txlist".
505 */
506 list_splice(&txlist, &ep->rdllist);
507
508 if (!list_empty(&ep->rdllist)) {
509 /*
510 * Wake up (if active) both the eventpoll wait list and
511 * the ->poll() wait list (delayed after we release the lock).
512 */
513 if (waitqueue_active(&ep->wq))
514 wake_up_locked(&ep->wq);
515 if (waitqueue_active(&ep->poll_wait))
516 pwake++;
397 } 517 }
518 spin_unlock_irqrestore(&ep->lock, flags);
519
520 mutex_unlock(&ep->mtx);
521
522 /* We have to call this outside the lock */
523 if (pwake)
524 ep_poll_safewake(&ep->poll_wait);
525
526 return error;
398} 527}
399 528
400/* 529/*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
434 563
435 atomic_dec(&ep->user->epoll_watches); 564 atomic_dec(&ep->user->epoll_watches);
436 565
437 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
438 current, ep, file));
439
440 return 0; 566 return 0;
441} 567}
442 568
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
447 573
448 /* We need to release all tasks waiting for these file */ 574 /* We need to release all tasks waiting for these file */
449 if (waitqueue_active(&ep->poll_wait)) 575 if (waitqueue_active(&ep->poll_wait))
450 ep_poll_safewake(&psw, &ep->poll_wait); 576 ep_poll_safewake(&ep->poll_wait);
451 577
452 /* 578 /*
453 * We need to lock this because we could be hit by 579 * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
492 if (ep) 618 if (ep)
493 ep_free(ep); 619 ep_free(ep);
494 620
495 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
496 return 0; 621 return 0;
497} 622}
498 623
624static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
625 void *priv)
626{
627 struct epitem *epi, *tmp;
628
629 list_for_each_entry_safe(epi, tmp, head, rdllink) {
630 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
631 epi->event.events)
632 return POLLIN | POLLRDNORM;
633 else {
634 /*
635 * Item has been dropped into the ready list by the poll
636 * callback, but it's not actually ready, as far as
637 * caller requested events goes. We can remove it here.
638 */
639 list_del_init(&epi->rdllink);
640 }
641 }
642
643 return 0;
644}
645
646static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
647{
648 return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
649}
650
499static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 651static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
500{ 652{
501 unsigned int pollflags = 0; 653 int pollflags;
502 unsigned long flags;
503 struct eventpoll *ep = file->private_data; 654 struct eventpoll *ep = file->private_data;
504 655
505 /* Insert inside our poll wait queue */ 656 /* Insert inside our poll wait queue */
506 poll_wait(file, &ep->poll_wait, wait); 657 poll_wait(file, &ep->poll_wait, wait);
507 658
508 /* Check our condition */ 659 /*
509 spin_lock_irqsave(&ep->lock, flags); 660 * Proceed to find out if wanted events are really available inside
510 if (!list_empty(&ep->rdllist)) 661 * the ready list. This need to be done under ep_call_nested()
511 pollflags = POLLIN | POLLRDNORM; 662 * supervision, since the call to f_op->poll() done on listed files
512 spin_unlock_irqrestore(&ep->lock, flags); 663 * could re-enter here.
664 */
665 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
666 ep_poll_readyevents_proc, ep, ep);
513 667
514 return pollflags; 668 return pollflags != -1 ? pollflags : 0;
515} 669}
516 670
517/* File callbacks that implement the eventpoll file behaviour */ 671/* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
541 * We don't want to get "file->f_lock" because it is not 695 * We don't want to get "file->f_lock" because it is not
542 * necessary. It is not necessary because we're in the "struct file" 696 * necessary. It is not necessary because we're in the "struct file"
543 * cleanup path, and this means that noone is using this file anymore. 697 * cleanup path, and this means that noone is using this file anymore.
544 * So, for example, epoll_ctl() cannot hit here sicne if we reach this 698 * So, for example, epoll_ctl() cannot hit here since if we reach this
545 * point, the file counter already went to zero and fget() would fail. 699 * point, the file counter already went to zero and fget() would fail.
546 * The only hit might come from ep_free() but by holding the mutex 700 * The only hit might come from ep_free() but by holding the mutex
547 * will correctly serialize the operation. We do need to acquire 701 * will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
588 742
589 *pep = ep; 743 *pep = ep;
590 744
591 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
592 current, ep));
593 return 0; 745 return 0;
594 746
595free_uid: 747free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
623 } 775 }
624 } 776 }
625 777
626 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
627 current, file, epir));
628
629 return epir; 778 return epir;
630} 779}
631 780
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
641 struct epitem *epi = ep_item_from_wait(wait); 790 struct epitem *epi = ep_item_from_wait(wait);
642 struct eventpoll *ep = epi->ep; 791 struct eventpoll *ep = epi->ep;
643 792
644 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
645 current, epi->ffd.file, epi, ep));
646
647 spin_lock_irqsave(&ep->lock, flags); 793 spin_lock_irqsave(&ep->lock, flags);
648 794
649 /* 795 /*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
656 goto out_unlock; 802 goto out_unlock;
657 803
658 /* 804 /*
805 * Check the events coming with the callback. At this stage, not
806 * every device reports the events in the "key" parameter of the
807 * callback. We need to be able to handle both cases here, hence the
808 * test for "key" != NULL before the event match test.
809 */
810 if (key && !((unsigned long) key & epi->event.events))
811 goto out_unlock;
812
813 /*
659 * If we are trasfering events to userspace, we can hold no locks 814 * If we are trasfering events to userspace, we can hold no locks
660 * (because we're accessing user memory, and because of linux f_op->poll() 815 * (because we're accessing user memory, and because of linux f_op->poll()
661 * semantics). All the events that happens during that period of time are 816 * semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
670 } 825 }
671 826
672 /* If this file is already in the ready list we exit soon */ 827 /* If this file is already in the ready list we exit soon */
673 if (ep_is_linked(&epi->rdllink)) 828 if (!ep_is_linked(&epi->rdllink))
674 goto is_linked; 829 list_add_tail(&epi->rdllink, &ep->rdllist);
675
676 list_add_tail(&epi->rdllink, &ep->rdllist);
677 830
678is_linked:
679 /* 831 /*
680 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 832 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
681 * wait list. 833 * wait list.
@@ -690,7 +842,7 @@ out_unlock:
690 842
691 /* We have to call this outside the lock */ 843 /* We have to call this outside the lock */
692 if (pwake) 844 if (pwake)
693 ep_poll_safewake(&psw, &ep->poll_wait); 845 ep_poll_safewake(&ep->poll_wait);
694 846
695 return 1; 847 return 1;
696} 848}
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
817 969
818 /* We have to call this outside the lock */ 970 /* We have to call this outside the lock */
819 if (pwake) 971 if (pwake)
820 ep_poll_safewake(&psw, &ep->poll_wait); 972 ep_poll_safewake(&ep->poll_wait);
821
822 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
823 current, ep, tfile, fd));
824 973
825 return 0; 974 return 0;
826 975
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
851{ 1000{
852 int pwake = 0; 1001 int pwake = 0;
853 unsigned int revents; 1002 unsigned int revents;
854 unsigned long flags;
855 1003
856 /* 1004 /*
857 * Set the new event interest mask before calling f_op->poll(), otherwise 1005 * Set the new event interest mask before calling f_op->poll();
858 * a potential race might occur. In fact if we do this operation inside 1006 * otherwise we might miss an event that happens between the
859 * the lock, an event might happen between the f_op->poll() call and the 1007 * f_op->poll() call and the new event set registering.
860 * new event set registering.
861 */ 1008 */
862 epi->event.events = event->events; 1009 epi->event.events = event->events;
1010 epi->event.data = event->data; /* protected by mtx */
863 1011
864 /* 1012 /*
865 * Get current event bits. We can safely use the file* here because 1013 * Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
867 */ 1015 */
868 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1016 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
869 1017
870 spin_lock_irqsave(&ep->lock, flags);
871
872 /* Copy the data member from inside the lock */
873 epi->event.data = event->data;
874
875 /* 1018 /*
876 * If the item is "hot" and it is not registered inside the ready 1019 * If the item is "hot" and it is not registered inside the ready
877 * list, push it inside. 1020 * list, push it inside.
878 */ 1021 */
879 if (revents & event->events) { 1022 if (revents & event->events) {
1023 spin_lock_irq(&ep->lock);
880 if (!ep_is_linked(&epi->rdllink)) { 1024 if (!ep_is_linked(&epi->rdllink)) {
881 list_add_tail(&epi->rdllink, &ep->rdllist); 1025 list_add_tail(&epi->rdllink, &ep->rdllist);
882 1026
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
886 if (waitqueue_active(&ep->poll_wait)) 1030 if (waitqueue_active(&ep->poll_wait))
887 pwake++; 1031 pwake++;
888 } 1032 }
1033 spin_unlock_irq(&ep->lock);
889 } 1034 }
890 spin_unlock_irqrestore(&ep->lock, flags);
891 1035
892 /* We have to call this outside the lock */ 1036 /* We have to call this outside the lock */
893 if (pwake) 1037 if (pwake)
894 ep_poll_safewake(&psw, &ep->poll_wait); 1038 ep_poll_safewake(&ep->poll_wait);
895 1039
896 return 0; 1040 return 0;
897} 1041}
898 1042
899static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, 1043static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
900 int maxevents) 1044 void *priv)
901{ 1045{
902 int eventcnt, error = -EFAULT, pwake = 0; 1046 struct ep_send_events_data *esed = priv;
1047 int eventcnt;
903 unsigned int revents; 1048 unsigned int revents;
904 unsigned long flags; 1049 struct epitem *epi;
905 struct epitem *epi, *nepi; 1050 struct epoll_event __user *uevent;
906 struct list_head txlist;
907
908 INIT_LIST_HEAD(&txlist);
909
910 /*
911 * We need to lock this because we could be hit by
912 * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
913 */
914 mutex_lock(&ep->mtx);
915
916 /*
917 * Steal the ready list, and re-init the original one to the
918 * empty list. Also, set ep->ovflist to NULL so that events
919 * happening while looping w/out locks, are not lost. We cannot
920 * have the poll callback to queue directly on ep->rdllist,
921 * because we are doing it in the loop below, in a lockless way.
922 */
923 spin_lock_irqsave(&ep->lock, flags);
924 list_splice(&ep->rdllist, &txlist);
925 INIT_LIST_HEAD(&ep->rdllist);
926 ep->ovflist = NULL;
927 spin_unlock_irqrestore(&ep->lock, flags);
928 1051
929 /* 1052 /*
930 * We can loop without lock because this is a task private list. 1053 * We can loop without lock because we are passed a task private list.
931 * We just splice'd out the ep->rdllist in ep_collect_ready_items(). 1054 * Items cannot vanish during the loop because ep_scan_ready_list() is
932 * Items cannot vanish during the loop because we are holding "mtx". 1055 * holding "mtx" during this call.
933 */ 1056 */
934 for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { 1057 for (eventcnt = 0, uevent = esed->events;
935 epi = list_first_entry(&txlist, struct epitem, rdllink); 1058 !list_empty(head) && eventcnt < esed->maxevents;) {
1059 epi = list_first_entry(head, struct epitem, rdllink);
936 1060
937 list_del_init(&epi->rdllink); 1061 list_del_init(&epi->rdllink);
938 1062
939 /* 1063 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
940 * Get the ready file event set. We can safely use the file 1064 epi->event.events;
941 * because we are holding the "mtx" and this will guarantee
942 * that both the file and the item will not vanish.
943 */
944 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
945 revents &= epi->event.events;
946 1065
947 /* 1066 /*
948 * Is the event mask intersect the caller-requested one, 1067 * If the event mask intersect the caller-requested one,
949 * deliver the event to userspace. Again, we are holding 1068 * deliver the event to userspace. Again, ep_scan_ready_list()
950 * "mtx", so no operations coming from userspace can change 1069 * is holding "mtx", so no operations coming from userspace
951 * the item. 1070 * can change the item.
952 */ 1071 */
953 if (revents) { 1072 if (revents) {
954 if (__put_user(revents, 1073 if (__put_user(revents, &uevent->events) ||
955 &events[eventcnt].events) || 1074 __put_user(epi->event.data, &uevent->data)) {
956 __put_user(epi->event.data, 1075 list_add(&epi->rdllink, head);
957 &events[eventcnt].data)) 1076 return eventcnt ? eventcnt : -EFAULT;
958 goto errxit; 1077 }
1078 eventcnt++;
1079 uevent++;
959 if (epi->event.events & EPOLLONESHOT) 1080 if (epi->event.events & EPOLLONESHOT)
960 epi->event.events &= EP_PRIVATE_BITS; 1081 epi->event.events &= EP_PRIVATE_BITS;
961 eventcnt++; 1082 else if (!(epi->event.events & EPOLLET)) {
1083 /*
1084 * If this file has been added with Level
1085 * Trigger mode, we need to insert back inside
1086 * the ready list, so that the next call to
1087 * epoll_wait() will check again the events
1088 * availability. At this point, noone can insert
1089 * into ep->rdllist besides us. The epoll_ctl()
1090 * callers are locked out by
1091 * ep_scan_ready_list() holding "mtx" and the
1092 * poll callback will queue them in ep->ovflist.
1093 */
1094 list_add_tail(&epi->rdllink, &ep->rdllist);
1095 }
962 } 1096 }
963 /*
964 * At this point, noone can insert into ep->rdllist besides
965 * us. The epoll_ctl() callers are locked out by us holding
966 * "mtx" and the poll callback will queue them in ep->ovflist.
967 */
968 if (!(epi->event.events & EPOLLET) &&
969 (revents & epi->event.events))
970 list_add_tail(&epi->rdllink, &ep->rdllist);
971 }
972 error = 0;
973
974errxit:
975
976 spin_lock_irqsave(&ep->lock, flags);
977 /*
978 * During the time we spent in the loop above, some other events
979 * might have been queued by the poll callback. We re-insert them
980 * inside the main ready-list here.
981 */
982 for (nepi = ep->ovflist; (epi = nepi) != NULL;
983 nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
984 /*
985 * If the above loop quit with errors, the epoll item might still
986 * be linked to "txlist", and the list_splice() done below will
987 * take care of those cases.
988 */
989 if (!ep_is_linked(&epi->rdllink))
990 list_add_tail(&epi->rdllink, &ep->rdllist);
991 } 1097 }
992 /*
993 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
994 * releasing the lock, events will be queued in the normal way inside
995 * ep->rdllist.
996 */
997 ep->ovflist = EP_UNACTIVE_PTR;
998 1098
999 /* 1099 return eventcnt;
1000 * In case of error in the event-send loop, or in case the number of 1100}
1001 * ready events exceeds the userspace limit, we need to splice the
1002 * "txlist" back inside ep->rdllist.
1003 */
1004 list_splice(&txlist, &ep->rdllist);
1005
1006 if (!list_empty(&ep->rdllist)) {
1007 /*
1008 * Wake up (if active) both the eventpoll wait list and the ->poll()
1009 * wait list (delayed after we release the lock).
1010 */
1011 if (waitqueue_active(&ep->wq))
1012 wake_up_locked(&ep->wq);
1013 if (waitqueue_active(&ep->poll_wait))
1014 pwake++;
1015 }
1016 spin_unlock_irqrestore(&ep->lock, flags);
1017 1101
1018 mutex_unlock(&ep->mtx); 1102static int ep_send_events(struct eventpoll *ep,
1103 struct epoll_event __user *events, int maxevents)
1104{
1105 struct ep_send_events_data esed;
1019 1106
1020 /* We have to call this outside the lock */ 1107 esed.maxevents = maxevents;
1021 if (pwake) 1108 esed.events = events;
1022 ep_poll_safewake(&psw, &ep->poll_wait);
1023 1109
1024 return eventcnt == 0 ? error: eventcnt; 1110 return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
1025} 1111}
1026 1112
1027static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1113static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1033 wait_queue_t wait; 1119 wait_queue_t wait;
1034 1120
1035 /* 1121 /*
1036 * Calculate the timeout by checking for the "infinite" value ( -1 ) 1122 * Calculate the timeout by checking for the "infinite" value (-1)
1037 * and the overflow condition. The passed timeout is in milliseconds, 1123 * and the overflow condition. The passed timeout is in milliseconds,
1038 * that why (t * HZ) / 1000. 1124 * that why (t * HZ) / 1000.
1039 */ 1125 */
@@ -1076,9 +1162,8 @@ retry:
1076 1162
1077 set_current_state(TASK_RUNNING); 1163 set_current_state(TASK_RUNNING);
1078 } 1164 }
1079
1080 /* Is it worth to try to dig for events ? */ 1165 /* Is it worth to try to dig for events ? */
1081 eavail = !list_empty(&ep->rdllist); 1166 eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
1082 1167
1083 spin_unlock_irqrestore(&ep->lock, flags); 1168 spin_unlock_irqrestore(&ep->lock, flags);
1084 1169
@@ -1099,41 +1184,30 @@ retry:
1099 */ 1184 */
1100SYSCALL_DEFINE1(epoll_create1, int, flags) 1185SYSCALL_DEFINE1(epoll_create1, int, flags)
1101{ 1186{
1102 int error, fd = -1; 1187 int error;
1103 struct eventpoll *ep; 1188 struct eventpoll *ep = NULL;
1104 1189
1105 /* Check the EPOLL_* constant for consistency. */ 1190 /* Check the EPOLL_* constant for consistency. */
1106 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1191 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
1107 1192
1108 if (flags & ~EPOLL_CLOEXEC) 1193 if (flags & ~EPOLL_CLOEXEC)
1109 return -EINVAL; 1194 return -EINVAL;
1110
1111 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1112 current, flags));
1113
1114 /* 1195 /*
1115 * Create the internal data structure ( "struct eventpoll" ). 1196 * Create the internal data structure ("struct eventpoll").
1116 */ 1197 */
1117 error = ep_alloc(&ep); 1198 error = ep_alloc(&ep);
1118 if (error < 0) { 1199 if (error < 0)
1119 fd = error; 1200 return error;
1120 goto error_return;
1121 }
1122
1123 /* 1201 /*
1124 * Creates all the items needed to setup an eventpoll file. That is, 1202 * Creates all the items needed to setup an eventpoll file. That is,
1125 * a file structure and a free file descriptor. 1203 * a file structure and a free file descriptor.
1126 */ 1204 */
1127 fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, 1205 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1128 flags & O_CLOEXEC); 1206 flags & O_CLOEXEC);
1129 if (fd < 0) 1207 if (error < 0)
1130 ep_free(ep); 1208 ep_free(ep);
1131 1209
1132error_return: 1210 return error;
1133 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1134 current, flags, fd));
1135
1136 return fd;
1137} 1211}
1138 1212
1139SYSCALL_DEFINE1(epoll_create, int, size) 1213SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1158 struct epitem *epi; 1232 struct epitem *epi;
1159 struct epoll_event epds; 1233 struct epoll_event epds;
1160 1234
1161 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1162 current, epfd, op, fd, event));
1163
1164 error = -EFAULT; 1235 error = -EFAULT;
1165 if (ep_op_has_event(op) && 1236 if (ep_op_has_event(op) &&
1166 copy_from_user(&epds, event, sizeof(struct epoll_event))) 1237 copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1211 case EPOLL_CTL_ADD: 1282 case EPOLL_CTL_ADD:
1212 if (!epi) { 1283 if (!epi) {
1213 epds.events |= POLLERR | POLLHUP; 1284 epds.events |= POLLERR | POLLHUP;
1214
1215 error = ep_insert(ep, &epds, tfile, fd); 1285 error = ep_insert(ep, &epds, tfile, fd);
1216 } else 1286 } else
1217 error = -EEXIST; 1287 error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
1237error_fput: 1307error_fput:
1238 fput(file); 1308 fput(file);
1239error_return: 1309error_return:
1240 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1241 current, epfd, op, fd, event, error));
1242 1310
1243 return error; 1311 return error;
1244} 1312}
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1254 struct file *file; 1322 struct file *file;
1255 struct eventpoll *ep; 1323 struct eventpoll *ep;
1256 1324
1257 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1258 current, epfd, events, maxevents, timeout));
1259
1260 /* The maximum number of event must be greater than zero */ 1325 /* The maximum number of event must be greater than zero */
1261 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) 1326 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1262 return -EINVAL; 1327 return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1293error_fput: 1358error_fput:
1294 fput(file); 1359 fput(file);
1295error_return: 1360error_return:
1296 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1297 current, epfd, events, maxevents, timeout, error));
1298 1361
1299 return error; 1362 return error;
1300} 1363}
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
1359 EP_ITEM_COST; 1422 EP_ITEM_COST;
1360 1423
1361 /* Initialize the structure used to perform safe poll wait head wake ups */ 1424 /* Initialize the structure used to perform safe poll wait head wake ups */
1362 ep_poll_safewake_init(&psw); 1425 ep_nested_calls_init(&poll_safewake_ncalls);
1426
1427 /* Initialize the structure used to perform file's f_op->poll() calls */
1428 ep_nested_calls_init(&poll_readywalk_ncalls);
1363 1429
1364 /* Allocates slab cache used to allocate "struct epitem" items */ 1430 /* Allocates slab cache used to allocate "struct epitem" items */
1365 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), 1431 epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
1366 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, 1432 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
1367 NULL);
1368 1433
1369 /* Allocates slab cache used to allocate "struct eppoll_entry" */ 1434 /* Allocates slab cache used to allocate "struct eppoll_entry" */
1370 pwq_cache = kmem_cache_create("eventpoll_pwq", 1435 pwq_cache = kmem_cache_create("eventpoll_pwq",
1371 sizeof(struct eppoll_entry), 0, 1436 sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
1372 EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
1373 1437
1374 return 0; 1438 return 0;
1375} 1439}