diff options
-rw-r--r-- | fs/eventpoll.c | 511 |
1 files changed, 304 insertions, 207 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index c5c424f23fd5..8a23a91e1377 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -1,6 +1,6 @@ | |||
1 | /* | 1 | /* |
2 | * fs/eventpoll.c (Efficent event polling implementation) | 2 | * fs/eventpoll.c (Efficient event retrieval implementation) |
3 | * Copyright (C) 2001,...,2007 Davide Libenzi | 3 | * Copyright (C) 2001,...,2009 Davide Libenzi |
4 | * | 4 | * |
5 | * This program is free software; you can redistribute it and/or modify | 5 | * This program is free software; you can redistribute it and/or modify |
6 | * it under the terms of the GNU General Public License as published by | 6 | * it under the terms of the GNU General Public License as published by |
@@ -92,8 +92,8 @@ | |||
92 | /* Epoll private bits inside the event mask */ | 92 | /* Epoll private bits inside the event mask */ |
93 | #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) | 93 | #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) |
94 | 94 | ||
95 | /* Maximum number of poll wake up nests we are allowing */ | 95 | /* Maximum number of nesting allowed inside epoll sets */ |
96 | #define EP_MAX_POLLWAKE_NESTS 4 | 96 | #define EP_MAX_NESTS 4 |
97 | 97 | ||
98 | /* Maximum msec timeout value storeable in a long int */ | 98 | /* Maximum msec timeout value storeable in a long int */ |
99 | #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) | 99 | #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ) |
@@ -110,24 +110,21 @@ struct epoll_filefd { | |||
110 | }; | 110 | }; |
111 | 111 | ||
112 | /* | 112 | /* |
113 | * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". | 113 | * Structure used to track possible nested calls, for too deep recursions |
114 | * It is used to keep track on all tasks that are currently inside the wake_up() code | 114 | * and loop cycles. |
115 | * to 1) short-circuit the one coming from the same task and same wait queue head | ||
116 | * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting | ||
117 | * 3) let go the ones coming from other tasks. | ||
118 | */ | 115 | */ |
119 | struct wake_task_node { | 116 | struct nested_call_node { |
120 | struct list_head llink; | 117 | struct list_head llink; |
121 | struct task_struct *task; | 118 | struct task_struct *task; |
122 | wait_queue_head_t *wq; | 119 | void *cookie; |
123 | }; | 120 | }; |
124 | 121 | ||
125 | /* | 122 | /* |
126 | * This is used to implement the safe poll wake up avoiding to reenter | 123 | * This structure is used as collector for nested calls, to check for |
127 | * the poll callback from inside wake_up(). | 124 | * maximum recursion dept and loop cycles. |
128 | */ | 125 | */ |
129 | struct poll_safewake { | 126 | struct nested_calls { |
130 | struct list_head wake_task_list; | 127 | struct list_head tasks_call_list; |
131 | spinlock_t lock; | 128 | spinlock_t lock; |
132 | }; | 129 | }; |
133 | 130 | ||
@@ -231,6 +228,12 @@ struct ep_pqueue { | |||
231 | struct epitem *epi; | 228 | struct epitem *epi; |
232 | }; | 229 | }; |
233 | 230 | ||
231 | /* Used by the ep_send_events() function as callback private data */ | ||
232 | struct ep_send_events_data { | ||
233 | int maxevents; | ||
234 | struct epoll_event __user *events; | ||
235 | }; | ||
236 | |||
234 | /* | 237 | /* |
235 | * Configuration options available inside /proc/sys/fs/epoll/ | 238 | * Configuration options available inside /proc/sys/fs/epoll/ |
236 | */ | 239 | */ |
@@ -242,8 +245,11 @@ static int max_user_watches __read_mostly; | |||
242 | */ | 245 | */ |
243 | static DEFINE_MUTEX(epmutex); | 246 | static DEFINE_MUTEX(epmutex); |
244 | 247 | ||
245 | /* Safe wake up implementation */ | 248 | /* Used for safe wake up implementation */ |
246 | static struct poll_safewake psw; | 249 | static struct nested_calls poll_safewake_ncalls; |
250 | |||
251 | /* Used to call file's f_op->poll() under the nested calls boundaries */ | ||
252 | static struct nested_calls poll_readywalk_ncalls; | ||
247 | 253 | ||
248 | /* Slab cache used to allocate "struct epitem" */ | 254 | /* Slab cache used to allocate "struct epitem" */ |
249 | static struct kmem_cache *epi_cache __read_mostly; | 255 | static struct kmem_cache *epi_cache __read_mostly; |
@@ -312,64 +318,96 @@ static inline int ep_op_has_event(int op) | |||
312 | } | 318 | } |
313 | 319 | ||
314 | /* Initialize the poll safe wake up structure */ | 320 | /* Initialize the poll safe wake up structure */ |
315 | static void ep_poll_safewake_init(struct poll_safewake *psw) | 321 | static void ep_nested_calls_init(struct nested_calls *ncalls) |
316 | { | 322 | { |
317 | 323 | INIT_LIST_HEAD(&ncalls->tasks_call_list); | |
318 | INIT_LIST_HEAD(&psw->wake_task_list); | 324 | spin_lock_init(&ncalls->lock); |
319 | spin_lock_init(&psw->lock); | ||
320 | } | 325 | } |
321 | 326 | ||
322 | /* | 327 | /** |
323 | * Perform a safe wake up of the poll wait list. The problem is that | 328 | * ep_call_nested - Perform a bound (possibly) nested call, by checking |
324 | * with the new callback'd wake up system, it is possible that the | 329 | * that the recursion limit is not exceeded, and that |
325 | * poll callback is reentered from inside the call to wake_up() done | 330 | * the same nested call (by the meaning of same cookie) is |
326 | * on the poll wait queue head. The rule is that we cannot reenter the | 331 | * no re-entered. |
327 | * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times, | 332 | * |
328 | * and we cannot reenter the same wait queue head at all. This will | 333 | * @ncalls: Pointer to the nested_calls structure to be used for this call. |
329 | * enable to have a hierarchy of epoll file descriptor of no more than | 334 | * @max_nests: Maximum number of allowed nesting calls. |
330 | * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock | 335 | * @nproc: Nested call core function pointer. |
331 | * because this one gets called by the poll callback, that in turn is called | 336 | * @priv: Opaque data to be passed to the @nproc callback. |
332 | * from inside a wake_up(), that might be called from irq context. | 337 | * @cookie: Cookie to be used to identify this nested call. |
338 | * | ||
339 | * Returns: Returns the code returned by the @nproc callback, or -1 if | ||
340 | * the maximum recursion limit has been exceeded. | ||
333 | */ | 341 | */ |
334 | static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) | 342 | static int ep_call_nested(struct nested_calls *ncalls, int max_nests, |
343 | int (*nproc)(void *, void *, int), void *priv, | ||
344 | void *cookie) | ||
335 | { | 345 | { |
336 | int wake_nests = 0; | 346 | int error, call_nests = 0; |
337 | unsigned long flags; | 347 | unsigned long flags; |
338 | struct task_struct *this_task = current; | 348 | struct task_struct *this_task = current; |
339 | struct list_head *lsthead = &psw->wake_task_list; | 349 | struct list_head *lsthead = &ncalls->tasks_call_list; |
340 | struct wake_task_node *tncur; | 350 | struct nested_call_node *tncur; |
341 | struct wake_task_node tnode; | 351 | struct nested_call_node tnode; |
342 | 352 | ||
343 | spin_lock_irqsave(&psw->lock, flags); | 353 | spin_lock_irqsave(&ncalls->lock, flags); |
344 | 354 | ||
345 | /* Try to see if the current task is already inside this wakeup call */ | 355 | /* |
356 | * Try to see if the current task is already inside this wakeup call. | ||
357 | * We use a list here, since the population inside this set is always | ||
358 | * very much limited. | ||
359 | */ | ||
346 | list_for_each_entry(tncur, lsthead, llink) { | 360 | list_for_each_entry(tncur, lsthead, llink) { |
347 | 361 | if (tncur->task == this_task && | |
348 | if (tncur->wq == wq || | 362 | (tncur->cookie == cookie || ++call_nests > max_nests)) { |
349 | (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) { | ||
350 | /* | 363 | /* |
351 | * Ops ... loop detected or maximum nest level reached. | 364 | * Ops ... loop detected or maximum nest level reached. |
352 | * We abort this wake by breaking the cycle itself. | 365 | * We abort this wake by breaking the cycle itself. |
353 | */ | 366 | */ |
354 | spin_unlock_irqrestore(&psw->lock, flags); | 367 | spin_unlock_irqrestore(&ncalls->lock, flags); |
355 | return; | 368 | |
369 | return -1; | ||
356 | } | 370 | } |
357 | } | 371 | } |
358 | 372 | ||
359 | /* Add the current task to the list */ | 373 | /* Add the current task and cookie to the list */ |
360 | tnode.task = this_task; | 374 | tnode.task = this_task; |
361 | tnode.wq = wq; | 375 | tnode.cookie = cookie; |
362 | list_add(&tnode.llink, lsthead); | 376 | list_add(&tnode.llink, lsthead); |
363 | 377 | ||
364 | spin_unlock_irqrestore(&psw->lock, flags); | 378 | spin_unlock_irqrestore(&ncalls->lock, flags); |
365 | 379 | ||
366 | /* Do really wake up now */ | 380 | /* Call the nested function */ |
367 | wake_up_nested(wq, 1 + wake_nests); | 381 | error = (*nproc)(priv, cookie, call_nests); |
368 | 382 | ||
369 | /* Remove the current task from the list */ | 383 | /* Remove the current task from the list */ |
370 | spin_lock_irqsave(&psw->lock, flags); | 384 | spin_lock_irqsave(&ncalls->lock, flags); |
371 | list_del(&tnode.llink); | 385 | list_del(&tnode.llink); |
372 | spin_unlock_irqrestore(&psw->lock, flags); | 386 | spin_unlock_irqrestore(&ncalls->lock, flags); |
387 | |||
388 | return error; | ||
389 | } | ||
390 | |||
391 | static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) | ||
392 | { | ||
393 | wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests); | ||
394 | return 0; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Perform a safe wake up of the poll wait list. The problem is that | ||
399 | * with the new callback'd wake up system, it is possible that the | ||
400 | * poll callback is reentered from inside the call to wake_up() done | ||
401 | * on the poll wait queue head. The rule is that we cannot reenter the | ||
402 | * wake up code from the same task more than EP_MAX_NESTS times, | ||
403 | * and we cannot reenter the same wait queue head at all. This will | ||
404 | * enable to have a hierarchy of epoll file descriptor of no more than | ||
405 | * EP_MAX_NESTS deep. | ||
406 | */ | ||
407 | static void ep_poll_safewake(wait_queue_head_t *wq) | ||
408 | { | ||
409 | ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, | ||
410 | ep_poll_wakeup_proc, NULL, wq); | ||
373 | } | 411 | } |
374 | 412 | ||
375 | /* | 413 | /* |
@@ -397,6 +435,104 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) | |||
397 | } | 435 | } |
398 | } | 436 | } |
399 | 437 | ||
438 | /** | ||
439 | * ep_scan_ready_list - Scans the ready list in a way that makes possible for | ||
440 | * the scan code, to call f_op->poll(). Also allows for | ||
441 | * O(NumReady) performance. | ||
442 | * | ||
443 | * @ep: Pointer to the epoll private data structure. | ||
444 | * @sproc: Pointer to the scan callback. | ||
445 | * @priv: Private opaque data passed to the @sproc callback. | ||
446 | * | ||
447 | * Returns: The same integer error code returned by the @sproc callback. | ||
448 | */ | ||
449 | static int ep_scan_ready_list(struct eventpoll *ep, | ||
450 | int (*sproc)(struct eventpoll *, | ||
451 | struct list_head *, void *), | ||
452 | void *priv) | ||
453 | { | ||
454 | int error, pwake = 0; | ||
455 | unsigned long flags; | ||
456 | struct epitem *epi, *nepi; | ||
457 | struct list_head txlist; | ||
458 | |||
459 | INIT_LIST_HEAD(&txlist); | ||
460 | |||
461 | /* | ||
462 | * We need to lock this because we could be hit by | ||
463 | * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). | ||
464 | */ | ||
465 | mutex_lock(&ep->mtx); | ||
466 | |||
467 | /* | ||
468 | * Steal the ready list, and re-init the original one to the | ||
469 | * empty list. Also, set ep->ovflist to NULL so that events | ||
470 | * happening while looping w/out locks, are not lost. We cannot | ||
471 | * have the poll callback to queue directly on ep->rdllist, | ||
472 | * because we want the "sproc" callback to be able to do it | ||
473 | * in a lockless way. | ||
474 | */ | ||
475 | spin_lock_irqsave(&ep->lock, flags); | ||
476 | list_splice(&ep->rdllist, &txlist); | ||
477 | INIT_LIST_HEAD(&ep->rdllist); | ||
478 | ep->ovflist = NULL; | ||
479 | spin_unlock_irqrestore(&ep->lock, flags); | ||
480 | |||
481 | /* | ||
482 | * Now call the callback function. | ||
483 | */ | ||
484 | error = (*sproc)(ep, &txlist, priv); | ||
485 | |||
486 | spin_lock_irqsave(&ep->lock, flags); | ||
487 | /* | ||
488 | * During the time we spent inside the "sproc" callback, some | ||
489 | * other events might have been queued by the poll callback. | ||
490 | * We re-insert them inside the main ready-list here. | ||
491 | */ | ||
492 | for (nepi = ep->ovflist; (epi = nepi) != NULL; | ||
493 | nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { | ||
494 | /* | ||
495 | * We need to check if the item is already in the list. | ||
496 | * During the "sproc" callback execution time, items are | ||
497 | * queued into ->ovflist but the "txlist" might already | ||
498 | * contain them, and the list_splice() below takes care of them. | ||
499 | */ | ||
500 | if (!ep_is_linked(&epi->rdllink)) | ||
501 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
502 | } | ||
503 | /* | ||
504 | * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after | ||
505 | * releasing the lock, events will be queued in the normal way inside | ||
506 | * ep->rdllist. | ||
507 | */ | ||
508 | ep->ovflist = EP_UNACTIVE_PTR; | ||
509 | |||
510 | /* | ||
511 | * Quickly re-inject items left on "txlist". | ||
512 | */ | ||
513 | list_splice(&txlist, &ep->rdllist); | ||
514 | |||
515 | if (!list_empty(&ep->rdllist)) { | ||
516 | /* | ||
517 | * Wake up (if active) both the eventpoll wait list and the ->poll() | ||
518 | * wait list (delayed after we release the lock). | ||
519 | */ | ||
520 | if (waitqueue_active(&ep->wq)) | ||
521 | wake_up_locked(&ep->wq); | ||
522 | if (waitqueue_active(&ep->poll_wait)) | ||
523 | pwake++; | ||
524 | } | ||
525 | spin_unlock_irqrestore(&ep->lock, flags); | ||
526 | |||
527 | mutex_unlock(&ep->mtx); | ||
528 | |||
529 | /* We have to call this outside the lock */ | ||
530 | if (pwake) | ||
531 | ep_poll_safewake(&ep->poll_wait); | ||
532 | |||
533 | return error; | ||
534 | } | ||
535 | |||
400 | /* | 536 | /* |
401 | * Removes a "struct epitem" from the eventpoll RB tree and deallocates | 537 | * Removes a "struct epitem" from the eventpoll RB tree and deallocates |
402 | * all the associated resources. Must be called with "mtx" held. | 538 | * all the associated resources. Must be called with "mtx" held. |
@@ -447,7 +583,7 @@ static void ep_free(struct eventpoll *ep) | |||
447 | 583 | ||
448 | /* We need to release all tasks waiting for these file */ | 584 | /* We need to release all tasks waiting for these file */ |
449 | if (waitqueue_active(&ep->poll_wait)) | 585 | if (waitqueue_active(&ep->poll_wait)) |
450 | ep_poll_safewake(&psw, &ep->poll_wait); | 586 | ep_poll_safewake(&ep->poll_wait); |
451 | 587 | ||
452 | /* | 588 | /* |
453 | * We need to lock this because we could be hit by | 589 | * We need to lock this because we could be hit by |
@@ -496,22 +632,49 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) | |||
496 | return 0; | 632 | return 0; |
497 | } | 633 | } |
498 | 634 | ||
635 | static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) | ||
636 | { | ||
637 | struct epitem *epi, *tmp; | ||
638 | |||
639 | list_for_each_entry_safe(epi, tmp, head, rdllink) { | ||
640 | if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & | ||
641 | epi->event.events) | ||
642 | return POLLIN | POLLRDNORM; | ||
643 | else | ||
644 | /* | ||
645 | * Item has been dropped into the ready list by the poll | ||
646 | * callback, but it's not actually ready, as far as | ||
647 | * caller requested events goes. We can remove it here. | ||
648 | */ | ||
649 | list_del_init(&epi->rdllink); | ||
650 | } | ||
651 | |||
652 | return 0; | ||
653 | } | ||
654 | |||
655 | static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) | ||
656 | { | ||
657 | return ep_scan_ready_list(priv, ep_read_events_proc, NULL); | ||
658 | } | ||
659 | |||
499 | static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) | 660 | static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) |
500 | { | 661 | { |
501 | unsigned int pollflags = 0; | 662 | int pollflags; |
502 | unsigned long flags; | ||
503 | struct eventpoll *ep = file->private_data; | 663 | struct eventpoll *ep = file->private_data; |
504 | 664 | ||
505 | /* Insert inside our poll wait queue */ | 665 | /* Insert inside our poll wait queue */ |
506 | poll_wait(file, &ep->poll_wait, wait); | 666 | poll_wait(file, &ep->poll_wait, wait); |
507 | 667 | ||
508 | /* Check our condition */ | 668 | /* |
509 | spin_lock_irqsave(&ep->lock, flags); | 669 | * Proceed to find out if wanted events are really available inside |
510 | if (!list_empty(&ep->rdllist)) | 670 | * the ready list. This need to be done under ep_call_nested() |
511 | pollflags = POLLIN | POLLRDNORM; | 671 | * supervision, since the call to f_op->poll() done on listed files |
512 | spin_unlock_irqrestore(&ep->lock, flags); | 672 | * could re-enter here. |
673 | */ | ||
674 | pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, | ||
675 | ep_poll_readyevents_proc, ep, ep); | ||
513 | 676 | ||
514 | return pollflags; | 677 | return pollflags != -1 ? pollflags: 0; |
515 | } | 678 | } |
516 | 679 | ||
517 | /* File callbacks that implement the eventpoll file behaviour */ | 680 | /* File callbacks that implement the eventpoll file behaviour */ |
@@ -541,7 +704,7 @@ void eventpoll_release_file(struct file *file) | |||
541 | * We don't want to get "file->f_lock" because it is not | 704 | * We don't want to get "file->f_lock" because it is not |
542 | * necessary. It is not necessary because we're in the "struct file" | 705 | * necessary. It is not necessary because we're in the "struct file" |
543 | * cleanup path, and this means that noone is using this file anymore. | 706 | * cleanup path, and this means that noone is using this file anymore. |
544 | * So, for example, epoll_ctl() cannot hit here sicne if we reach this | 707 | * So, for example, epoll_ctl() cannot hit here since if we reach this |
545 | * point, the file counter already went to zero and fget() would fail. | 708 | * point, the file counter already went to zero and fget() would fail. |
546 | * The only hit might come from ep_free() but by holding the mutex | 709 | * The only hit might come from ep_free() but by holding the mutex |
547 | * will correctly serialize the operation. We do need to acquire | 710 | * will correctly serialize the operation. We do need to acquire |
@@ -670,12 +833,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k | |||
670 | } | 833 | } |
671 | 834 | ||
672 | /* If this file is already in the ready list we exit soon */ | 835 | /* If this file is already in the ready list we exit soon */ |
673 | if (ep_is_linked(&epi->rdllink)) | 836 | if (!ep_is_linked(&epi->rdllink)) |
674 | goto is_linked; | 837 | list_add_tail(&epi->rdllink, &ep->rdllist); |
675 | |||
676 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
677 | 838 | ||
678 | is_linked: | ||
679 | /* | 839 | /* |
680 | * Wake up ( if active ) both the eventpoll wait list and the ->poll() | 840 | * Wake up ( if active ) both the eventpoll wait list and the ->poll() |
681 | * wait list. | 841 | * wait list. |
@@ -690,7 +850,7 @@ out_unlock: | |||
690 | 850 | ||
691 | /* We have to call this outside the lock */ | 851 | /* We have to call this outside the lock */ |
692 | if (pwake) | 852 | if (pwake) |
693 | ep_poll_safewake(&psw, &ep->poll_wait); | 853 | ep_poll_safewake(&ep->poll_wait); |
694 | 854 | ||
695 | return 1; | 855 | return 1; |
696 | } | 856 | } |
@@ -712,10 +872,9 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, | |||
712 | add_wait_queue(whead, &pwq->wait); | 872 | add_wait_queue(whead, &pwq->wait); |
713 | list_add_tail(&pwq->llink, &epi->pwqlist); | 873 | list_add_tail(&pwq->llink, &epi->pwqlist); |
714 | epi->nwait++; | 874 | epi->nwait++; |
715 | } else { | 875 | } else |
716 | /* We have to signal that an error occurred */ | 876 | /* We have to signal that an error occurred */ |
717 | epi->nwait = -1; | 877 | epi->nwait = -1; |
718 | } | ||
719 | } | 878 | } |
720 | 879 | ||
721 | static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | 880 | static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) |
@@ -817,7 +976,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
817 | 976 | ||
818 | /* We have to call this outside the lock */ | 977 | /* We have to call this outside the lock */ |
819 | if (pwake) | 978 | if (pwake) |
820 | ep_poll_safewake(&psw, &ep->poll_wait); | 979 | ep_poll_safewake(&ep->poll_wait); |
821 | 980 | ||
822 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", | 981 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", |
823 | current, ep, tfile, fd)); | 982 | current, ep, tfile, fd)); |
@@ -891,137 +1050,74 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even | |||
891 | 1050 | ||
892 | /* We have to call this outside the lock */ | 1051 | /* We have to call this outside the lock */ |
893 | if (pwake) | 1052 | if (pwake) |
894 | ep_poll_safewake(&psw, &ep->poll_wait); | 1053 | ep_poll_safewake(&ep->poll_wait); |
895 | 1054 | ||
896 | return 0; | 1055 | return 0; |
897 | } | 1056 | } |
898 | 1057 | ||
899 | static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, | 1058 | static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) |
900 | int maxevents) | ||
901 | { | 1059 | { |
902 | int eventcnt, error = -EFAULT, pwake = 0; | 1060 | struct ep_send_events_data *esed = priv; |
903 | unsigned int revents; | 1061 | int eventcnt; |
904 | unsigned long flags; | 1062 | unsigned int revents; |
905 | struct epitem *epi, *nepi; | 1063 | struct epitem *epi; |
906 | struct list_head txlist; | 1064 | struct epoll_event __user *uevent; |
907 | |||
908 | INIT_LIST_HEAD(&txlist); | ||
909 | |||
910 | /* | ||
911 | * We need to lock this because we could be hit by | ||
912 | * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). | ||
913 | */ | ||
914 | mutex_lock(&ep->mtx); | ||
915 | |||
916 | /* | ||
917 | * Steal the ready list, and re-init the original one to the | ||
918 | * empty list. Also, set ep->ovflist to NULL so that events | ||
919 | * happening while looping w/out locks, are not lost. We cannot | ||
920 | * have the poll callback to queue directly on ep->rdllist, | ||
921 | * because we are doing it in the loop below, in a lockless way. | ||
922 | */ | ||
923 | spin_lock_irqsave(&ep->lock, flags); | ||
924 | list_splice(&ep->rdllist, &txlist); | ||
925 | INIT_LIST_HEAD(&ep->rdllist); | ||
926 | ep->ovflist = NULL; | ||
927 | spin_unlock_irqrestore(&ep->lock, flags); | ||
928 | 1065 | ||
929 | /* | 1066 | /* |
930 | * We can loop without lock because this is a task private list. | 1067 | * We can loop without lock because we are passed a task private list. |
931 | * We just splice'd out the ep->rdllist in ep_collect_ready_items(). | 1068 | * Items cannot vanish during the loop because ep_scan_ready_list() is |
932 | * Items cannot vanish during the loop because we are holding "mtx". | 1069 | * holding "mtx" during this call. |
933 | */ | 1070 | */ |
934 | for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { | 1071 | for (eventcnt = 0, uevent = esed->events; |
935 | epi = list_first_entry(&txlist, struct epitem, rdllink); | 1072 | !list_empty(head) && eventcnt < esed->maxevents;) { |
1073 | epi = list_first_entry(head, struct epitem, rdllink); | ||
936 | 1074 | ||
937 | list_del_init(&epi->rdllink); | 1075 | list_del_init(&epi->rdllink); |
938 | 1076 | ||
939 | /* | 1077 | revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & |
940 | * Get the ready file event set. We can safely use the file | 1078 | epi->event.events; |
941 | * because we are holding the "mtx" and this will guarantee | 1079 | |
942 | * that both the file and the item will not vanish. | 1080 | /* |
943 | */ | 1081 | * If the event mask intersect the caller-requested one, |
944 | revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); | 1082 | * deliver the event to userspace. Again, ep_scan_ready_list() |
945 | revents &= epi->event.events; | 1083 | * is holding "mtx", so no operations coming from userspace |
946 | 1084 | * can change the item. | |
947 | /* | 1085 | */ |
948 | * Is the event mask intersect the caller-requested one, | 1086 | if (revents) { |
949 | * deliver the event to userspace. Again, we are holding | 1087 | if (__put_user(revents, &uevent->events) || |
950 | * "mtx", so no operations coming from userspace can change | 1088 | __put_user(epi->event.data, &uevent->data)) |
951 | * the item. | 1089 | return eventcnt ? eventcnt: -EFAULT; |
952 | */ | 1090 | eventcnt++; |
953 | if (revents) { | 1091 | uevent++; |
954 | if (__put_user(revents, | 1092 | if (epi->event.events & EPOLLONESHOT) |
955 | &events[eventcnt].events) || | 1093 | epi->event.events &= EP_PRIVATE_BITS; |
956 | __put_user(epi->event.data, | 1094 | else if (!(epi->event.events & EPOLLET)) |
957 | &events[eventcnt].data)) | 1095 | /* |
958 | goto errxit; | 1096 | * If this file has been added with Level Trigger |
959 | if (epi->event.events & EPOLLONESHOT) | 1097 | * mode, we need to insert back inside the ready |
960 | epi->event.events &= EP_PRIVATE_BITS; | 1098 | * list, so that the next call to epoll_wait() |
961 | eventcnt++; | 1099 | * will check again the events availability. |
962 | } | 1100 | * At this point, noone can insert into ep->rdllist |
963 | /* | 1101 | * besides us. The epoll_ctl() callers are locked |
964 | * At this point, noone can insert into ep->rdllist besides | 1102 | * out by ep_scan_ready_list() holding "mtx" and |
965 | * us. The epoll_ctl() callers are locked out by us holding | 1103 | * the poll callback will queue them in ep->ovflist. |
966 | * "mtx" and the poll callback will queue them in ep->ovflist. | 1104 | */ |
967 | */ | 1105 | list_add_tail(&epi->rdllink, &ep->rdllist); |
968 | if (!(epi->event.events & EPOLLET) && | 1106 | } |
969 | (revents & epi->event.events)) | 1107 | } |
970 | list_add_tail(&epi->rdllink, &ep->rdllist); | 1108 | |
971 | } | 1109 | return eventcnt; |
972 | error = 0; | 1110 | } |
973 | |||
974 | errxit: | ||
975 | |||
976 | spin_lock_irqsave(&ep->lock, flags); | ||
977 | /* | ||
978 | * During the time we spent in the loop above, some other events | ||
979 | * might have been queued by the poll callback. We re-insert them | ||
980 | * inside the main ready-list here. | ||
981 | */ | ||
982 | for (nepi = ep->ovflist; (epi = nepi) != NULL; | ||
983 | nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { | ||
984 | /* | ||
985 | * If the above loop quit with errors, the epoll item might still | ||
986 | * be linked to "txlist", and the list_splice() done below will | ||
987 | * take care of those cases. | ||
988 | */ | ||
989 | if (!ep_is_linked(&epi->rdllink)) | ||
990 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
991 | } | ||
992 | /* | ||
993 | * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after | ||
994 | * releasing the lock, events will be queued in the normal way inside | ||
995 | * ep->rdllist. | ||
996 | */ | ||
997 | ep->ovflist = EP_UNACTIVE_PTR; | ||
998 | |||
999 | /* | ||
1000 | * In case of error in the event-send loop, or in case the number of | ||
1001 | * ready events exceeds the userspace limit, we need to splice the | ||
1002 | * "txlist" back inside ep->rdllist. | ||
1003 | */ | ||
1004 | list_splice(&txlist, &ep->rdllist); | ||
1005 | |||
1006 | if (!list_empty(&ep->rdllist)) { | ||
1007 | /* | ||
1008 | * Wake up (if active) both the eventpoll wait list and the ->poll() | ||
1009 | * wait list (delayed after we release the lock). | ||
1010 | */ | ||
1011 | if (waitqueue_active(&ep->wq)) | ||
1012 | wake_up_locked(&ep->wq); | ||
1013 | if (waitqueue_active(&ep->poll_wait)) | ||
1014 | pwake++; | ||
1015 | } | ||
1016 | spin_unlock_irqrestore(&ep->lock, flags); | ||
1017 | 1111 | ||
1018 | mutex_unlock(&ep->mtx); | 1112 | static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, |
1113 | int maxevents) | ||
1114 | { | ||
1115 | struct ep_send_events_data esed; | ||
1019 | 1116 | ||
1020 | /* We have to call this outside the lock */ | 1117 | esed.maxevents = maxevents; |
1021 | if (pwake) | 1118 | esed.events = events; |
1022 | ep_poll_safewake(&psw, &ep->poll_wait); | ||
1023 | 1119 | ||
1024 | return eventcnt == 0 ? error: eventcnt; | 1120 | return ep_scan_ready_list(ep, ep_send_events_proc, &esed); |
1025 | } | 1121 | } |
1026 | 1122 | ||
1027 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | 1123 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, |
@@ -1033,7 +1129,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | |||
1033 | wait_queue_t wait; | 1129 | wait_queue_t wait; |
1034 | 1130 | ||
1035 | /* | 1131 | /* |
1036 | * Calculate the timeout by checking for the "infinite" value ( -1 ) | 1132 | * Calculate the timeout by checking for the "infinite" value (-1) |
1037 | * and the overflow condition. The passed timeout is in milliseconds, | 1133 | * and the overflow condition. The passed timeout is in milliseconds, |
1038 | * that why (t * HZ) / 1000. | 1134 | * that why (t * HZ) / 1000. |
1039 | */ | 1135 | */ |
@@ -1076,9 +1172,8 @@ retry: | |||
1076 | 1172 | ||
1077 | set_current_state(TASK_RUNNING); | 1173 | set_current_state(TASK_RUNNING); |
1078 | } | 1174 | } |
1079 | |||
1080 | /* Is it worth to try to dig for events ? */ | 1175 | /* Is it worth to try to dig for events ? */ |
1081 | eavail = !list_empty(&ep->rdllist); | 1176 | eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; |
1082 | 1177 | ||
1083 | spin_unlock_irqrestore(&ep->lock, flags); | 1178 | spin_unlock_irqrestore(&ep->lock, flags); |
1084 | 1179 | ||
@@ -1099,41 +1194,40 @@ retry: | |||
1099 | */ | 1194 | */ |
1100 | SYSCALL_DEFINE1(epoll_create1, int, flags) | 1195 | SYSCALL_DEFINE1(epoll_create1, int, flags) |
1101 | { | 1196 | { |
1102 | int error, fd = -1; | 1197 | int error; |
1103 | struct eventpoll *ep; | 1198 | struct eventpoll *ep = NULL; |
1104 | 1199 | ||
1105 | /* Check the EPOLL_* constant for consistency. */ | 1200 | /* Check the EPOLL_* constant for consistency. */ |
1106 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); | 1201 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); |
1107 | 1202 | ||
1108 | if (flags & ~EPOLL_CLOEXEC) | ||
1109 | return -EINVAL; | ||
1110 | |||
1111 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", | 1203 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", |
1112 | current, flags)); | 1204 | current, flags)); |
1113 | 1205 | ||
1206 | error = -EINVAL; | ||
1207 | if (flags & ~EPOLL_CLOEXEC) | ||
1208 | goto error_return; | ||
1209 | |||
1114 | /* | 1210 | /* |
1115 | * Create the internal data structure ( "struct eventpoll" ). | 1211 | * Create the internal data structure ("struct eventpoll"). |
1116 | */ | 1212 | */ |
1117 | error = ep_alloc(&ep); | 1213 | error = ep_alloc(&ep); |
1118 | if (error < 0) { | 1214 | if (error < 0) |
1119 | fd = error; | ||
1120 | goto error_return; | 1215 | goto error_return; |
1121 | } | ||
1122 | 1216 | ||
1123 | /* | 1217 | /* |
1124 | * Creates all the items needed to setup an eventpoll file. That is, | 1218 | * Creates all the items needed to setup an eventpoll file. That is, |
1125 | * a file structure and a free file descriptor. | 1219 | * a file structure and a free file descriptor. |
1126 | */ | 1220 | */ |
1127 | fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, | 1221 | error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, |
1128 | flags & O_CLOEXEC); | 1222 | flags & O_CLOEXEC); |
1129 | if (fd < 0) | 1223 | if (error < 0) |
1130 | ep_free(ep); | 1224 | ep_free(ep); |
1131 | 1225 | ||
1132 | error_return: | 1226 | error_return: |
1133 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | 1227 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", |
1134 | current, flags, fd)); | 1228 | current, flags, error)); |
1135 | 1229 | ||
1136 | return fd; | 1230 | return error; |
1137 | } | 1231 | } |
1138 | 1232 | ||
1139 | SYSCALL_DEFINE1(epoll_create, int, size) | 1233 | SYSCALL_DEFINE1(epoll_create, int, size) |
@@ -1359,7 +1453,10 @@ static int __init eventpoll_init(void) | |||
1359 | EP_ITEM_COST; | 1453 | EP_ITEM_COST; |
1360 | 1454 | ||
1361 | /* Initialize the structure used to perform safe poll wait head wake ups */ | 1455 | /* Initialize the structure used to perform safe poll wait head wake ups */ |
1362 | ep_poll_safewake_init(&psw); | 1456 | ep_nested_calls_init(&poll_safewake_ncalls); |
1457 | |||
1458 | /* Initialize the structure used to perform file's f_op->poll() calls */ | ||
1459 | ep_nested_calls_init(&poll_readywalk_ncalls); | ||
1363 | 1460 | ||
1364 | /* Allocates slab cache used to allocate "struct epitem" items */ | 1461 | /* Allocates slab cache used to allocate "struct epitem" items */ |
1365 | epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), | 1462 | epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), |