aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/eventpoll.c1034
1 files changed, 486 insertions, 548 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index e224abfd9197..1aad34ea61a4 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,7 +41,6 @@
41#include <asm/atomic.h> 41#include <asm/atomic.h>
42#include <asm/semaphore.h> 42#include <asm/semaphore.h>
43 43
44
45/* 44/*
46 * LOCKING: 45 * LOCKING:
47 * There are three level of locking required by epoll : 46 * There are three level of locking required by epoll :
@@ -74,7 +73,6 @@
74 * a greater scalability. 73 * a greater scalability.
75 */ 74 */
76 75
77
78#define DEBUG_EPOLL 0 76#define DEBUG_EPOLL 0
79 77
80#if DEBUG_EPOLL > 0 78#if DEBUG_EPOLL > 0
@@ -104,7 +102,6 @@
104 102
105#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) 103#define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event))
106 104
107
108struct epoll_filefd { 105struct epoll_filefd {
109 struct file *file; 106 struct file *file;
110 int fd; 107 int fd;
@@ -222,36 +219,6 @@ struct ep_pqueue {
222 struct epitem *epi; 219 struct epitem *epi;
223}; 220};
224 221
225
226
227static void ep_poll_safewake_init(struct poll_safewake *psw);
228static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq);
229static int ep_alloc(struct eventpoll **pep);
230static void ep_free(struct eventpoll *ep);
231static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd);
232static void ep_use_epitem(struct epitem *epi);
233static void ep_release_epitem(struct epitem *epi);
234static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
235 poll_table *pt);
236static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi);
237static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
238 struct file *tfile, int fd);
239static int ep_modify(struct eventpoll *ep, struct epitem *epi,
240 struct epoll_event *event);
241static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi);
242static int ep_unlink(struct eventpoll *ep, struct epitem *epi);
243static int ep_remove(struct eventpoll *ep, struct epitem *epi);
244static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
245static int ep_eventpoll_close(struct inode *inode, struct file *file);
246static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
247static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
248 struct epoll_event __user *events, int maxevents);
249static int ep_events_transfer(struct eventpoll *ep,
250 struct epoll_event __user *events,
251 int maxevents);
252static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
253 int maxevents, long timeout);
254
255/* 222/*
256 * This semaphore is used to serialize ep_free() and eventpoll_release_file(). 223 * This semaphore is used to serialize ep_free() and eventpoll_release_file().
257 */ 224 */
@@ -266,19 +233,6 @@ static struct kmem_cache *epi_cache __read_mostly;
266/* Slab cache used to allocate "struct eppoll_entry" */ 233/* Slab cache used to allocate "struct eppoll_entry" */
267static struct kmem_cache *pwq_cache __read_mostly; 234static struct kmem_cache *pwq_cache __read_mostly;
268 235
269/* File callbacks that implement the eventpoll file behaviour */
270static const struct file_operations eventpoll_fops = {
271 .release = ep_eventpoll_close,
272 .poll = ep_eventpoll_poll
273};
274
275
276
277/* Fast test to see if the file is an evenpoll file */
278static inline int is_file_epoll(struct file *f)
279{
280 return f->f_op == &eventpoll_fops;
281}
282 236
283/* Setup the structure that is used as key for the rb-tree */ 237/* Setup the structure that is used as key for the rb-tree */
284static inline void ep_set_ffd(struct epoll_filefd *ffd, 238static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -347,7 +301,6 @@ static void ep_poll_safewake_init(struct poll_safewake *psw)
347 spin_lock_init(&psw->lock); 301 spin_lock_init(&psw->lock);
348} 302}
349 303
350
351/* 304/*
352 * Perform a safe wake up of the poll wait list. The problem is that 305 * Perform a safe wake up of the poll wait list. The problem is that
353 * with the new callback'd wake up system, it is possible that the 306 * with the new callback'd wake up system, it is possible that the
@@ -402,325 +355,145 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
402 spin_unlock_irqrestore(&psw->lock, flags); 355 spin_unlock_irqrestore(&psw->lock, flags);
403} 356}
404 357
405
406/* 358/*
407 * This is called from eventpoll_release() to unlink files from the eventpoll 359 * This function unregister poll callbacks from the associated file descriptor.
408 * interface. We need to have this facility to cleanup correctly files that are 360 * Since this must be called without holding "ep->lock" the atomic exchange trick
409 * closed without being removed from the eventpoll interface. 361 * will protect us from multiple unregister.
410 */ 362 */
411void eventpoll_release_file(struct file *file) 363static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
412{ 364{
413 struct list_head *lsthead = &file->f_ep_links; 365 int nwait;
414 struct eventpoll *ep; 366 struct list_head *lsthead = &epi->pwqlist;
415 struct epitem *epi; 367 struct eppoll_entry *pwq;
416 368
417 /* 369 /* This is called without locks, so we need the atomic exchange */
418 * We don't want to get "file->f_ep_lock" because it is not 370 nwait = xchg(&epi->nwait, 0);
419 * necessary. It is not necessary because we're in the "struct file"
420 * cleanup path, and this means that noone is using this file anymore.
421 * The only hit might come from ep_free() but by holding the semaphore
422 * will correctly serialize the operation. We do need to acquire
423 * "ep->sem" after "epmutex" because ep_remove() requires it when called
424 * from anywhere but ep_free().
425 */
426 mutex_lock(&epmutex);
427 371
428 while (!list_empty(lsthead)) { 372 if (nwait) {
429 epi = list_first_entry(lsthead, struct epitem, fllink); 373 while (!list_empty(lsthead)) {
374 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
430 375
431 ep = epi->ep; 376 list_del_init(&pwq->llink);
432 list_del_init(&epi->fllink); 377 remove_wait_queue(pwq->whead, &pwq->wait);
433 down_write(&ep->sem); 378 kmem_cache_free(pwq_cache, pwq);
434 ep_remove(ep, epi); 379 }
435 up_write(&ep->sem);
436 } 380 }
437
438 mutex_unlock(&epmutex);
439} 381}
440 382
441
442/* 383/*
443 * It opens an eventpoll file descriptor by suggesting a storage of "size" 384 * Unlink the "struct epitem" from all places it might have been hooked up.
444 * file descriptors. The size parameter is just an hint about how to size 385 * This function must be called with write IRQ lock on "ep->lock".
445 * data structures. It won't prevent the user to store more than "size"
446 * file descriptors inside the epoll interface. It is the kernel part of
447 * the userspace epoll_create(2).
448 */ 386 */
449asmlinkage long sys_epoll_create(int size) 387static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
450{ 388{
451 int error, fd = -1; 389 int error;
452 struct eventpoll *ep;
453 struct inode *inode;
454 struct file *file;
455
456 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
457 current, size));
458
459 /*
460 * Sanity check on the size parameter, and create the internal data
461 * structure ( "struct eventpoll" ).
462 */
463 error = -EINVAL;
464 if (size <= 0 || (error = ep_alloc(&ep)) != 0)
465 goto eexit_1;
466 390
467 /* 391 /*
468 * Creates all the items needed to setup an eventpoll file. That is, 392 * It can happen that this one is called for an item already unlinked.
469 * a file structure, and inode and a free file descriptor. 393 * The check protect us from doing a double unlink ( crash ).
470 */ 394 */
471 error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", 395 error = -ENOENT;
472 &eventpoll_fops, ep); 396 if (!ep_rb_linked(&epi->rbn))
473 if (error) 397 goto error_return;
474 goto eexit_2;
475
476 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
477 current, size, fd));
478
479 return fd;
480
481eexit_2:
482 ep_free(ep);
483 kfree(ep);
484eexit_1:
485 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
486 current, size, error));
487 return error;
488}
489
490
491/*
492 * The following function implements the controller interface for
493 * the eventpoll file that enables the insertion/removal/change of
494 * file descriptors inside the interest set. It represents
495 * the kernel part of the user space epoll_ctl(2).
496 */
497asmlinkage long
498sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
499{
500 int error;
501 struct file *file, *tfile;
502 struct eventpoll *ep;
503 struct epitem *epi;
504 struct epoll_event epds;
505
506 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
507 current, epfd, op, fd, event));
508
509 error = -EFAULT;
510 if (ep_op_has_event(op) &&
511 copy_from_user(&epds, event, sizeof(struct epoll_event)))
512 goto eexit_1;
513
514 /* Get the "struct file *" for the eventpoll file */
515 error = -EBADF;
516 file = fget(epfd);
517 if (!file)
518 goto eexit_1;
519
520 /* Get the "struct file *" for the target file */
521 tfile = fget(fd);
522 if (!tfile)
523 goto eexit_2;
524
525 /* The target file descriptor must support poll */
526 error = -EPERM;
527 if (!tfile->f_op || !tfile->f_op->poll)
528 goto eexit_3;
529 398
530 /* 399 /*
531 * We have to check that the file structure underneath the file descriptor 400 * Clear the event mask for the unlinked item. This will avoid item
532 * the user passed to us _is_ an eventpoll file. And also we do not permit 401 * notifications to be sent after the unlink operation from inside
533 * adding an epoll file descriptor inside itself. 402 * the kernel->userspace event transfer loop.
534 */ 403 */
535 error = -EINVAL; 404 epi->event.events = 0;
536 if (file == tfile || !is_file_epoll(file))
537 goto eexit_3;
538 405
539 /* 406 /*
540 * At this point it is safe to assume that the "private_data" contains 407 * At this point is safe to do the job, unlink the item from our rb-tree.
541 * our own data structure. 408 * This operation togheter with the above check closes the door to
409 * double unlinks.
542 */ 410 */
543 ep = file->private_data; 411 ep_rb_erase(&epi->rbn, &ep->rbr);
544
545 down_write(&ep->sem);
546
547 /* Try to lookup the file inside our RB tree */
548 epi = ep_find(ep, tfile, fd);
549
550 error = -EINVAL;
551 switch (op) {
552 case EPOLL_CTL_ADD:
553 if (!epi) {
554 epds.events |= POLLERR | POLLHUP;
555
556 error = ep_insert(ep, &epds, tfile, fd);
557 } else
558 error = -EEXIST;
559 break;
560 case EPOLL_CTL_DEL:
561 if (epi)
562 error = ep_remove(ep, epi);
563 else
564 error = -ENOENT;
565 break;
566 case EPOLL_CTL_MOD:
567 if (epi) {
568 epds.events |= POLLERR | POLLHUP;
569 error = ep_modify(ep, epi, &epds);
570 } else
571 error = -ENOENT;
572 break;
573 }
574 412
575 /* 413 /*
576 * The function ep_find() increments the usage count of the structure 414 * If the item we are going to remove is inside the ready file descriptors
577 * so, if this is not NULL, we need to release it. 415 * we want to remove it from this list to avoid stale events.
578 */ 416 */
579 if (epi) 417 if (ep_is_linked(&epi->rdllink))
580 ep_release_epitem(epi); 418 list_del_init(&epi->rdllink);
581 419
582 up_write(&ep->sem); 420 error = 0;
421error_return:
583 422
584eexit_3: 423 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
585 fput(tfile); 424 current, ep, epi->ffd.file, error));
586eexit_2:
587 fput(file);
588eexit_1:
589 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
590 current, epfd, op, fd, event, error));
591 425
592 return error; 426 return error;
593} 427}
594 428
595
596/* 429/*
597 * Implement the event wait interface for the eventpoll file. It is the kernel 430 * Increment the usage count of the "struct epitem" making it sure
598 * part of the user space epoll_wait(2). 431 * that the user will have a valid pointer to reference.
599 */ 432 */
600asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, 433static void ep_use_epitem(struct epitem *epi)
601 int maxevents, int timeout)
602{ 434{
603 int error; 435 atomic_inc(&epi->usecnt);
604 struct file *file;
605 struct eventpoll *ep;
606
607 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
608 current, epfd, events, maxevents, timeout));
609
610 /* The maximum number of event must be greater than zero */
611 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
612 return -EINVAL;
613
614 /* Verify that the area passed by the user is writeable */
615 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
616 error = -EFAULT;
617 goto eexit_1;
618 }
619
620 /* Get the "struct file *" for the eventpoll file */
621 error = -EBADF;
622 file = fget(epfd);
623 if (!file)
624 goto eexit_1;
625
626 /*
627 * We have to check that the file structure underneath the fd
628 * the user passed to us _is_ an eventpoll file.
629 */
630 error = -EINVAL;
631 if (!is_file_epoll(file))
632 goto eexit_2;
633
634 /*
635 * At this point it is safe to assume that the "private_data" contains
636 * our own data structure.
637 */
638 ep = file->private_data;
639
640 /* Time to fish for events ... */
641 error = ep_poll(ep, events, maxevents, timeout);
642
643eexit_2:
644 fput(file);
645eexit_1:
646 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
647 current, epfd, events, maxevents, timeout, error));
648
649 return error;
650} 436}
651 437
652 438/*
653#ifdef TIF_RESTORE_SIGMASK 439 * Decrement ( release ) the usage count by signaling that the user
440 * has finished using the structure. It might lead to freeing the
441 * structure itself if the count goes to zero.
442 */
443static void ep_release_epitem(struct epitem *epi)
444{
445 if (atomic_dec_and_test(&epi->usecnt))
446 kmem_cache_free(epi_cache, epi);
447}
654 448
655/* 449/*
656 * Implement the event wait interface for the eventpoll file. It is the kernel 450 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
657 * part of the user space epoll_pwait(2). 451 * all the associated resources.
658 */ 452 */
659asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, 453static int ep_remove(struct eventpoll *ep, struct epitem *epi)
660 int maxevents, int timeout, const sigset_t __user *sigmask,
661 size_t sigsetsize)
662{ 454{
663 int error; 455 int error;
664 sigset_t ksigmask, sigsaved; 456 unsigned long flags;
665 457 struct file *file = epi->ffd.file;
666 /*
667 * If the caller wants a certain signal mask to be set during the wait,
668 * we apply it here.
669 */
670 if (sigmask) {
671 if (sigsetsize != sizeof(sigset_t))
672 return -EINVAL;
673 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
674 return -EFAULT;
675 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
676 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
677 }
678
679 error = sys_epoll_wait(epfd, events, maxevents, timeout);
680 458
681 /* 459 /*
682 * If we changed the signal mask, we need to restore the original one. 460 * Removes poll wait queue hooks. We _have_ to do this without holding
683 * In case we've got a signal while waiting, we do not restore the 461 * the "ep->lock" otherwise a deadlock might occur. This because of the
684 * signal mask yet, and we allow do_signal() to deliver the signal on 462 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
685 * the way back to userspace, before the signal mask is restored. 463 * queue head lock when unregistering the wait queue. The wakeup callback
464 * will run by holding the wait queue head lock and will call our callback
465 * that will try to get "ep->lock".
686 */ 466 */
687 if (sigmask) { 467 ep_unregister_pollwait(ep, epi);
688 if (error == -EINTR) {
689 memcpy(&current->saved_sigmask, &sigsaved,
690 sizeof(sigsaved));
691 set_thread_flag(TIF_RESTORE_SIGMASK);
692 } else
693 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
694 }
695 468
696 return error; 469 /* Remove the current item from the list of epoll hooks */
697} 470 spin_lock(&file->f_ep_lock);
471 if (ep_is_linked(&epi->fllink))
472 list_del_init(&epi->fllink);
473 spin_unlock(&file->f_ep_lock);
698 474
699#endif /* #ifdef TIF_RESTORE_SIGMASK */ 475 /* We need to acquire the write IRQ lock before calling ep_unlink() */
476 write_lock_irqsave(&ep->lock, flags);
700 477
478 /* Really unlink the item from the RB tree */
479 error = ep_unlink(ep, epi);
701 480
702static int ep_alloc(struct eventpoll **pep) 481 write_unlock_irqrestore(&ep->lock, flags);
703{
704 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
705 482
706 if (!ep) 483 if (error)
707 return -ENOMEM; 484 goto error_return;
708 485
709 rwlock_init(&ep->lock); 486 /* At this point it is safe to free the eventpoll item */
710 init_rwsem(&ep->sem); 487 ep_release_epitem(epi);
711 init_waitqueue_head(&ep->wq);
712 init_waitqueue_head(&ep->poll_wait);
713 INIT_LIST_HEAD(&ep->rdllist);
714 ep->rbr = RB_ROOT;
715 488
716 *pep = ep; 489 error = 0;
490error_return:
491 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
492 current, ep, file, error));
717 493
718 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", 494 return error;
719 current, ep));
720 return 0;
721} 495}
722 496
723
724static void ep_free(struct eventpoll *ep) 497static void ep_free(struct eventpoll *ep)
725{ 498{
726 struct rb_node *rbp; 499 struct rb_node *rbp;
@@ -763,6 +536,104 @@ static void ep_free(struct eventpoll *ep)
763 mutex_unlock(&epmutex); 536 mutex_unlock(&epmutex);
764} 537}
765 538
539static int ep_eventpoll_release(struct inode *inode, struct file *file)
540{
541 struct eventpoll *ep = file->private_data;
542
543 if (ep) {
544 ep_free(ep);
545 kfree(ep);
546 }
547
548 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
549 return 0;
550}
551
552static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
553{
554 unsigned int pollflags = 0;
555 unsigned long flags;
556 struct eventpoll *ep = file->private_data;
557
558 /* Insert inside our poll wait queue */
559 poll_wait(file, &ep->poll_wait, wait);
560
561 /* Check our condition */
562 read_lock_irqsave(&ep->lock, flags);
563 if (!list_empty(&ep->rdllist))
564 pollflags = POLLIN | POLLRDNORM;
565 read_unlock_irqrestore(&ep->lock, flags);
566
567 return pollflags;
568}
569
570/* File callbacks that implement the eventpoll file behaviour */
571static const struct file_operations eventpoll_fops = {
572 .release = ep_eventpoll_release,
573 .poll = ep_eventpoll_poll
574};
575
576/* Fast test to see if the file is an evenpoll file */
577static inline int is_file_epoll(struct file *f)
578{
579 return f->f_op == &eventpoll_fops;
580}
581
582/*
583 * This is called from eventpoll_release() to unlink files from the eventpoll
584 * interface. We need to have this facility to cleanup correctly files that are
585 * closed without being removed from the eventpoll interface.
586 */
587void eventpoll_release_file(struct file *file)
588{
589 struct list_head *lsthead = &file->f_ep_links;
590 struct eventpoll *ep;
591 struct epitem *epi;
592
593 /*
594 * We don't want to get "file->f_ep_lock" because it is not
595 * necessary. It is not necessary because we're in the "struct file"
596 * cleanup path, and this means that noone is using this file anymore.
597 * The only hit might come from ep_free() but by holding the semaphore
598 * will correctly serialize the operation. We do need to acquire
599 * "ep->sem" after "epmutex" because ep_remove() requires it when called
600 * from anywhere but ep_free().
601 */
602 mutex_lock(&epmutex);
603
604 while (!list_empty(lsthead)) {
605 epi = list_first_entry(lsthead, struct epitem, fllink);
606
607 ep = epi->ep;
608 list_del_init(&epi->fllink);
609 down_write(&ep->sem);
610 ep_remove(ep, epi);
611 up_write(&ep->sem);
612 }
613
614 mutex_unlock(&epmutex);
615}
616
617static int ep_alloc(struct eventpoll **pep)
618{
619 struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL);
620
621 if (!ep)
622 return -ENOMEM;
623
624 rwlock_init(&ep->lock);
625 init_rwsem(&ep->sem);
626 init_waitqueue_head(&ep->wq);
627 init_waitqueue_head(&ep->poll_wait);
628 INIT_LIST_HEAD(&ep->rdllist);
629 ep->rbr = RB_ROOT;
630
631 *pep = ep;
632
633 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
634 current, ep));
635 return 0;
636}
766 637
767/* 638/*
768 * Search the file inside the eventpoll tree. It add usage count to 639 * Search the file inside the eventpoll tree. It add usage count to
@@ -800,30 +671,58 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
800 return epir; 671 return epir;
801} 672}
802 673
803
804/* 674/*
805 * Increment the usage count of the "struct epitem" making it sure 675 * This is the callback that is passed to the wait queue wakeup
806 * that the user will have a valid pointer to reference. 676 * machanism. It is called by the stored file descriptors when they
677 * have events to report.
807 */ 678 */
808static void ep_use_epitem(struct epitem *epi) 679static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
809{ 680{
681 int pwake = 0;
682 unsigned long flags;
683 struct epitem *epi = ep_item_from_wait(wait);
684 struct eventpoll *ep = epi->ep;
810 685
811 atomic_inc(&epi->usecnt); 686 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
812} 687 current, epi->ffd.file, epi, ep));
813 688
689 write_lock_irqsave(&ep->lock, flags);
814 690
815/* 691 /*
816 * Decrement ( release ) the usage count by signaling that the user 692 * If the event mask does not contain any poll(2) event, we consider the
817 * has finished using the structure. It might lead to freeing the 693 * descriptor to be disabled. This condition is likely the effect of the
818 * structure itself if the count goes to zero. 694 * EPOLLONESHOT bit that disables the descriptor when an event is received,
819 */ 695 * until the next EPOLL_CTL_MOD will be issued.
820static void ep_release_epitem(struct epitem *epi) 696 */
821{ 697 if (!(epi->event.events & ~EP_PRIVATE_BITS))
698 goto is_disabled;
822 699
823 if (atomic_dec_and_test(&epi->usecnt)) 700 /* If this file is already in the ready list we exit soon */
824 kmem_cache_free(epi_cache, epi); 701 if (ep_is_linked(&epi->rdllink))
825} 702 goto is_linked;
703
704 list_add_tail(&epi->rdllink, &ep->rdllist);
705
706is_linked:
707 /*
708 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
709 * wait list.
710 */
711 if (waitqueue_active(&ep->wq))
712 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
713 TASK_INTERRUPTIBLE);
714 if (waitqueue_active(&ep->poll_wait))
715 pwake++;
716
717is_disabled:
718 write_unlock_irqrestore(&ep->lock, flags);
719
720 /* We have to call this outside the lock */
721 if (pwake)
722 ep_poll_safewake(&psw, &ep->poll_wait);
826 723
724 return 1;
725}
827 726
828/* 727/*
829 * This is the callback that is used to add our wait queue to the 728 * This is the callback that is used to add our wait queue to the
@@ -848,7 +747,6 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
848 } 747 }
849} 748}
850 749
851
852static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) 750static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
853{ 751{
854 int kcmp; 752 int kcmp;
@@ -868,7 +766,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
868 rb_insert_color(&epi->rbn, &ep->rbr); 766 rb_insert_color(&epi->rbn, &ep->rbr);
869} 767}
870 768
871
872static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 769static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
873 struct file *tfile, int fd) 770 struct file *tfile, int fd)
874{ 771{
@@ -879,7 +776,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
879 776
880 error = -ENOMEM; 777 error = -ENOMEM;
881 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) 778 if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL)))
882 goto eexit_1; 779 goto error_return;
883 780
884 /* Item initialization follow here ... */ 781 /* Item initialization follow here ... */
885 ep_rb_initnode(&epi->rbn); 782 ep_rb_initnode(&epi->rbn);
@@ -909,7 +806,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
909 * high memory pressure. 806 * high memory pressure.
910 */ 807 */
911 if (epi->nwait < 0) 808 if (epi->nwait < 0)
912 goto eexit_2; 809 goto error_unregister;
913 810
914 /* Add the current item to the list of active epoll hook for this file */ 811 /* Add the current item to the list of active epoll hook for this file */
915 spin_lock(&tfile->f_ep_lock); 812 spin_lock(&tfile->f_ep_lock);
@@ -944,7 +841,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
944 841
945 return 0; 842 return 0;
946 843
947eexit_2: 844error_unregister:
948 ep_unregister_pollwait(ep, epi); 845 ep_unregister_pollwait(ep, epi);
949 846
950 /* 847 /*
@@ -957,11 +854,10 @@ eexit_2:
957 write_unlock_irqrestore(&ep->lock, flags); 854 write_unlock_irqrestore(&ep->lock, flags);
958 855
959 kmem_cache_free(epi_cache, epi); 856 kmem_cache_free(epi_cache, epi);
960eexit_1: 857error_return:
961 return error; 858 return error;
962} 859}
963 860
964
965/* 861/*
966 * Modify the interest event mask by dropping an event if the new mask 862 * Modify the interest event mask by dropping an event if the new mask
967 * has a match in the current file status. 863 * has a match in the current file status.
@@ -1024,216 +920,6 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1024 return 0; 920 return 0;
1025} 921}
1026 922
1027
1028/*
1029 * This function unregister poll callbacks from the associated file descriptor.
1030 * Since this must be called without holding "ep->lock" the atomic exchange trick
1031 * will protect us from multiple unregister.
1032 */
1033static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1034{
1035 int nwait;
1036 struct list_head *lsthead = &epi->pwqlist;
1037 struct eppoll_entry *pwq;
1038
1039 /* This is called without locks, so we need the atomic exchange */
1040 nwait = xchg(&epi->nwait, 0);
1041
1042 if (nwait) {
1043 while (!list_empty(lsthead)) {
1044 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
1045
1046 list_del_init(&pwq->llink);
1047 remove_wait_queue(pwq->whead, &pwq->wait);
1048 kmem_cache_free(pwq_cache, pwq);
1049 }
1050 }
1051}
1052
1053
1054/*
1055 * Unlink the "struct epitem" from all places it might have been hooked up.
1056 * This function must be called with write IRQ lock on "ep->lock".
1057 */
1058static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1059{
1060 int error;
1061
1062 /*
1063 * It can happen that this one is called for an item already unlinked.
1064 * The check protect us from doing a double unlink ( crash ).
1065 */
1066 error = -ENOENT;
1067 if (!ep_rb_linked(&epi->rbn))
1068 goto eexit_1;
1069
1070 /*
1071 * Clear the event mask for the unlinked item. This will avoid item
1072 * notifications to be sent after the unlink operation from inside
1073 * the kernel->userspace event transfer loop.
1074 */
1075 epi->event.events = 0;
1076
1077 /*
1078 * At this point is safe to do the job, unlink the item from our rb-tree.
1079 * This operation togheter with the above check closes the door to
1080 * double unlinks.
1081 */
1082 ep_rb_erase(&epi->rbn, &ep->rbr);
1083
1084 /*
1085 * If the item we are going to remove is inside the ready file descriptors
1086 * we want to remove it from this list to avoid stale events.
1087 */
1088 if (ep_is_linked(&epi->rdllink))
1089 list_del_init(&epi->rdllink);
1090
1091 error = 0;
1092eexit_1:
1093
1094 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n",
1095 current, ep, epi->ffd.file, error));
1096
1097 return error;
1098}
1099
1100
1101/*
1102 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
1103 * all the associated resources.
1104 */
1105static int ep_remove(struct eventpoll *ep, struct epitem *epi)
1106{
1107 int error;
1108 unsigned long flags;
1109 struct file *file = epi->ffd.file;
1110
1111 /*
1112 * Removes poll wait queue hooks. We _have_ to do this without holding
1113 * the "ep->lock" otherwise a deadlock might occur. This because of the
1114 * sequence of the lock acquisition. Here we do "ep->lock" then the wait
1115 * queue head lock when unregistering the wait queue. The wakeup callback
1116 * will run by holding the wait queue head lock and will call our callback
1117 * that will try to get "ep->lock".
1118 */
1119 ep_unregister_pollwait(ep, epi);
1120
1121 /* Remove the current item from the list of epoll hooks */
1122 spin_lock(&file->f_ep_lock);
1123 if (ep_is_linked(&epi->fllink))
1124 list_del_init(&epi->fllink);
1125 spin_unlock(&file->f_ep_lock);
1126
1127 /* We need to acquire the write IRQ lock before calling ep_unlink() */
1128 write_lock_irqsave(&ep->lock, flags);
1129
1130 /* Really unlink the item from the RB tree */
1131 error = ep_unlink(ep, epi);
1132
1133 write_unlock_irqrestore(&ep->lock, flags);
1134
1135 if (error)
1136 goto eexit_1;
1137
1138 /* At this point it is safe to free the eventpoll item */
1139 ep_release_epitem(epi);
1140
1141 error = 0;
1142eexit_1:
1143 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n",
1144 current, ep, file, error));
1145
1146 return error;
1147}
1148
1149
1150/*
1151 * This is the callback that is passed to the wait queue wakeup
1152 * machanism. It is called by the stored file descriptors when they
1153 * have events to report.
1154 */
1155static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key)
1156{
1157 int pwake = 0;
1158 unsigned long flags;
1159 struct epitem *epi = ep_item_from_wait(wait);
1160 struct eventpoll *ep = epi->ep;
1161
1162 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
1163 current, epi->ffd.file, epi, ep));
1164
1165 write_lock_irqsave(&ep->lock, flags);
1166
1167 /*
1168 * If the event mask does not contain any poll(2) event, we consider the
1169 * descriptor to be disabled. This condition is likely the effect of the
1170 * EPOLLONESHOT bit that disables the descriptor when an event is received,
1171 * until the next EPOLL_CTL_MOD will be issued.
1172 */
1173 if (!(epi->event.events & ~EP_PRIVATE_BITS))
1174 goto is_disabled;
1175
1176 /* If this file is already in the ready list we exit soon */
1177 if (ep_is_linked(&epi->rdllink))
1178 goto is_linked;
1179
1180 list_add_tail(&epi->rdllink, &ep->rdllist);
1181
1182is_linked:
1183 /*
1184 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1185 * wait list.
1186 */
1187 if (waitqueue_active(&ep->wq))
1188 __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE |
1189 TASK_INTERRUPTIBLE);
1190 if (waitqueue_active(&ep->poll_wait))
1191 pwake++;
1192
1193is_disabled:
1194 write_unlock_irqrestore(&ep->lock, flags);
1195
1196 /* We have to call this outside the lock */
1197 if (pwake)
1198 ep_poll_safewake(&psw, &ep->poll_wait);
1199
1200 return 1;
1201}
1202
1203
1204static int ep_eventpoll_close(struct inode *inode, struct file *file)
1205{
1206 struct eventpoll *ep = file->private_data;
1207
1208 if (ep) {
1209 ep_free(ep);
1210 kfree(ep);
1211 }
1212
1213 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
1214 return 0;
1215}
1216
1217
1218static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
1219{
1220 unsigned int pollflags = 0;
1221 unsigned long flags;
1222 struct eventpoll *ep = file->private_data;
1223
1224 /* Insert inside our poll wait queue */
1225 poll_wait(file, &ep->poll_wait, wait);
1226
1227 /* Check our condition */
1228 read_lock_irqsave(&ep->lock, flags);
1229 if (!list_empty(&ep->rdllist))
1230 pollflags = POLLIN | POLLRDNORM;
1231 read_unlock_irqrestore(&ep->lock, flags);
1232
1233 return pollflags;
1234}
1235
1236
1237/* 923/*
1238 * This function is called without holding the "ep->lock" since the call to 924 * This function is called without holding the "ep->lock" since the call to
1239 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ 925 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
@@ -1345,7 +1031,6 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1345 return eventcnt == 0 ? error: eventcnt; 1031 return eventcnt == 0 ? error: eventcnt;
1346} 1032}
1347 1033
1348
1349/* 1034/*
1350 * Perform the transfer of events to user space. 1035 * Perform the transfer of events to user space.
1351 */ 1036 */
@@ -1381,7 +1066,6 @@ static int ep_events_transfer(struct eventpoll *ep,
1381 return eventcnt; 1066 return eventcnt;
1382} 1067}
1383 1068
1384
1385static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, 1069static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
1386 int maxevents, long timeout) 1070 int maxevents, long timeout)
1387{ 1071{
@@ -1451,6 +1135,260 @@ retry:
1451 return res; 1135 return res;
1452} 1136}
1453 1137
1138/*
1139 * It opens an eventpoll file descriptor by suggesting a storage of "size"
1140 * file descriptors. The size parameter is just an hint about how to size
1141 * data structures. It won't prevent the user to store more than "size"
1142 * file descriptors inside the epoll interface. It is the kernel part of
1143 * the userspace epoll_create(2).
1144 */
1145asmlinkage long sys_epoll_create(int size)
1146{
1147 int error, fd = -1;
1148 struct eventpoll *ep;
1149 struct inode *inode;
1150 struct file *file;
1151
1152 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
1153 current, size));
1154
1155 /*
1156 * Sanity check on the size parameter, and create the internal data
1157 * structure ( "struct eventpoll" ).
1158 */
1159 error = -EINVAL;
1160 if (size <= 0 || (error = ep_alloc(&ep)) != 0)
1161 goto error_return;
1162
1163 /*
1164 * Creates all the items needed to setup an eventpoll file. That is,
1165 * a file structure, and inode and a free file descriptor.
1166 */
1167 error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]",
1168 &eventpoll_fops, ep);
1169 if (error)
1170 goto error_free;
1171
1172 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1173 current, size, fd));
1174
1175 return fd;
1176
1177error_free:
1178 ep_free(ep);
1179 kfree(ep);
1180error_return:
1181 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
1182 current, size, error));
1183 return error;
1184}
1185
1186/*
1187 * The following function implements the controller interface for
1188 * the eventpoll file that enables the insertion/removal/change of
1189 * file descriptors inside the interest set. It represents
1190 * the kernel part of the user space epoll_ctl(2).
1191 */
1192asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
1193 struct epoll_event __user *event)
1194{
1195 int error;
1196 struct file *file, *tfile;
1197 struct eventpoll *ep;
1198 struct epitem *epi;
1199 struct epoll_event epds;
1200
1201 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
1202 current, epfd, op, fd, event));
1203
1204 error = -EFAULT;
1205 if (ep_op_has_event(op) &&
1206 copy_from_user(&epds, event, sizeof(struct epoll_event)))
1207 goto error_return;
1208
1209 /* Get the "struct file *" for the eventpoll file */
1210 error = -EBADF;
1211 file = fget(epfd);
1212 if (!file)
1213 goto error_return;
1214
1215 /* Get the "struct file *" for the target file */
1216 tfile = fget(fd);
1217 if (!tfile)
1218 goto error_fput;
1219
1220 /* The target file descriptor must support poll */
1221 error = -EPERM;
1222 if (!tfile->f_op || !tfile->f_op->poll)
1223 goto error_tgt_fput;
1224
1225 /*
1226 * We have to check that the file structure underneath the file descriptor
1227 * the user passed to us _is_ an eventpoll file. And also we do not permit
1228 * adding an epoll file descriptor inside itself.
1229 */
1230 error = -EINVAL;
1231 if (file == tfile || !is_file_epoll(file))
1232 goto error_tgt_fput;
1233
1234 /*
1235 * At this point it is safe to assume that the "private_data" contains
1236 * our own data structure.
1237 */
1238 ep = file->private_data;
1239
1240 down_write(&ep->sem);
1241
1242 /* Try to lookup the file inside our RB tree */
1243 epi = ep_find(ep, tfile, fd);
1244
1245 error = -EINVAL;
1246 switch (op) {
1247 case EPOLL_CTL_ADD:
1248 if (!epi) {
1249 epds.events |= POLLERR | POLLHUP;
1250
1251 error = ep_insert(ep, &epds, tfile, fd);
1252 } else
1253 error = -EEXIST;
1254 break;
1255 case EPOLL_CTL_DEL:
1256 if (epi)
1257 error = ep_remove(ep, epi);
1258 else
1259 error = -ENOENT;
1260 break;
1261 case EPOLL_CTL_MOD:
1262 if (epi) {
1263 epds.events |= POLLERR | POLLHUP;
1264 error = ep_modify(ep, epi, &epds);
1265 } else
1266 error = -ENOENT;
1267 break;
1268 }
1269 /*
1270 * The function ep_find() increments the usage count of the structure
1271 * so, if this is not NULL, we need to release it.
1272 */
1273 if (epi)
1274 ep_release_epitem(epi);
1275 up_write(&ep->sem);
1276
1277error_tgt_fput:
1278 fput(tfile);
1279error_fput:
1280 fput(file);
1281error_return:
1282 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
1283 current, epfd, op, fd, event, error));
1284
1285 return error;
1286}
1287
1288/*
1289 * Implement the event wait interface for the eventpoll file. It is the kernel
1290 * part of the user space epoll_wait(2).
1291 */
1292asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
1293 int maxevents, int timeout)
1294{
1295 int error;
1296 struct file *file;
1297 struct eventpoll *ep;
1298
1299 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
1300 current, epfd, events, maxevents, timeout));
1301
1302 /* The maximum number of event must be greater than zero */
1303 if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
1304 return -EINVAL;
1305
1306 /* Verify that the area passed by the user is writeable */
1307 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1308 error = -EFAULT;
1309 goto error_return;
1310 }
1311
1312 /* Get the "struct file *" for the eventpoll file */
1313 error = -EBADF;
1314 file = fget(epfd);
1315 if (!file)
1316 goto error_return;
1317
1318 /*
1319 * We have to check that the file structure underneath the fd
1320 * the user passed to us _is_ an eventpoll file.
1321 */
1322 error = -EINVAL;
1323 if (!is_file_epoll(file))
1324 goto error_fput;
1325
1326 /*
1327 * At this point it is safe to assume that the "private_data" contains
1328 * our own data structure.
1329 */
1330 ep = file->private_data;
1331
1332 /* Time to fish for events ... */
1333 error = ep_poll(ep, events, maxevents, timeout);
1334
1335error_fput:
1336 fput(file);
1337error_return:
1338 DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
1339 current, epfd, events, maxevents, timeout, error));
1340
1341 return error;
1342}
1343
1344#ifdef TIF_RESTORE_SIGMASK
1345
1346/*
1347 * Implement the event wait interface for the eventpoll file. It is the kernel
1348 * part of the user space epoll_pwait(2).
1349 */
1350asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
1351 int maxevents, int timeout, const sigset_t __user *sigmask,
1352 size_t sigsetsize)
1353{
1354 int error;
1355 sigset_t ksigmask, sigsaved;
1356
1357 /*
1358 * If the caller wants a certain signal mask to be set during the wait,
1359 * we apply it here.
1360 */
1361 if (sigmask) {
1362 if (sigsetsize != sizeof(sigset_t))
1363 return -EINVAL;
1364 if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask)))
1365 return -EFAULT;
1366 sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
1367 sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
1368 }
1369
1370 error = sys_epoll_wait(epfd, events, maxevents, timeout);
1371
1372 /*
1373 * If we changed the signal mask, we need to restore the original one.
1374 * In case we've got a signal while waiting, we do not restore the
1375 * signal mask yet, and we allow do_signal() to deliver the signal on
1376 * the way back to userspace, before the signal mask is restored.
1377 */
1378 if (sigmask) {
1379 if (error == -EINTR) {
1380 memcpy(&current->saved_sigmask, &sigsaved,
1381 sizeof(sigsaved));
1382 set_thread_flag(TIF_RESTORE_SIGMASK);
1383 } else
1384 sigprocmask(SIG_SETMASK, &sigsaved, NULL);
1385 }
1386
1387 return error;
1388}
1389
1390#endif /* #ifdef TIF_RESTORE_SIGMASK */
1391
1454static int __init eventpoll_init(void) 1392static int __init eventpoll_init(void)
1455{ 1393{
1456 mutex_init(&epmutex); 1394 mutex_init(&epmutex);