diff options
-rw-r--r-- | fs/eventpoll.c | 1034 |
1 files changed, 486 insertions, 548 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index e224abfd9197..1aad34ea61a4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -41,7 +41,6 @@ | |||
41 | #include <asm/atomic.h> | 41 | #include <asm/atomic.h> |
42 | #include <asm/semaphore.h> | 42 | #include <asm/semaphore.h> |
43 | 43 | ||
44 | |||
45 | /* | 44 | /* |
46 | * LOCKING: | 45 | * LOCKING: |
47 | * There are three level of locking required by epoll : | 46 | * There are three level of locking required by epoll : |
@@ -74,7 +73,6 @@ | |||
74 | * a greater scalability. | 73 | * a greater scalability. |
75 | */ | 74 | */ |
76 | 75 | ||
77 | |||
78 | #define DEBUG_EPOLL 0 | 76 | #define DEBUG_EPOLL 0 |
79 | 77 | ||
80 | #if DEBUG_EPOLL > 0 | 78 | #if DEBUG_EPOLL > 0 |
@@ -104,7 +102,6 @@ | |||
104 | 102 | ||
105 | #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) | 103 | #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) |
106 | 104 | ||
107 | |||
108 | struct epoll_filefd { | 105 | struct epoll_filefd { |
109 | struct file *file; | 106 | struct file *file; |
110 | int fd; | 107 | int fd; |
@@ -222,36 +219,6 @@ struct ep_pqueue { | |||
222 | struct epitem *epi; | 219 | struct epitem *epi; |
223 | }; | 220 | }; |
224 | 221 | ||
225 | |||
226 | |||
227 | static void ep_poll_safewake_init(struct poll_safewake *psw); | ||
228 | static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq); | ||
229 | static int ep_alloc(struct eventpoll **pep); | ||
230 | static void ep_free(struct eventpoll *ep); | ||
231 | static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd); | ||
232 | static void ep_use_epitem(struct epitem *epi); | ||
233 | static void ep_release_epitem(struct epitem *epi); | ||
234 | static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, | ||
235 | poll_table *pt); | ||
236 | static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi); | ||
237 | static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | ||
238 | struct file *tfile, int fd); | ||
239 | static int ep_modify(struct eventpoll *ep, struct epitem *epi, | ||
240 | struct epoll_event *event); | ||
241 | static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi); | ||
242 | static int ep_unlink(struct eventpoll *ep, struct epitem *epi); | ||
243 | static int ep_remove(struct eventpoll *ep, struct epitem *epi); | ||
244 | static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); | ||
245 | static int ep_eventpoll_close(struct inode *inode, struct file *file); | ||
246 | static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); | ||
247 | static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | ||
248 | struct epoll_event __user *events, int maxevents); | ||
249 | static int ep_events_transfer(struct eventpoll *ep, | ||
250 | struct epoll_event __user *events, | ||
251 | int maxevents); | ||
252 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | ||
253 | int maxevents, long timeout); | ||
254 | |||
255 | /* | 222 | /* |
256 | * This semaphore is used to serialize ep_free() and eventpoll_release_file(). | 223 | * This semaphore is used to serialize ep_free() and eventpoll_release_file(). |
257 | */ | 224 | */ |
@@ -266,19 +233,6 @@ static struct kmem_cache *epi_cache __read_mostly; | |||
266 | /* Slab cache used to allocate "struct eppoll_entry" */ | 233 | /* Slab cache used to allocate "struct eppoll_entry" */ |
267 | static struct kmem_cache *pwq_cache __read_mostly; | 234 | static struct kmem_cache *pwq_cache __read_mostly; |
268 | 235 | ||
269 | /* File callbacks that implement the eventpoll file behaviour */ | ||
270 | static const struct file_operations eventpoll_fops = { | ||
271 | .release = ep_eventpoll_close, | ||
272 | .poll = ep_eventpoll_poll | ||
273 | }; | ||
274 | |||
275 | |||
276 | |||
277 | /* Fast test to see if the file is an evenpoll file */ | ||
278 | static inline int is_file_epoll(struct file *f) | ||
279 | { | ||
280 | return f->f_op == &eventpoll_fops; | ||
281 | } | ||
282 | 236 | ||
283 | /* Setup the structure that is used as key for the rb-tree */ | 237 | /* Setup the structure that is used as key for the rb-tree */ |
284 | static inline void ep_set_ffd(struct epoll_filefd *ffd, | 238 | static inline void ep_set_ffd(struct epoll_filefd *ffd, |
@@ -347,7 +301,6 @@ static void ep_poll_safewake_init(struct poll_safewake *psw) | |||
347 | spin_lock_init(&psw->lock); | 301 | spin_lock_init(&psw->lock); |
348 | } | 302 | } |
349 | 303 | ||
350 | |||
351 | /* | 304 | /* |
352 | * Perform a safe wake up of the poll wait list. The problem is that | 305 | * Perform a safe wake up of the poll wait list. The problem is that |
353 | * with the new callback'd wake up system, it is possible that the | 306 | * with the new callback'd wake up system, it is possible that the |
@@ -402,325 +355,145 @@ static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq) | |||
402 | spin_unlock_irqrestore(&psw->lock, flags); | 355 | spin_unlock_irqrestore(&psw->lock, flags); |
403 | } | 356 | } |
404 | 357 | ||
405 | |||
406 | /* | 358 | /* |
407 | * This is called from eventpoll_release() to unlink files from the eventpoll | 359 | * This function unregister poll callbacks from the associated file descriptor. |
408 | * interface. We need to have this facility to cleanup correctly files that are | 360 | * Since this must be called without holding "ep->lock" the atomic exchange trick |
409 | * closed without being removed from the eventpoll interface. | 361 | * will protect us from multiple unregister. |
410 | */ | 362 | */ |
411 | void eventpoll_release_file(struct file *file) | 363 | static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) |
412 | { | 364 | { |
413 | struct list_head *lsthead = &file->f_ep_links; | 365 | int nwait; |
414 | struct eventpoll *ep; | 366 | struct list_head *lsthead = &epi->pwqlist; |
415 | struct epitem *epi; | 367 | struct eppoll_entry *pwq; |
416 | 368 | ||
417 | /* | 369 | /* This is called without locks, so we need the atomic exchange */ |
418 | * We don't want to get "file->f_ep_lock" because it is not | 370 | nwait = xchg(&epi->nwait, 0); |
419 | * necessary. It is not necessary because we're in the "struct file" | ||
420 | * cleanup path, and this means that noone is using this file anymore. | ||
421 | * The only hit might come from ep_free() but by holding the semaphore | ||
422 | * will correctly serialize the operation. We do need to acquire | ||
423 | * "ep->sem" after "epmutex" because ep_remove() requires it when called | ||
424 | * from anywhere but ep_free(). | ||
425 | */ | ||
426 | mutex_lock(&epmutex); | ||
427 | 371 | ||
428 | while (!list_empty(lsthead)) { | 372 | if (nwait) { |
429 | epi = list_first_entry(lsthead, struct epitem, fllink); | 373 | while (!list_empty(lsthead)) { |
374 | pwq = list_first_entry(lsthead, struct eppoll_entry, llink); | ||
430 | 375 | ||
431 | ep = epi->ep; | 376 | list_del_init(&pwq->llink); |
432 | list_del_init(&epi->fllink); | 377 | remove_wait_queue(pwq->whead, &pwq->wait); |
433 | down_write(&ep->sem); | 378 | kmem_cache_free(pwq_cache, pwq); |
434 | ep_remove(ep, epi); | 379 | } |
435 | up_write(&ep->sem); | ||
436 | } | 380 | } |
437 | |||
438 | mutex_unlock(&epmutex); | ||
439 | } | 381 | } |
440 | 382 | ||
441 | |||
442 | /* | 383 | /* |
443 | * It opens an eventpoll file descriptor by suggesting a storage of "size" | 384 | * Unlink the "struct epitem" from all places it might have been hooked up. |
444 | * file descriptors. The size parameter is just an hint about how to size | 385 | * This function must be called with write IRQ lock on "ep->lock". |
445 | * data structures. It won't prevent the user to store more than "size" | ||
446 | * file descriptors inside the epoll interface. It is the kernel part of | ||
447 | * the userspace epoll_create(2). | ||
448 | */ | 386 | */ |
449 | asmlinkage long sys_epoll_create(int size) | 387 | static int ep_unlink(struct eventpoll *ep, struct epitem *epi) |
450 | { | 388 | { |
451 | int error, fd = -1; | 389 | int error; |
452 | struct eventpoll *ep; | ||
453 | struct inode *inode; | ||
454 | struct file *file; | ||
455 | |||
456 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", | ||
457 | current, size)); | ||
458 | |||
459 | /* | ||
460 | * Sanity check on the size parameter, and create the internal data | ||
461 | * structure ( "struct eventpoll" ). | ||
462 | */ | ||
463 | error = -EINVAL; | ||
464 | if (size <= 0 || (error = ep_alloc(&ep)) != 0) | ||
465 | goto eexit_1; | ||
466 | 390 | ||
467 | /* | 391 | /* |
468 | * Creates all the items needed to setup an eventpoll file. That is, | 392 | * It can happen that this one is called for an item already unlinked. |
469 | * a file structure, and inode and a free file descriptor. | 393 | * The check protect us from doing a double unlink ( crash ). |
470 | */ | 394 | */ |
471 | error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", | 395 | error = -ENOENT; |
472 | &eventpoll_fops, ep); | 396 | if (!ep_rb_linked(&epi->rbn)) |
473 | if (error) | 397 | goto error_return; |
474 | goto eexit_2; | ||
475 | |||
476 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | ||
477 | current, size, fd)); | ||
478 | |||
479 | return fd; | ||
480 | |||
481 | eexit_2: | ||
482 | ep_free(ep); | ||
483 | kfree(ep); | ||
484 | eexit_1: | ||
485 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | ||
486 | current, size, error)); | ||
487 | return error; | ||
488 | } | ||
489 | |||
490 | |||
491 | /* | ||
492 | * The following function implements the controller interface for | ||
493 | * the eventpoll file that enables the insertion/removal/change of | ||
494 | * file descriptors inside the interest set. It represents | ||
495 | * the kernel part of the user space epoll_ctl(2). | ||
496 | */ | ||
497 | asmlinkage long | ||
498 | sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) | ||
499 | { | ||
500 | int error; | ||
501 | struct file *file, *tfile; | ||
502 | struct eventpoll *ep; | ||
503 | struct epitem *epi; | ||
504 | struct epoll_event epds; | ||
505 | |||
506 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", | ||
507 | current, epfd, op, fd, event)); | ||
508 | |||
509 | error = -EFAULT; | ||
510 | if (ep_op_has_event(op) && | ||
511 | copy_from_user(&epds, event, sizeof(struct epoll_event))) | ||
512 | goto eexit_1; | ||
513 | |||
514 | /* Get the "struct file *" for the eventpoll file */ | ||
515 | error = -EBADF; | ||
516 | file = fget(epfd); | ||
517 | if (!file) | ||
518 | goto eexit_1; | ||
519 | |||
520 | /* Get the "struct file *" for the target file */ | ||
521 | tfile = fget(fd); | ||
522 | if (!tfile) | ||
523 | goto eexit_2; | ||
524 | |||
525 | /* The target file descriptor must support poll */ | ||
526 | error = -EPERM; | ||
527 | if (!tfile->f_op || !tfile->f_op->poll) | ||
528 | goto eexit_3; | ||
529 | 398 | ||
530 | /* | 399 | /* |
531 | * We have to check that the file structure underneath the file descriptor | 400 | * Clear the event mask for the unlinked item. This will avoid item |
532 | * the user passed to us _is_ an eventpoll file. And also we do not permit | 401 | * notifications to be sent after the unlink operation from inside |
533 | * adding an epoll file descriptor inside itself. | 402 | * the kernel->userspace event transfer loop. |
534 | */ | 403 | */ |
535 | error = -EINVAL; | 404 | epi->event.events = 0; |
536 | if (file == tfile || !is_file_epoll(file)) | ||
537 | goto eexit_3; | ||
538 | 405 | ||
539 | /* | 406 | /* |
540 | * At this point it is safe to assume that the "private_data" contains | 407 | * At this point is safe to do the job, unlink the item from our rb-tree. |
541 | * our own data structure. | 408 | * This operation togheter with the above check closes the door to |
409 | * double unlinks. | ||
542 | */ | 410 | */ |
543 | ep = file->private_data; | 411 | ep_rb_erase(&epi->rbn, &ep->rbr); |
544 | |||
545 | down_write(&ep->sem); | ||
546 | |||
547 | /* Try to lookup the file inside our RB tree */ | ||
548 | epi = ep_find(ep, tfile, fd); | ||
549 | |||
550 | error = -EINVAL; | ||
551 | switch (op) { | ||
552 | case EPOLL_CTL_ADD: | ||
553 | if (!epi) { | ||
554 | epds.events |= POLLERR | POLLHUP; | ||
555 | |||
556 | error = ep_insert(ep, &epds, tfile, fd); | ||
557 | } else | ||
558 | error = -EEXIST; | ||
559 | break; | ||
560 | case EPOLL_CTL_DEL: | ||
561 | if (epi) | ||
562 | error = ep_remove(ep, epi); | ||
563 | else | ||
564 | error = -ENOENT; | ||
565 | break; | ||
566 | case EPOLL_CTL_MOD: | ||
567 | if (epi) { | ||
568 | epds.events |= POLLERR | POLLHUP; | ||
569 | error = ep_modify(ep, epi, &epds); | ||
570 | } else | ||
571 | error = -ENOENT; | ||
572 | break; | ||
573 | } | ||
574 | 412 | ||
575 | /* | 413 | /* |
576 | * The function ep_find() increments the usage count of the structure | 414 | * If the item we are going to remove is inside the ready file descriptors |
577 | * so, if this is not NULL, we need to release it. | 415 | * we want to remove it from this list to avoid stale events. |
578 | */ | 416 | */ |
579 | if (epi) | 417 | if (ep_is_linked(&epi->rdllink)) |
580 | ep_release_epitem(epi); | 418 | list_del_init(&epi->rdllink); |
581 | 419 | ||
582 | up_write(&ep->sem); | 420 | error = 0; |
421 | error_return: | ||
583 | 422 | ||
584 | eexit_3: | 423 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", |
585 | fput(tfile); | 424 | current, ep, epi->ffd.file, error)); |
586 | eexit_2: | ||
587 | fput(file); | ||
588 | eexit_1: | ||
589 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", | ||
590 | current, epfd, op, fd, event, error)); | ||
591 | 425 | ||
592 | return error; | 426 | return error; |
593 | } | 427 | } |
594 | 428 | ||
595 | |||
596 | /* | 429 | /* |
597 | * Implement the event wait interface for the eventpoll file. It is the kernel | 430 | * Increment the usage count of the "struct epitem" making it sure |
598 | * part of the user space epoll_wait(2). | 431 | * that the user will have a valid pointer to reference. |
599 | */ | 432 | */ |
600 | asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, | 433 | static void ep_use_epitem(struct epitem *epi) |
601 | int maxevents, int timeout) | ||
602 | { | 434 | { |
603 | int error; | 435 | atomic_inc(&epi->usecnt); |
604 | struct file *file; | ||
605 | struct eventpoll *ep; | ||
606 | |||
607 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", | ||
608 | current, epfd, events, maxevents, timeout)); | ||
609 | |||
610 | /* The maximum number of event must be greater than zero */ | ||
611 | if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) | ||
612 | return -EINVAL; | ||
613 | |||
614 | /* Verify that the area passed by the user is writeable */ | ||
615 | if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { | ||
616 | error = -EFAULT; | ||
617 | goto eexit_1; | ||
618 | } | ||
619 | |||
620 | /* Get the "struct file *" for the eventpoll file */ | ||
621 | error = -EBADF; | ||
622 | file = fget(epfd); | ||
623 | if (!file) | ||
624 | goto eexit_1; | ||
625 | |||
626 | /* | ||
627 | * We have to check that the file structure underneath the fd | ||
628 | * the user passed to us _is_ an eventpoll file. | ||
629 | */ | ||
630 | error = -EINVAL; | ||
631 | if (!is_file_epoll(file)) | ||
632 | goto eexit_2; | ||
633 | |||
634 | /* | ||
635 | * At this point it is safe to assume that the "private_data" contains | ||
636 | * our own data structure. | ||
637 | */ | ||
638 | ep = file->private_data; | ||
639 | |||
640 | /* Time to fish for events ... */ | ||
641 | error = ep_poll(ep, events, maxevents, timeout); | ||
642 | |||
643 | eexit_2: | ||
644 | fput(file); | ||
645 | eexit_1: | ||
646 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", | ||
647 | current, epfd, events, maxevents, timeout, error)); | ||
648 | |||
649 | return error; | ||
650 | } | 436 | } |
651 | 437 | ||
652 | 438 | /* | |
653 | #ifdef TIF_RESTORE_SIGMASK | 439 | * Decrement ( release ) the usage count by signaling that the user |
440 | * has finished using the structure. It might lead to freeing the | ||
441 | * structure itself if the count goes to zero. | ||
442 | */ | ||
443 | static void ep_release_epitem(struct epitem *epi) | ||
444 | { | ||
445 | if (atomic_dec_and_test(&epi->usecnt)) | ||
446 | kmem_cache_free(epi_cache, epi); | ||
447 | } | ||
654 | 448 | ||
655 | /* | 449 | /* |
656 | * Implement the event wait interface for the eventpoll file. It is the kernel | 450 | * Removes a "struct epitem" from the eventpoll RB tree and deallocates |
657 | * part of the user space epoll_pwait(2). | 451 | * all the associated resources. |
658 | */ | 452 | */ |
659 | asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, | 453 | static int ep_remove(struct eventpoll *ep, struct epitem *epi) |
660 | int maxevents, int timeout, const sigset_t __user *sigmask, | ||
661 | size_t sigsetsize) | ||
662 | { | 454 | { |
663 | int error; | 455 | int error; |
664 | sigset_t ksigmask, sigsaved; | 456 | unsigned long flags; |
665 | 457 | struct file *file = epi->ffd.file; | |
666 | /* | ||
667 | * If the caller wants a certain signal mask to be set during the wait, | ||
668 | * we apply it here. | ||
669 | */ | ||
670 | if (sigmask) { | ||
671 | if (sigsetsize != sizeof(sigset_t)) | ||
672 | return -EINVAL; | ||
673 | if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) | ||
674 | return -EFAULT; | ||
675 | sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
676 | sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); | ||
677 | } | ||
678 | |||
679 | error = sys_epoll_wait(epfd, events, maxevents, timeout); | ||
680 | 458 | ||
681 | /* | 459 | /* |
682 | * If we changed the signal mask, we need to restore the original one. | 460 | * Removes poll wait queue hooks. We _have_ to do this without holding |
683 | * In case we've got a signal while waiting, we do not restore the | 461 | * the "ep->lock" otherwise a deadlock might occur. This because of the |
684 | * signal mask yet, and we allow do_signal() to deliver the signal on | 462 | * sequence of the lock acquisition. Here we do "ep->lock" then the wait |
685 | * the way back to userspace, before the signal mask is restored. | 463 | * queue head lock when unregistering the wait queue. The wakeup callback |
464 | * will run by holding the wait queue head lock and will call our callback | ||
465 | * that will try to get "ep->lock". | ||
686 | */ | 466 | */ |
687 | if (sigmask) { | 467 | ep_unregister_pollwait(ep, epi); |
688 | if (error == -EINTR) { | ||
689 | memcpy(¤t->saved_sigmask, &sigsaved, | ||
690 | sizeof(sigsaved)); | ||
691 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
692 | } else | ||
693 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | ||
694 | } | ||
695 | 468 | ||
696 | return error; | 469 | /* Remove the current item from the list of epoll hooks */ |
697 | } | 470 | spin_lock(&file->f_ep_lock); |
471 | if (ep_is_linked(&epi->fllink)) | ||
472 | list_del_init(&epi->fllink); | ||
473 | spin_unlock(&file->f_ep_lock); | ||
698 | 474 | ||
699 | #endif /* #ifdef TIF_RESTORE_SIGMASK */ | 475 | /* We need to acquire the write IRQ lock before calling ep_unlink() */ |
476 | write_lock_irqsave(&ep->lock, flags); | ||
700 | 477 | ||
478 | /* Really unlink the item from the RB tree */ | ||
479 | error = ep_unlink(ep, epi); | ||
701 | 480 | ||
702 | static int ep_alloc(struct eventpoll **pep) | 481 | write_unlock_irqrestore(&ep->lock, flags); |
703 | { | ||
704 | struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); | ||
705 | 482 | ||
706 | if (!ep) | 483 | if (error) |
707 | return -ENOMEM; | 484 | goto error_return; |
708 | 485 | ||
709 | rwlock_init(&ep->lock); | 486 | /* At this point it is safe to free the eventpoll item */ |
710 | init_rwsem(&ep->sem); | 487 | ep_release_epitem(epi); |
711 | init_waitqueue_head(&ep->wq); | ||
712 | init_waitqueue_head(&ep->poll_wait); | ||
713 | INIT_LIST_HEAD(&ep->rdllist); | ||
714 | ep->rbr = RB_ROOT; | ||
715 | 488 | ||
716 | *pep = ep; | 489 | error = 0; |
490 | error_return: | ||
491 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", | ||
492 | current, ep, file, error)); | ||
717 | 493 | ||
718 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", | 494 | return error; |
719 | current, ep)); | ||
720 | return 0; | ||
721 | } | 495 | } |
722 | 496 | ||
723 | |||
724 | static void ep_free(struct eventpoll *ep) | 497 | static void ep_free(struct eventpoll *ep) |
725 | { | 498 | { |
726 | struct rb_node *rbp; | 499 | struct rb_node *rbp; |
@@ -763,6 +536,104 @@ static void ep_free(struct eventpoll *ep) | |||
763 | mutex_unlock(&epmutex); | 536 | mutex_unlock(&epmutex); |
764 | } | 537 | } |
765 | 538 | ||
539 | static int ep_eventpoll_release(struct inode *inode, struct file *file) | ||
540 | { | ||
541 | struct eventpoll *ep = file->private_data; | ||
542 | |||
543 | if (ep) { | ||
544 | ep_free(ep); | ||
545 | kfree(ep); | ||
546 | } | ||
547 | |||
548 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); | ||
549 | return 0; | ||
550 | } | ||
551 | |||
552 | static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) | ||
553 | { | ||
554 | unsigned int pollflags = 0; | ||
555 | unsigned long flags; | ||
556 | struct eventpoll *ep = file->private_data; | ||
557 | |||
558 | /* Insert inside our poll wait queue */ | ||
559 | poll_wait(file, &ep->poll_wait, wait); | ||
560 | |||
561 | /* Check our condition */ | ||
562 | read_lock_irqsave(&ep->lock, flags); | ||
563 | if (!list_empty(&ep->rdllist)) | ||
564 | pollflags = POLLIN | POLLRDNORM; | ||
565 | read_unlock_irqrestore(&ep->lock, flags); | ||
566 | |||
567 | return pollflags; | ||
568 | } | ||
569 | |||
570 | /* File callbacks that implement the eventpoll file behaviour */ | ||
571 | static const struct file_operations eventpoll_fops = { | ||
572 | .release = ep_eventpoll_release, | ||
573 | .poll = ep_eventpoll_poll | ||
574 | }; | ||
575 | |||
576 | /* Fast test to see if the file is an evenpoll file */ | ||
577 | static inline int is_file_epoll(struct file *f) | ||
578 | { | ||
579 | return f->f_op == &eventpoll_fops; | ||
580 | } | ||
581 | |||
582 | /* | ||
583 | * This is called from eventpoll_release() to unlink files from the eventpoll | ||
584 | * interface. We need to have this facility to cleanup correctly files that are | ||
585 | * closed without being removed from the eventpoll interface. | ||
586 | */ | ||
587 | void eventpoll_release_file(struct file *file) | ||
588 | { | ||
589 | struct list_head *lsthead = &file->f_ep_links; | ||
590 | struct eventpoll *ep; | ||
591 | struct epitem *epi; | ||
592 | |||
593 | /* | ||
594 | * We don't want to get "file->f_ep_lock" because it is not | ||
595 | * necessary. It is not necessary because we're in the "struct file" | ||
596 | * cleanup path, and this means that noone is using this file anymore. | ||
597 | * The only hit might come from ep_free() but by holding the semaphore | ||
598 | * will correctly serialize the operation. We do need to acquire | ||
599 | * "ep->sem" after "epmutex" because ep_remove() requires it when called | ||
600 | * from anywhere but ep_free(). | ||
601 | */ | ||
602 | mutex_lock(&epmutex); | ||
603 | |||
604 | while (!list_empty(lsthead)) { | ||
605 | epi = list_first_entry(lsthead, struct epitem, fllink); | ||
606 | |||
607 | ep = epi->ep; | ||
608 | list_del_init(&epi->fllink); | ||
609 | down_write(&ep->sem); | ||
610 | ep_remove(ep, epi); | ||
611 | up_write(&ep->sem); | ||
612 | } | ||
613 | |||
614 | mutex_unlock(&epmutex); | ||
615 | } | ||
616 | |||
617 | static int ep_alloc(struct eventpoll **pep) | ||
618 | { | ||
619 | struct eventpoll *ep = kzalloc(sizeof(*ep), GFP_KERNEL); | ||
620 | |||
621 | if (!ep) | ||
622 | return -ENOMEM; | ||
623 | |||
624 | rwlock_init(&ep->lock); | ||
625 | init_rwsem(&ep->sem); | ||
626 | init_waitqueue_head(&ep->wq); | ||
627 | init_waitqueue_head(&ep->poll_wait); | ||
628 | INIT_LIST_HEAD(&ep->rdllist); | ||
629 | ep->rbr = RB_ROOT; | ||
630 | |||
631 | *pep = ep; | ||
632 | |||
633 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", | ||
634 | current, ep)); | ||
635 | return 0; | ||
636 | } | ||
766 | 637 | ||
767 | /* | 638 | /* |
768 | * Search the file inside the eventpoll tree. It add usage count to | 639 | * Search the file inside the eventpoll tree. It add usage count to |
@@ -800,30 +671,58 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) | |||
800 | return epir; | 671 | return epir; |
801 | } | 672 | } |
802 | 673 | ||
803 | |||
804 | /* | 674 | /* |
805 | * Increment the usage count of the "struct epitem" making it sure | 675 | * This is the callback that is passed to the wait queue wakeup |
806 | * that the user will have a valid pointer to reference. | 676 | * machanism. It is called by the stored file descriptors when they |
677 | * have events to report. | ||
807 | */ | 678 | */ |
808 | static void ep_use_epitem(struct epitem *epi) | 679 | static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) |
809 | { | 680 | { |
681 | int pwake = 0; | ||
682 | unsigned long flags; | ||
683 | struct epitem *epi = ep_item_from_wait(wait); | ||
684 | struct eventpoll *ep = epi->ep; | ||
810 | 685 | ||
811 | atomic_inc(&epi->usecnt); | 686 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", |
812 | } | 687 | current, epi->ffd.file, epi, ep)); |
813 | 688 | ||
689 | write_lock_irqsave(&ep->lock, flags); | ||
814 | 690 | ||
815 | /* | 691 | /* |
816 | * Decrement ( release ) the usage count by signaling that the user | 692 | * If the event mask does not contain any poll(2) event, we consider the |
817 | * has finished using the structure. It might lead to freeing the | 693 | * descriptor to be disabled. This condition is likely the effect of the |
818 | * structure itself if the count goes to zero. | 694 | * EPOLLONESHOT bit that disables the descriptor when an event is received, |
819 | */ | 695 | * until the next EPOLL_CTL_MOD will be issued. |
820 | static void ep_release_epitem(struct epitem *epi) | 696 | */ |
821 | { | 697 | if (!(epi->event.events & ~EP_PRIVATE_BITS)) |
698 | goto is_disabled; | ||
822 | 699 | ||
823 | if (atomic_dec_and_test(&epi->usecnt)) | 700 | /* If this file is already in the ready list we exit soon */ |
824 | kmem_cache_free(epi_cache, epi); | 701 | if (ep_is_linked(&epi->rdllink)) |
825 | } | 702 | goto is_linked; |
703 | |||
704 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
705 | |||
706 | is_linked: | ||
707 | /* | ||
708 | * Wake up ( if active ) both the eventpoll wait list and the ->poll() | ||
709 | * wait list. | ||
710 | */ | ||
711 | if (waitqueue_active(&ep->wq)) | ||
712 | __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | | ||
713 | TASK_INTERRUPTIBLE); | ||
714 | if (waitqueue_active(&ep->poll_wait)) | ||
715 | pwake++; | ||
716 | |||
717 | is_disabled: | ||
718 | write_unlock_irqrestore(&ep->lock, flags); | ||
719 | |||
720 | /* We have to call this outside the lock */ | ||
721 | if (pwake) | ||
722 | ep_poll_safewake(&psw, &ep->poll_wait); | ||
826 | 723 | ||
724 | return 1; | ||
725 | } | ||
827 | 726 | ||
828 | /* | 727 | /* |
829 | * This is the callback that is used to add our wait queue to the | 728 | * This is the callback that is used to add our wait queue to the |
@@ -848,7 +747,6 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, | |||
848 | } | 747 | } |
849 | } | 748 | } |
850 | 749 | ||
851 | |||
852 | static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | 750 | static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) |
853 | { | 751 | { |
854 | int kcmp; | 752 | int kcmp; |
@@ -868,7 +766,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
868 | rb_insert_color(&epi->rbn, &ep->rbr); | 766 | rb_insert_color(&epi->rbn, &ep->rbr); |
869 | } | 767 | } |
870 | 768 | ||
871 | |||
872 | static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | 769 | static int ep_insert(struct eventpoll *ep, struct epoll_event *event, |
873 | struct file *tfile, int fd) | 770 | struct file *tfile, int fd) |
874 | { | 771 | { |
@@ -879,7 +776,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
879 | 776 | ||
880 | error = -ENOMEM; | 777 | error = -ENOMEM; |
881 | if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) | 778 | if (!(epi = kmem_cache_alloc(epi_cache, GFP_KERNEL))) |
882 | goto eexit_1; | 779 | goto error_return; |
883 | 780 | ||
884 | /* Item initialization follow here ... */ | 781 | /* Item initialization follow here ... */ |
885 | ep_rb_initnode(&epi->rbn); | 782 | ep_rb_initnode(&epi->rbn); |
@@ -909,7 +806,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
909 | * high memory pressure. | 806 | * high memory pressure. |
910 | */ | 807 | */ |
911 | if (epi->nwait < 0) | 808 | if (epi->nwait < 0) |
912 | goto eexit_2; | 809 | goto error_unregister; |
913 | 810 | ||
914 | /* Add the current item to the list of active epoll hook for this file */ | 811 | /* Add the current item to the list of active epoll hook for this file */ |
915 | spin_lock(&tfile->f_ep_lock); | 812 | spin_lock(&tfile->f_ep_lock); |
@@ -944,7 +841,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
944 | 841 | ||
945 | return 0; | 842 | return 0; |
946 | 843 | ||
947 | eexit_2: | 844 | error_unregister: |
948 | ep_unregister_pollwait(ep, epi); | 845 | ep_unregister_pollwait(ep, epi); |
949 | 846 | ||
950 | /* | 847 | /* |
@@ -957,11 +854,10 @@ eexit_2: | |||
957 | write_unlock_irqrestore(&ep->lock, flags); | 854 | write_unlock_irqrestore(&ep->lock, flags); |
958 | 855 | ||
959 | kmem_cache_free(epi_cache, epi); | 856 | kmem_cache_free(epi_cache, epi); |
960 | eexit_1: | 857 | error_return: |
961 | return error; | 858 | return error; |
962 | } | 859 | } |
963 | 860 | ||
964 | |||
965 | /* | 861 | /* |
966 | * Modify the interest event mask by dropping an event if the new mask | 862 | * Modify the interest event mask by dropping an event if the new mask |
967 | * has a match in the current file status. | 863 | * has a match in the current file status. |
@@ -1024,216 +920,6 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even | |||
1024 | return 0; | 920 | return 0; |
1025 | } | 921 | } |
1026 | 922 | ||
1027 | |||
1028 | /* | ||
1029 | * This function unregister poll callbacks from the associated file descriptor. | ||
1030 | * Since this must be called without holding "ep->lock" the atomic exchange trick | ||
1031 | * will protect us from multiple unregister. | ||
1032 | */ | ||
1033 | static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) | ||
1034 | { | ||
1035 | int nwait; | ||
1036 | struct list_head *lsthead = &epi->pwqlist; | ||
1037 | struct eppoll_entry *pwq; | ||
1038 | |||
1039 | /* This is called without locks, so we need the atomic exchange */ | ||
1040 | nwait = xchg(&epi->nwait, 0); | ||
1041 | |||
1042 | if (nwait) { | ||
1043 | while (!list_empty(lsthead)) { | ||
1044 | pwq = list_first_entry(lsthead, struct eppoll_entry, llink); | ||
1045 | |||
1046 | list_del_init(&pwq->llink); | ||
1047 | remove_wait_queue(pwq->whead, &pwq->wait); | ||
1048 | kmem_cache_free(pwq_cache, pwq); | ||
1049 | } | ||
1050 | } | ||
1051 | } | ||
1052 | |||
1053 | |||
1054 | /* | ||
1055 | * Unlink the "struct epitem" from all places it might have been hooked up. | ||
1056 | * This function must be called with write IRQ lock on "ep->lock". | ||
1057 | */ | ||
1058 | static int ep_unlink(struct eventpoll *ep, struct epitem *epi) | ||
1059 | { | ||
1060 | int error; | ||
1061 | |||
1062 | /* | ||
1063 | * It can happen that this one is called for an item already unlinked. | ||
1064 | * The check protect us from doing a double unlink ( crash ). | ||
1065 | */ | ||
1066 | error = -ENOENT; | ||
1067 | if (!ep_rb_linked(&epi->rbn)) | ||
1068 | goto eexit_1; | ||
1069 | |||
1070 | /* | ||
1071 | * Clear the event mask for the unlinked item. This will avoid item | ||
1072 | * notifications to be sent after the unlink operation from inside | ||
1073 | * the kernel->userspace event transfer loop. | ||
1074 | */ | ||
1075 | epi->event.events = 0; | ||
1076 | |||
1077 | /* | ||
1078 | * At this point is safe to do the job, unlink the item from our rb-tree. | ||
1079 | * This operation togheter with the above check closes the door to | ||
1080 | * double unlinks. | ||
1081 | */ | ||
1082 | ep_rb_erase(&epi->rbn, &ep->rbr); | ||
1083 | |||
1084 | /* | ||
1085 | * If the item we are going to remove is inside the ready file descriptors | ||
1086 | * we want to remove it from this list to avoid stale events. | ||
1087 | */ | ||
1088 | if (ep_is_linked(&epi->rdllink)) | ||
1089 | list_del_init(&epi->rdllink); | ||
1090 | |||
1091 | error = 0; | ||
1092 | eexit_1: | ||
1093 | |||
1094 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", | ||
1095 | current, ep, epi->ffd.file, error)); | ||
1096 | |||
1097 | return error; | ||
1098 | } | ||
1099 | |||
1100 | |||
1101 | /* | ||
1102 | * Removes a "struct epitem" from the eventpoll RB tree and deallocates | ||
1103 | * all the associated resources. | ||
1104 | */ | ||
1105 | static int ep_remove(struct eventpoll *ep, struct epitem *epi) | ||
1106 | { | ||
1107 | int error; | ||
1108 | unsigned long flags; | ||
1109 | struct file *file = epi->ffd.file; | ||
1110 | |||
1111 | /* | ||
1112 | * Removes poll wait queue hooks. We _have_ to do this without holding | ||
1113 | * the "ep->lock" otherwise a deadlock might occur. This because of the | ||
1114 | * sequence of the lock acquisition. Here we do "ep->lock" then the wait | ||
1115 | * queue head lock when unregistering the wait queue. The wakeup callback | ||
1116 | * will run by holding the wait queue head lock and will call our callback | ||
1117 | * that will try to get "ep->lock". | ||
1118 | */ | ||
1119 | ep_unregister_pollwait(ep, epi); | ||
1120 | |||
1121 | /* Remove the current item from the list of epoll hooks */ | ||
1122 | spin_lock(&file->f_ep_lock); | ||
1123 | if (ep_is_linked(&epi->fllink)) | ||
1124 | list_del_init(&epi->fllink); | ||
1125 | spin_unlock(&file->f_ep_lock); | ||
1126 | |||
1127 | /* We need to acquire the write IRQ lock before calling ep_unlink() */ | ||
1128 | write_lock_irqsave(&ep->lock, flags); | ||
1129 | |||
1130 | /* Really unlink the item from the RB tree */ | ||
1131 | error = ep_unlink(ep, epi); | ||
1132 | |||
1133 | write_unlock_irqrestore(&ep->lock, flags); | ||
1134 | |||
1135 | if (error) | ||
1136 | goto eexit_1; | ||
1137 | |||
1138 | /* At this point it is safe to free the eventpoll item */ | ||
1139 | ep_release_epitem(epi); | ||
1140 | |||
1141 | error = 0; | ||
1142 | eexit_1: | ||
1143 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p) = %d\n", | ||
1144 | current, ep, file, error)); | ||
1145 | |||
1146 | return error; | ||
1147 | } | ||
1148 | |||
1149 | |||
1150 | /* | ||
1151 | * This is the callback that is passed to the wait queue wakeup | ||
1152 | * machanism. It is called by the stored file descriptors when they | ||
1153 | * have events to report. | ||
1154 | */ | ||
1155 | static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key) | ||
1156 | { | ||
1157 | int pwake = 0; | ||
1158 | unsigned long flags; | ||
1159 | struct epitem *epi = ep_item_from_wait(wait); | ||
1160 | struct eventpoll *ep = epi->ep; | ||
1161 | |||
1162 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", | ||
1163 | current, epi->ffd.file, epi, ep)); | ||
1164 | |||
1165 | write_lock_irqsave(&ep->lock, flags); | ||
1166 | |||
1167 | /* | ||
1168 | * If the event mask does not contain any poll(2) event, we consider the | ||
1169 | * descriptor to be disabled. This condition is likely the effect of the | ||
1170 | * EPOLLONESHOT bit that disables the descriptor when an event is received, | ||
1171 | * until the next EPOLL_CTL_MOD will be issued. | ||
1172 | */ | ||
1173 | if (!(epi->event.events & ~EP_PRIVATE_BITS)) | ||
1174 | goto is_disabled; | ||
1175 | |||
1176 | /* If this file is already in the ready list we exit soon */ | ||
1177 | if (ep_is_linked(&epi->rdllink)) | ||
1178 | goto is_linked; | ||
1179 | |||
1180 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
1181 | |||
1182 | is_linked: | ||
1183 | /* | ||
1184 | * Wake up ( if active ) both the eventpoll wait list and the ->poll() | ||
1185 | * wait list. | ||
1186 | */ | ||
1187 | if (waitqueue_active(&ep->wq)) | ||
1188 | __wake_up_locked(&ep->wq, TASK_UNINTERRUPTIBLE | | ||
1189 | TASK_INTERRUPTIBLE); | ||
1190 | if (waitqueue_active(&ep->poll_wait)) | ||
1191 | pwake++; | ||
1192 | |||
1193 | is_disabled: | ||
1194 | write_unlock_irqrestore(&ep->lock, flags); | ||
1195 | |||
1196 | /* We have to call this outside the lock */ | ||
1197 | if (pwake) | ||
1198 | ep_poll_safewake(&psw, &ep->poll_wait); | ||
1199 | |||
1200 | return 1; | ||
1201 | } | ||
1202 | |||
1203 | |||
1204 | static int ep_eventpoll_close(struct inode *inode, struct file *file) | ||
1205 | { | ||
1206 | struct eventpoll *ep = file->private_data; | ||
1207 | |||
1208 | if (ep) { | ||
1209 | ep_free(ep); | ||
1210 | kfree(ep); | ||
1211 | } | ||
1212 | |||
1213 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); | ||
1214 | return 0; | ||
1215 | } | ||
1216 | |||
1217 | |||
1218 | static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) | ||
1219 | { | ||
1220 | unsigned int pollflags = 0; | ||
1221 | unsigned long flags; | ||
1222 | struct eventpoll *ep = file->private_data; | ||
1223 | |||
1224 | /* Insert inside our poll wait queue */ | ||
1225 | poll_wait(file, &ep->poll_wait, wait); | ||
1226 | |||
1227 | /* Check our condition */ | ||
1228 | read_lock_irqsave(&ep->lock, flags); | ||
1229 | if (!list_empty(&ep->rdllist)) | ||
1230 | pollflags = POLLIN | POLLRDNORM; | ||
1231 | read_unlock_irqrestore(&ep->lock, flags); | ||
1232 | |||
1233 | return pollflags; | ||
1234 | } | ||
1235 | |||
1236 | |||
1237 | /* | 923 | /* |
1238 | * This function is called without holding the "ep->lock" since the call to | 924 | * This function is called without holding the "ep->lock" since the call to |
1239 | * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ | 925 | * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ |
@@ -1345,7 +1031,6 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | |||
1345 | return eventcnt == 0 ? error: eventcnt; | 1031 | return eventcnt == 0 ? error: eventcnt; |
1346 | } | 1032 | } |
1347 | 1033 | ||
1348 | |||
1349 | /* | 1034 | /* |
1350 | * Perform the transfer of events to user space. | 1035 | * Perform the transfer of events to user space. |
1351 | */ | 1036 | */ |
@@ -1381,7 +1066,6 @@ static int ep_events_transfer(struct eventpoll *ep, | |||
1381 | return eventcnt; | 1066 | return eventcnt; |
1382 | } | 1067 | } |
1383 | 1068 | ||
1384 | |||
1385 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | 1069 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, |
1386 | int maxevents, long timeout) | 1070 | int maxevents, long timeout) |
1387 | { | 1071 | { |
@@ -1451,6 +1135,260 @@ retry: | |||
1451 | return res; | 1135 | return res; |
1452 | } | 1136 | } |
1453 | 1137 | ||
1138 | /* | ||
1139 | * It opens an eventpoll file descriptor by suggesting a storage of "size" | ||
1140 | * file descriptors. The size parameter is just an hint about how to size | ||
1141 | * data structures. It won't prevent the user to store more than "size" | ||
1142 | * file descriptors inside the epoll interface. It is the kernel part of | ||
1143 | * the userspace epoll_create(2). | ||
1144 | */ | ||
1145 | asmlinkage long sys_epoll_create(int size) | ||
1146 | { | ||
1147 | int error, fd = -1; | ||
1148 | struct eventpoll *ep; | ||
1149 | struct inode *inode; | ||
1150 | struct file *file; | ||
1151 | |||
1152 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", | ||
1153 | current, size)); | ||
1154 | |||
1155 | /* | ||
1156 | * Sanity check on the size parameter, and create the internal data | ||
1157 | * structure ( "struct eventpoll" ). | ||
1158 | */ | ||
1159 | error = -EINVAL; | ||
1160 | if (size <= 0 || (error = ep_alloc(&ep)) != 0) | ||
1161 | goto error_return; | ||
1162 | |||
1163 | /* | ||
1164 | * Creates all the items needed to setup an eventpoll file. That is, | ||
1165 | * a file structure, and inode and a free file descriptor. | ||
1166 | */ | ||
1167 | error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", | ||
1168 | &eventpoll_fops, ep); | ||
1169 | if (error) | ||
1170 | goto error_free; | ||
1171 | |||
1172 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | ||
1173 | current, size, fd)); | ||
1174 | |||
1175 | return fd; | ||
1176 | |||
1177 | error_free: | ||
1178 | ep_free(ep); | ||
1179 | kfree(ep); | ||
1180 | error_return: | ||
1181 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", | ||
1182 | current, size, error)); | ||
1183 | return error; | ||
1184 | } | ||
1185 | |||
1186 | /* | ||
1187 | * The following function implements the controller interface for | ||
1188 | * the eventpoll file that enables the insertion/removal/change of | ||
1189 | * file descriptors inside the interest set. It represents | ||
1190 | * the kernel part of the user space epoll_ctl(2). | ||
1191 | */ | ||
1192 | asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, | ||
1193 | struct epoll_event __user *event) | ||
1194 | { | ||
1195 | int error; | ||
1196 | struct file *file, *tfile; | ||
1197 | struct eventpoll *ep; | ||
1198 | struct epitem *epi; | ||
1199 | struct epoll_event epds; | ||
1200 | |||
1201 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", | ||
1202 | current, epfd, op, fd, event)); | ||
1203 | |||
1204 | error = -EFAULT; | ||
1205 | if (ep_op_has_event(op) && | ||
1206 | copy_from_user(&epds, event, sizeof(struct epoll_event))) | ||
1207 | goto error_return; | ||
1208 | |||
1209 | /* Get the "struct file *" for the eventpoll file */ | ||
1210 | error = -EBADF; | ||
1211 | file = fget(epfd); | ||
1212 | if (!file) | ||
1213 | goto error_return; | ||
1214 | |||
1215 | /* Get the "struct file *" for the target file */ | ||
1216 | tfile = fget(fd); | ||
1217 | if (!tfile) | ||
1218 | goto error_fput; | ||
1219 | |||
1220 | /* The target file descriptor must support poll */ | ||
1221 | error = -EPERM; | ||
1222 | if (!tfile->f_op || !tfile->f_op->poll) | ||
1223 | goto error_tgt_fput; | ||
1224 | |||
1225 | /* | ||
1226 | * We have to check that the file structure underneath the file descriptor | ||
1227 | * the user passed to us _is_ an eventpoll file. And also we do not permit | ||
1228 | * adding an epoll file descriptor inside itself. | ||
1229 | */ | ||
1230 | error = -EINVAL; | ||
1231 | if (file == tfile || !is_file_epoll(file)) | ||
1232 | goto error_tgt_fput; | ||
1233 | |||
1234 | /* | ||
1235 | * At this point it is safe to assume that the "private_data" contains | ||
1236 | * our own data structure. | ||
1237 | */ | ||
1238 | ep = file->private_data; | ||
1239 | |||
1240 | down_write(&ep->sem); | ||
1241 | |||
1242 | /* Try to lookup the file inside our RB tree */ | ||
1243 | epi = ep_find(ep, tfile, fd); | ||
1244 | |||
1245 | error = -EINVAL; | ||
1246 | switch (op) { | ||
1247 | case EPOLL_CTL_ADD: | ||
1248 | if (!epi) { | ||
1249 | epds.events |= POLLERR | POLLHUP; | ||
1250 | |||
1251 | error = ep_insert(ep, &epds, tfile, fd); | ||
1252 | } else | ||
1253 | error = -EEXIST; | ||
1254 | break; | ||
1255 | case EPOLL_CTL_DEL: | ||
1256 | if (epi) | ||
1257 | error = ep_remove(ep, epi); | ||
1258 | else | ||
1259 | error = -ENOENT; | ||
1260 | break; | ||
1261 | case EPOLL_CTL_MOD: | ||
1262 | if (epi) { | ||
1263 | epds.events |= POLLERR | POLLHUP; | ||
1264 | error = ep_modify(ep, epi, &epds); | ||
1265 | } else | ||
1266 | error = -ENOENT; | ||
1267 | break; | ||
1268 | } | ||
1269 | /* | ||
1270 | * The function ep_find() increments the usage count of the structure | ||
1271 | * so, if this is not NULL, we need to release it. | ||
1272 | */ | ||
1273 | if (epi) | ||
1274 | ep_release_epitem(epi); | ||
1275 | up_write(&ep->sem); | ||
1276 | |||
1277 | error_tgt_fput: | ||
1278 | fput(tfile); | ||
1279 | error_fput: | ||
1280 | fput(file); | ||
1281 | error_return: | ||
1282 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", | ||
1283 | current, epfd, op, fd, event, error)); | ||
1284 | |||
1285 | return error; | ||
1286 | } | ||
1287 | |||
1288 | /* | ||
1289 | * Implement the event wait interface for the eventpoll file. It is the kernel | ||
1290 | * part of the user space epoll_wait(2). | ||
1291 | */ | ||
1292 | asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events, | ||
1293 | int maxevents, int timeout) | ||
1294 | { | ||
1295 | int error; | ||
1296 | struct file *file; | ||
1297 | struct eventpoll *ep; | ||
1298 | |||
1299 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", | ||
1300 | current, epfd, events, maxevents, timeout)); | ||
1301 | |||
1302 | /* The maximum number of event must be greater than zero */ | ||
1303 | if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) | ||
1304 | return -EINVAL; | ||
1305 | |||
1306 | /* Verify that the area passed by the user is writeable */ | ||
1307 | if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) { | ||
1308 | error = -EFAULT; | ||
1309 | goto error_return; | ||
1310 | } | ||
1311 | |||
1312 | /* Get the "struct file *" for the eventpoll file */ | ||
1313 | error = -EBADF; | ||
1314 | file = fget(epfd); | ||
1315 | if (!file) | ||
1316 | goto error_return; | ||
1317 | |||
1318 | /* | ||
1319 | * We have to check that the file structure underneath the fd | ||
1320 | * the user passed to us _is_ an eventpoll file. | ||
1321 | */ | ||
1322 | error = -EINVAL; | ||
1323 | if (!is_file_epoll(file)) | ||
1324 | goto error_fput; | ||
1325 | |||
1326 | /* | ||
1327 | * At this point it is safe to assume that the "private_data" contains | ||
1328 | * our own data structure. | ||
1329 | */ | ||
1330 | ep = file->private_data; | ||
1331 | |||
1332 | /* Time to fish for events ... */ | ||
1333 | error = ep_poll(ep, events, maxevents, timeout); | ||
1334 | |||
1335 | error_fput: | ||
1336 | fput(file); | ||
1337 | error_return: | ||
1338 | DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", | ||
1339 | current, epfd, events, maxevents, timeout, error)); | ||
1340 | |||
1341 | return error; | ||
1342 | } | ||
1343 | |||
1344 | #ifdef TIF_RESTORE_SIGMASK | ||
1345 | |||
1346 | /* | ||
1347 | * Implement the event wait interface for the eventpoll file. It is the kernel | ||
1348 | * part of the user space epoll_pwait(2). | ||
1349 | */ | ||
1350 | asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, | ||
1351 | int maxevents, int timeout, const sigset_t __user *sigmask, | ||
1352 | size_t sigsetsize) | ||
1353 | { | ||
1354 | int error; | ||
1355 | sigset_t ksigmask, sigsaved; | ||
1356 | |||
1357 | /* | ||
1358 | * If the caller wants a certain signal mask to be set during the wait, | ||
1359 | * we apply it here. | ||
1360 | */ | ||
1361 | if (sigmask) { | ||
1362 | if (sigsetsize != sizeof(sigset_t)) | ||
1363 | return -EINVAL; | ||
1364 | if (copy_from_user(&ksigmask, sigmask, sizeof(ksigmask))) | ||
1365 | return -EFAULT; | ||
1366 | sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP)); | ||
1367 | sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); | ||
1368 | } | ||
1369 | |||
1370 | error = sys_epoll_wait(epfd, events, maxevents, timeout); | ||
1371 | |||
1372 | /* | ||
1373 | * If we changed the signal mask, we need to restore the original one. | ||
1374 | * In case we've got a signal while waiting, we do not restore the | ||
1375 | * signal mask yet, and we allow do_signal() to deliver the signal on | ||
1376 | * the way back to userspace, before the signal mask is restored. | ||
1377 | */ | ||
1378 | if (sigmask) { | ||
1379 | if (error == -EINTR) { | ||
1380 | memcpy(¤t->saved_sigmask, &sigsaved, | ||
1381 | sizeof(sigsaved)); | ||
1382 | set_thread_flag(TIF_RESTORE_SIGMASK); | ||
1383 | } else | ||
1384 | sigprocmask(SIG_SETMASK, &sigsaved, NULL); | ||
1385 | } | ||
1386 | |||
1387 | return error; | ||
1388 | } | ||
1389 | |||
1390 | #endif /* #ifdef TIF_RESTORE_SIGMASK */ | ||
1391 | |||
1454 | static int __init eventpoll_init(void) | 1392 | static int __init eventpoll_init(void) |
1455 | { | 1393 | { |
1456 | mutex_init(&epmutex); | 1394 | mutex_init(&epmutex); |