aboutsummaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c80
1 files changed, 73 insertions, 7 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index aabdfc38cf24..739b0985b398 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,6 @@
34#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h> 37#include <asm/io.h>
39#include <asm/mman.h> 38#include <asm/mman.h>
40#include <linux/atomic.h> 39#include <linux/atomic.h>
@@ -320,6 +319,11 @@ static inline int ep_is_linked(struct list_head *p)
320 return !list_empty(p); 319 return !list_empty(p);
321} 320}
322 321
322static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
323{
324 return container_of(p, struct eppoll_entry, wait);
325}
326
323/* Get the "struct epitem" from a wait queue pointer */ 327/* Get the "struct epitem" from a wait queue pointer */
324static inline struct epitem *ep_item_from_wait(wait_queue_t *p) 328static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
325{ 329{
@@ -422,6 +426,31 @@ out_unlock:
422 return error; 426 return error;
423} 427}
424 428
429/*
430 * As described in commit 0ccf831cb lockdep: annotate epoll
431 * the use of wait queues used by epoll is done in a very controlled
432 * manner. Wake ups can nest inside each other, but are never done
433 * with the same locking. For example:
434 *
435 * dfd = socket(...);
436 * efd1 = epoll_create();
437 * efd2 = epoll_create();
438 * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
439 * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
440 *
441 * When a packet arrives to the device underneath "dfd", the net code will
442 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
443 * callback wakeup entry on that queue, and the wake_up() performed by the
444 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
445 * (efd1) notices that it may have some event ready, so it needs to wake up
446 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
447 * that ends up in another wake_up(), after having checked about the
448 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
449 * avoid stack blasting.
450 *
451 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
452 * this special case of epoll.
453 */
425#ifdef CONFIG_DEBUG_LOCK_ALLOC 454#ifdef CONFIG_DEBUG_LOCK_ALLOC
426static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, 455static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
427 unsigned long events, int subclass) 456 unsigned long events, int subclass)
@@ -467,6 +496,18 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
467 put_cpu(); 496 put_cpu();
468} 497}
469 498
499static void ep_remove_wait_queue(struct eppoll_entry *pwq)
500{
501 wait_queue_head_t *whead;
502
503 rcu_read_lock();
504 /* If it is cleared by POLLFREE, it should be rcu-safe */
505 whead = rcu_dereference(pwq->whead);
506 if (whead)
507 remove_wait_queue(whead, &pwq->wait);
508 rcu_read_unlock();
509}
510
470/* 511/*
471 * This function unregisters poll callbacks from the associated file 512 * This function unregisters poll callbacks from the associated file
472 * descriptor. Must be called with "mtx" held (or "epmutex" if called from 513 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
@@ -481,7 +522,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
481 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 522 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
482 523
483 list_del(&pwq->llink); 524 list_del(&pwq->llink);
484 remove_wait_queue(pwq->whead, &pwq->wait); 525 ep_remove_wait_queue(pwq);
485 kmem_cache_free(pwq_cache, pwq); 526 kmem_cache_free(pwq_cache, pwq);
486 } 527 }
487} 528}
@@ -682,9 +723,12 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
682 void *priv) 723 void *priv)
683{ 724{
684 struct epitem *epi, *tmp; 725 struct epitem *epi, *tmp;
726 poll_table pt;
685 727
728 init_poll_funcptr(&pt, NULL);
686 list_for_each_entry_safe(epi, tmp, head, rdllink) { 729 list_for_each_entry_safe(epi, tmp, head, rdllink) {
687 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 730 pt._key = epi->event.events;
731 if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
688 epi->event.events) 732 epi->event.events)
689 return POLLIN | POLLRDNORM; 733 return POLLIN | POLLRDNORM;
690 else { 734 else {
@@ -842,6 +886,17 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
842 struct epitem *epi = ep_item_from_wait(wait); 886 struct epitem *epi = ep_item_from_wait(wait);
843 struct eventpoll *ep = epi->ep; 887 struct eventpoll *ep = epi->ep;
844 888
889 if ((unsigned long)key & POLLFREE) {
890 ep_pwq_from_wait(wait)->whead = NULL;
891 /*
892 * whead = NULL above can race with ep_remove_wait_queue()
893 * which can do another remove_wait_queue() after us, so we
894 * can't use __remove_wait_queue(). whead->lock is held by
895 * the caller.
896 */
897 list_del_init(&wait->task_list);
898 }
899
845 spin_lock_irqsave(&ep->lock, flags); 900 spin_lock_irqsave(&ep->lock, flags);
846 901
847 /* 902 /*
@@ -960,6 +1015,10 @@ static int path_count[PATH_ARR_SIZE];
960 1015
961static int path_count_inc(int nests) 1016static int path_count_inc(int nests)
962{ 1017{
1018 /* Allow an arbitrary number of depth 1 paths */
1019 if (nests == 0)
1020 return 0;
1021
963 if (++path_count[nests] > path_limits[nests]) 1022 if (++path_count[nests] > path_limits[nests])
964 return -1; 1023 return -1;
965 return 0; 1024 return 0;
@@ -1017,13 +1076,11 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1017 */ 1076 */
1018static int reverse_path_check(void) 1077static int reverse_path_check(void)
1019{ 1078{
1020 int length = 0;
1021 int error = 0; 1079 int error = 0;
1022 struct file *current_file; 1080 struct file *current_file;
1023 1081
1024 /* let's call this for all tfiles */ 1082 /* let's call this for all tfiles */
1025 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { 1083 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1026 length++;
1027 path_count_init(); 1084 path_count_init();
1028 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1085 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1029 reverse_path_check_proc, current_file, 1086 reverse_path_check_proc, current_file,
@@ -1065,6 +1122,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1065 /* Initialize the poll table using the queue callback */ 1122 /* Initialize the poll table using the queue callback */
1066 epq.epi = epi; 1123 epq.epi = epi;
1067 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 1124 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1125 epq.pt._key = event->events;
1068 1126
1069 /* 1127 /*
1070 * Attach the item to the poll hooks and get current event bits. 1128 * Attach the item to the poll hooks and get current event bits.
@@ -1159,6 +1217,9 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1159{ 1217{
1160 int pwake = 0; 1218 int pwake = 0;
1161 unsigned int revents; 1219 unsigned int revents;
1220 poll_table pt;
1221
1222 init_poll_funcptr(&pt, NULL);
1162 1223
1163 /* 1224 /*
1164 * Set the new event interest mask before calling f_op->poll(); 1225 * Set the new event interest mask before calling f_op->poll();
@@ -1166,13 +1227,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1166 * f_op->poll() call and the new event set registering. 1227 * f_op->poll() call and the new event set registering.
1167 */ 1228 */
1168 epi->event.events = event->events; 1229 epi->event.events = event->events;
1230 pt._key = event->events;
1169 epi->event.data = event->data; /* protected by mtx */ 1231 epi->event.data = event->data; /* protected by mtx */
1170 1232
1171 /* 1233 /*
1172 * Get current event bits. We can safely use the file* here because 1234 * Get current event bits. We can safely use the file* here because
1173 * its usage count has been increased by the caller of this function. 1235 * its usage count has been increased by the caller of this function.
1174 */ 1236 */
1175 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1237 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
1176 1238
1177 /* 1239 /*
1178 * If the item is "hot" and it is not registered inside the ready 1240 * If the item is "hot" and it is not registered inside the ready
@@ -1207,6 +1269,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1207 unsigned int revents; 1269 unsigned int revents;
1208 struct epitem *epi; 1270 struct epitem *epi;
1209 struct epoll_event __user *uevent; 1271 struct epoll_event __user *uevent;
1272 poll_table pt;
1273
1274 init_poll_funcptr(&pt, NULL);
1210 1275
1211 /* 1276 /*
1212 * We can loop without lock because we are passed a task private list. 1277 * We can loop without lock because we are passed a task private list.
@@ -1219,7 +1284,8 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1219 1284
1220 list_del_init(&epi->rdllink); 1285 list_del_init(&epi->rdllink);
1221 1286
1222 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & 1287 pt._key = epi->event.events;
1288 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
1223 epi->event.events; 1289 epi->event.events;
1224 1290
1225 /* 1291 /*