aboutsummaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
authorJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
committerJonathan Herman <hermanjl@cs.unc.edu>2013-01-17 16:15:55 -0500
commit8dea78da5cee153b8af9c07a2745f6c55057fe12 (patch)
treea8f4d49d63b1ecc92f2fddceba0655b2472c5bd9 /fs/eventpoll.c
parent406089d01562f1e2bf9f089fd7637009ebaad589 (diff)
Patched in Tegra support.
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c479
1 files changed, 52 insertions, 427 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057..2d1744ab5bc 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -33,13 +33,11 @@
33#include <linux/bitops.h> 33#include <linux/bitops.h>
34#include <linux/mutex.h> 34#include <linux/mutex.h>
35#include <linux/anon_inodes.h> 35#include <linux/anon_inodes.h>
36#include <linux/device.h>
37#include <asm/uaccess.h> 36#include <asm/uaccess.h>
37#include <asm/system.h>
38#include <asm/io.h> 38#include <asm/io.h>
39#include <asm/mman.h> 39#include <asm/mman.h>
40#include <linux/atomic.h> 40#include <linux/atomic.h>
41#include <linux/proc_fs.h>
42#include <linux/seq_file.h>
43 41
44/* 42/*
45 * LOCKING: 43 * LOCKING:
@@ -90,7 +88,7 @@
90 */ 88 */
91 89
92/* Epoll private bits inside the event mask */ 90/* Epoll private bits inside the event mask */
93#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) 91#define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
94 92
95/* Maximum number of nesting allowed inside epoll sets */ 93/* Maximum number of nesting allowed inside epoll sets */
96#define EP_MAX_NESTS 4 94#define EP_MAX_NESTS 4
@@ -157,9 +155,6 @@ struct epitem {
157 /* List header used to link this item to the "struct file" items list */ 155 /* List header used to link this item to the "struct file" items list */
158 struct list_head fllink; 156 struct list_head fllink;
159 157
160 /* wakeup_source used when EPOLLWAKEUP is set */
161 struct wakeup_source *ws;
162
163 /* The structure that describe the interested events and the source fd */ 158 /* The structure that describe the interested events and the source fd */
164 struct epoll_event event; 159 struct epoll_event event;
165}; 160};
@@ -200,17 +195,8 @@ struct eventpoll {
200 */ 195 */
201 struct epitem *ovflist; 196 struct epitem *ovflist;
202 197
203 /* wakeup_source used when ep_scan_ready_list is running */
204 struct wakeup_source *ws;
205
206 /* The user that created the eventpoll descriptor */ 198 /* The user that created the eventpoll descriptor */
207 struct user_struct *user; 199 struct user_struct *user;
208
209 struct file *file;
210
211 /* used to optimize loop detection check */
212 int visited;
213 struct list_head visited_list_link;
214}; 200};
215 201
216/* Wait structure used by the poll hooks */ 202/* Wait structure used by the poll hooks */
@@ -269,15 +255,6 @@ static struct kmem_cache *epi_cache __read_mostly;
269/* Slab cache used to allocate "struct eppoll_entry" */ 255/* Slab cache used to allocate "struct eppoll_entry" */
270static struct kmem_cache *pwq_cache __read_mostly; 256static struct kmem_cache *pwq_cache __read_mostly;
271 257
272/* Visited nodes during ep_loop_check(), so we can unset them when we finish */
273static LIST_HEAD(visited_list);
274
275/*
276 * List of files with newly added links, where we may need to limit the number
277 * of emanating paths. Protected by the epmutex.
278 */
279static LIST_HEAD(tfile_check_list);
280
281#ifdef CONFIG_SYSCTL 258#ifdef CONFIG_SYSCTL
282 259
283#include <linux/sysctl.h> 260#include <linux/sysctl.h>
@@ -299,12 +276,6 @@ ctl_table epoll_table[] = {
299}; 276};
300#endif /* CONFIG_SYSCTL */ 277#endif /* CONFIG_SYSCTL */
301 278
302static const struct file_operations eventpoll_fops;
303
304static inline int is_file_epoll(struct file *f)
305{
306 return f->f_op == &eventpoll_fops;
307}
308 279
309/* Setup the structure that is used as key for the RB tree */ 280/* Setup the structure that is used as key for the RB tree */
310static inline void ep_set_ffd(struct epoll_filefd *ffd, 281static inline void ep_set_ffd(struct epoll_filefd *ffd,
@@ -328,11 +299,6 @@ static inline int ep_is_linked(struct list_head *p)
328 return !list_empty(p); 299 return !list_empty(p);
329} 300}
330 301
331static inline struct eppoll_entry *ep_pwq_from_wait(wait_queue_t *p)
332{
333 return container_of(p, struct eppoll_entry, wait);
334}
335
336/* Get the "struct epitem" from a wait queue pointer */ 302/* Get the "struct epitem" from a wait queue pointer */
337static inline struct epitem *ep_item_from_wait(wait_queue_t *p) 303static inline struct epitem *ep_item_from_wait(wait_queue_t *p)
338{ 304{
@@ -435,31 +401,6 @@ out_unlock:
435 return error; 401 return error;
436} 402}
437 403
438/*
439 * As described in commit 0ccf831cb lockdep: annotate epoll
440 * the use of wait queues used by epoll is done in a very controlled
441 * manner. Wake ups can nest inside each other, but are never done
442 * with the same locking. For example:
443 *
444 * dfd = socket(...);
445 * efd1 = epoll_create();
446 * efd2 = epoll_create();
447 * epoll_ctl(efd1, EPOLL_CTL_ADD, dfd, ...);
448 * epoll_ctl(efd2, EPOLL_CTL_ADD, efd1, ...);
449 *
450 * When a packet arrives to the device underneath "dfd", the net code will
451 * issue a wake_up() on its poll wake list. Epoll (efd1) has installed a
452 * callback wakeup entry on that queue, and the wake_up() performed by the
453 * "dfd" net code will end up in ep_poll_callback(). At this point epoll
454 * (efd1) notices that it may have some event ready, so it needs to wake up
455 * the waiters on its poll wait list (efd2). So it calls ep_poll_safewake()
456 * that ends up in another wake_up(), after having checked about the
457 * recursion constraints. That are, no more than EP_MAX_POLLWAKE_NESTS, to
458 * avoid stack blasting.
459 *
460 * When CONFIG_DEBUG_LOCK_ALLOC is enabled, make sure lockdep can handle
461 * this special case of epoll.
462 */
463#ifdef CONFIG_DEBUG_LOCK_ALLOC 404#ifdef CONFIG_DEBUG_LOCK_ALLOC
464static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, 405static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
465 unsigned long events, int subclass) 406 unsigned long events, int subclass)
@@ -505,18 +446,6 @@ static void ep_poll_safewake(wait_queue_head_t *wq)
505 put_cpu(); 446 put_cpu();
506} 447}
507 448
508static void ep_remove_wait_queue(struct eppoll_entry *pwq)
509{
510 wait_queue_head_t *whead;
511
512 rcu_read_lock();
513 /* If it is cleared by POLLFREE, it should be rcu-safe */
514 whead = rcu_dereference(pwq->whead);
515 if (whead)
516 remove_wait_queue(whead, &pwq->wait);
517 rcu_read_unlock();
518}
519
520/* 449/*
521 * This function unregisters poll callbacks from the associated file 450 * This function unregisters poll callbacks from the associated file
522 * descriptor. Must be called with "mtx" held (or "epmutex" if called from 451 * descriptor. Must be called with "mtx" held (or "epmutex" if called from
@@ -531,7 +460,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
531 pwq = list_first_entry(lsthead, struct eppoll_entry, llink); 460 pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
532 461
533 list_del(&pwq->llink); 462 list_del(&pwq->llink);
534 ep_remove_wait_queue(pwq); 463 remove_wait_queue(pwq->whead, &pwq->wait);
535 kmem_cache_free(pwq_cache, pwq); 464 kmem_cache_free(pwq_cache, pwq);
536 } 465 }
537} 466}
@@ -597,10 +526,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
597 * queued into ->ovflist but the "txlist" might already 526 * queued into ->ovflist but the "txlist" might already
598 * contain them, and the list_splice() below takes care of them. 527 * contain them, and the list_splice() below takes care of them.
599 */ 528 */
600 if (!ep_is_linked(&epi->rdllink)) { 529 if (!ep_is_linked(&epi->rdllink))
601 list_add_tail(&epi->rdllink, &ep->rdllist); 530 list_add_tail(&epi->rdllink, &ep->rdllist);
602 __pm_stay_awake(epi->ws);
603 }
604 } 531 }
605 /* 532 /*
606 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after 533 * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
@@ -613,7 +540,6 @@ static int ep_scan_ready_list(struct eventpoll *ep,
613 * Quickly re-inject items left on "txlist". 540 * Quickly re-inject items left on "txlist".
614 */ 541 */
615 list_splice(&txlist, &ep->rdllist); 542 list_splice(&txlist, &ep->rdllist);
616 __pm_relax(ep->ws);
617 543
618 if (!list_empty(&ep->rdllist)) { 544 if (!list_empty(&ep->rdllist)) {
619 /* 545 /*
@@ -668,8 +594,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
668 list_del_init(&epi->rdllink); 594 list_del_init(&epi->rdllink);
669 spin_unlock_irqrestore(&ep->lock, flags); 595 spin_unlock_irqrestore(&ep->lock, flags);
670 596
671 wakeup_source_unregister(epi->ws);
672
673 /* At this point it is safe to free the eventpoll item */ 597 /* At this point it is safe to free the eventpoll item */
674 kmem_cache_free(epi_cache, epi); 598 kmem_cache_free(epi_cache, epi);
675 599
@@ -720,7 +644,6 @@ static void ep_free(struct eventpoll *ep)
720 mutex_unlock(&epmutex); 644 mutex_unlock(&epmutex);
721 mutex_destroy(&ep->mtx); 645 mutex_destroy(&ep->mtx);
722 free_uid(ep->user); 646 free_uid(ep->user);
723 wakeup_source_unregister(ep->ws);
724 kfree(ep); 647 kfree(ep);
725} 648}
726 649
@@ -738,12 +661,9 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
738 void *priv) 661 void *priv)
739{ 662{
740 struct epitem *epi, *tmp; 663 struct epitem *epi, *tmp;
741 poll_table pt;
742 664
743 init_poll_funcptr(&pt, NULL);
744 list_for_each_entry_safe(epi, tmp, head, rdllink) { 665 list_for_each_entry_safe(epi, tmp, head, rdllink) {
745 pt._key = epi->event.events; 666 if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
746 if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
747 epi->event.events) 667 epi->event.events)
748 return POLLIN | POLLRDNORM; 668 return POLLIN | POLLRDNORM;
749 else { 669 else {
@@ -752,7 +672,6 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
752 * callback, but it's not actually ready, as far as 672 * callback, but it's not actually ready, as far as
753 * caller requested events goes. We can remove it here. 673 * caller requested events goes. We can remove it here.
754 */ 674 */
755 __pm_relax(epi->ws);
756 list_del_init(&epi->rdllink); 675 list_del_init(&epi->rdllink);
757 } 676 }
758 } 677 }
@@ -785,39 +704,19 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
785 return pollflags != -1 ? pollflags : 0; 704 return pollflags != -1 ? pollflags : 0;
786} 705}
787 706
788#ifdef CONFIG_PROC_FS
789static int ep_show_fdinfo(struct seq_file *m, struct file *f)
790{
791 struct eventpoll *ep = f->private_data;
792 struct rb_node *rbp;
793 int ret = 0;
794
795 mutex_lock(&ep->mtx);
796 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
797 struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
798
799 ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
800 epi->ffd.fd, epi->event.events,
801 (long long)epi->event.data);
802 if (ret)
803 break;
804 }
805 mutex_unlock(&ep->mtx);
806
807 return ret;
808}
809#endif
810
811/* File callbacks that implement the eventpoll file behaviour */ 707/* File callbacks that implement the eventpoll file behaviour */
812static const struct file_operations eventpoll_fops = { 708static const struct file_operations eventpoll_fops = {
813#ifdef CONFIG_PROC_FS
814 .show_fdinfo = ep_show_fdinfo,
815#endif
816 .release = ep_eventpoll_release, 709 .release = ep_eventpoll_release,
817 .poll = ep_eventpoll_poll, 710 .poll = ep_eventpoll_poll,
818 .llseek = noop_llseek, 711 .llseek = noop_llseek,
819}; 712};
820 713
714/* Fast test to see if the file is an evenpoll file */
715static inline int is_file_epoll(struct file *f)
716{
717 return f->f_op == &eventpoll_fops;
718}
719
821/* 720/*
822 * This is called from eventpoll_release() to unlink files from the eventpoll 721 * This is called from eventpoll_release() to unlink files from the eventpoll
823 * interface. We need to have this facility to cleanup correctly files that are 722 * interface. We need to have this facility to cleanup correctly files that are
@@ -928,17 +827,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
928 struct epitem *epi = ep_item_from_wait(wait); 827 struct epitem *epi = ep_item_from_wait(wait);
929 struct eventpoll *ep = epi->ep; 828 struct eventpoll *ep = epi->ep;
930 829
931 if ((unsigned long)key & POLLFREE) {
932 ep_pwq_from_wait(wait)->whead = NULL;
933 /*
934 * whead = NULL above can race with ep_remove_wait_queue()
935 * which can do another remove_wait_queue() after us, so we
936 * can't use __remove_wait_queue(). whead->lock is held by
937 * the caller.
938 */
939 list_del_init(&wait->task_list);
940 }
941
942 spin_lock_irqsave(&ep->lock, flags); 830 spin_lock_irqsave(&ep->lock, flags);
943 831
944 /* 832 /*
@@ -969,23 +857,13 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
969 if (epi->next == EP_UNACTIVE_PTR) { 857 if (epi->next == EP_UNACTIVE_PTR) {
970 epi->next = ep->ovflist; 858 epi->next = ep->ovflist;
971 ep->ovflist = epi; 859 ep->ovflist = epi;
972 if (epi->ws) {
973 /*
974 * Activate ep->ws since epi->ws may get
975 * deactivated at any time.
976 */
977 __pm_stay_awake(ep->ws);
978 }
979
980 } 860 }
981 goto out_unlock; 861 goto out_unlock;
982 } 862 }
983 863
984 /* If this file is already in the ready list we exit soon */ 864 /* If this file is already in the ready list we exit soon */
985 if (!ep_is_linked(&epi->rdllink)) { 865 if (!ep_is_linked(&epi->rdllink))
986 list_add_tail(&epi->rdllink, &ep->rdllist); 866 list_add_tail(&epi->rdllink, &ep->rdllist);
987 __pm_stay_awake(epi->ws);
988 }
989 867
990 /* 868 /*
991 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 869 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
@@ -1048,125 +926,6 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
1048 rb_insert_color(&epi->rbn, &ep->rbr); 926 rb_insert_color(&epi->rbn, &ep->rbr);
1049} 927}
1050 928
1051
1052
1053#define PATH_ARR_SIZE 5
1054/*
1055 * These are the number paths of length 1 to 5, that we are allowing to emanate
1056 * from a single file of interest. For example, we allow 1000 paths of length
1057 * 1, to emanate from each file of interest. This essentially represents the
1058 * potential wakeup paths, which need to be limited in order to avoid massive
1059 * uncontrolled wakeup storms. The common use case should be a single ep which
1060 * is connected to n file sources. In this case each file source has 1 path
1061 * of length 1. Thus, the numbers below should be more than sufficient. These
1062 * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify
1063 * and delete can't add additional paths. Protected by the epmutex.
1064 */
1065static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 };
1066static int path_count[PATH_ARR_SIZE];
1067
1068static int path_count_inc(int nests)
1069{
1070 /* Allow an arbitrary number of depth 1 paths */
1071 if (nests == 0)
1072 return 0;
1073
1074 if (++path_count[nests] > path_limits[nests])
1075 return -1;
1076 return 0;
1077}
1078
1079static void path_count_init(void)
1080{
1081 int i;
1082
1083 for (i = 0; i < PATH_ARR_SIZE; i++)
1084 path_count[i] = 0;
1085}
1086
1087static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1088{
1089 int error = 0;
1090 struct file *file = priv;
1091 struct file *child_file;
1092 struct epitem *epi;
1093
1094 list_for_each_entry(epi, &file->f_ep_links, fllink) {
1095 child_file = epi->ep->file;
1096 if (is_file_epoll(child_file)) {
1097 if (list_empty(&child_file->f_ep_links)) {
1098 if (path_count_inc(call_nests)) {
1099 error = -1;
1100 break;
1101 }
1102 } else {
1103 error = ep_call_nested(&poll_loop_ncalls,
1104 EP_MAX_NESTS,
1105 reverse_path_check_proc,
1106 child_file, child_file,
1107 current);
1108 }
1109 if (error != 0)
1110 break;
1111 } else {
1112 printk(KERN_ERR "reverse_path_check_proc: "
1113 "file is not an ep!\n");
1114 }
1115 }
1116 return error;
1117}
1118
1119/**
1120 * reverse_path_check - The tfile_check_list is list of file *, which have
1121 * links that are proposed to be newly added. We need to
1122 * make sure that those added links don't add too many
1123 * paths such that we will spend all our time waking up
1124 * eventpoll objects.
1125 *
1126 * Returns: Returns zero if the proposed links don't create too many paths,
1127 * -1 otherwise.
1128 */
1129static int reverse_path_check(void)
1130{
1131 int error = 0;
1132 struct file *current_file;
1133
1134 /* let's call this for all tfiles */
1135 list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) {
1136 path_count_init();
1137 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1138 reverse_path_check_proc, current_file,
1139 current_file, current);
1140 if (error)
1141 break;
1142 }
1143 return error;
1144}
1145
1146static int ep_create_wakeup_source(struct epitem *epi)
1147{
1148 const char *name;
1149
1150 if (!epi->ep->ws) {
1151 epi->ep->ws = wakeup_source_register("eventpoll");
1152 if (!epi->ep->ws)
1153 return -ENOMEM;
1154 }
1155
1156 name = epi->ffd.file->f_path.dentry->d_name.name;
1157 epi->ws = wakeup_source_register(name);
1158 if (!epi->ws)
1159 return -ENOMEM;
1160
1161 return 0;
1162}
1163
1164static void ep_destroy_wakeup_source(struct epitem *epi)
1165{
1166 wakeup_source_unregister(epi->ws);
1167 epi->ws = NULL;
1168}
1169
1170/* 929/*
1171 * Must be called with "mtx" held. 930 * Must be called with "mtx" held.
1172 */ 931 */
@@ -1194,18 +953,10 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1194 epi->event = *event; 953 epi->event = *event;
1195 epi->nwait = 0; 954 epi->nwait = 0;
1196 epi->next = EP_UNACTIVE_PTR; 955 epi->next = EP_UNACTIVE_PTR;
1197 if (epi->event.events & EPOLLWAKEUP) {
1198 error = ep_create_wakeup_source(epi);
1199 if (error)
1200 goto error_create_wakeup_source;
1201 } else {
1202 epi->ws = NULL;
1203 }
1204 956
1205 /* Initialize the poll table using the queue callback */ 957 /* Initialize the poll table using the queue callback */
1206 epq.epi = epi; 958 epq.epi = epi;
1207 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc); 959 init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
1208 epq.pt._key = event->events;
1209 960
1210 /* 961 /*
1211 * Attach the item to the poll hooks and get current event bits. 962 * Attach the item to the poll hooks and get current event bits.
@@ -1236,18 +987,12 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1236 */ 987 */
1237 ep_rbtree_insert(ep, epi); 988 ep_rbtree_insert(ep, epi);
1238 989
1239 /* now check if we've created too many backpaths */
1240 error = -EINVAL;
1241 if (reverse_path_check())
1242 goto error_remove_epi;
1243
1244 /* We have to drop the new item inside our item list to keep track of it */ 990 /* We have to drop the new item inside our item list to keep track of it */
1245 spin_lock_irqsave(&ep->lock, flags); 991 spin_lock_irqsave(&ep->lock, flags);
1246 992
1247 /* If the file is already "ready" we drop it inside the ready list */ 993 /* If the file is already "ready" we drop it inside the ready list */
1248 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { 994 if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) {
1249 list_add_tail(&epi->rdllink, &ep->rdllist); 995 list_add_tail(&epi->rdllink, &ep->rdllist);
1250 __pm_stay_awake(epi->ws);
1251 996
1252 /* Notify waiting tasks that events are available */ 997 /* Notify waiting tasks that events are available */
1253 if (waitqueue_active(&ep->wq)) 998 if (waitqueue_active(&ep->wq))
@@ -1266,14 +1011,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1266 1011
1267 return 0; 1012 return 0;
1268 1013
1269error_remove_epi:
1270 spin_lock(&tfile->f_lock);
1271 if (ep_is_linked(&epi->fllink))
1272 list_del_init(&epi->fllink);
1273 spin_unlock(&tfile->f_lock);
1274
1275 rb_erase(&epi->rbn, &ep->rbr);
1276
1277error_unregister: 1014error_unregister:
1278 ep_unregister_pollwait(ep, epi); 1015 ep_unregister_pollwait(ep, epi);
1279 1016
@@ -1288,9 +1025,6 @@ error_unregister:
1288 list_del_init(&epi->rdllink); 1025 list_del_init(&epi->rdllink);
1289 spin_unlock_irqrestore(&ep->lock, flags); 1026 spin_unlock_irqrestore(&ep->lock, flags);
1290 1027
1291 wakeup_source_unregister(epi->ws);
1292
1293error_create_wakeup_source:
1294 kmem_cache_free(epi_cache, epi); 1028 kmem_cache_free(epi_cache, epi);
1295 1029
1296 return error; 1030 return error;
@@ -1304,50 +1038,20 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1304{ 1038{
1305 int pwake = 0; 1039 int pwake = 0;
1306 unsigned int revents; 1040 unsigned int revents;
1307 poll_table pt;
1308
1309 init_poll_funcptr(&pt, NULL);
1310 1041
1311 /* 1042 /*
1312 * Set the new event interest mask before calling f_op->poll(); 1043 * Set the new event interest mask before calling f_op->poll();
1313 * otherwise we might miss an event that happens between the 1044 * otherwise we might miss an event that happens between the
1314 * f_op->poll() call and the new event set registering. 1045 * f_op->poll() call and the new event set registering.
1315 */ 1046 */
1316 epi->event.events = event->events; /* need barrier below */ 1047 epi->event.events = event->events;
1317 pt._key = event->events;
1318 epi->event.data = event->data; /* protected by mtx */ 1048 epi->event.data = event->data; /* protected by mtx */
1319 if (epi->event.events & EPOLLWAKEUP) {
1320 if (!epi->ws)
1321 ep_create_wakeup_source(epi);
1322 } else if (epi->ws) {
1323 ep_destroy_wakeup_source(epi);
1324 }
1325
1326 /*
1327 * The following barrier has two effects:
1328 *
1329 * 1) Flush epi changes above to other CPUs. This ensures
1330 * we do not miss events from ep_poll_callback if an
1331 * event occurs immediately after we call f_op->poll().
1332 * We need this because we did not take ep->lock while
1333 * changing epi above (but ep_poll_callback does take
1334 * ep->lock).
1335 *
1336 * 2) We also need to ensure we do not miss _past_ events
1337 * when calling f_op->poll(). This barrier also
1338 * pairs with the barrier in wq_has_sleeper (see
1339 * comments for wq_has_sleeper).
1340 *
1341 * This barrier will now guarantee ep_poll_callback or f_op->poll
1342 * (or both) will notice the readiness of an item.
1343 */
1344 smp_mb();
1345 1049
1346 /* 1050 /*
1347 * Get current event bits. We can safely use the file* here because 1051 * Get current event bits. We can safely use the file* here because
1348 * its usage count has been increased by the caller of this function. 1052 * its usage count has been increased by the caller of this function.
1349 */ 1053 */
1350 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt); 1054 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1351 1055
1352 /* 1056 /*
1353 * If the item is "hot" and it is not registered inside the ready 1057 * If the item is "hot" and it is not registered inside the ready
@@ -1357,7 +1061,6 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1357 spin_lock_irq(&ep->lock); 1061 spin_lock_irq(&ep->lock);
1358 if (!ep_is_linked(&epi->rdllink)) { 1062 if (!ep_is_linked(&epi->rdllink)) {
1359 list_add_tail(&epi->rdllink, &ep->rdllist); 1063 list_add_tail(&epi->rdllink, &ep->rdllist);
1360 __pm_stay_awake(epi->ws);
1361 1064
1362 /* Notify waiting tasks that events are available */ 1065 /* Notify waiting tasks that events are available */
1363 if (waitqueue_active(&ep->wq)) 1066 if (waitqueue_active(&ep->wq))
@@ -1383,9 +1086,6 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1383 unsigned int revents; 1086 unsigned int revents;
1384 struct epitem *epi; 1087 struct epitem *epi;
1385 struct epoll_event __user *uevent; 1088 struct epoll_event __user *uevent;
1386 poll_table pt;
1387
1388 init_poll_funcptr(&pt, NULL);
1389 1089
1390 /* 1090 /*
1391 * We can loop without lock because we are passed a task private list. 1091 * We can loop without lock because we are passed a task private list.
@@ -1396,22 +1096,9 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1396 !list_empty(head) && eventcnt < esed->maxevents;) { 1096 !list_empty(head) && eventcnt < esed->maxevents;) {
1397 epi = list_first_entry(head, struct epitem, rdllink); 1097 epi = list_first_entry(head, struct epitem, rdllink);
1398 1098
1399 /*
1400 * Activate ep->ws before deactivating epi->ws to prevent
1401 * triggering auto-suspend here (in case we reactive epi->ws
1402 * below).
1403 *
1404 * This could be rearranged to delay the deactivation of epi->ws
1405 * instead, but then epi->ws would temporarily be out of sync
1406 * with ep_is_linked().
1407 */
1408 if (epi->ws && epi->ws->active)
1409 __pm_stay_awake(ep->ws);
1410 __pm_relax(epi->ws);
1411 list_del_init(&epi->rdllink); 1099 list_del_init(&epi->rdllink);
1412 1100
1413 pt._key = epi->event.events; 1101 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
1414 revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
1415 epi->event.events; 1102 epi->event.events;
1416 1103
1417 /* 1104 /*
@@ -1424,7 +1111,6 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1424 if (__put_user(revents, &uevent->events) || 1111 if (__put_user(revents, &uevent->events) ||
1425 __put_user(epi->event.data, &uevent->data)) { 1112 __put_user(epi->event.data, &uevent->data)) {
1426 list_add(&epi->rdllink, head); 1113 list_add(&epi->rdllink, head);
1427 __pm_stay_awake(epi->ws);
1428 return eventcnt ? eventcnt : -EFAULT; 1114 return eventcnt ? eventcnt : -EFAULT;
1429 } 1115 }
1430 eventcnt++; 1116 eventcnt++;
@@ -1444,7 +1130,6 @@ static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
1444 * poll callback will queue them in ep->ovflist. 1130 * poll callback will queue them in ep->ovflist.
1445 */ 1131 */
1446 list_add_tail(&epi->rdllink, &ep->rdllist); 1132 list_add_tail(&epi->rdllink, &ep->rdllist);
1447 __pm_stay_awake(epi->ws);
1448 } 1133 }
1449 } 1134 }
1450 } 1135 }
@@ -1590,36 +1275,18 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1590 int error = 0; 1275 int error = 0;
1591 struct file *file = priv; 1276 struct file *file = priv;
1592 struct eventpoll *ep = file->private_data; 1277 struct eventpoll *ep = file->private_data;
1593 struct eventpoll *ep_tovisit;
1594 struct rb_node *rbp; 1278 struct rb_node *rbp;
1595 struct epitem *epi; 1279 struct epitem *epi;
1596 1280
1597 mutex_lock_nested(&ep->mtx, call_nests + 1); 1281 mutex_lock_nested(&ep->mtx, call_nests + 1);
1598 ep->visited = 1;
1599 list_add(&ep->visited_list_link, &visited_list);
1600 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1601 epi = rb_entry(rbp, struct epitem, rbn); 1283 epi = rb_entry(rbp, struct epitem, rbn);
1602 if (unlikely(is_file_epoll(epi->ffd.file))) { 1284 if (unlikely(is_file_epoll(epi->ffd.file))) {
1603 ep_tovisit = epi->ffd.file->private_data;
1604 if (ep_tovisit->visited)
1605 continue;
1606 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, 1285 error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1607 ep_loop_check_proc, epi->ffd.file, 1286 ep_loop_check_proc, epi->ffd.file,
1608 ep_tovisit, current); 1287 epi->ffd.file->private_data, current);
1609 if (error != 0) 1288 if (error != 0)
1610 break; 1289 break;
1611 } else {
1612 /*
1613 * If we've reached a file that is not associated with
1614 * an ep, then we need to check if the newly added
1615 * links are going to add too many wakeup paths. We do
1616 * this by adding it to the tfile_check_list, if it's
1617 * not already there, and calling reverse_path_check()
1618 * during ep_insert().
1619 */
1620 if (list_empty(&epi->ffd.file->f_tfile_llink))
1621 list_add(&epi->ffd.file->f_tfile_llink,
1622 &tfile_check_list);
1623 } 1290 }
1624 } 1291 }
1625 mutex_unlock(&ep->mtx); 1292 mutex_unlock(&ep->mtx);
@@ -1640,31 +1307,8 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1640 */ 1307 */
1641static int ep_loop_check(struct eventpoll *ep, struct file *file) 1308static int ep_loop_check(struct eventpoll *ep, struct file *file)
1642{ 1309{
1643 int ret; 1310 return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1644 struct eventpoll *ep_cur, *ep_next;
1645
1646 ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS,
1647 ep_loop_check_proc, file, ep, current); 1311 ep_loop_check_proc, file, ep, current);
1648 /* clear visited list */
1649 list_for_each_entry_safe(ep_cur, ep_next, &visited_list,
1650 visited_list_link) {
1651 ep_cur->visited = 0;
1652 list_del(&ep_cur->visited_list_link);
1653 }
1654 return ret;
1655}
1656
1657static void clear_tfile_check_list(void)
1658{
1659 struct file *file;
1660
1661 /* first clear the tfile_check_list */
1662 while (!list_empty(&tfile_check_list)) {
1663 file = list_first_entry(&tfile_check_list, struct file,
1664 f_tfile_llink);
1665 list_del_init(&file->f_tfile_llink);
1666 }
1667 INIT_LIST_HEAD(&tfile_check_list);
1668} 1312}
1669 1313
1670/* 1314/*
@@ -1672,9 +1316,8 @@ static void clear_tfile_check_list(void)
1672 */ 1316 */
1673SYSCALL_DEFINE1(epoll_create1, int, flags) 1317SYSCALL_DEFINE1(epoll_create1, int, flags)
1674{ 1318{
1675 int error, fd; 1319 int error;
1676 struct eventpoll *ep = NULL; 1320 struct eventpoll *ep = NULL;
1677 struct file *file;
1678 1321
1679 /* Check the EPOLL_* constant for consistency. */ 1322 /* Check the EPOLL_* constant for consistency. */
1680 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); 1323 BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
@@ -1691,25 +1334,11 @@ SYSCALL_DEFINE1(epoll_create1, int, flags)
1691 * Creates all the items needed to setup an eventpoll file. That is, 1334 * Creates all the items needed to setup an eventpoll file. That is,
1692 * a file structure and a free file descriptor. 1335 * a file structure and a free file descriptor.
1693 */ 1336 */
1694 fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); 1337 error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
1695 if (fd < 0) {
1696 error = fd;
1697 goto out_free_ep;
1698 }
1699 file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep,
1700 O_RDWR | (flags & O_CLOEXEC)); 1338 O_RDWR | (flags & O_CLOEXEC));
1701 if (IS_ERR(file)) { 1339 if (error < 0)
1702 error = PTR_ERR(file); 1340 ep_free(ep);
1703 goto out_free_fd; 1341
1704 }
1705 ep->file = file;
1706 fd_install(fd, file);
1707 return fd;
1708
1709out_free_fd:
1710 put_unused_fd(fd);
1711out_free_ep:
1712 ep_free(ep);
1713 return error; 1342 return error;
1714} 1343}
1715 1344
@@ -1757,10 +1386,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1757 if (!tfile->f_op || !tfile->f_op->poll) 1386 if (!tfile->f_op || !tfile->f_op->poll)
1758 goto error_tgt_fput; 1387 goto error_tgt_fput;
1759 1388
1760 /* Check if EPOLLWAKEUP is allowed */
1761 if ((epds.events & EPOLLWAKEUP) && !capable(CAP_BLOCK_SUSPEND))
1762 epds.events &= ~EPOLLWAKEUP;
1763
1764 /* 1389 /*
1765 * We have to check that the file structure underneath the file descriptor 1390 * We have to check that the file structure underneath the file descriptor
1766 * the user passed to us _is_ an eventpoll file. And also we do not permit 1391 * the user passed to us _is_ an eventpoll file. And also we do not permit
@@ -1779,29 +1404,21 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1779 /* 1404 /*
1780 * When we insert an epoll file descriptor, inside another epoll file 1405 * When we insert an epoll file descriptor, inside another epoll file
1781 * descriptor, there is the change of creating closed loops, which are 1406 * descriptor, there is the change of creating closed loops, which are
1782 * better be handled here, than in more critical paths. While we are 1407 * better be handled here, than in more critical paths.
1783 * checking for loops we also determine the list of files reachable
1784 * and hang them on the tfile_check_list, so we can check that we
1785 * haven't created too many possible wakeup paths.
1786 * 1408 *
1787 * We need to hold the epmutex across both ep_insert and ep_remove 1409 * We hold epmutex across the loop check and the insert in this case, in
1788 * b/c we want to make sure we are looking at a coherent view of 1410 * order to prevent two separate inserts from racing and each doing the
1789 * epoll network. 1411 * insert "at the same time" such that ep_loop_check passes on both
1412 * before either one does the insert, thereby creating a cycle.
1790 */ 1413 */
1791 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { 1414 if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) {
1792 mutex_lock(&epmutex); 1415 mutex_lock(&epmutex);
1793 did_lock_epmutex = 1; 1416 did_lock_epmutex = 1;
1417 error = -ELOOP;
1418 if (ep_loop_check(ep, tfile) != 0)
1419 goto error_tgt_fput;
1794 } 1420 }
1795 if (op == EPOLL_CTL_ADD) { 1421
1796 if (is_file_epoll(tfile)) {
1797 error = -ELOOP;
1798 if (ep_loop_check(ep, tfile) != 0) {
1799 clear_tfile_check_list();
1800 goto error_tgt_fput;
1801 }
1802 } else
1803 list_add(&tfile->f_tfile_llink, &tfile_check_list);
1804 }
1805 1422
1806 mutex_lock_nested(&ep->mtx, 0); 1423 mutex_lock_nested(&ep->mtx, 0);
1807 1424
@@ -1820,7 +1437,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1820 error = ep_insert(ep, &epds, tfile, fd); 1437 error = ep_insert(ep, &epds, tfile, fd);
1821 } else 1438 } else
1822 error = -EEXIST; 1439 error = -EEXIST;
1823 clear_tfile_check_list();
1824 break; 1440 break;
1825 case EPOLL_CTL_DEL: 1441 case EPOLL_CTL_DEL:
1826 if (epi) 1442 if (epi)
@@ -1839,7 +1455,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1839 mutex_unlock(&ep->mtx); 1455 mutex_unlock(&ep->mtx);
1840 1456
1841error_tgt_fput: 1457error_tgt_fput:
1842 if (did_lock_epmutex) 1458 if (unlikely(did_lock_epmutex))
1843 mutex_unlock(&epmutex); 1459 mutex_unlock(&epmutex);
1844 1460
1845 fput(tfile); 1461 fput(tfile);
@@ -1858,7 +1474,7 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1858 int, maxevents, int, timeout) 1474 int, maxevents, int, timeout)
1859{ 1475{
1860 int error; 1476 int error;
1861 struct fd f; 1477 struct file *file;
1862 struct eventpoll *ep; 1478 struct eventpoll *ep;
1863 1479
1864 /* The maximum number of event must be greater than zero */ 1480 /* The maximum number of event must be greater than zero */
@@ -1866,36 +1482,43 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
1866 return -EINVAL; 1482 return -EINVAL;
1867 1483
1868 /* Verify that the area passed by the user is writeable */ 1484 /* Verify that the area passed by the user is writeable */
1869 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) 1485 if (!access_ok(VERIFY_WRITE, events, maxevents * sizeof(struct epoll_event))) {
1870 return -EFAULT; 1486 error = -EFAULT;
1487 goto error_return;
1488 }
1871 1489
1872 /* Get the "struct file *" for the eventpoll file */ 1490 /* Get the "struct file *" for the eventpoll file */
1873 f = fdget(epfd); 1491 error = -EBADF;
1874 if (!f.file) 1492 file = fget(epfd);
1875 return -EBADF; 1493 if (!file)
1494 goto error_return;
1876 1495
1877 /* 1496 /*
1878 * We have to check that the file structure underneath the fd 1497 * We have to check that the file structure underneath the fd
1879 * the user passed to us _is_ an eventpoll file. 1498 * the user passed to us _is_ an eventpoll file.
1880 */ 1499 */
1881 error = -EINVAL; 1500 error = -EINVAL;
1882 if (!is_file_epoll(f.file)) 1501 if (!is_file_epoll(file))
1883 goto error_fput; 1502 goto error_fput;
1884 1503
1885 /* 1504 /*
1886 * At this point it is safe to assume that the "private_data" contains 1505 * At this point it is safe to assume that the "private_data" contains
1887 * our own data structure. 1506 * our own data structure.
1888 */ 1507 */
1889 ep = f.file->private_data; 1508 ep = file->private_data;
1890 1509
1891 /* Time to fish for events ... */ 1510 /* Time to fish for events ... */
1892 error = ep_poll(ep, events, maxevents, timeout); 1511 error = ep_poll(ep, events, maxevents, timeout);
1893 1512
1894error_fput: 1513error_fput:
1895 fdput(f); 1514 fput(file);
1515error_return:
1516
1896 return error; 1517 return error;
1897} 1518}
1898 1519
1520#ifdef HAVE_SET_RESTORE_SIGMASK
1521
1899/* 1522/*
1900 * Implement the event wait interface for the eventpoll file. It is the kernel 1523 * Implement the event wait interface for the eventpoll file. It is the kernel
1901 * part of the user space epoll_pwait(2). 1524 * part of the user space epoll_pwait(2).
@@ -1940,6 +1563,8 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
1940 return error; 1563 return error;
1941} 1564}
1942 1565
1566#endif /* HAVE_SET_RESTORE_SIGMASK */
1567
1943static int __init eventpoll_init(void) 1568static int __init eventpoll_init(void)
1944{ 1569{
1945 struct sysinfo si; 1570 struct sysinfo si;