diff options
author | Davide Libenzi <davidel@xmailserver.org> | 2007-05-15 04:40:41 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-05-15 11:53:59 -0400 |
commit | d47de16c7221968d3eab899d7540efa5ba77af5a (patch) | |
tree | 289c3dc8e4b3121a9a4b1846ae9acbd355b4b541 /fs | |
parent | faa8b6c3c2e1454175609167a25ae525d075f045 (diff) |
fix epoll single pass code and add wait-exclusive flag
Fixes the epoll single pass code. During the unlocked event delivery (to
userspace) code, the poll callback can re-issue new events, and we must
receive them correctly. Since we loop in a lockless fashion, we want to be
O(nready), and we don't want to flash on/off the spinlock for every event, we
have the poll callback to use a secondary list to queue events while we're
inside the event delivery loop. The rw_semaphore has been turned into a
mutex. This patch also adds the wait-exclusive flag, as suggested by Davi
Arnaut.
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/eventpoll.c | 322 |
1 files changed, 166 insertions, 156 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1aad34ea61a4..1dbedc71a28c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -26,7 +26,6 @@ | |||
26 | #include <linux/hash.h> | 26 | #include <linux/hash.h> |
27 | #include <linux/spinlock.h> | 27 | #include <linux/spinlock.h> |
28 | #include <linux/syscalls.h> | 28 | #include <linux/syscalls.h> |
29 | #include <linux/rwsem.h> | ||
30 | #include <linux/rbtree.h> | 29 | #include <linux/rbtree.h> |
31 | #include <linux/wait.h> | 30 | #include <linux/wait.h> |
32 | #include <linux/eventpoll.h> | 31 | #include <linux/eventpoll.h> |
@@ -39,14 +38,13 @@ | |||
39 | #include <asm/io.h> | 38 | #include <asm/io.h> |
40 | #include <asm/mman.h> | 39 | #include <asm/mman.h> |
41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
42 | #include <asm/semaphore.h> | ||
43 | 41 | ||
44 | /* | 42 | /* |
45 | * LOCKING: | 43 | * LOCKING: |
46 | * There are three level of locking required by epoll : | 44 | * There are three level of locking required by epoll : |
47 | * | 45 | * |
48 | * 1) epmutex (mutex) | 46 | * 1) epmutex (mutex) |
49 | * 2) ep->sem (rw_semaphore) | 47 | * 2) ep->mtx (mutes) |
50 | * 3) ep->lock (rw_lock) | 48 | * 3) ep->lock (rw_lock) |
51 | * | 49 | * |
52 | * The acquire order is the one listed above, from 1 to 3. | 50 | * The acquire order is the one listed above, from 1 to 3. |
@@ -57,20 +55,20 @@ | |||
57 | * a spinlock. During the event transfer loop (from kernel to | 55 | * a spinlock. During the event transfer loop (from kernel to |
58 | * user space) we could end up sleeping due a copy_to_user(), so | 56 | * user space) we could end up sleeping due a copy_to_user(), so |
59 | * we need a lock that will allow us to sleep. This lock is a | 57 | * we need a lock that will allow us to sleep. This lock is a |
60 | * read-write semaphore (ep->sem). It is acquired on read during | 58 | * mutex (ep->mtx). It is acquired during the event transfer loop, |
61 | * the event transfer loop and in write during epoll_ctl(EPOLL_CTL_DEL) | 59 | * during epoll_ctl(EPOLL_CTL_DEL) and during eventpoll_release_file(). |
62 | * and during eventpoll_release_file(). Then we also need a global | 60 | * Then we also need a global mutex to serialize eventpoll_release_file() |
63 | * semaphore to serialize eventpoll_release_file() and ep_free(). | 61 | * and ep_free(). |
64 | * This semaphore is acquired by ep_free() during the epoll file | 62 | * This mutex is acquired by ep_free() during the epoll file |
65 | * cleanup path and it is also acquired by eventpoll_release_file() | 63 | * cleanup path and it is also acquired by eventpoll_release_file() |
66 | * if a file has been pushed inside an epoll set and it is then | 64 | * if a file has been pushed inside an epoll set and it is then |
67 | * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). | 65 | * close()d without a previous call toepoll_ctl(EPOLL_CTL_DEL). |
68 | * It is possible to drop the "ep->sem" and to use the global | 66 | * It is possible to drop the "ep->mtx" and to use the global |
69 | * semaphore "epmutex" (together with "ep->lock") to have it working, | 67 | * mutex "epmutex" (together with "ep->lock") to have it working, |
70 | * but having "ep->sem" will make the interface more scalable. | 68 | * but having "ep->mtx" will make the interface more scalable. |
71 | * Events that require holding "epmutex" are very rare, while for | 69 | * Events that require holding "epmutex" are very rare, while for |
72 | * normal operations the epoll private "ep->sem" will guarantee | 70 | * normal operations the epoll private "ep->mtx" will guarantee |
73 | * a greater scalability. | 71 | * a better scalability. |
74 | */ | 72 | */ |
75 | 73 | ||
76 | #define DEBUG_EPOLL 0 | 74 | #define DEBUG_EPOLL 0 |
@@ -102,6 +100,8 @@ | |||
102 | 100 | ||
103 | #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) | 101 | #define EP_MAX_EVENTS (INT_MAX / sizeof(struct epoll_event)) |
104 | 102 | ||
103 | #define EP_UNACTIVE_PTR ((void *) -1L) | ||
104 | |||
105 | struct epoll_filefd { | 105 | struct epoll_filefd { |
106 | struct file *file; | 106 | struct file *file; |
107 | int fd; | 107 | int fd; |
@@ -111,7 +111,7 @@ struct epoll_filefd { | |||
111 | * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". | 111 | * Node that is linked into the "wake_task_list" member of the "struct poll_safewake". |
112 | * It is used to keep track on all tasks that are currently inside the wake_up() code | 112 | * It is used to keep track on all tasks that are currently inside the wake_up() code |
113 | * to 1) short-circuit the one coming from the same task and same wait queue head | 113 | * to 1) short-circuit the one coming from the same task and same wait queue head |
114 | * ( loop ) 2) allow a maximum number of epoll descriptors inclusion nesting | 114 | * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting |
115 | * 3) let go the ones coming from other tasks. | 115 | * 3) let go the ones coming from other tasks. |
116 | */ | 116 | */ |
117 | struct wake_task_node { | 117 | struct wake_task_node { |
@@ -130,6 +130,48 @@ struct poll_safewake { | |||
130 | }; | 130 | }; |
131 | 131 | ||
132 | /* | 132 | /* |
133 | * Each file descriptor added to the eventpoll interface will | ||
134 | * have an entry of this type linked to the "rbr" RB tree. | ||
135 | */ | ||
136 | struct epitem { | ||
137 | /* RB-Tree node used to link this structure to the eventpoll rb-tree */ | ||
138 | struct rb_node rbn; | ||
139 | |||
140 | /* List header used to link this structure to the eventpoll ready list */ | ||
141 | struct list_head rdllink; | ||
142 | |||
143 | /* The file descriptor information this item refers to */ | ||
144 | struct epoll_filefd ffd; | ||
145 | |||
146 | /* Number of active wait queue attached to poll operations */ | ||
147 | int nwait; | ||
148 | |||
149 | /* List containing poll wait queues */ | ||
150 | struct list_head pwqlist; | ||
151 | |||
152 | /* The "container" of this item */ | ||
153 | struct eventpoll *ep; | ||
154 | |||
155 | /* The structure that describe the interested events and the source fd */ | ||
156 | struct epoll_event event; | ||
157 | |||
158 | /* | ||
159 | * Used to keep track of the usage count of the structure. This avoids | ||
160 | * that the structure will desappear from underneath our processing. | ||
161 | */ | ||
162 | atomic_t usecnt; | ||
163 | |||
164 | /* List header used to link this item to the "struct file" items list */ | ||
165 | struct list_head fllink; | ||
166 | |||
167 | /* | ||
168 | * Works together "struct eventpoll"->ovflist in keeping the | ||
169 | * single linked chain of items. | ||
170 | */ | ||
171 | struct epitem *next; | ||
172 | }; | ||
173 | |||
174 | /* | ||
133 | * This structure is stored inside the "private_data" member of the file | 175 | * This structure is stored inside the "private_data" member of the file |
134 | * structure and rapresent the main data sructure for the eventpoll | 176 | * structure and rapresent the main data sructure for the eventpoll |
135 | * interface. | 177 | * interface. |
@@ -139,12 +181,12 @@ struct eventpoll { | |||
139 | rwlock_t lock; | 181 | rwlock_t lock; |
140 | 182 | ||
141 | /* | 183 | /* |
142 | * This semaphore is used to ensure that files are not removed | 184 | * This mutex is used to ensure that files are not removed |
143 | * while epoll is using them. This is read-held during the event | 185 | * while epoll is using them. This is held during the event |
144 | * collection loop and it is write-held during the file cleanup | 186 | * collection loop, the file cleanup path, the epoll file exit |
145 | * path, the epoll file exit code and the ctl operations. | 187 | * code and the ctl operations. |
146 | */ | 188 | */ |
147 | struct rw_semaphore sem; | 189 | struct mutex mtx; |
148 | 190 | ||
149 | /* Wait queue used by sys_epoll_wait() */ | 191 | /* Wait queue used by sys_epoll_wait() */ |
150 | wait_queue_head_t wq; | 192 | wait_queue_head_t wq; |
@@ -157,6 +199,13 @@ struct eventpoll { | |||
157 | 199 | ||
158 | /* RB-Tree root used to store monitored fd structs */ | 200 | /* RB-Tree root used to store monitored fd structs */ |
159 | struct rb_root rbr; | 201 | struct rb_root rbr; |
202 | |||
203 | /* | ||
204 | * This is a single linked list that chains all the "struct epitem" that | ||
205 | * happened while transfering ready events to userspace w/out | ||
206 | * holding ->lock. | ||
207 | */ | ||
208 | struct epitem *ovflist; | ||
160 | }; | 209 | }; |
161 | 210 | ||
162 | /* Wait structure used by the poll hooks */ | 211 | /* Wait structure used by the poll hooks */ |
@@ -177,42 +226,6 @@ struct eppoll_entry { | |||
177 | wait_queue_head_t *whead; | 226 | wait_queue_head_t *whead; |
178 | }; | 227 | }; |
179 | 228 | ||
180 | /* | ||
181 | * Each file descriptor added to the eventpoll interface will | ||
182 | * have an entry of this type linked to the "rbr" RB tree. | ||
183 | */ | ||
184 | struct epitem { | ||
185 | /* RB-Tree node used to link this structure to the eventpoll rb-tree */ | ||
186 | struct rb_node rbn; | ||
187 | |||
188 | /* List header used to link this structure to the eventpoll ready list */ | ||
189 | struct list_head rdllink; | ||
190 | |||
191 | /* The file descriptor information this item refers to */ | ||
192 | struct epoll_filefd ffd; | ||
193 | |||
194 | /* Number of active wait queue attached to poll operations */ | ||
195 | int nwait; | ||
196 | |||
197 | /* List containing poll wait queues */ | ||
198 | struct list_head pwqlist; | ||
199 | |||
200 | /* The "container" of this item */ | ||
201 | struct eventpoll *ep; | ||
202 | |||
203 | /* The structure that describe the interested events and the source fd */ | ||
204 | struct epoll_event event; | ||
205 | |||
206 | /* | ||
207 | * Used to keep track of the usage count of the structure. This avoids | ||
208 | * that the structure will desappear from underneath our processing. | ||
209 | */ | ||
210 | atomic_t usecnt; | ||
211 | |||
212 | /* List header used to link this item to the "struct file" items list */ | ||
213 | struct list_head fllink; | ||
214 | }; | ||
215 | |||
216 | /* Wrapper struct used by poll queueing */ | 229 | /* Wrapper struct used by poll queueing */ |
217 | struct ep_pqueue { | 230 | struct ep_pqueue { |
218 | poll_table pt; | 231 | poll_table pt; |
@@ -220,7 +233,7 @@ struct ep_pqueue { | |||
220 | }; | 233 | }; |
221 | 234 | ||
222 | /* | 235 | /* |
223 | * This semaphore is used to serialize ep_free() and eventpoll_release_file(). | 236 | * This mutex is used to serialize ep_free() and eventpoll_release_file(). |
224 | */ | 237 | */ |
225 | static struct mutex epmutex; | 238 | static struct mutex epmutex; |
226 | 239 | ||
@@ -506,7 +519,7 @@ static void ep_free(struct eventpoll *ep) | |||
506 | /* | 519 | /* |
507 | * We need to lock this because we could be hit by | 520 | * We need to lock this because we could be hit by |
508 | * eventpoll_release_file() while we're freeing the "struct eventpoll". | 521 | * eventpoll_release_file() while we're freeing the "struct eventpoll". |
509 | * We do not need to hold "ep->sem" here because the epoll file | 522 | * We do not need to hold "ep->mtx" here because the epoll file |
510 | * is on the way to be removed and no one has references to it | 523 | * is on the way to be removed and no one has references to it |
511 | * anymore. The only hit might come from eventpoll_release_file() but | 524 | * anymore. The only hit might come from eventpoll_release_file() but |
512 | * holding "epmutex" is sufficent here. | 525 | * holding "epmutex" is sufficent here. |
@@ -525,7 +538,7 @@ static void ep_free(struct eventpoll *ep) | |||
525 | /* | 538 | /* |
526 | * Walks through the whole tree by freeing each "struct epitem". At this | 539 | * Walks through the whole tree by freeing each "struct epitem". At this |
527 | * point we are sure no poll callbacks will be lingering around, and also by | 540 | * point we are sure no poll callbacks will be lingering around, and also by |
528 | * write-holding "sem" we can be sure that no file cleanup code will hit | 541 | * holding "epmutex" we can be sure that no file cleanup code will hit |
529 | * us during this operation. So we can avoid the lock on "ep->lock". | 542 | * us during this operation. So we can avoid the lock on "ep->lock". |
530 | */ | 543 | */ |
531 | while ((rbp = rb_first(&ep->rbr)) != 0) { | 544 | while ((rbp = rb_first(&ep->rbr)) != 0) { |
@@ -534,6 +547,8 @@ static void ep_free(struct eventpoll *ep) | |||
534 | } | 547 | } |
535 | 548 | ||
536 | mutex_unlock(&epmutex); | 549 | mutex_unlock(&epmutex); |
550 | |||
551 | mutex_destroy(&ep->mtx); | ||
537 | } | 552 | } |
538 | 553 | ||
539 | static int ep_eventpoll_release(struct inode *inode, struct file *file) | 554 | static int ep_eventpoll_release(struct inode *inode, struct file *file) |
@@ -594,9 +609,9 @@ void eventpoll_release_file(struct file *file) | |||
594 | * We don't want to get "file->f_ep_lock" because it is not | 609 | * We don't want to get "file->f_ep_lock" because it is not |
595 | * necessary. It is not necessary because we're in the "struct file" | 610 | * necessary. It is not necessary because we're in the "struct file" |
596 | * cleanup path, and this means that noone is using this file anymore. | 611 | * cleanup path, and this means that noone is using this file anymore. |
597 | * The only hit might come from ep_free() but by holding the semaphore | 612 | * The only hit might come from ep_free() but by holding the mutex |
598 | * will correctly serialize the operation. We do need to acquire | 613 | * will correctly serialize the operation. We do need to acquire |
599 | * "ep->sem" after "epmutex" because ep_remove() requires it when called | 614 | * "ep->mtx" after "epmutex" because ep_remove() requires it when called |
600 | * from anywhere but ep_free(). | 615 | * from anywhere but ep_free(). |
601 | */ | 616 | */ |
602 | mutex_lock(&epmutex); | 617 | mutex_lock(&epmutex); |
@@ -606,9 +621,9 @@ void eventpoll_release_file(struct file *file) | |||
606 | 621 | ||
607 | ep = epi->ep; | 622 | ep = epi->ep; |
608 | list_del_init(&epi->fllink); | 623 | list_del_init(&epi->fllink); |
609 | down_write(&ep->sem); | 624 | mutex_lock(&ep->mtx); |
610 | ep_remove(ep, epi); | 625 | ep_remove(ep, epi); |
611 | up_write(&ep->sem); | 626 | mutex_unlock(&ep->mtx); |
612 | } | 627 | } |
613 | 628 | ||
614 | mutex_unlock(&epmutex); | 629 | mutex_unlock(&epmutex); |
@@ -622,11 +637,12 @@ static int ep_alloc(struct eventpoll **pep) | |||
622 | return -ENOMEM; | 637 | return -ENOMEM; |
623 | 638 | ||
624 | rwlock_init(&ep->lock); | 639 | rwlock_init(&ep->lock); |
625 | init_rwsem(&ep->sem); | 640 | mutex_init(&ep->mtx); |
626 | init_waitqueue_head(&ep->wq); | 641 | init_waitqueue_head(&ep->wq); |
627 | init_waitqueue_head(&ep->poll_wait); | 642 | init_waitqueue_head(&ep->poll_wait); |
628 | INIT_LIST_HEAD(&ep->rdllist); | 643 | INIT_LIST_HEAD(&ep->rdllist); |
629 | ep->rbr = RB_ROOT; | 644 | ep->rbr = RB_ROOT; |
645 | ep->ovflist = EP_UNACTIVE_PTR; | ||
630 | 646 | ||
631 | *pep = ep; | 647 | *pep = ep; |
632 | 648 | ||
@@ -695,7 +711,21 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k | |||
695 | * until the next EPOLL_CTL_MOD will be issued. | 711 | * until the next EPOLL_CTL_MOD will be issued. |
696 | */ | 712 | */ |
697 | if (!(epi->event.events & ~EP_PRIVATE_BITS)) | 713 | if (!(epi->event.events & ~EP_PRIVATE_BITS)) |
698 | goto is_disabled; | 714 | goto out_unlock; |
715 | |||
716 | /* | ||
717 | * If we are trasfering events to userspace, we can hold no locks | ||
718 | * (because we're accessing user memory, and because of linux f_op->poll() | ||
719 | * semantics). All the events that happens during that period of time are | ||
720 | * chained in ep->ovflist and requeued later on. | ||
721 | */ | ||
722 | if (unlikely(ep->ovflist != EP_UNACTIVE_PTR)) { | ||
723 | if (epi->next == EP_UNACTIVE_PTR) { | ||
724 | epi->next = ep->ovflist; | ||
725 | ep->ovflist = epi; | ||
726 | } | ||
727 | goto out_unlock; | ||
728 | } | ||
699 | 729 | ||
700 | /* If this file is already in the ready list we exit soon */ | 730 | /* If this file is already in the ready list we exit soon */ |
701 | if (ep_is_linked(&epi->rdllink)) | 731 | if (ep_is_linked(&epi->rdllink)) |
@@ -714,7 +744,7 @@ is_linked: | |||
714 | if (waitqueue_active(&ep->poll_wait)) | 744 | if (waitqueue_active(&ep->poll_wait)) |
715 | pwake++; | 745 | pwake++; |
716 | 746 | ||
717 | is_disabled: | 747 | out_unlock: |
718 | write_unlock_irqrestore(&ep->lock, flags); | 748 | write_unlock_irqrestore(&ep->lock, flags); |
719 | 749 | ||
720 | /* We have to call this outside the lock */ | 750 | /* We have to call this outside the lock */ |
@@ -788,6 +818,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
788 | epi->event = *event; | 818 | epi->event = *event; |
789 | atomic_set(&epi->usecnt, 1); | 819 | atomic_set(&epi->usecnt, 1); |
790 | epi->nwait = 0; | 820 | epi->nwait = 0; |
821 | epi->next = EP_UNACTIVE_PTR; | ||
791 | 822 | ||
792 | /* Initialize the poll table using the queue callback */ | 823 | /* Initialize the poll table using the queue callback */ |
793 | epq.epi = epi; | 824 | epq.epi = epi; |
@@ -920,36 +951,50 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even | |||
920 | return 0; | 951 | return 0; |
921 | } | 952 | } |
922 | 953 | ||
923 | /* | 954 | static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, |
924 | * This function is called without holding the "ep->lock" since the call to | 955 | int maxevents) |
925 | * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ | ||
926 | * because of the way poll() is traditionally implemented in Linux. | ||
927 | */ | ||
928 | static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | ||
929 | struct epoll_event __user *events, int maxevents) | ||
930 | { | 956 | { |
931 | int eventcnt, error = -EFAULT, pwake = 0; | 957 | int eventcnt, error = -EFAULT, pwake = 0; |
932 | unsigned int revents; | 958 | unsigned int revents; |
933 | unsigned long flags; | 959 | unsigned long flags; |
934 | struct epitem *epi; | 960 | struct epitem *epi, *nepi; |
935 | struct list_head injlist; | 961 | struct list_head txlist; |
962 | |||
963 | INIT_LIST_HEAD(&txlist); | ||
936 | 964 | ||
937 | INIT_LIST_HEAD(&injlist); | 965 | /* |
966 | * We need to lock this because we could be hit by | ||
967 | * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). | ||
968 | */ | ||
969 | mutex_lock(&ep->mtx); | ||
970 | |||
971 | /* | ||
972 | * Steal the ready list, and re-init the original one to the | ||
973 | * empty list. Also, set ep->ovflist to NULL so that events | ||
974 | * happening while looping w/out locks, are not lost. We cannot | ||
975 | * have the poll callback to queue directly on ep->rdllist, | ||
976 | * because we are doing it in the loop below, in a lockless way. | ||
977 | */ | ||
978 | write_lock_irqsave(&ep->lock, flags); | ||
979 | list_splice(&ep->rdllist, &txlist); | ||
980 | INIT_LIST_HEAD(&ep->rdllist); | ||
981 | ep->ovflist = NULL; | ||
982 | write_unlock_irqrestore(&ep->lock, flags); | ||
938 | 983 | ||
939 | /* | 984 | /* |
940 | * We can loop without lock because this is a task private list. | 985 | * We can loop without lock because this is a task private list. |
941 | * We just splice'd out the ep->rdllist in ep_collect_ready_items(). | 986 | * We just splice'd out the ep->rdllist in ep_collect_ready_items(). |
942 | * Items cannot vanish during the loop because we are holding "sem" in | 987 | * Items cannot vanish during the loop because we are holding "mtx". |
943 | * read. | ||
944 | */ | 988 | */ |
945 | for (eventcnt = 0; !list_empty(txlist) && eventcnt < maxevents;) { | 989 | for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) { |
946 | epi = list_first_entry(txlist, struct epitem, rdllink); | 990 | epi = list_first_entry(&txlist, struct epitem, rdllink); |
947 | prefetch(epi->rdllink.next); | 991 | |
992 | list_del_init(&epi->rdllink); | ||
948 | 993 | ||
949 | /* | 994 | /* |
950 | * Get the ready file event set. We can safely use the file | 995 | * Get the ready file event set. We can safely use the file |
951 | * because we are holding the "sem" in read and this will | 996 | * because we are holding the "mtx" and this will guarantee |
952 | * guarantee that both the file and the item will not vanish. | 997 | * that both the file and the item will not vanish. |
953 | */ | 998 | */ |
954 | revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); | 999 | revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); |
955 | revents &= epi->event.events; | 1000 | revents &= epi->event.events; |
@@ -957,8 +1002,8 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | |||
957 | /* | 1002 | /* |
958 | * Is the event mask intersect the caller-requested one, | 1003 | * Is the event mask intersect the caller-requested one, |
959 | * deliver the event to userspace. Again, we are holding | 1004 | * deliver the event to userspace. Again, we are holding |
960 | * "sem" in read, so no operations coming from userspace | 1005 | * "mtx", so no operations coming from userspace can change |
961 | * can change the item. | 1006 | * the item. |
962 | */ | 1007 | */ |
963 | if (revents) { | 1008 | if (revents) { |
964 | if (__put_user(revents, | 1009 | if (__put_user(revents, |
@@ -970,49 +1015,47 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | |||
970 | epi->event.events &= EP_PRIVATE_BITS; | 1015 | epi->event.events &= EP_PRIVATE_BITS; |
971 | eventcnt++; | 1016 | eventcnt++; |
972 | } | 1017 | } |
973 | |||
974 | /* | 1018 | /* |
975 | * This is tricky. We are holding the "sem" in read, and this | 1019 | * At this point, noone can insert into ep->rdllist besides |
976 | * means that the operations that can change the "linked" status | 1020 | * us. The epoll_ctl() callers are locked out by us holding |
977 | * of the epoll item (epi->rbn and epi->rdllink), cannot touch | 1021 | * "mtx" and the poll callback will queue them in ep->ovflist. |
978 | * them. Also, since we are "linked" from a epi->rdllink POV | ||
979 | * (the item is linked to our transmission list we just | ||
980 | * spliced), the ep_poll_callback() cannot touch us either, | ||
981 | * because of the check present in there. Another parallel | ||
982 | * epoll_wait() will not get the same result set, since we | ||
983 | * spliced the ready list before. Note that list_del() still | ||
984 | * shows the item as linked to the test in ep_poll_callback(). | ||
985 | */ | 1022 | */ |
986 | list_del(&epi->rdllink); | ||
987 | if (!(epi->event.events & EPOLLET) && | 1023 | if (!(epi->event.events & EPOLLET) && |
988 | (revents & epi->event.events)) | 1024 | (revents & epi->event.events)) |
989 | list_add_tail(&epi->rdllink, &injlist); | 1025 | list_add_tail(&epi->rdllink, &ep->rdllist); |
990 | else { | ||
991 | /* | ||
992 | * Be sure the item is totally detached before re-init | ||
993 | * the list_head. After INIT_LIST_HEAD() is committed, | ||
994 | * the ep_poll_callback() can requeue the item again, | ||
995 | * but we don't care since we are already past it. | ||
996 | */ | ||
997 | smp_mb(); | ||
998 | INIT_LIST_HEAD(&epi->rdllink); | ||
999 | } | ||
1000 | } | 1026 | } |
1001 | error = 0; | 1027 | error = 0; |
1002 | 1028 | ||
1003 | errxit: | 1029 | errxit: |
1004 | 1030 | ||
1031 | write_lock_irqsave(&ep->lock, flags); | ||
1032 | /* | ||
1033 | * During the time we spent in the loop above, some other events | ||
1034 | * might have been queued by the poll callback. We re-insert them | ||
1035 | * here (in case they are not already queued, or they're one-shot). | ||
1036 | */ | ||
1037 | for (nepi = ep->ovflist; (epi = nepi) != NULL; | ||
1038 | nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { | ||
1039 | if (!ep_is_linked(&epi->rdllink) && | ||
1040 | (epi->event.events & ~EP_PRIVATE_BITS)) | ||
1041 | list_add_tail(&epi->rdllink, &ep->rdllist); | ||
1042 | } | ||
1005 | /* | 1043 | /* |
1006 | * If the re-injection list or the txlist are not empty, re-splice | 1044 | * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after |
1007 | * them to the ready list and do proper wakeups. | 1045 | * releasing the lock, events will be queued in the normal way inside |
1046 | * ep->rdllist. | ||
1008 | */ | 1047 | */ |
1009 | if (!list_empty(&injlist) || !list_empty(txlist)) { | 1048 | ep->ovflist = EP_UNACTIVE_PTR; |
1010 | write_lock_irqsave(&ep->lock, flags); | ||
1011 | 1049 | ||
1012 | list_splice(txlist, &ep->rdllist); | 1050 | /* |
1013 | list_splice(&injlist, &ep->rdllist); | 1051 | * In case of error in the event-send loop, we might still have items |
1052 | * inside the "txlist". We need to splice them back inside ep->rdllist. | ||
1053 | */ | ||
1054 | list_splice(&txlist, &ep->rdllist); | ||
1055 | |||
1056 | if (!list_empty(&ep->rdllist)) { | ||
1014 | /* | 1057 | /* |
1015 | * Wake up ( if active ) both the eventpoll wait list and the ->poll() | 1058 | * Wake up (if active) both the eventpoll wait list and the ->poll() |
1016 | * wait list. | 1059 | * wait list. |
1017 | */ | 1060 | */ |
1018 | if (waitqueue_active(&ep->wq)) | 1061 | if (waitqueue_active(&ep->wq)) |
@@ -1020,9 +1063,10 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | |||
1020 | TASK_INTERRUPTIBLE); | 1063 | TASK_INTERRUPTIBLE); |
1021 | if (waitqueue_active(&ep->poll_wait)) | 1064 | if (waitqueue_active(&ep->poll_wait)) |
1022 | pwake++; | 1065 | pwake++; |
1023 | |||
1024 | write_unlock_irqrestore(&ep->lock, flags); | ||
1025 | } | 1066 | } |
1067 | write_unlock_irqrestore(&ep->lock, flags); | ||
1068 | |||
1069 | mutex_unlock(&ep->mtx); | ||
1026 | 1070 | ||
1027 | /* We have to call this outside the lock */ | 1071 | /* We have to call this outside the lock */ |
1028 | if (pwake) | 1072 | if (pwake) |
@@ -1031,41 +1075,6 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, | |||
1031 | return eventcnt == 0 ? error: eventcnt; | 1075 | return eventcnt == 0 ? error: eventcnt; |
1032 | } | 1076 | } |
1033 | 1077 | ||
1034 | /* | ||
1035 | * Perform the transfer of events to user space. | ||
1036 | */ | ||
1037 | static int ep_events_transfer(struct eventpoll *ep, | ||
1038 | struct epoll_event __user *events, int maxevents) | ||
1039 | { | ||
1040 | int eventcnt; | ||
1041 | unsigned long flags; | ||
1042 | struct list_head txlist; | ||
1043 | |||
1044 | INIT_LIST_HEAD(&txlist); | ||
1045 | |||
1046 | /* | ||
1047 | * We need to lock this because we could be hit by | ||
1048 | * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). | ||
1049 | */ | ||
1050 | down_read(&ep->sem); | ||
1051 | |||
1052 | /* | ||
1053 | * Steal the ready list, and re-init the original one to the | ||
1054 | * empty list. | ||
1055 | */ | ||
1056 | write_lock_irqsave(&ep->lock, flags); | ||
1057 | list_splice(&ep->rdllist, &txlist); | ||
1058 | INIT_LIST_HEAD(&ep->rdllist); | ||
1059 | write_unlock_irqrestore(&ep->lock, flags); | ||
1060 | |||
1061 | /* Build result set in userspace */ | ||
1062 | eventcnt = ep_send_events(ep, &txlist, events, maxevents); | ||
1063 | |||
1064 | up_read(&ep->sem); | ||
1065 | |||
1066 | return eventcnt; | ||
1067 | } | ||
1068 | |||
1069 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, | 1078 | static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, |
1070 | int maxevents, long timeout) | 1079 | int maxevents, long timeout) |
1071 | { | 1080 | { |
@@ -1093,6 +1102,7 @@ retry: | |||
1093 | * ep_poll_callback() when events will become available. | 1102 | * ep_poll_callback() when events will become available. |
1094 | */ | 1103 | */ |
1095 | init_waitqueue_entry(&wait, current); | 1104 | init_waitqueue_entry(&wait, current); |
1105 | wait.flags |= WQ_FLAG_EXCLUSIVE; | ||
1096 | __add_wait_queue(&ep->wq, &wait); | 1106 | __add_wait_queue(&ep->wq, &wait); |
1097 | 1107 | ||
1098 | for (;;) { | 1108 | for (;;) { |
@@ -1129,7 +1139,7 @@ retry: | |||
1129 | * more luck. | 1139 | * more luck. |
1130 | */ | 1140 | */ |
1131 | if (!res && eavail && | 1141 | if (!res && eavail && |
1132 | !(res = ep_events_transfer(ep, events, maxevents)) && jtimeout) | 1142 | !(res = ep_send_events(ep, events, maxevents)) && jtimeout) |
1133 | goto retry; | 1143 | goto retry; |
1134 | 1144 | ||
1135 | return res; | 1145 | return res; |
@@ -1237,7 +1247,7 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, | |||
1237 | */ | 1247 | */ |
1238 | ep = file->private_data; | 1248 | ep = file->private_data; |
1239 | 1249 | ||
1240 | down_write(&ep->sem); | 1250 | mutex_lock(&ep->mtx); |
1241 | 1251 | ||
1242 | /* Try to lookup the file inside our RB tree */ | 1252 | /* Try to lookup the file inside our RB tree */ |
1243 | epi = ep_find(ep, tfile, fd); | 1253 | epi = ep_find(ep, tfile, fd); |
@@ -1272,7 +1282,7 @@ asmlinkage long sys_epoll_ctl(int epfd, int op, int fd, | |||
1272 | */ | 1282 | */ |
1273 | if (epi) | 1283 | if (epi) |
1274 | ep_release_epitem(epi); | 1284 | ep_release_epitem(epi); |
1275 | up_write(&ep->sem); | 1285 | mutex_unlock(&ep->mtx); |
1276 | 1286 | ||
1277 | error_tgt_fput: | 1287 | error_tgt_fput: |
1278 | fput(tfile); | 1288 | fput(tfile); |