aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavide Libenzi <davidel@xmailserver.org>2007-05-08 03:25:41 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-05-08 14:15:01 -0400
commit6192bd536f96c6a0d969081bc71ae24f9319bfdc (patch)
tree07056ed061df4070d22198b5b6692d102aeacc00
parent44171df8e944f0bc8f7fa3f6d080f3e671431989 (diff)
epoll: optimizations and cleanups
Epoll is doing multiple passes over the ready set at the moment, because of the constraints over the f_op->poll() call. Looking at the code again, I noticed that we already hold the epoll semaphore in read, and this (together with other locking conditions that hold while doing an epoll_wait()) can lead to a smarter way [1] to "ship" events to userspace (in a single pass). This is a stress application that can be used to test the new code. It spwans multiple thread and call epoll_wait() and epoll_ctl() from many threads. Stress tested on my dual Opteron 254 w/out any problems. http://www.xmailserver.org/totalmess.c This is not a benchmark, just something that tries to stress and exploit possible problems with the new code. Also, I made a stupid micro-benchmark: http://www.xmailserver.org/epwbench.c [1] Considering that epoll must be thread-safe, there are five ways we can be hit during an epoll_wait() transfer loop (ep_send_events()): 1) The epoll fd going away and calling ep_free This just can't happen, since we did an fget() in sys_epoll_wait 2) An epoll_ctl(EPOLL_CTL_DEL) This can't happen because epoll_ctl() gets ep->sem in write, and we're holding it in read during ep_send_events() 3) An fd stored inside the epoll fd going away This can't happen because in eventpoll_release_file() we get ep->sem in write, and we're holding it in read during ep_send_events() 4) Another epoll_wait() happening on another thread They both can be inside ep_send_events() at the same time, we get (splice) the ready-list under the spinlock, so each one will get its own ready list. Note that an fd cannot be at the same time inside more than one ready list, because ep_poll_callback() will not re-queue it if it sees it already linked: if (ep_is_linked(&epi->rdllink)) goto is_linked; Another case that can happen, is two concurrent epoll_wait(), coming in with a userspace event buffer of size, say, ten. Suppose there are 50 event ready in the list. The first epoll_wait() will "steal" the whole list, while the second, seeing no events, will go to sleep. But at the end of ep_send_events() in the first epoll_wait(), we will re-inject surplus ready fds, and we will trigger the proper wake_up to the second epoll_wait(). 5) ep_poll_callback() hitting us asyncronously This is the tricky part. As I said above, the ep_is_linked() test done inside ep_poll_callback(), will guarantee us that until the item will result linked to a list, ep_poll_callback() will not try to re-queue it again (read, write data on any of its members). When we do a list_del() in ep_send_events(), the item will still satisfy the ep_is_linked() test (whatever data is written in prev/next, it'll never be its own pointer), so ep_poll_callback() will still leave us alone. It's only after the eventual smp_mb()+INIT_LIST_HEAD(&epi->rdllink) that it'll become visible to ep_poll_callback(), but at the point we're already past it. [akpm@osdl.org: 80 cols] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c233
1 files changed, 86 insertions, 147 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3ae644e7e860..997711c5a732 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -185,7 +185,7 @@ struct eppoll_entry {
185 185
186/* 186/*
187 * Each file descriptor added to the eventpoll interface will 187 * Each file descriptor added to the eventpoll interface will
188 * have an entry of this type linked to the hash. 188 * have an entry of this type linked to the "rbr" RB tree.
189 */ 189 */
190struct epitem { 190struct epitem {
191 /* RB-Tree node used to link this structure to the eventpoll rb-tree */ 191 /* RB-Tree node used to link this structure to the eventpoll rb-tree */
@@ -217,15 +217,6 @@ struct epitem {
217 217
218 /* List header used to link this item to the "struct file" items list */ 218 /* List header used to link this item to the "struct file" items list */
219 struct list_head fllink; 219 struct list_head fllink;
220
221 /* List header used to link the item to the transfer list */
222 struct list_head txlink;
223
224 /*
225 * This is used during the collection/transfer of events to userspace
226 * to pin items empty events set.
227 */
228 unsigned int revents;
229}; 220};
230 221
231/* Wrapper struct used by poll queueing */ 222/* Wrapper struct used by poll queueing */
@@ -258,11 +249,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi);
258static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); 249static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
259static int ep_eventpoll_close(struct inode *inode, struct file *file); 250static int ep_eventpoll_close(struct inode *inode, struct file *file);
260static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); 251static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
261static int ep_collect_ready_items(struct eventpoll *ep,
262 struct list_head *txlist, int maxevents);
263static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 252static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
264 struct epoll_event __user *events); 253 struct epoll_event __user *events, int maxevents);
265static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
266static int ep_events_transfer(struct eventpoll *ep, 254static int ep_events_transfer(struct eventpoll *ep,
267 struct epoll_event __user *events, 255 struct epoll_event __user *events,
268 int maxevents); 256 int maxevents);
@@ -355,17 +343,6 @@ static inline int ep_rb_linked(struct rb_node *n)
355 return rb_parent(n) != n; 343 return rb_parent(n) != n;
356} 344}
357 345
358/*
359 * Remove the item from the list and perform its initialization.
360 * This is useful for us because we can test if the item is linked
361 * using "ep_is_linked(p)".
362 */
363static inline void ep_list_del(struct list_head *p)
364{
365 list_del(p);
366 INIT_LIST_HEAD(p);
367}
368
369/* Tells us if the item is currently linked */ 346/* Tells us if the item is currently linked */
370static inline int ep_is_linked(struct list_head *p) 347static inline int ep_is_linked(struct list_head *p)
371{ 348{
@@ -385,7 +362,7 @@ static inline struct epitem * ep_item_from_epqueue(poll_table *p)
385} 362}
386 363
387/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 364/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
388static inline int ep_op_hash_event(int op) 365static inline int ep_op_has_event(int op)
389{ 366{
390 return op != EPOLL_CTL_DEL; 367 return op != EPOLL_CTL_DEL;
391} 368}
@@ -480,7 +457,7 @@ void eventpoll_release_file(struct file *file)
480 epi = list_entry(lsthead->next, struct epitem, fllink); 457 epi = list_entry(lsthead->next, struct epitem, fllink);
481 458
482 ep = epi->ep; 459 ep = epi->ep;
483 ep_list_del(&epi->fllink); 460 list_del_init(&epi->fllink);
484 down_write(&ep->sem); 461 down_write(&ep->sem);
485 ep_remove(ep, epi); 462 ep_remove(ep, epi);
486 up_write(&ep->sem); 463 up_write(&ep->sem);
@@ -557,7 +534,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
557 current, epfd, op, fd, event)); 534 current, epfd, op, fd, event));
558 535
559 error = -EFAULT; 536 error = -EFAULT;
560 if (ep_op_hash_event(op) && 537 if (ep_op_has_event(op) &&
561 copy_from_user(&epds, event, sizeof(struct epoll_event))) 538 copy_from_user(&epds, event, sizeof(struct epoll_event)))
562 goto eexit_1; 539 goto eexit_1;
563 540
@@ -594,7 +571,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
594 571
595 down_write(&ep->sem); 572 down_write(&ep->sem);
596 573
597 /* Try to lookup the file inside our hash table */ 574 /* Try to lookup the file inside our RB tree */
598 epi = ep_find(ep, tfile, fd); 575 epi = ep_find(ep, tfile, fd);
599 576
600 error = -EINVAL; 577 error = -EINVAL;
@@ -876,7 +853,7 @@ static void ep_free(struct eventpoll *ep)
876 } 853 }
877 854
878 /* 855 /*
879 * Walks through the whole hash by freeing each "struct epitem". At this 856 * Walks through the whole tree by freeing each "struct epitem". At this
880 * point we are sure no poll callbacks will be lingering around, and also by 857 * point we are sure no poll callbacks will be lingering around, and also by
881 * write-holding "sem" we can be sure that no file cleanup code will hit 858 * write-holding "sem" we can be sure that no file cleanup code will hit
882 * us during this operation. So we can avoid the lock on "ep->lock". 859 * us during this operation. So we can avoid the lock on "ep->lock".
@@ -891,7 +868,7 @@ static void ep_free(struct eventpoll *ep)
891 868
892 869
893/* 870/*
894 * Search the file inside the eventpoll hash. It add usage count to 871 * Search the file inside the eventpoll tree. It add usage count to
895 * the returned item, so the caller must call ep_release_epitem() 872 * the returned item, so the caller must call ep_release_epitem()
896 * after finished using the "struct epitem". 873 * after finished using the "struct epitem".
897 */ 874 */
@@ -1011,7 +988,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 ep_rb_initnode(&epi->rbn); 988 ep_rb_initnode(&epi->rbn);
1012 INIT_LIST_HEAD(&epi->rdllink); 989 INIT_LIST_HEAD(&epi->rdllink);
1013 INIT_LIST_HEAD(&epi->fllink); 990 INIT_LIST_HEAD(&epi->fllink);
1014 INIT_LIST_HEAD(&epi->txlink);
1015 INIT_LIST_HEAD(&epi->pwqlist); 991 INIT_LIST_HEAD(&epi->pwqlist);
1016 epi->ep = ep; 992 epi->ep = ep;
1017 ep_set_ffd(&epi->ffd, tfile, fd); 993 ep_set_ffd(&epi->ffd, tfile, fd);
@@ -1080,7 +1056,7 @@ eexit_2:
1080 */ 1056 */
1081 write_lock_irqsave(&ep->lock, flags); 1057 write_lock_irqsave(&ep->lock, flags);
1082 if (ep_is_linked(&epi->rdllink)) 1058 if (ep_is_linked(&epi->rdllink))
1083 ep_list_del(&epi->rdllink); 1059 list_del_init(&epi->rdllink);
1084 write_unlock_irqrestore(&ep->lock, flags); 1060 write_unlock_irqrestore(&ep->lock, flags);
1085 1061
1086 kmem_cache_free(epi_cache, epi); 1062 kmem_cache_free(epi_cache, epi);
@@ -1119,7 +1095,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1119 epi->event.data = event->data; 1095 epi->event.data = event->data;
1120 1096
1121 /* 1097 /*
1122 * If the item is not linked to the hash it means that it's on its 1098 * If the item is not linked to the RB tree it means that it's on its
1123 * way toward the removal. Do nothing in this case. 1099 * way toward the removal. Do nothing in this case.
1124 */ 1100 */
1125 if (ep_rb_linked(&epi->rbn)) { 1101 if (ep_rb_linked(&epi->rbn)) {
@@ -1170,7 +1146,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1170 while (!list_empty(lsthead)) { 1146 while (!list_empty(lsthead)) {
1171 pwq = list_entry(lsthead->next, struct eppoll_entry, llink); 1147 pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
1172 1148
1173 ep_list_del(&pwq->llink); 1149 list_del_init(&pwq->llink);
1174 remove_wait_queue(pwq->whead, &pwq->wait); 1150 remove_wait_queue(pwq->whead, &pwq->wait);
1175 kmem_cache_free(pwq_cache, pwq); 1151 kmem_cache_free(pwq_cache, pwq);
1176 } 1152 }
@@ -1213,7 +1189,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1213 * we want to remove it from this list to avoid stale events. 1189 * we want to remove it from this list to avoid stale events.
1214 */ 1190 */
1215 if (ep_is_linked(&epi->rdllink)) 1191 if (ep_is_linked(&epi->rdllink))
1216 ep_list_del(&epi->rdllink); 1192 list_del_init(&epi->rdllink);
1217 1193
1218 error = 0; 1194 error = 0;
1219eexit_1: 1195eexit_1:
@@ -1226,7 +1202,7 @@ eexit_1:
1226 1202
1227 1203
1228/* 1204/*
1229 * Removes a "struct epitem" from the eventpoll hash and deallocates 1205 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
1230 * all the associated resources. 1206 * all the associated resources.
1231 */ 1207 */
1232static int ep_remove(struct eventpoll *ep, struct epitem *epi) 1208static int ep_remove(struct eventpoll *ep, struct epitem *epi)
@@ -1248,13 +1224,13 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
1248 /* Remove the current item from the list of epoll hooks */ 1224 /* Remove the current item from the list of epoll hooks */
1249 spin_lock(&file->f_ep_lock); 1225 spin_lock(&file->f_ep_lock);
1250 if (ep_is_linked(&epi->fllink)) 1226 if (ep_is_linked(&epi->fllink))
1251 ep_list_del(&epi->fllink); 1227 list_del_init(&epi->fllink);
1252 spin_unlock(&file->f_ep_lock); 1228 spin_unlock(&file->f_ep_lock);
1253 1229
1254 /* We need to acquire the write IRQ lock before calling ep_unlink() */ 1230 /* We need to acquire the write IRQ lock before calling ep_unlink() */
1255 write_lock_irqsave(&ep->lock, flags); 1231 write_lock_irqsave(&ep->lock, flags);
1256 1232
1257 /* Really unlink the item from the hash */ 1233 /* Really unlink the item from the RB tree */
1258 error = ep_unlink(ep, epi); 1234 error = ep_unlink(ep, epi);
1259 1235
1260 write_unlock_irqrestore(&ep->lock, flags); 1236 write_unlock_irqrestore(&ep->lock, flags);
@@ -1362,71 +1338,30 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
1362 1338
1363 1339
1364/* 1340/*
1365 * Since we have to release the lock during the __copy_to_user() operation and
1366 * during the f_op->poll() call, we try to collect the maximum number of items
1367 * by reducing the irqlock/irqunlock switching rate.
1368 */
1369static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
1370{
1371 int nepi;
1372 unsigned long flags;
1373 struct list_head *lsthead = &ep->rdllist, *lnk;
1374 struct epitem *epi;
1375
1376 write_lock_irqsave(&ep->lock, flags);
1377
1378 for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
1379 epi = list_entry(lnk, struct epitem, rdllink);
1380
1381 lnk = lnk->next;
1382
1383 /* If this file is already in the ready list we exit soon */
1384 if (!ep_is_linked(&epi->txlink)) {
1385 /*
1386 * This is initialized in this way so that the default
1387 * behaviour of the reinjecting code will be to push back
1388 * the item inside the ready list.
1389 */
1390 epi->revents = epi->event.events;
1391
1392 /* Link the ready item into the transfer list */
1393 list_add(&epi->txlink, txlist);
1394 nepi++;
1395
1396 /*
1397 * Unlink the item from the ready list.
1398 */
1399 ep_list_del(&epi->rdllink);
1400 }
1401 }
1402
1403 write_unlock_irqrestore(&ep->lock, flags);
1404
1405 return nepi;
1406}
1407
1408
1409/*
1410 * This function is called without holding the "ep->lock" since the call to 1341 * This function is called without holding the "ep->lock" since the call to
1411 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ 1342 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
1412 * because of the way poll() is traditionally implemented in Linux. 1343 * because of the way poll() is traditionally implemented in Linux.
1413 */ 1344 */
1414static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 1345static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1415 struct epoll_event __user *events) 1346 struct epoll_event __user *events, int maxevents)
1416{ 1347{
1417 int eventcnt = 0; 1348 int eventcnt, error = -EFAULT, pwake = 0;
1418 unsigned int revents; 1349 unsigned int revents;
1419 struct list_head *lnk; 1350 unsigned long flags;
1420 struct epitem *epi; 1351 struct epitem *epi;
1352 struct list_head injlist;
1353
1354 INIT_LIST_HEAD(&injlist);
1421 1355
1422 /* 1356 /*
1423 * We can loop without lock because this is a task private list. 1357 * We can loop without lock because this is a task private list.
1424 * The test done during the collection loop will guarantee us that 1358 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
1425 * another task will not try to collect this file. Also, items 1359 * Items cannot vanish during the loop because we are holding "sem" in
1426 * cannot vanish during the loop because we are holding "sem". 1360 * read.
1427 */ 1361 */
1428 list_for_each(lnk, txlist) { 1362 for (eventcnt = 0; !list_empty(txlist) && eventcnt < maxevents;) {
1429 epi = list_entry(lnk, struct epitem, txlink); 1363 epi = list_entry(txlist->next, struct epitem, rdllink);
1364 prefetch(epi->rdllink.next);
1430 1365
1431 /* 1366 /*
1432 * Get the ready file event set. We can safely use the file 1367 * Get the ready file event set. We can safely use the file
@@ -1434,64 +1369,65 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1434 * guarantee that both the file and the item will not vanish. 1369 * guarantee that both the file and the item will not vanish.
1435 */ 1370 */
1436 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1371 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1372 revents &= epi->event.events;
1437 1373
1438 /* 1374 /*
1439 * Set the return event set for the current file descriptor. 1375 * Is the event mask intersect the caller-requested one,
1440 * Note that only the task task was successfully able to link 1376 * deliver the event to userspace. Again, we are holding
1441 * the item to its "txlist" will write this field. 1377 * "sem" in read, so no operations coming from userspace
1378 * can change the item.
1442 */ 1379 */
1443 epi->revents = revents & epi->event.events; 1380 if (revents) {
1444 1381 if (__put_user(revents,
1445 if (epi->revents) {
1446 if (__put_user(epi->revents,
1447 &events[eventcnt].events) || 1382 &events[eventcnt].events) ||
1448 __put_user(epi->event.data, 1383 __put_user(epi->event.data,
1449 &events[eventcnt].data)) 1384 &events[eventcnt].data))
1450 return -EFAULT; 1385 goto errxit;
1451 if (epi->event.events & EPOLLONESHOT) 1386 if (epi->event.events & EPOLLONESHOT)
1452 epi->event.events &= EP_PRIVATE_BITS; 1387 epi->event.events &= EP_PRIVATE_BITS;
1453 eventcnt++; 1388 eventcnt++;
1454 } 1389 }
1455 }
1456 return eventcnt;
1457}
1458
1459
1460/*
1461 * Walk through the transfer list we collected with ep_collect_ready_items()
1462 * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
1463 * not already linked, links it to the ready list. Same as above, we are holding
1464 * "sem" so items cannot vanish underneath our nose.
1465 */
1466static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1467{
1468 int ricnt = 0, pwake = 0;
1469 unsigned long flags;
1470 struct epitem *epi;
1471
1472 write_lock_irqsave(&ep->lock, flags);
1473
1474 while (!list_empty(txlist)) {
1475 epi = list_entry(txlist->next, struct epitem, txlink);
1476
1477 /* Unlink the current item from the transfer list */
1478 ep_list_del(&epi->txlink);
1479 1390
1480 /* 1391 /*
1481 * If the item is no more linked to the interest set, we don't 1392 * This is tricky. We are holding the "sem" in read, and this
1482 * have to push it inside the ready list because the following 1393 * means that the operations that can change the "linked" status
1483 * ep_release_epitem() is going to drop it. Also, if the current 1394 * of the epoll item (epi->rbn and epi->rdllink), cannot touch
1484 * item is set to have an Edge Triggered behaviour, we don't have 1395 * them. Also, since we are "linked" from a epi->rdllink POV
1485 * to push it back either. 1396 * (the item is linked to our transmission list we just
1397 * spliced), the ep_poll_callback() cannot touch us either,
1398 * because of the check present in there. Another parallel
1399 * epoll_wait() will not get the same result set, since we
1400 * spliced the ready list before. Note that list_del() still
1401 * shows the item as linked to the test in ep_poll_callback().
1486 */ 1402 */
1487 if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) && 1403 list_del(&epi->rdllink);
1488 (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) { 1404 if (!(epi->event.events & EPOLLET) &&
1489 list_add_tail(&epi->rdllink, &ep->rdllist); 1405 (revents & epi->event.events))
1490 ricnt++; 1406 list_add_tail(&epi->rdllink, &injlist);
1407 else {
1408 /*
1409 * Be sure the item is totally detached before re-init
1410 * the list_head. After INIT_LIST_HEAD() is committed,
1411 * the ep_poll_callback() can requeue the item again,
1412 * but we don't care since we are already past it.
1413 */
1414 smp_mb();
1415 INIT_LIST_HEAD(&epi->rdllink);
1491 } 1416 }
1492 } 1417 }
1418 error = 0;
1493 1419
1494 if (ricnt) { 1420 errxit:
1421
1422 /*
1423 * If the re-injection list or the txlist are not empty, re-splice
1424 * them to the ready list and do proper wakeups.
1425 */
1426 if (!list_empty(&injlist) || !list_empty(txlist)) {
1427 write_lock_irqsave(&ep->lock, flags);
1428
1429 list_splice(txlist, &ep->rdllist);
1430 list_splice(&injlist, &ep->rdllist);
1495 /* 1431 /*
1496 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1432 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1497 * wait list. 1433 * wait list.
@@ -1501,13 +1437,15 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1501 TASK_INTERRUPTIBLE); 1437 TASK_INTERRUPTIBLE);
1502 if (waitqueue_active(&ep->poll_wait)) 1438 if (waitqueue_active(&ep->poll_wait))
1503 pwake++; 1439 pwake++;
1504 }
1505 1440
1506 write_unlock_irqrestore(&ep->lock, flags); 1441 write_unlock_irqrestore(&ep->lock, flags);
1442 }
1507 1443
1508 /* We have to call this outside the lock */ 1444 /* We have to call this outside the lock */
1509 if (pwake) 1445 if (pwake)
1510 ep_poll_safewake(&psw, &ep->poll_wait); 1446 ep_poll_safewake(&psw, &ep->poll_wait);
1447
1448 return eventcnt == 0 ? error: eventcnt;
1511} 1449}
1512 1450
1513 1451
@@ -1517,7 +1455,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1517static int ep_events_transfer(struct eventpoll *ep, 1455static int ep_events_transfer(struct eventpoll *ep,
1518 struct epoll_event __user *events, int maxevents) 1456 struct epoll_event __user *events, int maxevents)
1519{ 1457{
1520 int eventcnt = 0; 1458 int eventcnt;
1459 unsigned long flags;
1521 struct list_head txlist; 1460 struct list_head txlist;
1522 1461
1523 INIT_LIST_HEAD(&txlist); 1462 INIT_LIST_HEAD(&txlist);
@@ -1528,14 +1467,17 @@ static int ep_events_transfer(struct eventpoll *ep,
1528 */ 1467 */
1529 down_read(&ep->sem); 1468 down_read(&ep->sem);
1530 1469
1531 /* Collect/extract ready items */ 1470 /*
1532 if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) { 1471 * Steal the ready list, and re-init the original one to the
1533 /* Build result set in userspace */ 1472 * empty list.
1534 eventcnt = ep_send_events(ep, &txlist, events); 1473 */
1474 write_lock_irqsave(&ep->lock, flags);
1475 list_splice(&ep->rdllist, &txlist);
1476 INIT_LIST_HEAD(&ep->rdllist);
1477 write_unlock_irqrestore(&ep->lock, flags);
1535 1478
1536 /* Reinject ready items into the ready list */ 1479 /* Build result set in userspace */
1537 ep_reinject_items(ep, &txlist); 1480 eventcnt = ep_send_events(ep, &txlist, events, maxevents);
1538 }
1539 1481
1540 up_read(&ep->sem); 1482 up_read(&ep->sem);
1541 1483
@@ -1612,14 +1554,12 @@ retry:
1612 return res; 1554 return res;
1613} 1555}
1614 1556
1615
1616static int eventpollfs_delete_dentry(struct dentry *dentry) 1557static int eventpollfs_delete_dentry(struct dentry *dentry)
1617{ 1558{
1618 1559
1619 return 1; 1560 return 1;
1620} 1561}
1621 1562
1622
1623static struct inode *ep_eventpoll_inode(void) 1563static struct inode *ep_eventpoll_inode(void)
1624{ 1564{
1625 int error = -ENOMEM; 1565 int error = -ENOMEM;
@@ -1647,7 +1587,6 @@ eexit_1:
1647 return ERR_PTR(error); 1587 return ERR_PTR(error);
1648} 1588}
1649 1589
1650
1651static int 1590static int
1652eventpollfs_get_sb(struct file_system_type *fs_type, int flags, 1591eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
1653 const char *dev_name, void *data, struct vfsmount *mnt) 1592 const char *dev_name, void *data, struct vfsmount *mnt)