aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/eventpoll.c233
1 files changed, 86 insertions, 147 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 3ae644e7e860..997711c5a732 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -185,7 +185,7 @@ struct eppoll_entry {
185 185
186/* 186/*
187 * Each file descriptor added to the eventpoll interface will 187 * Each file descriptor added to the eventpoll interface will
188 * have an entry of this type linked to the hash. 188 * have an entry of this type linked to the "rbr" RB tree.
189 */ 189 */
190struct epitem { 190struct epitem {
191 /* RB-Tree node used to link this structure to the eventpoll rb-tree */ 191 /* RB-Tree node used to link this structure to the eventpoll rb-tree */
@@ -217,15 +217,6 @@ struct epitem {
217 217
218 /* List header used to link this item to the "struct file" items list */ 218 /* List header used to link this item to the "struct file" items list */
219 struct list_head fllink; 219 struct list_head fllink;
220
221 /* List header used to link the item to the transfer list */
222 struct list_head txlink;
223
224 /*
225 * This is used during the collection/transfer of events to userspace
226 * to pin items empty events set.
227 */
228 unsigned int revents;
229}; 220};
230 221
231/* Wrapper struct used by poll queueing */ 222/* Wrapper struct used by poll queueing */
@@ -258,11 +249,8 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi);
258static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key); 249static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *key);
259static int ep_eventpoll_close(struct inode *inode, struct file *file); 250static int ep_eventpoll_close(struct inode *inode, struct file *file);
260static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait); 251static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait);
261static int ep_collect_ready_items(struct eventpoll *ep,
262 struct list_head *txlist, int maxevents);
263static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 252static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
264 struct epoll_event __user *events); 253 struct epoll_event __user *events, int maxevents);
265static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist);
266static int ep_events_transfer(struct eventpoll *ep, 254static int ep_events_transfer(struct eventpoll *ep,
267 struct epoll_event __user *events, 255 struct epoll_event __user *events,
268 int maxevents); 256 int maxevents);
@@ -355,17 +343,6 @@ static inline int ep_rb_linked(struct rb_node *n)
355 return rb_parent(n) != n; 343 return rb_parent(n) != n;
356} 344}
357 345
358/*
359 * Remove the item from the list and perform its initialization.
360 * This is useful for us because we can test if the item is linked
361 * using "ep_is_linked(p)".
362 */
363static inline void ep_list_del(struct list_head *p)
364{
365 list_del(p);
366 INIT_LIST_HEAD(p);
367}
368
369/* Tells us if the item is currently linked */ 346/* Tells us if the item is currently linked */
370static inline int ep_is_linked(struct list_head *p) 347static inline int ep_is_linked(struct list_head *p)
371{ 348{
@@ -385,7 +362,7 @@ static inline struct epitem * ep_item_from_epqueue(poll_table *p)
385} 362}
386 363
387/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ 364/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
388static inline int ep_op_hash_event(int op) 365static inline int ep_op_has_event(int op)
389{ 366{
390 return op != EPOLL_CTL_DEL; 367 return op != EPOLL_CTL_DEL;
391} 368}
@@ -480,7 +457,7 @@ void eventpoll_release_file(struct file *file)
480 epi = list_entry(lsthead->next, struct epitem, fllink); 457 epi = list_entry(lsthead->next, struct epitem, fllink);
481 458
482 ep = epi->ep; 459 ep = epi->ep;
483 ep_list_del(&epi->fllink); 460 list_del_init(&epi->fllink);
484 down_write(&ep->sem); 461 down_write(&ep->sem);
485 ep_remove(ep, epi); 462 ep_remove(ep, epi);
486 up_write(&ep->sem); 463 up_write(&ep->sem);
@@ -557,7 +534,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
557 current, epfd, op, fd, event)); 534 current, epfd, op, fd, event));
558 535
559 error = -EFAULT; 536 error = -EFAULT;
560 if (ep_op_hash_event(op) && 537 if (ep_op_has_event(op) &&
561 copy_from_user(&epds, event, sizeof(struct epoll_event))) 538 copy_from_user(&epds, event, sizeof(struct epoll_event)))
562 goto eexit_1; 539 goto eexit_1;
563 540
@@ -594,7 +571,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
594 571
595 down_write(&ep->sem); 572 down_write(&ep->sem);
596 573
597 /* Try to lookup the file inside our hash table */ 574 /* Try to lookup the file inside our RB tree */
598 epi = ep_find(ep, tfile, fd); 575 epi = ep_find(ep, tfile, fd);
599 576
600 error = -EINVAL; 577 error = -EINVAL;
@@ -876,7 +853,7 @@ static void ep_free(struct eventpoll *ep)
876 } 853 }
877 854
878 /* 855 /*
879 * Walks through the whole hash by freeing each "struct epitem". At this 856 * Walks through the whole tree by freeing each "struct epitem". At this
880 * point we are sure no poll callbacks will be lingering around, and also by 857 * point we are sure no poll callbacks will be lingering around, and also by
881 * write-holding "sem" we can be sure that no file cleanup code will hit 858 * write-holding "sem" we can be sure that no file cleanup code will hit
882 * us during this operation. So we can avoid the lock on "ep->lock". 859 * us during this operation. So we can avoid the lock on "ep->lock".
@@ -891,7 +868,7 @@ static void ep_free(struct eventpoll *ep)
891 868
892 869
893/* 870/*
894 * Search the file inside the eventpoll hash. It add usage count to 871 * Search the file inside the eventpoll tree. It add usage count to
895 * the returned item, so the caller must call ep_release_epitem() 872 * the returned item, so the caller must call ep_release_epitem()
896 * after finished using the "struct epitem". 873 * after finished using the "struct epitem".
897 */ 874 */
@@ -1011,7 +988,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1011 ep_rb_initnode(&epi->rbn); 988 ep_rb_initnode(&epi->rbn);
1012 INIT_LIST_HEAD(&epi->rdllink); 989 INIT_LIST_HEAD(&epi->rdllink);
1013 INIT_LIST_HEAD(&epi->fllink); 990 INIT_LIST_HEAD(&epi->fllink);
1014 INIT_LIST_HEAD(&epi->txlink);
1015 INIT_LIST_HEAD(&epi->pwqlist); 991 INIT_LIST_HEAD(&epi->pwqlist);
1016 epi->ep = ep; 992 epi->ep = ep;
1017 ep_set_ffd(&epi->ffd, tfile, fd); 993 ep_set_ffd(&epi->ffd, tfile, fd);
@@ -1080,7 +1056,7 @@ eexit_2:
1080 */ 1056 */
1081 write_lock_irqsave(&ep->lock, flags); 1057 write_lock_irqsave(&ep->lock, flags);
1082 if (ep_is_linked(&epi->rdllink)) 1058 if (ep_is_linked(&epi->rdllink))
1083 ep_list_del(&epi->rdllink); 1059 list_del_init(&epi->rdllink);
1084 write_unlock_irqrestore(&ep->lock, flags); 1060 write_unlock_irqrestore(&ep->lock, flags);
1085 1061
1086 kmem_cache_free(epi_cache, epi); 1062 kmem_cache_free(epi_cache, epi);
@@ -1119,7 +1095,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
1119 epi->event.data = event->data; 1095 epi->event.data = event->data;
1120 1096
1121 /* 1097 /*
1122 * If the item is not linked to the hash it means that it's on its 1098 * If the item is not linked to the RB tree it means that it's on its
1123 * way toward the removal. Do nothing in this case. 1099 * way toward the removal. Do nothing in this case.
1124 */ 1100 */
1125 if (ep_rb_linked(&epi->rbn)) { 1101 if (ep_rb_linked(&epi->rbn)) {
@@ -1170,7 +1146,7 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
1170 while (!list_empty(lsthead)) { 1146 while (!list_empty(lsthead)) {
1171 pwq = list_entry(lsthead->next, struct eppoll_entry, llink); 1147 pwq = list_entry(lsthead->next, struct eppoll_entry, llink);
1172 1148
1173 ep_list_del(&pwq->llink); 1149 list_del_init(&pwq->llink);
1174 remove_wait_queue(pwq->whead, &pwq->wait); 1150 remove_wait_queue(pwq->whead, &pwq->wait);
1175 kmem_cache_free(pwq_cache, pwq); 1151 kmem_cache_free(pwq_cache, pwq);
1176 } 1152 }
@@ -1213,7 +1189,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi)
1213 * we want to remove it from this list to avoid stale events. 1189 * we want to remove it from this list to avoid stale events.
1214 */ 1190 */
1215 if (ep_is_linked(&epi->rdllink)) 1191 if (ep_is_linked(&epi->rdllink))
1216 ep_list_del(&epi->rdllink); 1192 list_del_init(&epi->rdllink);
1217 1193
1218 error = 0; 1194 error = 0;
1219eexit_1: 1195eexit_1:
@@ -1226,7 +1202,7 @@ eexit_1:
1226 1202
1227 1203
1228/* 1204/*
1229 * Removes a "struct epitem" from the eventpoll hash and deallocates 1205 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
1230 * all the associated resources. 1206 * all the associated resources.
1231 */ 1207 */
1232static int ep_remove(struct eventpoll *ep, struct epitem *epi) 1208static int ep_remove(struct eventpoll *ep, struct epitem *epi)
@@ -1248,13 +1224,13 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
1248 /* Remove the current item from the list of epoll hooks */ 1224 /* Remove the current item from the list of epoll hooks */
1249 spin_lock(&file->f_ep_lock); 1225 spin_lock(&file->f_ep_lock);
1250 if (ep_is_linked(&epi->fllink)) 1226 if (ep_is_linked(&epi->fllink))
1251 ep_list_del(&epi->fllink); 1227 list_del_init(&epi->fllink);
1252 spin_unlock(&file->f_ep_lock); 1228 spin_unlock(&file->f_ep_lock);
1253 1229
1254 /* We need to acquire the write IRQ lock before calling ep_unlink() */ 1230 /* We need to acquire the write IRQ lock before calling ep_unlink() */
1255 write_lock_irqsave(&ep->lock, flags); 1231 write_lock_irqsave(&ep->lock, flags);
1256 1232
1257 /* Really unlink the item from the hash */ 1233 /* Really unlink the item from the RB tree */
1258 error = ep_unlink(ep, epi); 1234 error = ep_unlink(ep, epi);
1259 1235
1260 write_unlock_irqrestore(&ep->lock, flags); 1236 write_unlock_irqrestore(&ep->lock, flags);
@@ -1362,71 +1338,30 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
1362 1338
1363 1339
1364/* 1340/*
1365 * Since we have to release the lock during the __copy_to_user() operation and
1366 * during the f_op->poll() call, we try to collect the maximum number of items
1367 * by reducing the irqlock/irqunlock switching rate.
1368 */
1369static int ep_collect_ready_items(struct eventpoll *ep, struct list_head *txlist, int maxevents)
1370{
1371 int nepi;
1372 unsigned long flags;
1373 struct list_head *lsthead = &ep->rdllist, *lnk;
1374 struct epitem *epi;
1375
1376 write_lock_irqsave(&ep->lock, flags);
1377
1378 for (nepi = 0, lnk = lsthead->next; lnk != lsthead && nepi < maxevents;) {
1379 epi = list_entry(lnk, struct epitem, rdllink);
1380
1381 lnk = lnk->next;
1382
1383 /* If this file is already in the ready list we exit soon */
1384 if (!ep_is_linked(&epi->txlink)) {
1385 /*
1386 * This is initialized in this way so that the default
1387 * behaviour of the reinjecting code will be to push back
1388 * the item inside the ready list.
1389 */
1390 epi->revents = epi->event.events;
1391
1392 /* Link the ready item into the transfer list */
1393 list_add(&epi->txlink, txlist);
1394 nepi++;
1395
1396 /*
1397 * Unlink the item from the ready list.
1398 */
1399 ep_list_del(&epi->rdllink);
1400 }
1401 }
1402
1403 write_unlock_irqrestore(&ep->lock, flags);
1404
1405 return nepi;
1406}
1407
1408
1409/*
1410 * This function is called without holding the "ep->lock" since the call to 1341 * This function is called without holding the "ep->lock" since the call to
1411 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ 1342 * __copy_to_user() might sleep, and also f_op->poll() might reenable the IRQ
1412 * because of the way poll() is traditionally implemented in Linux. 1343 * because of the way poll() is traditionally implemented in Linux.
1413 */ 1344 */
1414static int ep_send_events(struct eventpoll *ep, struct list_head *txlist, 1345static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1415 struct epoll_event __user *events) 1346 struct epoll_event __user *events, int maxevents)
1416{ 1347{
1417 int eventcnt = 0; 1348 int eventcnt, error = -EFAULT, pwake = 0;
1418 unsigned int revents; 1349 unsigned int revents;
1419 struct list_head *lnk; 1350 unsigned long flags;
1420 struct epitem *epi; 1351 struct epitem *epi;
1352 struct list_head injlist;
1353
1354 INIT_LIST_HEAD(&injlist);
1421 1355
1422 /* 1356 /*
1423 * We can loop without lock because this is a task private list. 1357 * We can loop without lock because this is a task private list.
1424 * The test done during the collection loop will guarantee us that 1358 * We just splice'd out the ep->rdllist in ep_collect_ready_items().
1425 * another task will not try to collect this file. Also, items 1359 * Items cannot vanish during the loop because we are holding "sem" in
1426 * cannot vanish during the loop because we are holding "sem". 1360 * read.
1427 */ 1361 */
1428 list_for_each(lnk, txlist) { 1362 for (eventcnt = 0; !list_empty(txlist) && eventcnt < maxevents;) {
1429 epi = list_entry(lnk, struct epitem, txlink); 1363 epi = list_entry(txlist->next, struct epitem, rdllink);
1364 prefetch(epi->rdllink.next);
1430 1365
1431 /* 1366 /*
1432 * Get the ready file event set. We can safely use the file 1367 * Get the ready file event set. We can safely use the file
@@ -1434,64 +1369,65 @@ static int ep_send_events(struct eventpoll *ep, struct list_head *txlist,
1434 * guarantee that both the file and the item will not vanish. 1369 * guarantee that both the file and the item will not vanish.
1435 */ 1370 */
1436 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); 1371 revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
1372 revents &= epi->event.events;
1437 1373
1438 /* 1374 /*
1439 * Set the return event set for the current file descriptor. 1375 * Is the event mask intersect the caller-requested one,
1440 * Note that only the task task was successfully able to link 1376 * deliver the event to userspace. Again, we are holding
1441 * the item to its "txlist" will write this field. 1377 * "sem" in read, so no operations coming from userspace
1378 * can change the item.
1442 */ 1379 */
1443 epi->revents = revents & epi->event.events; 1380 if (revents) {
1444 1381 if (__put_user(revents,
1445 if (epi->revents) {
1446 if (__put_user(epi->revents,
1447 &events[eventcnt].events) || 1382 &events[eventcnt].events) ||
1448 __put_user(epi->event.data, 1383 __put_user(epi->event.data,
1449 &events[eventcnt].data)) 1384 &events[eventcnt].data))
1450 return -EFAULT; 1385 goto errxit;
1451 if (epi->event.events & EPOLLONESHOT) 1386 if (epi->event.events & EPOLLONESHOT)
1452 epi->event.events &= EP_PRIVATE_BITS; 1387 epi->event.events &= EP_PRIVATE_BITS;
1453 eventcnt++; 1388 eventcnt++;
1454 } 1389 }
1455 }
1456 return eventcnt;
1457}
1458
1459
1460/*
1461 * Walk through the transfer list we collected with ep_collect_ready_items()
1462 * and, if 1) the item is still "alive" 2) its event set is not empty 3) it's
1463 * not already linked, links it to the ready list. Same as above, we are holding
1464 * "sem" so items cannot vanish underneath our nose.
1465 */
1466static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1467{
1468 int ricnt = 0, pwake = 0;
1469 unsigned long flags;
1470 struct epitem *epi;
1471
1472 write_lock_irqsave(&ep->lock, flags);
1473
1474 while (!list_empty(txlist)) {
1475 epi = list_entry(txlist->next, struct epitem, txlink);
1476
1477 /* Unlink the current item from the transfer list */
1478 ep_list_del(&epi->txlink);
1479 1390
1480 /* 1391 /*
1481 * If the item is no more linked to the interest set, we don't 1392 * This is tricky. We are holding the "sem" in read, and this
1482 * have to push it inside the ready list because the following 1393 * means that the operations that can change the "linked" status
1483 * ep_release_epitem() is going to drop it. Also, if the current 1394 * of the epoll item (epi->rbn and epi->rdllink), cannot touch
1484 * item is set to have an Edge Triggered behaviour, we don't have 1395 * them. Also, since we are "linked" from a epi->rdllink POV
1485 * to push it back either. 1396 * (the item is linked to our transmission list we just
1397 * spliced), the ep_poll_callback() cannot touch us either,
1398 * because of the check present in there. Another parallel
1399 * epoll_wait() will not get the same result set, since we
1400 * spliced the ready list before. Note that list_del() still
1401 * shows the item as linked to the test in ep_poll_callback().
1486 */ 1402 */
1487 if (ep_rb_linked(&epi->rbn) && !(epi->event.events & EPOLLET) && 1403 list_del(&epi->rdllink);
1488 (epi->revents & epi->event.events) && !ep_is_linked(&epi->rdllink)) { 1404 if (!(epi->event.events & EPOLLET) &&
1489 list_add_tail(&epi->rdllink, &ep->rdllist); 1405 (revents & epi->event.events))
1490 ricnt++; 1406 list_add_tail(&epi->rdllink, &injlist);
1407 else {
1408 /*
1409 * Be sure the item is totally detached before re-init
1410 * the list_head. After INIT_LIST_HEAD() is committed,
1411 * the ep_poll_callback() can requeue the item again,
1412 * but we don't care since we are already past it.
1413 */
1414 smp_mb();
1415 INIT_LIST_HEAD(&epi->rdllink);
1491 } 1416 }
1492 } 1417 }
1418 error = 0;
1493 1419
1494 if (ricnt) { 1420 errxit:
1421
1422 /*
1423 * If the re-injection list or the txlist are not empty, re-splice
1424 * them to the ready list and do proper wakeups.
1425 */
1426 if (!list_empty(&injlist) || !list_empty(txlist)) {
1427 write_lock_irqsave(&ep->lock, flags);
1428
1429 list_splice(txlist, &ep->rdllist);
1430 list_splice(&injlist, &ep->rdllist);
1495 /* 1431 /*
1496 * Wake up ( if active ) both the eventpoll wait list and the ->poll() 1432 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
1497 * wait list. 1433 * wait list.
@@ -1501,13 +1437,15 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1501 TASK_INTERRUPTIBLE); 1437 TASK_INTERRUPTIBLE);
1502 if (waitqueue_active(&ep->poll_wait)) 1438 if (waitqueue_active(&ep->poll_wait))
1503 pwake++; 1439 pwake++;
1504 }
1505 1440
1506 write_unlock_irqrestore(&ep->lock, flags); 1441 write_unlock_irqrestore(&ep->lock, flags);
1442 }
1507 1443
1508 /* We have to call this outside the lock */ 1444 /* We have to call this outside the lock */
1509 if (pwake) 1445 if (pwake)
1510 ep_poll_safewake(&psw, &ep->poll_wait); 1446 ep_poll_safewake(&psw, &ep->poll_wait);
1447
1448 return eventcnt == 0 ? error: eventcnt;
1511} 1449}
1512 1450
1513 1451
@@ -1517,7 +1455,8 @@ static void ep_reinject_items(struct eventpoll *ep, struct list_head *txlist)
1517static int ep_events_transfer(struct eventpoll *ep, 1455static int ep_events_transfer(struct eventpoll *ep,
1518 struct epoll_event __user *events, int maxevents) 1456 struct epoll_event __user *events, int maxevents)
1519{ 1457{
1520 int eventcnt = 0; 1458 int eventcnt;
1459 unsigned long flags;
1521 struct list_head txlist; 1460 struct list_head txlist;
1522 1461
1523 INIT_LIST_HEAD(&txlist); 1462 INIT_LIST_HEAD(&txlist);
@@ -1528,14 +1467,17 @@ static int ep_events_transfer(struct eventpoll *ep,
1528 */ 1467 */
1529 down_read(&ep->sem); 1468 down_read(&ep->sem);
1530 1469
1531 /* Collect/extract ready items */ 1470 /*
1532 if (ep_collect_ready_items(ep, &txlist, maxevents) > 0) { 1471 * Steal the ready list, and re-init the original one to the
1533 /* Build result set in userspace */ 1472 * empty list.
1534 eventcnt = ep_send_events(ep, &txlist, events); 1473 */
1474 write_lock_irqsave(&ep->lock, flags);
1475 list_splice(&ep->rdllist, &txlist);
1476 INIT_LIST_HEAD(&ep->rdllist);
1477 write_unlock_irqrestore(&ep->lock, flags);
1535 1478
1536 /* Reinject ready items into the ready list */ 1479 /* Build result set in userspace */
1537 ep_reinject_items(ep, &txlist); 1480 eventcnt = ep_send_events(ep, &txlist, events, maxevents);
1538 }
1539 1481
1540 up_read(&ep->sem); 1482 up_read(&ep->sem);
1541 1483
@@ -1612,14 +1554,12 @@ retry:
1612 return res; 1554 return res;
1613} 1555}
1614 1556
1615
1616static int eventpollfs_delete_dentry(struct dentry *dentry) 1557static int eventpollfs_delete_dentry(struct dentry *dentry)
1617{ 1558{
1618 1559
1619 return 1; 1560 return 1;
1620} 1561}
1621 1562
1622
1623static struct inode *ep_eventpoll_inode(void) 1563static struct inode *ep_eventpoll_inode(void)
1624{ 1564{
1625 int error = -ENOMEM; 1565 int error = -ENOMEM;
@@ -1647,7 +1587,6 @@ eexit_1:
1647 return ERR_PTR(error); 1587 return ERR_PTR(error);
1648} 1588}
1649 1589
1650
1651static int 1590static int
1652eventpollfs_get_sb(struct file_system_type *fs_type, int flags, 1591eventpollfs_get_sb(struct file_system_type *fs_type, int flags,
1653 const char *dev_name, void *data, struct vfsmount *mnt) 1592 const char *dev_name, void *data, struct vfsmount *mnt)