diff options
-rw-r--r-- | fs/eventpoll.c | 234 | ||||
-rw-r--r-- | include/linux/eventpoll.h | 1 | ||||
-rw-r--r-- | include/linux/fs.h | 1 |
3 files changed, 211 insertions, 25 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a39cf4cfb9d..6879d0c5cb5 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -197,6 +197,12 @@ struct eventpoll { | |||
197 | 197 | ||
198 | /* The user that created the eventpoll descriptor */ | 198 | /* The user that created the eventpoll descriptor */ |
199 | struct user_struct *user; | 199 | struct user_struct *user; |
200 | |||
201 | struct file *file; | ||
202 | |||
203 | /* used to optimize loop detection check */ | ||
204 | int visited; | ||
205 | struct list_head visited_list_link; | ||
200 | }; | 206 | }; |
201 | 207 | ||
202 | /* Wait structure used by the poll hooks */ | 208 | /* Wait structure used by the poll hooks */ |
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; | |||
255 | /* Slab cache used to allocate "struct eppoll_entry" */ | 261 | /* Slab cache used to allocate "struct eppoll_entry" */ |
256 | static struct kmem_cache *pwq_cache __read_mostly; | 262 | static struct kmem_cache *pwq_cache __read_mostly; |
257 | 263 | ||
264 | /* Visited nodes during ep_loop_check(), so we can unset them when we finish */ | ||
265 | static LIST_HEAD(visited_list); | ||
266 | |||
267 | /* | ||
268 | * List of files with newly added links, where we may need to limit the number | ||
269 | * of emanating paths. Protected by the epmutex. | ||
270 | */ | ||
271 | static LIST_HEAD(tfile_check_list); | ||
272 | |||
258 | #ifdef CONFIG_SYSCTL | 273 | #ifdef CONFIG_SYSCTL |
259 | 274 | ||
260 | #include <linux/sysctl.h> | 275 | #include <linux/sysctl.h> |
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = { | |||
276 | }; | 291 | }; |
277 | #endif /* CONFIG_SYSCTL */ | 292 | #endif /* CONFIG_SYSCTL */ |
278 | 293 | ||
294 | static const struct file_operations eventpoll_fops; | ||
295 | |||
296 | static inline int is_file_epoll(struct file *f) | ||
297 | { | ||
298 | return f->f_op == &eventpoll_fops; | ||
299 | } | ||
279 | 300 | ||
280 | /* Setup the structure that is used as key for the RB tree */ | 301 | /* Setup the structure that is used as key for the RB tree */ |
281 | static inline void ep_set_ffd(struct epoll_filefd *ffd, | 302 | static inline void ep_set_ffd(struct epoll_filefd *ffd, |
@@ -728,12 +749,6 @@ static const struct file_operations eventpoll_fops = { | |||
728 | .llseek = noop_llseek, | 749 | .llseek = noop_llseek, |
729 | }; | 750 | }; |
730 | 751 | ||
731 | /* Fast test to see if the file is an evenpoll file */ | ||
732 | static inline int is_file_epoll(struct file *f) | ||
733 | { | ||
734 | return f->f_op == &eventpoll_fops; | ||
735 | } | ||
736 | |||
737 | /* | 752 | /* |
738 | * This is called from eventpoll_release() to unlink files from the eventpoll | 753 | * This is called from eventpoll_release() to unlink files from the eventpoll |
739 | * interface. We need to have this facility to cleanup correctly files that are | 754 | * interface. We need to have this facility to cleanup correctly files that are |
@@ -954,6 +969,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
954 | rb_insert_color(&epi->rbn, &ep->rbr); | 969 | rb_insert_color(&epi->rbn, &ep->rbr); |
955 | } | 970 | } |
956 | 971 | ||
972 | |||
973 | |||
974 | #define PATH_ARR_SIZE 5 | ||
975 | /* | ||
976 | * These are the number paths of length 1 to 5, that we are allowing to emanate | ||
977 | * from a single file of interest. For example, we allow 1000 paths of length | ||
978 | * 1, to emanate from each file of interest. This essentially represents the | ||
979 | * potential wakeup paths, which need to be limited in order to avoid massive | ||
980 | * uncontrolled wakeup storms. The common use case should be a single ep which | ||
981 | * is connected to n file sources. In this case each file source has 1 path | ||
982 | * of length 1. Thus, the numbers below should be more than sufficient. These | ||
983 | * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify | ||
984 | * and delete can't add additional paths. Protected by the epmutex. | ||
985 | */ | ||
986 | static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; | ||
987 | static int path_count[PATH_ARR_SIZE]; | ||
988 | |||
989 | static int path_count_inc(int nests) | ||
990 | { | ||
991 | if (++path_count[nests] > path_limits[nests]) | ||
992 | return -1; | ||
993 | return 0; | ||
994 | } | ||
995 | |||
996 | static void path_count_init(void) | ||
997 | { | ||
998 | int i; | ||
999 | |||
1000 | for (i = 0; i < PATH_ARR_SIZE; i++) | ||
1001 | path_count[i] = 0; | ||
1002 | } | ||
1003 | |||
1004 | static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) | ||
1005 | { | ||
1006 | int error = 0; | ||
1007 | struct file *file = priv; | ||
1008 | struct file *child_file; | ||
1009 | struct epitem *epi; | ||
1010 | |||
1011 | list_for_each_entry(epi, &file->f_ep_links, fllink) { | ||
1012 | child_file = epi->ep->file; | ||
1013 | if (is_file_epoll(child_file)) { | ||
1014 | if (list_empty(&child_file->f_ep_links)) { | ||
1015 | if (path_count_inc(call_nests)) { | ||
1016 | error = -1; | ||
1017 | break; | ||
1018 | } | ||
1019 | } else { | ||
1020 | error = ep_call_nested(&poll_loop_ncalls, | ||
1021 | EP_MAX_NESTS, | ||
1022 | reverse_path_check_proc, | ||
1023 | child_file, child_file, | ||
1024 | current); | ||
1025 | } | ||
1026 | if (error != 0) | ||
1027 | break; | ||
1028 | } else { | ||
1029 | printk(KERN_ERR "reverse_path_check_proc: " | ||
1030 | "file is not an ep!\n"); | ||
1031 | } | ||
1032 | } | ||
1033 | return error; | ||
1034 | } | ||
1035 | |||
1036 | /** | ||
1037 | * reverse_path_check - The tfile_check_list is list of file *, which have | ||
1038 | * links that are proposed to be newly added. We need to | ||
1039 | * make sure that those added links don't add too many | ||
1040 | * paths such that we will spend all our time waking up | ||
1041 | * eventpoll objects. | ||
1042 | * | ||
1043 | * Returns: Returns zero if the proposed links don't create too many paths, | ||
1044 | * -1 otherwise. | ||
1045 | */ | ||
1046 | static int reverse_path_check(void) | ||
1047 | { | ||
1048 | int length = 0; | ||
1049 | int error = 0; | ||
1050 | struct file *current_file; | ||
1051 | |||
1052 | /* let's call this for all tfiles */ | ||
1053 | list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { | ||
1054 | length++; | ||
1055 | path_count_init(); | ||
1056 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1057 | reverse_path_check_proc, current_file, | ||
1058 | current_file, current); | ||
1059 | if (error) | ||
1060 | break; | ||
1061 | } | ||
1062 | return error; | ||
1063 | } | ||
1064 | |||
957 | /* | 1065 | /* |
958 | * Must be called with "mtx" held. | 1066 | * Must be called with "mtx" held. |
959 | */ | 1067 | */ |
@@ -1015,6 +1123,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
1015 | */ | 1123 | */ |
1016 | ep_rbtree_insert(ep, epi); | 1124 | ep_rbtree_insert(ep, epi); |
1017 | 1125 | ||
1126 | /* now check if we've created too many backpaths */ | ||
1127 | error = -EINVAL; | ||
1128 | if (reverse_path_check()) | ||
1129 | goto error_remove_epi; | ||
1130 | |||
1018 | /* We have to drop the new item inside our item list to keep track of it */ | 1131 | /* We have to drop the new item inside our item list to keep track of it */ |
1019 | spin_lock_irqsave(&ep->lock, flags); | 1132 | spin_lock_irqsave(&ep->lock, flags); |
1020 | 1133 | ||
@@ -1039,6 +1152,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
1039 | 1152 | ||
1040 | return 0; | 1153 | return 0; |
1041 | 1154 | ||
1155 | error_remove_epi: | ||
1156 | spin_lock(&tfile->f_lock); | ||
1157 | if (ep_is_linked(&epi->fllink)) | ||
1158 | list_del_init(&epi->fllink); | ||
1159 | spin_unlock(&tfile->f_lock); | ||
1160 | |||
1161 | rb_erase(&epi->rbn, &ep->rbr); | ||
1162 | |||
1042 | error_unregister: | 1163 | error_unregister: |
1043 | ep_unregister_pollwait(ep, epi); | 1164 | ep_unregister_pollwait(ep, epi); |
1044 | 1165 | ||
@@ -1303,18 +1424,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1303 | int error = 0; | 1424 | int error = 0; |
1304 | struct file *file = priv; | 1425 | struct file *file = priv; |
1305 | struct eventpoll *ep = file->private_data; | 1426 | struct eventpoll *ep = file->private_data; |
1427 | struct eventpoll *ep_tovisit; | ||
1306 | struct rb_node *rbp; | 1428 | struct rb_node *rbp; |
1307 | struct epitem *epi; | 1429 | struct epitem *epi; |
1308 | 1430 | ||
1309 | mutex_lock_nested(&ep->mtx, call_nests + 1); | 1431 | mutex_lock_nested(&ep->mtx, call_nests + 1); |
1432 | ep->visited = 1; | ||
1433 | list_add(&ep->visited_list_link, &visited_list); | ||
1310 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { | 1434 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { |
1311 | epi = rb_entry(rbp, struct epitem, rbn); | 1435 | epi = rb_entry(rbp, struct epitem, rbn); |
1312 | if (unlikely(is_file_epoll(epi->ffd.file))) { | 1436 | if (unlikely(is_file_epoll(epi->ffd.file))) { |
1437 | ep_tovisit = epi->ffd.file->private_data; | ||
1438 | if (ep_tovisit->visited) | ||
1439 | continue; | ||
1313 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1440 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
1314 | ep_loop_check_proc, epi->ffd.file, | 1441 | ep_loop_check_proc, epi->ffd.file, |
1315 | epi->ffd.file->private_data, current); | 1442 | ep_tovisit, current); |
1316 | if (error != 0) | 1443 | if (error != 0) |
1317 | break; | 1444 | break; |
1445 | } else { | ||
1446 | /* | ||
1447 | * If we've reached a file that is not associated with | ||
1448 | * an ep, then we need to check if the newly added | ||
1449 | * links are going to add too many wakeup paths. We do | ||
1450 | * this by adding it to the tfile_check_list, if it's | ||
1451 | * not already there, and calling reverse_path_check() | ||
1452 | * during ep_insert(). | ||
1453 | */ | ||
1454 | if (list_empty(&epi->ffd.file->f_tfile_llink)) | ||
1455 | list_add(&epi->ffd.file->f_tfile_llink, | ||
1456 | &tfile_check_list); | ||
1318 | } | 1457 | } |
1319 | } | 1458 | } |
1320 | mutex_unlock(&ep->mtx); | 1459 | mutex_unlock(&ep->mtx); |
@@ -1335,8 +1474,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1335 | */ | 1474 | */ |
1336 | static int ep_loop_check(struct eventpoll *ep, struct file *file) | 1475 | static int ep_loop_check(struct eventpoll *ep, struct file *file) |
1337 | { | 1476 | { |
1338 | return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1477 | int ret; |
1478 | struct eventpoll *ep_cur, *ep_next; | ||
1479 | |||
1480 | ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1339 | ep_loop_check_proc, file, ep, current); | 1481 | ep_loop_check_proc, file, ep, current); |
1482 | /* clear visited list */ | ||
1483 | list_for_each_entry_safe(ep_cur, ep_next, &visited_list, | ||
1484 | visited_list_link) { | ||
1485 | ep_cur->visited = 0; | ||
1486 | list_del(&ep_cur->visited_list_link); | ||
1487 | } | ||
1488 | return ret; | ||
1489 | } | ||
1490 | |||
1491 | static void clear_tfile_check_list(void) | ||
1492 | { | ||
1493 | struct file *file; | ||
1494 | |||
1495 | /* first clear the tfile_check_list */ | ||
1496 | while (!list_empty(&tfile_check_list)) { | ||
1497 | file = list_first_entry(&tfile_check_list, struct file, | ||
1498 | f_tfile_llink); | ||
1499 | list_del_init(&file->f_tfile_llink); | ||
1500 | } | ||
1501 | INIT_LIST_HEAD(&tfile_check_list); | ||
1340 | } | 1502 | } |
1341 | 1503 | ||
1342 | /* | 1504 | /* |
@@ -1344,8 +1506,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) | |||
1344 | */ | 1506 | */ |
1345 | SYSCALL_DEFINE1(epoll_create1, int, flags) | 1507 | SYSCALL_DEFINE1(epoll_create1, int, flags) |
1346 | { | 1508 | { |
1347 | int error; | 1509 | int error, fd; |
1348 | struct eventpoll *ep = NULL; | 1510 | struct eventpoll *ep = NULL; |
1511 | struct file *file; | ||
1349 | 1512 | ||
1350 | /* Check the EPOLL_* constant for consistency. */ | 1513 | /* Check the EPOLL_* constant for consistency. */ |
1351 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); | 1514 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); |
@@ -1362,11 +1525,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) | |||
1362 | * Creates all the items needed to setup an eventpoll file. That is, | 1525 | * Creates all the items needed to setup an eventpoll file. That is, |
1363 | * a file structure and a free file descriptor. | 1526 | * a file structure and a free file descriptor. |
1364 | */ | 1527 | */ |
1365 | error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, | 1528 | fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); |
1529 | if (fd < 0) { | ||
1530 | error = fd; | ||
1531 | goto out_free_ep; | ||
1532 | } | ||
1533 | file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, | ||
1366 | O_RDWR | (flags & O_CLOEXEC)); | 1534 | O_RDWR | (flags & O_CLOEXEC)); |
1367 | if (error < 0) | 1535 | if (IS_ERR(file)) { |
1368 | ep_free(ep); | 1536 | error = PTR_ERR(file); |
1369 | 1537 | goto out_free_fd; | |
1538 | } | ||
1539 | fd_install(fd, file); | ||
1540 | ep->file = file; | ||
1541 | return fd; | ||
1542 | |||
1543 | out_free_fd: | ||
1544 | put_unused_fd(fd); | ||
1545 | out_free_ep: | ||
1546 | ep_free(ep); | ||
1370 | return error; | 1547 | return error; |
1371 | } | 1548 | } |
1372 | 1549 | ||
@@ -1432,21 +1609,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1432 | /* | 1609 | /* |
1433 | * When we insert an epoll file descriptor, inside another epoll file | 1610 | * When we insert an epoll file descriptor, inside another epoll file |
1434 | * descriptor, there is the change of creating closed loops, which are | 1611 | * descriptor, there is the change of creating closed loops, which are |
1435 | * better be handled here, than in more critical paths. | 1612 | * better be handled here, than in more critical paths. While we are |
1613 | * checking for loops we also determine the list of files reachable | ||
1614 | * and hang them on the tfile_check_list, so we can check that we | ||
1615 | * haven't created too many possible wakeup paths. | ||
1436 | * | 1616 | * |
1437 | * We hold epmutex across the loop check and the insert in this case, in | 1617 | * We need to hold the epmutex across both ep_insert and ep_remove |
1438 | * order to prevent two separate inserts from racing and each doing the | 1618 | * b/c we want to make sure we are looking at a coherent view of |
1439 | * insert "at the same time" such that ep_loop_check passes on both | 1619 | * epoll network. |
1440 | * before either one does the insert, thereby creating a cycle. | ||
1441 | */ | 1620 | */ |
1442 | if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { | 1621 | if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { |
1443 | mutex_lock(&epmutex); | 1622 | mutex_lock(&epmutex); |
1444 | did_lock_epmutex = 1; | 1623 | did_lock_epmutex = 1; |
1445 | error = -ELOOP; | ||
1446 | if (ep_loop_check(ep, tfile) != 0) | ||
1447 | goto error_tgt_fput; | ||
1448 | } | 1624 | } |
1449 | 1625 | if (op == EPOLL_CTL_ADD) { | |
1626 | if (is_file_epoll(tfile)) { | ||
1627 | error = -ELOOP; | ||
1628 | if (ep_loop_check(ep, tfile) != 0) | ||
1629 | goto error_tgt_fput; | ||
1630 | } else | ||
1631 | list_add(&tfile->f_tfile_llink, &tfile_check_list); | ||
1632 | } | ||
1450 | 1633 | ||
1451 | mutex_lock_nested(&ep->mtx, 0); | 1634 | mutex_lock_nested(&ep->mtx, 0); |
1452 | 1635 | ||
@@ -1465,6 +1648,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1465 | error = ep_insert(ep, &epds, tfile, fd); | 1648 | error = ep_insert(ep, &epds, tfile, fd); |
1466 | } else | 1649 | } else |
1467 | error = -EEXIST; | 1650 | error = -EEXIST; |
1651 | clear_tfile_check_list(); | ||
1468 | break; | 1652 | break; |
1469 | case EPOLL_CTL_DEL: | 1653 | case EPOLL_CTL_DEL: |
1470 | if (epi) | 1654 | if (epi) |
@@ -1483,7 +1667,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1483 | mutex_unlock(&ep->mtx); | 1667 | mutex_unlock(&ep->mtx); |
1484 | 1668 | ||
1485 | error_tgt_fput: | 1669 | error_tgt_fput: |
1486 | if (unlikely(did_lock_epmutex)) | 1670 | if (did_lock_epmutex) |
1487 | mutex_unlock(&epmutex); | 1671 | mutex_unlock(&epmutex); |
1488 | 1672 | ||
1489 | fput(tfile); | 1673 | fput(tfile); |
diff --git a/include/linux/eventpoll.h b/include/linux/eventpoll.h index f362733186a..657ab55beda 100644 --- a/include/linux/eventpoll.h +++ b/include/linux/eventpoll.h | |||
@@ -61,6 +61,7 @@ struct file; | |||
61 | static inline void eventpoll_init_file(struct file *file) | 61 | static inline void eventpoll_init_file(struct file *file) |
62 | { | 62 | { |
63 | INIT_LIST_HEAD(&file->f_ep_links); | 63 | INIT_LIST_HEAD(&file->f_ep_links); |
64 | INIT_LIST_HEAD(&file->f_tfile_llink); | ||
64 | } | 65 | } |
65 | 66 | ||
66 | 67 | ||
diff --git a/include/linux/fs.h b/include/linux/fs.h index 7b17db7c5a6..d8ecb015ff8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -969,6 +969,7 @@ struct file { | |||
969 | #ifdef CONFIG_EPOLL | 969 | #ifdef CONFIG_EPOLL |
970 | /* Used by fs/eventpoll.c to link all the hooks to this file */ | 970 | /* Used by fs/eventpoll.c to link all the hooks to this file */ |
971 | struct list_head f_ep_links; | 971 | struct list_head f_ep_links; |
972 | struct list_head f_tfile_llink; | ||
972 | #endif /* #ifdef CONFIG_EPOLL */ | 973 | #endif /* #ifdef CONFIG_EPOLL */ |
973 | struct address_space *f_mapping; | 974 | struct address_space *f_mapping; |
974 | #ifdef CONFIG_DEBUG_WRITECOUNT | 975 | #ifdef CONFIG_DEBUG_WRITECOUNT |