diff options
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r-- | fs/eventpoll.c | 234 |
1 files changed, 209 insertions, 25 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 828e750af23a..aabdfc38cf24 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c | |||
@@ -197,6 +197,12 @@ struct eventpoll { | |||
197 | 197 | ||
198 | /* The user that created the eventpoll descriptor */ | 198 | /* The user that created the eventpoll descriptor */ |
199 | struct user_struct *user; | 199 | struct user_struct *user; |
200 | |||
201 | struct file *file; | ||
202 | |||
203 | /* used to optimize loop detection check */ | ||
204 | int visited; | ||
205 | struct list_head visited_list_link; | ||
200 | }; | 206 | }; |
201 | 207 | ||
202 | /* Wait structure used by the poll hooks */ | 208 | /* Wait structure used by the poll hooks */ |
@@ -255,6 +261,15 @@ static struct kmem_cache *epi_cache __read_mostly; | |||
255 | /* Slab cache used to allocate "struct eppoll_entry" */ | 261 | /* Slab cache used to allocate "struct eppoll_entry" */ |
256 | static struct kmem_cache *pwq_cache __read_mostly; | 262 | static struct kmem_cache *pwq_cache __read_mostly; |
257 | 263 | ||
264 | /* Visited nodes during ep_loop_check(), so we can unset them when we finish */ | ||
265 | static LIST_HEAD(visited_list); | ||
266 | |||
267 | /* | ||
268 | * List of files with newly added links, where we may need to limit the number | ||
269 | * of emanating paths. Protected by the epmutex. | ||
270 | */ | ||
271 | static LIST_HEAD(tfile_check_list); | ||
272 | |||
258 | #ifdef CONFIG_SYSCTL | 273 | #ifdef CONFIG_SYSCTL |
259 | 274 | ||
260 | #include <linux/sysctl.h> | 275 | #include <linux/sysctl.h> |
@@ -276,6 +291,12 @@ ctl_table epoll_table[] = { | |||
276 | }; | 291 | }; |
277 | #endif /* CONFIG_SYSCTL */ | 292 | #endif /* CONFIG_SYSCTL */ |
278 | 293 | ||
294 | static const struct file_operations eventpoll_fops; | ||
295 | |||
296 | static inline int is_file_epoll(struct file *f) | ||
297 | { | ||
298 | return f->f_op == &eventpoll_fops; | ||
299 | } | ||
279 | 300 | ||
280 | /* Setup the structure that is used as key for the RB tree */ | 301 | /* Setup the structure that is used as key for the RB tree */ |
281 | static inline void ep_set_ffd(struct epoll_filefd *ffd, | 302 | static inline void ep_set_ffd(struct epoll_filefd *ffd, |
@@ -711,12 +732,6 @@ static const struct file_operations eventpoll_fops = { | |||
711 | .llseek = noop_llseek, | 732 | .llseek = noop_llseek, |
712 | }; | 733 | }; |
713 | 734 | ||
714 | /* Fast test to see if the file is an eventpoll file */ | ||
715 | static inline int is_file_epoll(struct file *f) | ||
716 | { | ||
717 | return f->f_op == &eventpoll_fops; | ||
718 | } | ||
719 | |||
720 | /* | 735 | /* |
721 | * This is called from eventpoll_release() to unlink files from the eventpoll | 736 | * This is called from eventpoll_release() to unlink files from the eventpoll |
722 | * interface. We need to have this facility to cleanup correctly files that are | 737 | * interface. We need to have this facility to cleanup correctly files that are |
@@ -926,6 +941,99 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) | |||
926 | rb_insert_color(&epi->rbn, &ep->rbr); | 941 | rb_insert_color(&epi->rbn, &ep->rbr); |
927 | } | 942 | } |
928 | 943 | ||
944 | |||
945 | |||
946 | #define PATH_ARR_SIZE 5 | ||
947 | /* | ||
948 | * These are the number paths of length 1 to 5, that we are allowing to emanate | ||
949 | * from a single file of interest. For example, we allow 1000 paths of length | ||
950 | * 1, to emanate from each file of interest. This essentially represents the | ||
951 | * potential wakeup paths, which need to be limited in order to avoid massive | ||
952 | * uncontrolled wakeup storms. The common use case should be a single ep which | ||
953 | * is connected to n file sources. In this case each file source has 1 path | ||
954 | * of length 1. Thus, the numbers below should be more than sufficient. These | ||
955 | * path limits are enforced during an EPOLL_CTL_ADD operation, since a modify | ||
956 | * and delete can't add additional paths. Protected by the epmutex. | ||
957 | */ | ||
958 | static const int path_limits[PATH_ARR_SIZE] = { 1000, 500, 100, 50, 10 }; | ||
959 | static int path_count[PATH_ARR_SIZE]; | ||
960 | |||
961 | static int path_count_inc(int nests) | ||
962 | { | ||
963 | if (++path_count[nests] > path_limits[nests]) | ||
964 | return -1; | ||
965 | return 0; | ||
966 | } | ||
967 | |||
968 | static void path_count_init(void) | ||
969 | { | ||
970 | int i; | ||
971 | |||
972 | for (i = 0; i < PATH_ARR_SIZE; i++) | ||
973 | path_count[i] = 0; | ||
974 | } | ||
975 | |||
976 | static int reverse_path_check_proc(void *priv, void *cookie, int call_nests) | ||
977 | { | ||
978 | int error = 0; | ||
979 | struct file *file = priv; | ||
980 | struct file *child_file; | ||
981 | struct epitem *epi; | ||
982 | |||
983 | list_for_each_entry(epi, &file->f_ep_links, fllink) { | ||
984 | child_file = epi->ep->file; | ||
985 | if (is_file_epoll(child_file)) { | ||
986 | if (list_empty(&child_file->f_ep_links)) { | ||
987 | if (path_count_inc(call_nests)) { | ||
988 | error = -1; | ||
989 | break; | ||
990 | } | ||
991 | } else { | ||
992 | error = ep_call_nested(&poll_loop_ncalls, | ||
993 | EP_MAX_NESTS, | ||
994 | reverse_path_check_proc, | ||
995 | child_file, child_file, | ||
996 | current); | ||
997 | } | ||
998 | if (error != 0) | ||
999 | break; | ||
1000 | } else { | ||
1001 | printk(KERN_ERR "reverse_path_check_proc: " | ||
1002 | "file is not an ep!\n"); | ||
1003 | } | ||
1004 | } | ||
1005 | return error; | ||
1006 | } | ||
1007 | |||
1008 | /** | ||
1009 | * reverse_path_check - The tfile_check_list is list of file *, which have | ||
1010 | * links that are proposed to be newly added. We need to | ||
1011 | * make sure that those added links don't add too many | ||
1012 | * paths such that we will spend all our time waking up | ||
1013 | * eventpoll objects. | ||
1014 | * | ||
1015 | * Returns: Returns zero if the proposed links don't create too many paths, | ||
1016 | * -1 otherwise. | ||
1017 | */ | ||
1018 | static int reverse_path_check(void) | ||
1019 | { | ||
1020 | int length = 0; | ||
1021 | int error = 0; | ||
1022 | struct file *current_file; | ||
1023 | |||
1024 | /* let's call this for all tfiles */ | ||
1025 | list_for_each_entry(current_file, &tfile_check_list, f_tfile_llink) { | ||
1026 | length++; | ||
1027 | path_count_init(); | ||
1028 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1029 | reverse_path_check_proc, current_file, | ||
1030 | current_file, current); | ||
1031 | if (error) | ||
1032 | break; | ||
1033 | } | ||
1034 | return error; | ||
1035 | } | ||
1036 | |||
929 | /* | 1037 | /* |
930 | * Must be called with "mtx" held. | 1038 | * Must be called with "mtx" held. |
931 | */ | 1039 | */ |
@@ -987,6 +1095,11 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
987 | */ | 1095 | */ |
988 | ep_rbtree_insert(ep, epi); | 1096 | ep_rbtree_insert(ep, epi); |
989 | 1097 | ||
1098 | /* now check if we've created too many backpaths */ | ||
1099 | error = -EINVAL; | ||
1100 | if (reverse_path_check()) | ||
1101 | goto error_remove_epi; | ||
1102 | |||
990 | /* We have to drop the new item inside our item list to keep track of it */ | 1103 | /* We have to drop the new item inside our item list to keep track of it */ |
991 | spin_lock_irqsave(&ep->lock, flags); | 1104 | spin_lock_irqsave(&ep->lock, flags); |
992 | 1105 | ||
@@ -1011,6 +1124,14 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, | |||
1011 | 1124 | ||
1012 | return 0; | 1125 | return 0; |
1013 | 1126 | ||
1127 | error_remove_epi: | ||
1128 | spin_lock(&tfile->f_lock); | ||
1129 | if (ep_is_linked(&epi->fllink)) | ||
1130 | list_del_init(&epi->fllink); | ||
1131 | spin_unlock(&tfile->f_lock); | ||
1132 | |||
1133 | rb_erase(&epi->rbn, &ep->rbr); | ||
1134 | |||
1014 | error_unregister: | 1135 | error_unregister: |
1015 | ep_unregister_pollwait(ep, epi); | 1136 | ep_unregister_pollwait(ep, epi); |
1016 | 1137 | ||
@@ -1275,18 +1396,36 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1275 | int error = 0; | 1396 | int error = 0; |
1276 | struct file *file = priv; | 1397 | struct file *file = priv; |
1277 | struct eventpoll *ep = file->private_data; | 1398 | struct eventpoll *ep = file->private_data; |
1399 | struct eventpoll *ep_tovisit; | ||
1278 | struct rb_node *rbp; | 1400 | struct rb_node *rbp; |
1279 | struct epitem *epi; | 1401 | struct epitem *epi; |
1280 | 1402 | ||
1281 | mutex_lock_nested(&ep->mtx, call_nests + 1); | 1403 | mutex_lock_nested(&ep->mtx, call_nests + 1); |
1404 | ep->visited = 1; | ||
1405 | list_add(&ep->visited_list_link, &visited_list); | ||
1282 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { | 1406 | for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { |
1283 | epi = rb_entry(rbp, struct epitem, rbn); | 1407 | epi = rb_entry(rbp, struct epitem, rbn); |
1284 | if (unlikely(is_file_epoll(epi->ffd.file))) { | 1408 | if (unlikely(is_file_epoll(epi->ffd.file))) { |
1409 | ep_tovisit = epi->ffd.file->private_data; | ||
1410 | if (ep_tovisit->visited) | ||
1411 | continue; | ||
1285 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1412 | error = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, |
1286 | ep_loop_check_proc, epi->ffd.file, | 1413 | ep_loop_check_proc, epi->ffd.file, |
1287 | epi->ffd.file->private_data, current); | 1414 | ep_tovisit, current); |
1288 | if (error != 0) | 1415 | if (error != 0) |
1289 | break; | 1416 | break; |
1417 | } else { | ||
1418 | /* | ||
1419 | * If we've reached a file that is not associated with | ||
1420 | * an ep, then we need to check if the newly added | ||
1421 | * links are going to add too many wakeup paths. We do | ||
1422 | * this by adding it to the tfile_check_list, if it's | ||
1423 | * not already there, and calling reverse_path_check() | ||
1424 | * during ep_insert(). | ||
1425 | */ | ||
1426 | if (list_empty(&epi->ffd.file->f_tfile_llink)) | ||
1427 | list_add(&epi->ffd.file->f_tfile_llink, | ||
1428 | &tfile_check_list); | ||
1290 | } | 1429 | } |
1291 | } | 1430 | } |
1292 | mutex_unlock(&ep->mtx); | 1431 | mutex_unlock(&ep->mtx); |
@@ -1307,8 +1446,31 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests) | |||
1307 | */ | 1446 | */ |
1308 | static int ep_loop_check(struct eventpoll *ep, struct file *file) | 1447 | static int ep_loop_check(struct eventpoll *ep, struct file *file) |
1309 | { | 1448 | { |
1310 | return ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | 1449 | int ret; |
1450 | struct eventpoll *ep_cur, *ep_next; | ||
1451 | |||
1452 | ret = ep_call_nested(&poll_loop_ncalls, EP_MAX_NESTS, | ||
1311 | ep_loop_check_proc, file, ep, current); | 1453 | ep_loop_check_proc, file, ep, current); |
1454 | /* clear visited list */ | ||
1455 | list_for_each_entry_safe(ep_cur, ep_next, &visited_list, | ||
1456 | visited_list_link) { | ||
1457 | ep_cur->visited = 0; | ||
1458 | list_del(&ep_cur->visited_list_link); | ||
1459 | } | ||
1460 | return ret; | ||
1461 | } | ||
1462 | |||
1463 | static void clear_tfile_check_list(void) | ||
1464 | { | ||
1465 | struct file *file; | ||
1466 | |||
1467 | /* first clear the tfile_check_list */ | ||
1468 | while (!list_empty(&tfile_check_list)) { | ||
1469 | file = list_first_entry(&tfile_check_list, struct file, | ||
1470 | f_tfile_llink); | ||
1471 | list_del_init(&file->f_tfile_llink); | ||
1472 | } | ||
1473 | INIT_LIST_HEAD(&tfile_check_list); | ||
1312 | } | 1474 | } |
1313 | 1475 | ||
1314 | /* | 1476 | /* |
@@ -1316,8 +1478,9 @@ static int ep_loop_check(struct eventpoll *ep, struct file *file) | |||
1316 | */ | 1478 | */ |
1317 | SYSCALL_DEFINE1(epoll_create1, int, flags) | 1479 | SYSCALL_DEFINE1(epoll_create1, int, flags) |
1318 | { | 1480 | { |
1319 | int error; | 1481 | int error, fd; |
1320 | struct eventpoll *ep = NULL; | 1482 | struct eventpoll *ep = NULL; |
1483 | struct file *file; | ||
1321 | 1484 | ||
1322 | /* Check the EPOLL_* constant for consistency. */ | 1485 | /* Check the EPOLL_* constant for consistency. */ |
1323 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); | 1486 | BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); |
@@ -1334,11 +1497,25 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) | |||
1334 | * Creates all the items needed to setup an eventpoll file. That is, | 1497 | * Creates all the items needed to setup an eventpoll file. That is, |
1335 | * a file structure and a free file descriptor. | 1498 | * a file structure and a free file descriptor. |
1336 | */ | 1499 | */ |
1337 | error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, | 1500 | fd = get_unused_fd_flags(O_RDWR | (flags & O_CLOEXEC)); |
1501 | if (fd < 0) { | ||
1502 | error = fd; | ||
1503 | goto out_free_ep; | ||
1504 | } | ||
1505 | file = anon_inode_getfile("[eventpoll]", &eventpoll_fops, ep, | ||
1338 | O_RDWR | (flags & O_CLOEXEC)); | 1506 | O_RDWR | (flags & O_CLOEXEC)); |
1339 | if (error < 0) | 1507 | if (IS_ERR(file)) { |
1340 | ep_free(ep); | 1508 | error = PTR_ERR(file); |
1341 | 1509 | goto out_free_fd; | |
1510 | } | ||
1511 | fd_install(fd, file); | ||
1512 | ep->file = file; | ||
1513 | return fd; | ||
1514 | |||
1515 | out_free_fd: | ||
1516 | put_unused_fd(fd); | ||
1517 | out_free_ep: | ||
1518 | ep_free(ep); | ||
1342 | return error; | 1519 | return error; |
1343 | } | 1520 | } |
1344 | 1521 | ||
@@ -1404,21 +1581,27 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1404 | /* | 1581 | /* |
1405 | * When we insert an epoll file descriptor, inside another epoll file | 1582 | * When we insert an epoll file descriptor, inside another epoll file |
1406 | * descriptor, there is the change of creating closed loops, which are | 1583 | * descriptor, there is the change of creating closed loops, which are |
1407 | * better be handled here, than in more critical paths. | 1584 | * better be handled here, than in more critical paths. While we are |
1585 | * checking for loops we also determine the list of files reachable | ||
1586 | * and hang them on the tfile_check_list, so we can check that we | ||
1587 | * haven't created too many possible wakeup paths. | ||
1408 | * | 1588 | * |
1409 | * We hold epmutex across the loop check and the insert in this case, in | 1589 | * We need to hold the epmutex across both ep_insert and ep_remove |
1410 | * order to prevent two separate inserts from racing and each doing the | 1590 | * b/c we want to make sure we are looking at a coherent view of |
1411 | * insert "at the same time" such that ep_loop_check passes on both | 1591 | * epoll network. |
1412 | * before either one does the insert, thereby creating a cycle. | ||
1413 | */ | 1592 | */ |
1414 | if (unlikely(is_file_epoll(tfile) && op == EPOLL_CTL_ADD)) { | 1593 | if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { |
1415 | mutex_lock(&epmutex); | 1594 | mutex_lock(&epmutex); |
1416 | did_lock_epmutex = 1; | 1595 | did_lock_epmutex = 1; |
1417 | error = -ELOOP; | ||
1418 | if (ep_loop_check(ep, tfile) != 0) | ||
1419 | goto error_tgt_fput; | ||
1420 | } | 1596 | } |
1421 | 1597 | if (op == EPOLL_CTL_ADD) { | |
1598 | if (is_file_epoll(tfile)) { | ||
1599 | error = -ELOOP; | ||
1600 | if (ep_loop_check(ep, tfile) != 0) | ||
1601 | goto error_tgt_fput; | ||
1602 | } else | ||
1603 | list_add(&tfile->f_tfile_llink, &tfile_check_list); | ||
1604 | } | ||
1422 | 1605 | ||
1423 | mutex_lock_nested(&ep->mtx, 0); | 1606 | mutex_lock_nested(&ep->mtx, 0); |
1424 | 1607 | ||
@@ -1437,6 +1620,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1437 | error = ep_insert(ep, &epds, tfile, fd); | 1620 | error = ep_insert(ep, &epds, tfile, fd); |
1438 | } else | 1621 | } else |
1439 | error = -EEXIST; | 1622 | error = -EEXIST; |
1623 | clear_tfile_check_list(); | ||
1440 | break; | 1624 | break; |
1441 | case EPOLL_CTL_DEL: | 1625 | case EPOLL_CTL_DEL: |
1442 | if (epi) | 1626 | if (epi) |
@@ -1455,7 +1639,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, | |||
1455 | mutex_unlock(&ep->mtx); | 1639 | mutex_unlock(&ep->mtx); |
1456 | 1640 | ||
1457 | error_tgt_fput: | 1641 | error_tgt_fput: |
1458 | if (unlikely(did_lock_epmutex)) | 1642 | if (did_lock_epmutex) |
1459 | mutex_unlock(&epmutex); | 1643 | mutex_unlock(&epmutex); |
1460 | 1644 | ||
1461 | fput(tfile); | 1645 | fput(tfile); |