aboutsummaryrefslogtreecommitdiffstats
path: root/fs/eventpoll.c
diff options
context:
space:
mode:
authorNelson Elhage <nelhage@nelhage.com>2011-10-31 20:13:14 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-31 20:30:57 -0400
commitd8805e633e054c816c47cb6e727c81f156d9253d (patch)
tree8f7151e635a38593d68eae40586449627d835965 /fs/eventpoll.c
parent15662b3e8644905032c2e26808401a487d4e90c1 (diff)
epoll: fix spurious lockdep warnings
epoll can acquire recursively acquire ep->mtx on multiple "struct eventpoll"s at once in the case where one epoll fd is monitoring another epoll fd. This is perfectly OK, since we're careful about the lock ordering, but it causes spurious lockdep warnings. Annotate the recursion using mutex_lock_nested, and add a comment explaining the nesting rules for good measure. Recent versions of systemd are triggering this, and it can also be demonstrated with the following trivial test program: --------------------8<-------------------- int main(void) { int e1, e2; struct epoll_event evt = { .events = EPOLLIN }; e1 = epoll_create1(0); e2 = epoll_create1(0); epoll_ctl(e1, EPOLL_CTL_ADD, e2, &evt); return 0; } --------------------8<-------------------- Reported-by: Paul Bolle <pebolle@tiscali.nl> Tested-by: Paul Bolle <pebolle@tiscali.nl> Signed-off-by: Nelson Elhage <nelhage@nelhage.com> Acked-by: Jason Baron <jbaron@redhat.com> Cc: Dave Jones <davej@redhat.com> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: <stable@kernel.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/eventpoll.c')
-rw-r--r--fs/eventpoll.c25
1 files changed, 18 insertions, 7 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9026fc91fe3b..828e750af23a 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -70,6 +70,15 @@
70 * simultaneous inserts (A into B and B into A) from racing and 70 * simultaneous inserts (A into B and B into A) from racing and
71 * constructing a cycle without either insert observing that it is 71 * constructing a cycle without either insert observing that it is
72 * going to. 72 * going to.
73 * It is necessary to acquire multiple "ep->mtx"es at once in the
74 * case when one epoll fd is added to another. In this case, we
75 * always acquire the locks in the order of nesting (i.e. after
76 * epoll_ctl(e1, EPOLL_CTL_ADD, e2), e1->mtx will always be acquired
77 * before e2->mtx). Since we disallow cycles of epoll file
78 * descriptors, this ensures that the mutexes are well-ordered. In
79 * order to communicate this nesting to lockdep, when walking a tree
80 * of epoll file descriptors, we use the current recursion depth as
81 * the lockdep subkey.
73 * It is possible to drop the "ep->mtx" and to use the global 82 * It is possible to drop the "ep->mtx" and to use the global
74 * mutex "epmutex" (together with "ep->lock") to have it working, 83 * mutex "epmutex" (together with "ep->lock") to have it working,
75 * but having "ep->mtx" will make the interface more scalable. 84 * but having "ep->mtx" will make the interface more scalable.
@@ -464,13 +473,15 @@ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
464 * @ep: Pointer to the epoll private data structure. 473 * @ep: Pointer to the epoll private data structure.
465 * @sproc: Pointer to the scan callback. 474 * @sproc: Pointer to the scan callback.
466 * @priv: Private opaque data passed to the @sproc callback. 475 * @priv: Private opaque data passed to the @sproc callback.
476 * @depth: The current depth of recursive f_op->poll calls.
467 * 477 *
468 * Returns: The same integer error code returned by the @sproc callback. 478 * Returns: The same integer error code returned by the @sproc callback.
469 */ 479 */
470static int ep_scan_ready_list(struct eventpoll *ep, 480static int ep_scan_ready_list(struct eventpoll *ep,
471 int (*sproc)(struct eventpoll *, 481 int (*sproc)(struct eventpoll *,
472 struct list_head *, void *), 482 struct list_head *, void *),
473 void *priv) 483 void *priv,
484 int depth)
474{ 485{
475 int error, pwake = 0; 486 int error, pwake = 0;
476 unsigned long flags; 487 unsigned long flags;
@@ -481,7 +492,7 @@ static int ep_scan_ready_list(struct eventpoll *ep,
481 * We need to lock this because we could be hit by 492 * We need to lock this because we could be hit by
482 * eventpoll_release_file() and epoll_ctl(). 493 * eventpoll_release_file() and epoll_ctl().
483 */ 494 */
484 mutex_lock(&ep->mtx); 495 mutex_lock_nested(&ep->mtx, depth);
485 496
486 /* 497 /*
487 * Steal the ready list, and re-init the original one to the 498 * Steal the ready list, and re-init the original one to the
@@ -670,7 +681,7 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
670 681
671static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) 682static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
672{ 683{
673 return ep_scan_ready_list(priv, ep_read_events_proc, NULL); 684 return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1);
674} 685}
675 686
676static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 687static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
@@ -737,7 +748,7 @@ void eventpoll_release_file(struct file *file)
737 748
738 ep = epi->ep; 749 ep = epi->ep;
739 list_del_init(&epi->fllink); 750 list_del_init(&epi->fllink);
740 mutex_lock(&ep->mtx); 751 mutex_lock_nested(&ep->mtx, 0);
741 ep_remove(ep, epi); 752 ep_remove(ep, epi);
742 mutex_unlock(&ep->mtx); 753 mutex_unlock(&ep->mtx);
743 } 754 }
@@ -1134,7 +1145,7 @@ static int ep_send_events(struct eventpoll *ep,
1134 esed.maxevents = maxevents; 1145 esed.maxevents = maxevents;
1135 esed.events = events; 1146 esed.events = events;
1136 1147
1137 return ep_scan_ready_list(ep, ep_send_events_proc, &esed); 1148 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0);
1138} 1149}
1139 1150
1140static inline struct timespec ep_set_mstimeout(long ms) 1151static inline struct timespec ep_set_mstimeout(long ms)
@@ -1267,7 +1278,7 @@ static int ep_loop_check_proc(void *priv, void *cookie, int call_nests)
1267 struct rb_node *rbp; 1278 struct rb_node *rbp;
1268 struct epitem *epi; 1279 struct epitem *epi;
1269 1280
1270 mutex_lock(&ep->mtx); 1281 mutex_lock_nested(&ep->mtx, call_nests + 1);
1271 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) { 1282 for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
1272 epi = rb_entry(rbp, struct epitem, rbn); 1283 epi = rb_entry(rbp, struct epitem, rbn);
1273 if (unlikely(is_file_epoll(epi->ffd.file))) { 1284 if (unlikely(is_file_epoll(epi->ffd.file))) {
@@ -1409,7 +1420,7 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1409 } 1420 }
1410 1421
1411 1422
1412 mutex_lock(&ep->mtx); 1423 mutex_lock_nested(&ep->mtx, 0);
1413 1424
1414 /* 1425 /*
1415 * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 1426 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"