aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason Baron <jbaron@akamai.com>2013-11-12 18:10:16 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:25 -0500
commitae10b2b4eb01bedc91d29d5c5bb9e416fd806c40 (patch)
treeac33bf227eac54951bc1372d1dea2f2d1168f110
parent823b794ce176bcf135a062075737be71a78629dd (diff)
epoll: optimize EPOLL_CTL_DEL using rcu
Nathan Zimmer found that once we get over 10+ cpus, the scalability of SPECjbb falls over due to the contention on the global 'epmutex', which is taken in on EPOLL_CTL_ADD and EPOLL_CTL_DEL operations. Patch #1 removes the 'epmutex' lock completely from the EPOLL_CTL_DEL path by using rcu to guard against any concurrent traversals. Patch #2 remove the 'epmutex' lock from EPOLL_CTL_ADD operations for simple topologies. IE when adding a link from an epoll file descriptor to a wakeup source, where the epoll file descriptor is not nested. This patch (of 2): Optimize EPOLL_CTL_DEL such that it does not require the 'epmutex' by converting the file->f_ep_links list into an rcu one. In this way, we can traverse the epoll network on the add path in parallel with deletes. Since deletes can't create loops or worse wakeup paths, this is safe. This patch in combination with the patch "epoll: Do not take global 'epmutex' for simple topologies", shows a dramatic performance improvement in scalability for SPECjbb. Signed-off-by: Jason Baron <jbaron@akamai.com> Tested-by: Nathan Zimmer <nzimmer@sgi.com> Cc: Eric Wong <normalperson@yhbt.net> Cc: Nelson Elhage <nelhage@nelhage.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> CC: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/eventpoll.c56
1 files changed, 32 insertions, 24 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 810c28fb8c3c..584249454822 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -41,6 +41,7 @@
41#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
42#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/compat.h> 43#include <linux/compat.h>
44#include <linux/rculist.h>
44 45
45/* 46/*
46 * LOCKING: 47 * LOCKING:
@@ -133,8 +134,12 @@ struct nested_calls {
133 * of these on a server and we do not want this to take another cache line. 134 * of these on a server and we do not want this to take another cache line.
134 */ 135 */
135struct epitem { 136struct epitem {
136 /* RB tree node used to link this structure to the eventpoll RB tree */ 137 union {
137 struct rb_node rbn; 138 /* RB tree node links this structure to the eventpoll RB tree */
139 struct rb_node rbn;
140 /* Used to free the struct epitem */
141 struct rcu_head rcu;
142 };
138 143
139 /* List header used to link this structure to the eventpoll ready list */ 144 /* List header used to link this structure to the eventpoll ready list */
140 struct list_head rdllink; 145 struct list_head rdllink;
@@ -671,6 +676,12 @@ static int ep_scan_ready_list(struct eventpoll *ep,
671 return error; 676 return error;
672} 677}
673 678
679static void epi_rcu_free(struct rcu_head *head)
680{
681 struct epitem *epi = container_of(head, struct epitem, rcu);
682 kmem_cache_free(epi_cache, epi);
683}
684
674/* 685/*
675 * Removes a "struct epitem" from the eventpoll RB tree and deallocates 686 * Removes a "struct epitem" from the eventpoll RB tree and deallocates
676 * all the associated resources. Must be called with "mtx" held. 687 * all the associated resources. Must be called with "mtx" held.
@@ -692,8 +703,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
692 703
693 /* Remove the current item from the list of epoll hooks */ 704 /* Remove the current item from the list of epoll hooks */
694 spin_lock(&file->f_lock); 705 spin_lock(&file->f_lock);
695 if (ep_is_linked(&epi->fllink)) 706 list_del_rcu(&epi->fllink);
696 list_del_init(&epi->fllink);
697 spin_unlock(&file->f_lock); 707 spin_unlock(&file->f_lock);
698 708
699 rb_erase(&epi->rbn, &ep->rbr); 709 rb_erase(&epi->rbn, &ep->rbr);
@@ -704,9 +714,14 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
704 spin_unlock_irqrestore(&ep->lock, flags); 714 spin_unlock_irqrestore(&ep->lock, flags);
705 715
706 wakeup_source_unregister(ep_wakeup_source(epi)); 716 wakeup_source_unregister(ep_wakeup_source(epi));
707 717 /*
708 /* At this point it is safe to free the eventpoll item */ 718 * At this point it is safe to free the eventpoll item. Use the union
709 kmem_cache_free(epi_cache, epi); 719 * field epi->rcu, since we are trying to minimize the size of
720 * 'struct epitem'. The 'rbn' field is no longer in use. Protected by
721 * ep->mtx. The rcu read side, reverse_path_check_proc(), does not make
722 * use of the rbn field.
723 */
724 call_rcu(&epi->rcu, epi_rcu_free);
710 725
711 atomic_long_dec(&ep->user->epoll_watches); 726 atomic_long_dec(&ep->user->epoll_watches);
712 727
@@ -872,7 +887,6 @@ static const struct file_operations eventpoll_fops = {
872 */ 887 */
873void eventpoll_release_file(struct file *file) 888void eventpoll_release_file(struct file *file)
874{ 889{
875 struct list_head *lsthead = &file->f_ep_links;
876 struct eventpoll *ep; 890 struct eventpoll *ep;
877 struct epitem *epi; 891 struct epitem *epi;
878 892
@@ -890,17 +904,12 @@ void eventpoll_release_file(struct file *file)
890 * Besides, ep_remove() acquires the lock, so we can't hold it here. 904 * Besides, ep_remove() acquires the lock, so we can't hold it here.
891 */ 905 */
892 mutex_lock(&epmutex); 906 mutex_lock(&epmutex);
893 907 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
894 while (!list_empty(lsthead)) {
895 epi = list_first_entry(lsthead, struct epitem, fllink);
896
897 ep = epi->ep; 908 ep = epi->ep;
898 list_del_init(&epi->fllink);
899 mutex_lock_nested(&ep->mtx, 0); 909 mutex_lock_nested(&ep->mtx, 0);
900 ep_remove(ep, epi); 910 ep_remove(ep, epi);
901 mutex_unlock(&ep->mtx); 911 mutex_unlock(&ep->mtx);
902 } 912 }
903
904 mutex_unlock(&epmutex); 913 mutex_unlock(&epmutex);
905} 914}
906 915
@@ -1138,7 +1147,9 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1138 struct file *child_file; 1147 struct file *child_file;
1139 struct epitem *epi; 1148 struct epitem *epi;
1140 1149
1141 list_for_each_entry(epi, &file->f_ep_links, fllink) { 1150 /* CTL_DEL can remove links here, but that can't increase our count */
1151 rcu_read_lock();
1152 list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
1142 child_file = epi->ep->file; 1153 child_file = epi->ep->file;
1143 if (is_file_epoll(child_file)) { 1154 if (is_file_epoll(child_file)) {
1144 if (list_empty(&child_file->f_ep_links)) { 1155 if (list_empty(&child_file->f_ep_links)) {
@@ -1160,6 +1171,7 @@ static int reverse_path_check_proc(void *priv, void *cookie, int call_nests)
1160 "file is not an ep!\n"); 1171 "file is not an ep!\n");
1161 } 1172 }
1162 } 1173 }
1174 rcu_read_unlock();
1163 return error; 1175 return error;
1164} 1176}
1165 1177
@@ -1286,7 +1298,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1286 1298
1287 /* Add the current item to the list of active epoll hook for this file */ 1299 /* Add the current item to the list of active epoll hook for this file */
1288 spin_lock(&tfile->f_lock); 1300 spin_lock(&tfile->f_lock);
1289 list_add_tail(&epi->fllink, &tfile->f_ep_links); 1301 list_add_tail_rcu(&epi->fllink, &tfile->f_ep_links);
1290 spin_unlock(&tfile->f_lock); 1302 spin_unlock(&tfile->f_lock);
1291 1303
1292 /* 1304 /*
@@ -1327,8 +1339,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1327 1339
1328error_remove_epi: 1340error_remove_epi:
1329 spin_lock(&tfile->f_lock); 1341 spin_lock(&tfile->f_lock);
1330 if (ep_is_linked(&epi->fllink)) 1342 list_del_rcu(&epi->fllink);
1331 list_del_init(&epi->fllink);
1332 spin_unlock(&tfile->f_lock); 1343 spin_unlock(&tfile->f_lock);
1333 1344
1334 rb_erase(&epi->rbn, &ep->rbr); 1345 rb_erase(&epi->rbn, &ep->rbr);
@@ -1844,15 +1855,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1844 * and hang them on the tfile_check_list, so we can check that we 1855 * and hang them on the tfile_check_list, so we can check that we
1845 * haven't created too many possible wakeup paths. 1856 * haven't created too many possible wakeup paths.
1846 * 1857 *
1847 * We need to hold the epmutex across both ep_insert and ep_remove 1858 * We need to hold the epmutex across ep_insert to prevent
1848 * b/c we want to make sure we are looking at a coherent view of 1859 * multple adds from creating loops in parallel.
1849 * epoll network.
1850 */ 1860 */
1851 if (op == EPOLL_CTL_ADD || op == EPOLL_CTL_DEL) { 1861 if (op == EPOLL_CTL_ADD) {
1852 mutex_lock(&epmutex); 1862 mutex_lock(&epmutex);
1853 did_lock_epmutex = 1; 1863 did_lock_epmutex = 1;
1854 }
1855 if (op == EPOLL_CTL_ADD) {
1856 if (is_file_epoll(tf.file)) { 1864 if (is_file_epoll(tf.file)) {
1857 error = -ELOOP; 1865 error = -ELOOP;
1858 if (ep_loop_check(ep, tf.file) != 0) { 1866 if (ep_loop_check(ep, tf.file) != 0) {