aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJason Baron <jbaron@akamai.com>2013-11-12 18:10:18 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2013-11-12 22:09:25 -0500
commit67347fe4e6326338ee217d7eb826bedf30b2e155 (patch)
tree2183a35b00b2451c1fb2ac41be35cb25a1db6ac7 /fs
parentae10b2b4eb01bedc91d29d5c5bb9e416fd806c40 (diff)
epoll: do not take global 'epmutex' for simple topologies
When calling EPOLL_CTL_ADD for an epoll file descriptor that is attached directly to a wakeup source, we do not need to take the global 'epmutex', unless the epoll file descriptor is nested. The purpose of taking the 'epmutex' on add is to prevent complex topologies such as loops and deep wakeup paths from forming in parallel through multiple EPOLL_CTL_ADD operations. However, for the simple case of an epoll file descriptor attached directly to a wakeup source (with no nesting), we do not need to hold the 'epmutex'. This patch along with 'epoll: optimize EPOLL_CTL_DEL using rcu' improves scalability on larger systems. Quoting Nathan Zimmer's mail on SPECjbb performance: "On the 16 socket run the performance went from 35k jOPS to 125k jOPS. In addition the benchmark when from scaling well on 10 sockets to scaling well on just over 40 sockets. ... Currently the benchmark stops scaling at around 40-44 sockets but it seems like I found a second unrelated bottleneck." [akpm@linux-foundation.org: use `bool' for boolean variables, remove unneeded/undesirable cast of void*, add missed ep_scan_ready_list() kerneldoc] Signed-off-by: Jason Baron <jbaron@akamai.com> Tested-by: Nathan Zimmer <nzimmer@sgi.com> Cc: Eric Wong <normalperson@yhbt.net> Cc: Nelson Elhage <nelhage@nelhage.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Davide Libenzi <davidel@xmailserver.org> Cc: "Paul E. McKenney" <paulmck@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/eventpoll.c95
1 files changed, 69 insertions, 26 deletions
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 584249454822..f7fe7e3ce664 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -585,14 +585,14 @@ static inline void ep_pm_stay_awake_rcu(struct epitem *epi)
585 * @sproc: Pointer to the scan callback. 585 * @sproc: Pointer to the scan callback.
586 * @priv: Private opaque data passed to the @sproc callback. 586 * @priv: Private opaque data passed to the @sproc callback.
587 * @depth: The current depth of recursive f_op->poll calls. 587 * @depth: The current depth of recursive f_op->poll calls.
588 * @ep_locked: caller already holds ep->mtx
588 * 589 *
589 * Returns: The same integer error code returned by the @sproc callback. 590 * Returns: The same integer error code returned by the @sproc callback.
590 */ 591 */
591static int ep_scan_ready_list(struct eventpoll *ep, 592static int ep_scan_ready_list(struct eventpoll *ep,
592 int (*sproc)(struct eventpoll *, 593 int (*sproc)(struct eventpoll *,
593 struct list_head *, void *), 594 struct list_head *, void *),
594 void *priv, 595 void *priv, int depth, bool ep_locked)
595 int depth)
596{ 596{
597 int error, pwake = 0; 597 int error, pwake = 0;
598 unsigned long flags; 598 unsigned long flags;
@@ -603,7 +603,9 @@ static int ep_scan_ready_list(struct eventpoll *ep,
603 * We need to lock this because we could be hit by 603 * We need to lock this because we could be hit by
604 * eventpoll_release_file() and epoll_ctl(). 604 * eventpoll_release_file() and epoll_ctl().
605 */ 605 */
606 mutex_lock_nested(&ep->mtx, depth); 606
607 if (!ep_locked)
608 mutex_lock_nested(&ep->mtx, depth);
607 609
608 /* 610 /*
609 * Steal the ready list, and re-init the original one to the 611 * Steal the ready list, and re-init the original one to the
@@ -667,7 +669,8 @@ static int ep_scan_ready_list(struct eventpoll *ep,
667 } 669 }
668 spin_unlock_irqrestore(&ep->lock, flags); 670 spin_unlock_irqrestore(&ep->lock, flags);
669 671
670 mutex_unlock(&ep->mtx); 672 if (!ep_locked)
673 mutex_unlock(&ep->mtx);
671 674
672 /* We have to call this outside the lock */ 675 /* We have to call this outside the lock */
673 if (pwake) 676 if (pwake)
@@ -822,15 +825,34 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
822 return 0; 825 return 0;
823} 826}
824 827
828static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
829 poll_table *pt);
830
831struct readyevents_arg {
832 struct eventpoll *ep;
833 bool locked;
834};
835
825static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests) 836static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
826{ 837{
827 return ep_scan_ready_list(priv, ep_read_events_proc, NULL, call_nests + 1); 838 struct readyevents_arg *arg = priv;
839
840 return ep_scan_ready_list(arg->ep, ep_read_events_proc, NULL,
841 call_nests + 1, arg->locked);
828} 842}
829 843
830static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) 844static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
831{ 845{
832 int pollflags; 846 int pollflags;
833 struct eventpoll *ep = file->private_data; 847 struct eventpoll *ep = file->private_data;
848 struct readyevents_arg arg;
849
850 /*
851 * During ep_insert() we already hold the ep->mtx for the tfile.
852 * Prevent re-aquisition.
853 */
854 arg.locked = wait && (wait->_qproc == ep_ptable_queue_proc);
855 arg.ep = ep;
834 856
835 /* Insert inside our poll wait queue */ 857 /* Insert inside our poll wait queue */
836 poll_wait(file, &ep->poll_wait, wait); 858 poll_wait(file, &ep->poll_wait, wait);
@@ -842,7 +864,7 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
842 * could re-enter here. 864 * could re-enter here.
843 */ 865 */
844 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, 866 pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
845 ep_poll_readyevents_proc, ep, ep, current); 867 ep_poll_readyevents_proc, &arg, ep, current);
846 868
847 return pollflags != -1 ? pollflags : 0; 869 return pollflags != -1 ? pollflags : 0;
848} 870}
@@ -1243,7 +1265,7 @@ static noinline void ep_destroy_wakeup_source(struct epitem *epi)
1243 * Must be called with "mtx" held. 1265 * Must be called with "mtx" held.
1244 */ 1266 */
1245static int ep_insert(struct eventpoll *ep, struct epoll_event *event, 1267static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1246 struct file *tfile, int fd) 1268 struct file *tfile, int fd, int full_check)
1247{ 1269{
1248 int error, revents, pwake = 0; 1270 int error, revents, pwake = 0;
1249 unsigned long flags; 1271 unsigned long flags;
@@ -1309,7 +1331,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
1309 1331
1310 /* now check if we've created too many backpaths */ 1332 /* now check if we've created too many backpaths */
1311 error = -EINVAL; 1333 error = -EINVAL;
1312 if (reverse_path_check()) 1334 if (full_check && reverse_path_check())
1313 goto error_remove_epi; 1335 goto error_remove_epi;
1314 1336
1315 /* We have to drop the new item inside our item list to keep track of it */ 1337 /* We have to drop the new item inside our item list to keep track of it */
@@ -1532,7 +1554,7 @@ static int ep_send_events(struct eventpoll *ep,
1532 esed.maxevents = maxevents; 1554 esed.maxevents = maxevents;
1533 esed.events = events; 1555 esed.events = events;
1534 1556
1535 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0); 1557 return ep_scan_ready_list(ep, ep_send_events_proc, &esed, 0, false);
1536} 1558}
1537 1559
1538static inline struct timespec ep_set_mstimeout(long ms) 1560static inline struct timespec ep_set_mstimeout(long ms)
@@ -1802,11 +1824,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1802 struct epoll_event __user *, event) 1824 struct epoll_event __user *, event)
1803{ 1825{
1804 int error; 1826 int error;
1805 int did_lock_epmutex = 0; 1827 int full_check = 0;
1806 struct fd f, tf; 1828 struct fd f, tf;
1807 struct eventpoll *ep; 1829 struct eventpoll *ep;
1808 struct epitem *epi; 1830 struct epitem *epi;
1809 struct epoll_event epds; 1831 struct epoll_event epds;
1832 struct eventpoll *tep = NULL;
1810 1833
1811 error = -EFAULT; 1834 error = -EFAULT;
1812 if (ep_op_has_event(op) && 1835 if (ep_op_has_event(op) &&
@@ -1855,23 +1878,40 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1855 * and hang them on the tfile_check_list, so we can check that we 1878 * and hang them on the tfile_check_list, so we can check that we
1856 * haven't created too many possible wakeup paths. 1879 * haven't created too many possible wakeup paths.
1857 * 1880 *
1858 * We need to hold the epmutex across ep_insert to prevent 1881 * We do not need to take the global 'epumutex' on EPOLL_CTL_ADD when
1859 * multple adds from creating loops in parallel. 1882 * the epoll file descriptor is attaching directly to a wakeup source,
1883 * unless the epoll file descriptor is nested. The purpose of taking the
1884 * 'epmutex' on add is to prevent complex toplogies such as loops and
1885 * deep wakeup paths from forming in parallel through multiple
1886 * EPOLL_CTL_ADD operations.
1860 */ 1887 */
1888 mutex_lock_nested(&ep->mtx, 0);
1861 if (op == EPOLL_CTL_ADD) { 1889 if (op == EPOLL_CTL_ADD) {
1862 mutex_lock(&epmutex); 1890 if (!list_empty(&f.file->f_ep_links) ||
1863 did_lock_epmutex = 1; 1891 is_file_epoll(tf.file)) {
1864 if (is_file_epoll(tf.file)) { 1892 full_check = 1;
1865 error = -ELOOP; 1893 mutex_unlock(&ep->mtx);
1866 if (ep_loop_check(ep, tf.file) != 0) { 1894 mutex_lock(&epmutex);
1867 clear_tfile_check_list(); 1895 if (is_file_epoll(tf.file)) {
1868 goto error_tgt_fput; 1896 error = -ELOOP;
1897 if (ep_loop_check(ep, tf.file) != 0) {
1898 clear_tfile_check_list();
1899 goto error_tgt_fput;
1900 }
1901 } else
1902 list_add(&tf.file->f_tfile_llink,
1903 &tfile_check_list);
1904 mutex_lock_nested(&ep->mtx, 0);
1905 if (is_file_epoll(tf.file)) {
1906 tep = tf.file->private_data;
1907 mutex_lock_nested(&tep->mtx, 1);
1869 } 1908 }
1870 } else 1909 }
1871 list_add(&tf.file->f_tfile_llink, &tfile_check_list); 1910 }
1911 if (op == EPOLL_CTL_DEL && is_file_epoll(tf.file)) {
1912 tep = tf.file->private_data;
1913 mutex_lock_nested(&tep->mtx, 1);
1872 } 1914 }
1873
1874 mutex_lock_nested(&ep->mtx, 0);
1875 1915
1876 /* 1916 /*
1877 * Try to lookup the file inside our RB tree, Since we grabbed "mtx" 1917 * Try to lookup the file inside our RB tree, Since we grabbed "mtx"
@@ -1885,10 +1925,11 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1885 case EPOLL_CTL_ADD: 1925 case EPOLL_CTL_ADD:
1886 if (!epi) { 1926 if (!epi) {
1887 epds.events |= POLLERR | POLLHUP; 1927 epds.events |= POLLERR | POLLHUP;
1888 error = ep_insert(ep, &epds, tf.file, fd); 1928 error = ep_insert(ep, &epds, tf.file, fd, full_check);
1889 } else 1929 } else
1890 error = -EEXIST; 1930 error = -EEXIST;
1891 clear_tfile_check_list(); 1931 if (full_check)
1932 clear_tfile_check_list();
1892 break; 1933 break;
1893 case EPOLL_CTL_DEL: 1934 case EPOLL_CTL_DEL:
1894 if (epi) 1935 if (epi)
@@ -1904,10 +1945,12 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
1904 error = -ENOENT; 1945 error = -ENOENT;
1905 break; 1946 break;
1906 } 1947 }
1948 if (tep != NULL)
1949 mutex_unlock(&tep->mtx);
1907 mutex_unlock(&ep->mtx); 1950 mutex_unlock(&ep->mtx);
1908 1951
1909error_tgt_fput: 1952error_tgt_fput:
1910 if (did_lock_epmutex) 1953 if (full_check)
1911 mutex_unlock(&epmutex); 1954 mutex_unlock(&epmutex);
1912 1955
1913 fdput(tf); 1956 fdput(tf);