aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorVlastimil Babka <vbabka@suse.cz>2015-06-24 19:58:48 -0400
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2015-07-21 13:10:04 -0400
commit1021c97205005db4f101e7fa55095ec13118ac03 (patch)
tree325814cc1bb9ea6f04f333dab3237f5b061a3039
parent03445a4c2324f4adddd6b6c9b92879c1c754238a (diff)
mm, thp: respect MPOL_PREFERRED policy with non-local node
commit 0867a57c4f80a566dda1bac975b42fcd857cb489 upstream. Since commit 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node"), we handle THP allocations on page fault in a special way - for non-interleave memory policies, the allocation is only attempted on the node local to the current CPU, if the policy's nodemask allows the node. This is motivated by the assumption that THP benefits cannot offset the cost of remote accesses, so it's better to fallback to base pages on the local node (which might still be available, while huge pages are not due to fragmentation) than to allocate huge pages on a remote node. The nodemask check prevents us from violating e.g. MPOL_BIND policies where the local node is not among the allowed nodes. However, the current implementation can still give surprising results for the MPOL_PREFERRED policy when the preferred node is different than the current CPU's local node. In such case we should honor the preferred node and not use the local node, which is what this patch does. If hugepage allocation on the preferred node fails, we fall back to base pages and don't try other nodes, with the same motivation as is done for the local node hugepage allocations. The patch also moves the MPOL_INTERLEAVE check around to simplify the hugepage specific test. The difference can be demonstrated using in-tree transhuge-stress test on the following 2-node machine where half memory on one node was occupied to show the difference. > numactl --hardware available: 2 nodes (0-1) node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 24 25 26 27 28 29 30 31 32 33 34 35 node 0 size: 7878 MB node 0 free: 3623 MB node 1 cpus: 12 13 14 15 16 17 18 19 20 21 22 23 36 37 38 39 40 41 42 43 44 45 46 47 node 1 size: 8045 MB node 1 free: 7818 MB node distances: node 0 1 0: 10 21 1: 21 10 Before the patch: > numactl -p0 -C0 ./transhuge-stress transhuge-stress: 2.197 s/loop, 0.276 ms/page, 7249.168 MiB/s 7962 succeed, 0 failed, 1786 different pages > numactl -p0 -C12 ./transhuge-stress transhuge-stress: 2.962 s/loop, 0.372 ms/page, 5376.172 MiB/s 7962 succeed, 0 failed, 3873 different pages Number of successful THP allocations corresponds to free memory on node 0 in the first case and node 1 in the second case, i.e. -p parameter is ignored and cpu binding "wins". After the patch: > numactl -p0 -C0 ./transhuge-stress transhuge-stress: 2.183 s/loop, 0.274 ms/page, 7295.516 MiB/s 7962 succeed, 0 failed, 1760 different pages > numactl -p0 -C12 ./transhuge-stress transhuge-stress: 2.878 s/loop, 0.361 ms/page, 5533.638 MiB/s 7962 succeed, 0 failed, 1750 different pages > numactl -p1 -C0 ./transhuge-stress transhuge-stress: 4.628 s/loop, 0.581 ms/page, 3440.893 MiB/s 7962 succeed, 0 failed, 3918 different pages The -p parameter is respected regardless of cpu binding. > numactl -C0 ./transhuge-stress transhuge-stress: 2.202 s/loop, 0.277 ms/page, 7230.003 MiB/s 7962 succeed, 0 failed, 1750 different pages > numactl -C12 ./transhuge-stress transhuge-stress: 3.020 s/loop, 0.379 ms/page, 5273.324 MiB/s 7962 succeed, 0 failed, 3916 different pages Without -p parameter, hugepage restriction to CPU-local node works as before. Fixes: 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node") Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> Acked-by: David Rientjes <rientjes@google.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Michal Hocko <mhocko@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-rw-r--r--mm/mempolicy.c38
1 files changed, 22 insertions, 16 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 747743237d9f..99d4c1d0b858 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1972,35 +1972,41 @@ retry_cpuset:
1972 pol = get_vma_policy(vma, addr); 1972 pol = get_vma_policy(vma, addr);
1973 cpuset_mems_cookie = read_mems_allowed_begin(); 1973 cpuset_mems_cookie = read_mems_allowed_begin();
1974 1974
1975 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && 1975 if (pol->mode == MPOL_INTERLEAVE) {
1976 pol->mode != MPOL_INTERLEAVE)) { 1976 unsigned nid;
1977
1978 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1979 mpol_cond_put(pol);
1980 page = alloc_page_interleave(gfp, order, nid);
1981 goto out;
1982 }
1983
1984 if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
1985 int hpage_node = node;
1986
1977 /* 1987 /*
1978 * For hugepage allocation and non-interleave policy which 1988 * For hugepage allocation and non-interleave policy which
1979 * allows the current node, we only try to allocate from the 1989 * allows the current node (or other explicitly preferred
1980 * current node and don't fall back to other nodes, as the 1990 * node) we only try to allocate from the current/preferred
1981 * cost of remote accesses would likely offset THP benefits. 1991 * node and don't fall back to other nodes, as the cost of
1992 * remote accesses would likely offset THP benefits.
1982 * 1993 *
1983 * If the policy is interleave, or does not allow the current 1994 * If the policy is interleave, or does not allow the current
1984 * node in its nodemask, we allocate the standard way. 1995 * node in its nodemask, we allocate the standard way.
1985 */ 1996 */
1997 if (pol->mode == MPOL_PREFERRED &&
1998 !(pol->flags & MPOL_F_LOCAL))
1999 hpage_node = pol->v.preferred_node;
2000
1986 nmask = policy_nodemask(gfp, pol); 2001 nmask = policy_nodemask(gfp, pol);
1987 if (!nmask || node_isset(node, *nmask)) { 2002 if (!nmask || node_isset(hpage_node, *nmask)) {
1988 mpol_cond_put(pol); 2003 mpol_cond_put(pol);
1989 page = alloc_pages_exact_node(node, 2004 page = alloc_pages_exact_node(hpage_node,
1990 gfp | __GFP_THISNODE, order); 2005 gfp | __GFP_THISNODE, order);
1991 goto out; 2006 goto out;
1992 } 2007 }
1993 } 2008 }
1994 2009
1995 if (pol->mode == MPOL_INTERLEAVE) {
1996 unsigned nid;
1997
1998 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
1999 mpol_cond_put(pol);
2000 page = alloc_page_interleave(gfp, order, nid);
2001 goto out;
2002 }
2003
2004 nmask = policy_nodemask(gfp, pol); 2010 nmask = policy_nodemask(gfp, pol);
2005 zl = policy_zonelist(gfp, pol, node); 2011 zl = policy_zonelist(gfp, pol, node);
2006 mpol_cond_put(pol); 2012 mpol_cond_put(pol);