hugetlb: derive huge pages nodes allowed from task mempolicy

This patch derives a "nodes_allowed" node mask from the numa mempolicy of the task modifying the number of persistent huge pages to control the allocation, freeing and adjusting of surplus huge pages when the pool page count is modified via the new sysctl or sysfs attribute "nr_hugepages_mempolicy". The nodes_allowed mask is derived as follows: * For "default" [NULL] task mempolicy, a NULL nodemask_t pointer is produced. This will cause the hugetlb subsystem to use node_online_map as the "nodes_allowed". This preserves the behavior before this patch. * For "preferred" mempolicy, including explicit local allocation, a nodemask with the single preferred node will be produced. "local" policy will NOT track any internode migrations of the task adjusting nr_hugepages. * For "bind" and "interleave" policy, the mempolicy's nodemask will be used. * Other than to inform the construction of the nodes_allowed node mask, the actual mempolicy mode is ignored. That is, all modes behave like interleave over the resulting nodes_allowed mask with no "fallback". See the updated documentation [next patch] for more information about the implications of this patch. Examples: Starting with: Node 0 HugePages_Total: 0 Node 1 HugePages_Total: 0 Node 2 HugePages_Total: 0 Node 3 HugePages_Total: 0 Default behavior [with or without this patch] balances persistent hugepage allocation across nodes [with sufficient contiguous memory]: sysctl vm.nr_hugepages[_mempolicy]=32 yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 8 Node 3 HugePages_Total: 8 Of course, we only have nr_hugepages_mempolicy with the patch, but with default mempolicy, nr_hugepages_mempolicy behaves the same as nr_hugepages. Applying mempolicy--e.g., with numactl [using '-m' a.k.a. '--membind' because it allows multiple nodes to be specified and it's easy to type]--we can allocate huge pages on individual nodes or sets of nodes. So, starting from the condition above, with 8 huge pages per node, add 8 more to node 2 using: numactl -m 2 sysctl vm.nr_hugepages_mempolicy=40 This yields: Node 0 HugePages_Total: 8 Node 1 HugePages_Total: 8 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The incremental 8 huge pages were restricted to node 2 by the specified mempolicy. Similarly, we can use mempolicy to free persistent huge pages from specified nodes: numactl -m 0,1 sysctl vm.nr_hugepages_mempolicy=32 yields: Node 0 HugePages_Total: 4 Node 1 HugePages_Total: 4 Node 2 HugePages_Total: 16 Node 3 HugePages_Total: 8 The 8 huge pages freed were balanced over nodes 0 and 1. [rientjes@google.com: accomodate reworked NODEMASK_ALLOC] Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Acked-by: Mel Gorman <mel@csn.ul.ie> Reviewed-by: Andi Kleen <andi@firstfloor.org> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Randy Dunlap <randy.dunlap@oracle.com> Cc: Nishanth Aravamudan <nacc@us.ibm.com> Cc: Adam Litke <agl@us.ibm.com> Cc: Andy Whitcroft <apw@canonical.com> Cc: Eric Whitney <eric.whitney@hp.com> Cc: Christoph Lameter <cl@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Lee Schermerhorn <lee.schermerhorn@hp.com> 2009-12-14 20:58:21 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2009-12-15 11:53:12 -0500
commit: 06808b0827e1cd14eedc96bac2655d5b37ac246c (patch)
tree: 8f7b52a4af1532ed414631f68b99a059e299d83f /mm/mempolicy.c
parent: c1e6c8d074ea3621106548654cc244d2edc12ead (diff)
1 files changed, 47 insertions, 0 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0f89eabbaf3e..f11fdad06204 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1568,6 +1568,53 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
        }
        return zl;
 }
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy.  Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+        struct mempolicy *mempolicy;
+        int nid;
+        if (!(mask && current->mempolicy))
+                return false;
+        mempolicy = current->mempolicy;
+        switch (mempolicy->mode) {
+        case MPOL_PREFERRED:
+                if (mempolicy->flags & MPOL_F_LOCAL)
+                        nid = numa_node_id();
+                else
+                        nid = mempolicy->v.preferred_node;
+                init_nodemask_of_node(mask, nid);
+                break;
+        case MPOL_BIND:
+                /* Fall through */
+        case MPOL_INTERLEAVE:
+                *mask =  mempolicy->v.nodes;
+                break;
+        default:
+                BUG();
+        }
+        return true;
+}
 #endif
 /* Allocate a page in interleaved policy.
author	Lee Schermerhorn <lee.schermerhorn@hp.com>	2009-12-14 20:58:21 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2009-12-15 11:53:12 -0500
commit	06808b0827e1cd14eedc96bac2655d5b37ac246c (patch)
tree	8f7b52a4af1532ed414631f68b99a059e299d83f /mm/mempolicy.c
parent	c1e6c8d074ea3621106548654cc244d2edc12ead (diff)

diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0f89eabbaf3e..f11fdad06204 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -1568,6 +1568,53 @@ struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr,
1568	}	1568	}
1569	return zl;	1569	return zl;
1570	}	1570	}
		1571
		1572	/*
		1573	* init_nodemask_of_mempolicy
		1574	*
		1575	* If the current task's mempolicy is "default" [NULL], return 'false'
		1576	* to indicate default policy. Otherwise, extract the policy nodemask
		1577	* for 'bind' or 'interleave' policy into the argument nodemask, or
		1578	* initialize the argument nodemask to contain the single node for
		1579	* 'preferred' or 'local' policy and return 'true' to indicate presence
		1580	* of non-default mempolicy.
		1581	*
		1582	* We don't bother with reference counting the mempolicy [mpol_get/put]
		1583	* because the current task is examining it's own mempolicy and a task's
		1584	* mempolicy is only ever changed by the task itself.
		1585	*
		1586	* N.B., it is the caller's responsibility to free a returned nodemask.
		1587	*/
		1588	bool init_nodemask_of_mempolicy(nodemask_t *mask)
		1589	{
		1590	struct mempolicy *mempolicy;
		1591	int nid;
		1592
		1593	if (!(mask && current->mempolicy))
		1594	return false;
		1595
		1596	mempolicy = current->mempolicy;
		1597	switch (mempolicy->mode) {
		1598	case MPOL_PREFERRED:
		1599	if (mempolicy->flags & MPOL_F_LOCAL)
		1600	nid = numa_node_id();
		1601	else
		1602	nid = mempolicy->v.preferred_node;
		1603	init_nodemask_of_node(mask, nid);
		1604	break;
		1605
		1606	case MPOL_BIND:
		1607	/* Fall through */
		1608	case MPOL_INTERLEAVE:
		1609	*mask = mempolicy->v.nodes;
		1610	break;
		1611
		1612	default:
		1613	BUG();
		1614	}
		1615
		1616	return true;
		1617	}
1571	#endif	1618	#endif
1572		1619
1573	/* Allocate a page in interleaved policy.	1620	/* Allocate a page in interleaved policy.