[PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@engr.sgi.com> 2006-01-06 03:10:46 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-06 11:33:23 -0500
commit: 5da7ca86078964cbfe6c83efc1205904587706fe (patch)
tree: a64a7824e90b42d6fdd71e6cb652362beb8983a1 /mm
parent: 96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf (diff)
2 files changed, 44 insertions, 19 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e93bd63462f0..eb405565949d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
        free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+                                unsigned long address)
 {
        int nid = numa_node_id();
        struct page *page = NULL;
-        struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+        struct zonelist *zonelist = huge_zonelist(vma, address);
        struct zone **z;
        for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page();
+        page = dequeue_huge_page(vma, addr);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
        spin_lock(&hugetlb_lock);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
 }
-static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
-                                unsigned long idx, int shared)
+                        unsigned long addr, struct address_space *mapping,
+                        unsigned long idx, int shared)
 {
        struct page *page;
        int err;
@@ -378,7 +382,7 @@ retry:
        if (hugetlb_get_quota(mapping))
                goto out;
-        page = alloc_huge_page();
+        page = alloc_huge_page(vma, addr);
        if (!page) {
                hugetlb_put_quota(mapping);
                goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        page_cache_get(old_page);
-        new_page = alloc_huge_page();
+        new_page = alloc_huge_page(vma, address);
        if (!new_page) {
                page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-        page = find_or_alloc_huge_page(mapping, idx,
+        page = find_or_alloc_huge_page(vma, address, mapping, idx,
                        vma->vm_flags & VM_SHARED);
        if (!page)
                goto out;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..45c51ac63443 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
        return nid;
 }
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+                 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+        if (vma) {
+                unsigned long off;
+                off = vma->vm_pgoff;
+                off += (addr - vma->vm_start) >> shift;
+                return offset_il_node(pol, vma, off);
+        } else
+                return interleave_nodes(pol);
+}
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        if (pol->policy == MPOL_INTERLEAVE) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+        }
+        return zonelist_policy(GFP_HIGHUSER, pol);
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
-                if (vma) {
-                        unsigned long off;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                        off = vma->vm_pgoff;
-                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
-                        nid = offset_il_node(pol, vma, off);
-                } else {
-                        /* fall back to process interleaving */
-                        nid = interleave_nodes(pol);
-                }
                return alloc_page_interleave(gfp, 0, nid);
        }
        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
author	Christoph Lameter <clameter@engr.sgi.com>	2006-01-06 03:10:46 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-06 11:33:23 -0500
commit	5da7ca86078964cbfe6c83efc1205904587706fe (patch)
tree	a64a7824e90b42d6fdd71e6cb652362beb8983a1 /mm
parent	96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf (diff)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e93bd63462f0..eb405565949d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
11	#include <linux/highmem.h>	11	#include <linux/highmem.h>
12	#include <linux/nodemask.h>	12	#include <linux/nodemask.h>
13	#include <linux/pagemap.h>	13	#include <linux/pagemap.h>
		14	#include <linux/mempolicy.h>
		15
14	#include <asm/page.h>	16	#include <asm/page.h>
15	#include <asm/pgtable.h>	17	#include <asm/pgtable.h>
16		18
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
36	free_huge_pages_node[nid]++;	38	free_huge_pages_node[nid]++;
37	}	39	}
38		40
39	static struct page *dequeue_huge_page(void)	41	static struct page dequeue_huge_page(struct vm_area_struct vma,
		42	unsigned long address)
40	{	43	{
41	int nid = numa_node_id();	44	int nid = numa_node_id();
42	struct page *page = NULL;	45	struct page *page = NULL;
43	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;	46	struct zonelist *zonelist = huge_zonelist(vma, address);
44	struct zone **z;	47	struct zone **z;
45		48
46	for (z = zonelist->zones; *z; z++) {	49	for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
87	spin_unlock(&hugetlb_lock);	90	spin_unlock(&hugetlb_lock);
88	}	91	}
89		92
90	struct page *alloc_huge_page(void)	93	struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)
91	{	94	{
92	struct page *page;	95	struct page *page;
93	int i;	96	int i;
94		97
95	spin_lock(&hugetlb_lock);	98	spin_lock(&hugetlb_lock);
96	page = dequeue_huge_page();	99	page = dequeue_huge_page(vma, addr);
97	if (!page) {	100	if (!page) {
98	spin_unlock(&hugetlb_lock);	101	spin_unlock(&hugetlb_lock);
99	return NULL;	102	return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
196	spin_lock(&hugetlb_lock);	199	spin_lock(&hugetlb_lock);
197	try_to_free_low(count);	200	try_to_free_low(count);
198	while (count < nr_huge_pages) {	201	while (count < nr_huge_pages) {
199	struct page *page = dequeue_huge_page();	202	struct page *page = dequeue_huge_page(NULL, 0);
200	if (!page)	203	if (!page)
201	break;	204	break;
202	update_and_free_page(page);	205	update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
365	flush_tlb_range(vma, start, end);	368	flush_tlb_range(vma, start, end);
366	}	369	}
367		370
368	static struct page find_or_alloc_huge_page(struct address_space mapping,	371	static struct page find_or_alloc_huge_page(struct vm_area_struct vma,
369	unsigned long idx, int shared)	372	unsigned long addr, struct address_space *mapping,
		373	unsigned long idx, int shared)
370	{	374	{
371	struct page *page;	375	struct page *page;
372	int err;	376	int err;
@@ -378,7 +382,7 @@ retry:
378		382
379	if (hugetlb_get_quota(mapping))	383	if (hugetlb_get_quota(mapping))
380	goto out;	384	goto out;
381	page = alloc_huge_page();	385	page = alloc_huge_page(vma, addr);
382	if (!page) {	386	if (!page) {
383	hugetlb_put_quota(mapping);	387	hugetlb_put_quota(mapping);
384	goto out;	388	goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
418	}	422	}
419		423
420	page_cache_get(old_page);	424	page_cache_get(old_page);
421	new_page = alloc_huge_page();	425	new_page = alloc_huge_page(vma, address);
422		426
423	if (!new_page) {	427	if (!new_page) {
424	page_cache_release(old_page);	428	page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
467	* Use page lock to guard against racing truncation	471	* Use page lock to guard against racing truncation
468	* before we get page_table_lock.	472	* before we get page_table_lock.
469	*/	473	*/
470	page = find_or_alloc_huge_page(mapping, idx,	474	page = find_or_alloc_huge_page(vma, address, mapping, idx,
471	vma->vm_flags & VM_SHARED);	475	vma->vm_flags & VM_SHARED);
472	if (!page)	476	if (!page)
473	goto out;	477	goto out;


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9a..45c51ac63443 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785	return nid;	785	return nid;
786	}	786	}
787		787
		788	/* Determine a node number for interleave */
		789	static inline unsigned interleave_nid(struct mempolicy *pol,
		790	struct vm_area_struct *vma, unsigned long addr, int shift)
		791	{
		792	if (vma) {
		793	unsigned long off;
		794
		795	off = vma->vm_pgoff;
		796	off += (addr - vma->vm_start) >> shift;
		797	return offset_il_node(pol, vma, off);
		798	} else
		799	return interleave_nodes(pol);
		800	}
		801
		802	/* Return a zonelist suitable for a huge page allocation. */
		803	struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)
		804	{
		805	struct mempolicy *pol = get_vma_policy(current, vma, addr);
		806
		807	if (pol->policy == MPOL_INTERLEAVE) {
		808	unsigned nid;
		809
		810	nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
		811	return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
		812	}
		813	return zonelist_policy(GFP_HIGHUSER, pol);
		814	}
		815
788	/* Allocate a page in interleaved policy.	816	/* Allocate a page in interleaved policy.
789	Own path because it needs to do special accounting. */	817	Own path because it needs to do special accounting. */
790	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,	818	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
833		861
834	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {	862	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835	unsigned nid;	863	unsigned nid;
836	if (vma) {	864
837	unsigned long off;	865	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838	off = vma->vm_pgoff;
839	off += (addr - vma->vm_start) >> PAGE_SHIFT;
840	nid = offset_il_node(pol, vma, off);
841	} else {
842	/* fall back to process interleaving */
843	nid = interleave_nodes(pol);
844	}
845	return alloc_page_interleave(gfp, 0, nid);	866	return alloc_page_interleave(gfp, 0, nid);
846	}	867	}
847	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));	868	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));