[PATCH] Add NUMA policy support for huge pages.

The huge_zonelist() function in the memory policy layer provides an list of zones ordered by NUMA distance. The hugetlb layer will walk that list looking for a zone that has available huge pages but is also in the nodeset of the current cpuset. This patch does not contain the folding of find_or_alloc_huge_page() that was controversial in the earlier discussion. Signed-off-by: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@muc.de> Acked-by: William Lee Irwin III <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Christoph Lameter <clameter@engr.sgi.com> 2006-01-06 03:10:46 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-06 11:33:23 -0500
commit: 5da7ca86078964cbfe6c83efc1205904587706fe (patch)
tree: a64a7824e90b42d6fdd71e6cb652362beb8983a1
parent: 96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf (diff)
4 files changed, 54 insertions, 21 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 1056717ee501..68d82ad6b17c 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(void);
+struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, int write_access);
@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define is_hugepage_only_range(mm, addr, len)   0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
                                                do { } while (0)
-#define alloc_huge_page()                       ({ NULL; })
+#define alloc_huge_page(vma, addr)              ({ NULL; })
 #define free_huge_page(p)                       ({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)     ({ BUG(); 0; })
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 8b67cf837ca9..817db6427113 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -156,6 +156,8 @@ extern void numa_default_policy(void);
 extern void numa_policy_init(void);
 extern void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new);
 extern struct mempolicy default_policy;
+extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+                unsigned long addr);
 #else
@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
 {
 }
+static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
+                unsigned long addr)
+{
+        return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+}
 #endif /* CONFIG_NUMA */
 #endif /* __KERNEL__ */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e93bd63462f0..eb405565949d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
 #include <linux/highmem.h>
 #include <linux/nodemask.h>
 #include <linux/pagemap.h>
+#include <linux/mempolicy.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
        free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(void)
+static struct page *dequeue_huge_page(struct vm_area_struct *vma,
+                                unsigned long address)
 {
        int nid = numa_node_id();
        struct page *page = NULL;
-        struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;
+        struct zonelist *zonelist = huge_zonelist(vma, address);
        struct zone **z;
        for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
        spin_unlock(&hugetlb_lock);
 }
-struct page *alloc_huge_page(void)
+struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
        struct page *page;
        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page();
+        page = dequeue_huge_page(vma, addr);
        if (!page) {
                spin_unlock(&hugetlb_lock);
                return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
        spin_lock(&hugetlb_lock);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
-                struct page *page = dequeue_huge_page();
+                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
 }
-static struct page *find_or_alloc_huge_page(struct address_space *mapping,
+static struct page *find_or_alloc_huge_page(struct vm_area_struct *vma,
-                                unsigned long idx, int shared)
+                        unsigned long addr, struct address_space *mapping,
+                        unsigned long idx, int shared)
 {
        struct page *page;
        int err;
@@ -378,7 +382,7 @@ retry:
        if (hugetlb_get_quota(mapping))
                goto out;
-        page = alloc_huge_page();
+        page = alloc_huge_page(vma, addr);
        if (!page) {
                hugetlb_put_quota(mapping);
                goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        page_cache_get(old_page);
-        new_page = alloc_huge_page();
+        new_page = alloc_huge_page(vma, address);
        if (!new_page) {
                page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * Use page lock to guard against racing truncation
         * before we get page_table_lock.
         */
-        page = find_or_alloc_huge_page(mapping, idx,
+        page = find_or_alloc_huge_page(vma, address, mapping, idx,
                        vma->vm_flags & VM_SHARED);
        if (!page)
                goto out;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 72f402cc9c9a..45c51ac63443 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
        return nid;
 }
+/* Determine a node number for interleave */
+static inline unsigned interleave_nid(struct mempolicy *pol,
+                 struct vm_area_struct *vma, unsigned long addr, int shift)
+{
+        if (vma) {
+                unsigned long off;
+                off = vma->vm_pgoff;
+                off += (addr - vma->vm_start) >> shift;
+                return offset_il_node(pol, vma, off);
+        } else
+                return interleave_nodes(pol);
+}
+/* Return a zonelist suitable for a huge page allocation. */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+{
+        struct mempolicy *pol = get_vma_policy(current, vma, addr);
+        if (pol->policy == MPOL_INTERLEAVE) {
+                unsigned nid;
+                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+                return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+        }
+        return zonelist_policy(GFP_HIGHUSER, pol);
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
        if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
                unsigned nid;
-                if (vma) {
-                        unsigned long off;
+                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
-                        off = vma->vm_pgoff;
-                        off += (addr - vma->vm_start) >> PAGE_SHIFT;
-                        nid = offset_il_node(pol, vma, off);
-                } else {
-                        /* fall back to process interleaving */
-                        nid = interleave_nodes(pol);
-                }
                return alloc_page_interleave(gfp, 0, nid);
        }
        return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
author	Christoph Lameter <clameter@engr.sgi.com>	2006-01-06 03:10:46 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-06 11:33:23 -0500
commit	5da7ca86078964cbfe6c83efc1205904587706fe (patch)
tree	a64a7824e90b42d6fdd71e6cb652362beb8983a1
parent	96df9333c94d7d5aeceb21f6c5e7ae8ff34753cf (diff)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 1056717ee501..68d82ad6b17c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h
@@ -22,7 +22,7 @@ int hugetlb_report_meminfo(char *);
22	int hugetlb_report_node_meminfo(int, char *);	22	int hugetlb_report_node_meminfo(int, char *);
23	int is_hugepage_mem_enough(size_t);	23	int is_hugepage_mem_enough(size_t);
24	unsigned long hugetlb_total_pages(void);	24	unsigned long hugetlb_total_pages(void);
25	struct page *alloc_huge_page(void);	25	struct page alloc_huge_page(struct vm_area_struct , unsigned long);
26	void free_huge_page(struct page *);	26	void free_huge_page(struct page *);
27	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,	27	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
28	unsigned long address, int write_access);	28	unsigned long address, int write_access);
@@ -97,7 +97,7 @@ static inline unsigned long hugetlb_total_pages(void)
97	#define is_hugepage_only_range(mm, addr, len) 0	97	#define is_hugepage_only_range(mm, addr, len) 0
98	#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \	98	#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
99	do { } while (0)	99	do { } while (0)
100	#define alloc_huge_page() ({ NULL; })	100	#define alloc_huge_page(vma, addr) ({ NULL; })
101	#define free_huge_page(p) ({ (void)(p); BUG(); })	101	#define free_huge_page(p) ({ (void)(p); BUG(); })
102	#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })	102	#define hugetlb_fault(mm, vma, addr, write) ({ BUG(); 0; })
103		103


diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 8b67cf837ca9..817db6427113 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h
@@ -156,6 +156,8 @@ extern void numa_default_policy(void);
156	extern void numa_policy_init(void);	156	extern void numa_policy_init(void);
157	extern void numa_policy_rebind(const nodemask_t old, const nodemask_t new);	157	extern void numa_policy_rebind(const nodemask_t old, const nodemask_t new);
158	extern struct mempolicy default_policy;	158	extern struct mempolicy default_policy;
		159	extern struct zonelist huge_zonelist(struct vm_area_struct vma,
		160	unsigned long addr);
159		161
160	#else	162	#else
161		163
@@ -232,6 +234,12 @@ static inline void numa_policy_rebind(const nodemask_t *old,
232	{	234	{
233	}	235	}
234		236
		237	static inline struct zonelist huge_zonelist(struct vm_area_struct vma,
		238	unsigned long addr)
		239	{
		240	return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
		241	}
		242
235	#endif /* CONFIG_NUMA */	243	#endif /* CONFIG_NUMA */
236	#endif /* __KERNEL__ */	244	#endif /* __KERNEL__ */
237		245


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e93bd63462f0..eb405565949d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -11,6 +11,8 @@
11	#include <linux/highmem.h>	11	#include <linux/highmem.h>
12	#include <linux/nodemask.h>	12	#include <linux/nodemask.h>
13	#include <linux/pagemap.h>	13	#include <linux/pagemap.h>
		14	#include <linux/mempolicy.h>
		15
14	#include <asm/page.h>	16	#include <asm/page.h>
15	#include <asm/pgtable.h>	17	#include <asm/pgtable.h>
16		18
@@ -36,11 +38,12 @@ static void enqueue_huge_page(struct page *page)
36	free_huge_pages_node[nid]++;	38	free_huge_pages_node[nid]++;
37	}	39	}
38		40
39	static struct page *dequeue_huge_page(void)	41	static struct page dequeue_huge_page(struct vm_area_struct vma,
		42	unsigned long address)
40	{	43	{
41	int nid = numa_node_id();	44	int nid = numa_node_id();
42	struct page *page = NULL;	45	struct page *page = NULL;
43	struct zonelist *zonelist = NODE_DATA(nid)->node_zonelists;	46	struct zonelist *zonelist = huge_zonelist(vma, address);
44	struct zone **z;	47	struct zone **z;
45		48
46	for (z = zonelist->zones; *z; z++) {	49	for (z = zonelist->zones; *z; z++) {
@@ -87,13 +90,13 @@ void free_huge_page(struct page *page)
87	spin_unlock(&hugetlb_lock);	90	spin_unlock(&hugetlb_lock);
88	}	91	}
89		92
90	struct page *alloc_huge_page(void)	93	struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)
91	{	94	{
92	struct page *page;	95	struct page *page;
93	int i;	96	int i;
94		97
95	spin_lock(&hugetlb_lock);	98	spin_lock(&hugetlb_lock);
96	page = dequeue_huge_page();	99	page = dequeue_huge_page(vma, addr);
97	if (!page) {	100	if (!page) {
98	spin_unlock(&hugetlb_lock);	101	spin_unlock(&hugetlb_lock);
99	return NULL;	102	return NULL;
@@ -196,7 +199,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
196	spin_lock(&hugetlb_lock);	199	spin_lock(&hugetlb_lock);
197	try_to_free_low(count);	200	try_to_free_low(count);
198	while (count < nr_huge_pages) {	201	while (count < nr_huge_pages) {
199	struct page *page = dequeue_huge_page();	202	struct page *page = dequeue_huge_page(NULL, 0);
200	if (!page)	203	if (!page)
201	break;	204	break;
202	update_and_free_page(page);	205	update_and_free_page(page);
@@ -365,8 +368,9 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
365	flush_tlb_range(vma, start, end);	368	flush_tlb_range(vma, start, end);
366	}	369	}
367		370
368	static struct page find_or_alloc_huge_page(struct address_space mapping,	371	static struct page find_or_alloc_huge_page(struct vm_area_struct vma,
369	unsigned long idx, int shared)	372	unsigned long addr, struct address_space *mapping,
		373	unsigned long idx, int shared)
370	{	374	{
371	struct page *page;	375	struct page *page;
372	int err;	376	int err;
@@ -378,7 +382,7 @@ retry:
378		382
379	if (hugetlb_get_quota(mapping))	383	if (hugetlb_get_quota(mapping))
380	goto out;	384	goto out;
381	page = alloc_huge_page();	385	page = alloc_huge_page(vma, addr);
382	if (!page) {	386	if (!page) {
383	hugetlb_put_quota(mapping);	387	hugetlb_put_quota(mapping);
384	goto out;	388	goto out;
@@ -418,7 +422,7 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
418	}	422	}
419		423
420	page_cache_get(old_page);	424	page_cache_get(old_page);
421	new_page = alloc_huge_page();	425	new_page = alloc_huge_page(vma, address);
422		426
423	if (!new_page) {	427	if (!new_page) {
424	page_cache_release(old_page);	428	page_cache_release(old_page);
@@ -467,7 +471,7 @@ int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
467	* Use page lock to guard against racing truncation	471	* Use page lock to guard against racing truncation
468	* before we get page_table_lock.	472	* before we get page_table_lock.
469	*/	473	*/
470	page = find_or_alloc_huge_page(mapping, idx,	474	page = find_or_alloc_huge_page(vma, address, mapping, idx,
471	vma->vm_flags & VM_SHARED);	475	vma->vm_flags & VM_SHARED);
472	if (!page)	476	if (!page)
473	goto out;	477	goto out;


diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 72f402cc9c9a..45c51ac63443 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c
@@ -785,6 +785,34 @@ static unsigned offset_il_node(struct mempolicy *pol,
785	return nid;	785	return nid;
786	}	786	}
787		787
		788	/* Determine a node number for interleave */
		789	static inline unsigned interleave_nid(struct mempolicy *pol,
		790	struct vm_area_struct *vma, unsigned long addr, int shift)
		791	{
		792	if (vma) {
		793	unsigned long off;
		794
		795	off = vma->vm_pgoff;
		796	off += (addr - vma->vm_start) >> shift;
		797	return offset_il_node(pol, vma, off);
		798	} else
		799	return interleave_nodes(pol);
		800	}
		801
		802	/* Return a zonelist suitable for a huge page allocation. */
		803	struct zonelist huge_zonelist(struct vm_area_struct vma, unsigned long addr)
		804	{
		805	struct mempolicy *pol = get_vma_policy(current, vma, addr);
		806
		807	if (pol->policy == MPOL_INTERLEAVE) {
		808	unsigned nid;
		809
		810	nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
		811	return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
		812	}
		813	return zonelist_policy(GFP_HIGHUSER, pol);
		814	}
		815
788	/* Allocate a page in interleaved policy.	816	/* Allocate a page in interleaved policy.
789	Own path because it needs to do special accounting. */	817	Own path because it needs to do special accounting. */
790	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,	818	static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
@@ -833,15 +861,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
833		861
834	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {	862	if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
835	unsigned nid;	863	unsigned nid;
836	if (vma) {	864
837	unsigned long off;	865	nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
838	off = vma->vm_pgoff;
839	off += (addr - vma->vm_start) >> PAGE_SHIFT;
840	nid = offset_il_node(pol, vma, off);
841	} else {
842	/* fall back to process interleaving */
843	nid = interleave_nodes(pol);
844	}
845	return alloc_page_interleave(gfp, 0, nid);	866	return alloc_page_interleave(gfp, 0, nid);
846	}	867	}
847	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));	868	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));