aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2008-04-28 05:12:18 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-28 11:58:19 -0400
commit19770b32609b6bf97a3dece2529089494cbfc549 (patch)
tree3b5922d1b20aabdf929bde9309f323841717747a
parentdd1a239f6f2d4d3eedd318583ec319aa145b324c (diff)
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations controlled by that mempolicy. As the per-node zonelist is already being filtered based on a zone id, this patch adds a version of __alloc_pages() that takes a nodemask for further filtering. This eliminates the need for MPOL_BIND to create a custom zonelist. A positive benefit of this is that allocations using MPOL_BIND now use the local node's distance-ordered zonelist instead of a custom node-id-ordered zonelist. I.e., pages will be allocated from the closest allowed node with available memory. [Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask] [Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework] Signed-off-by: Mel Gorman <mel@csn.ul.ie> Acked-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Hugh Dickins <hugh@veritas.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/vm/numa_memory_policy.txt11
-rw-r--r--fs/buffer.c9
-rw-r--r--include/linux/cpuset.h4
-rw-r--r--include/linux/gfp.h4
-rw-r--r--include/linux/mempolicy.h19
-rw-r--r--include/linux/mmzone.h80
-rw-r--r--kernel/cpuset.c18
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/mempolicy.c184
-rw-r--r--mm/mmzone.c30
-rw-r--r--mm/page_alloc.c50
11 files changed, 224 insertions, 191 deletions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt
index dd4986497996..1278e685d650 100644
--- a/Documentation/vm/numa_memory_policy.txt
+++ b/Documentation/vm/numa_memory_policy.txt
@@ -182,14 +182,9 @@ Components of Memory Policies
182 The Default mode does not use the optional set of nodes. 182 The Default mode does not use the optional set of nodes.
183 183
184 MPOL_BIND: This mode specifies that memory must come from the 184 MPOL_BIND: This mode specifies that memory must come from the
185 set of nodes specified by the policy. 185 set of nodes specified by the policy. Memory will be allocated from
186 186 the node in the set with sufficient free memory that is closest to
187 The memory policy APIs do not specify an order in which the nodes 187 the node where the allocation takes place.
188 will be searched. However, unlike "local allocation", the Bind
189 policy does not consider the distance between the nodes. Rather,
190 allocations will fallback to the nodes specified by the policy in
191 order of numeric node id. Like everything in Linux, this is subject
192 to change.
193 188
194 MPOL_PREFERRED: This mode specifies that the allocation should be 189 MPOL_PREFERRED: This mode specifies that the allocation should be
195 attempted from the single node specified in the policy. If that 190 attempted from the single node specified in the policy. If that
diff --git a/fs/buffer.c b/fs/buffer.c
index ac84cd13075d..7d51e649b19a 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev)
360 */ 360 */
361static void free_more_memory(void) 361static void free_more_memory(void)
362{ 362{
363 struct zoneref *zrefs; 363 struct zone *zone;
364 int nid; 364 int nid;
365 365
366 wakeup_pdflush(1024); 366 wakeup_pdflush(1024);
367 yield(); 367 yield();
368 368
369 for_each_online_node(nid) { 369 for_each_online_node(nid) {
370 zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), 370 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
371 gfp_zone(GFP_NOFS)); 371 gfp_zone(GFP_NOFS), NULL,
372 if (zrefs->zone) 372 &zone);
373 if (zone)
373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, 374 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 GFP_NOFS); 375 GFP_NOFS);
375 } 376 }
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 726761e24003..038578362b47 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
26#define cpuset_current_mems_allowed (current->mems_allowed) 26#define cpuset_current_mems_allowed (current->mems_allowed)
27void cpuset_init_current_mems_allowed(void); 27void cpuset_init_current_mems_allowed(void);
28void cpuset_update_task_memory_state(void); 28void cpuset_update_task_memory_state(void);
29int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); 29int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
30 30
31extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); 31extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
32extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); 32extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
@@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
103static inline void cpuset_init_current_mems_allowed(void) {} 103static inline void cpuset_init_current_mems_allowed(void) {}
104static inline void cpuset_update_task_memory_state(void) {} 104static inline void cpuset_update_task_memory_state(void) {}
105 105
106static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 106static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
107{ 107{
108 return 1; 108 return 1;
109} 109}
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index e1c6064cb6c7..898aa9d5b6c2 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { }
182 182
183extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *); 183extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);
184 184
185extern struct page *
186__alloc_pages_nodemask(gfp_t, unsigned int,
187 struct zonelist *, nodemask_t *nodemask);
188
185static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, 189static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
186 unsigned int order) 190 unsigned int order)
187{ 191{
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 69160dc32d48..b8b3da7a3315 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -54,19 +54,20 @@ struct mm_struct;
54 * mmap_sem. 54 * mmap_sem.
55 * 55 *
56 * Freeing policy: 56 * Freeing policy:
57 * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. 57 * Mempolicy objects are reference counted. A mempolicy will be freed when
58 * All other policies don't have any external state. mpol_free() handles this. 58 * mpol_free() decrements the reference count to zero.
59 * 59 *
60 * Copying policy objects: 60 * Copying policy objects:
61 * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this. 61 * mpol_copy() allocates a new mempolicy and copies the specified mempolicy
62 * to the new storage. The reference count of the new object is initialized
63 * to 1, representing the caller of mpol_copy().
62 */ 64 */
63struct mempolicy { 65struct mempolicy {
64 atomic_t refcnt; 66 atomic_t refcnt;
65 short policy; /* See MPOL_* above */ 67 short policy; /* See MPOL_* above */
66 union { 68 union {
67 struct zonelist *zonelist; /* bind */
68 short preferred_node; /* preferred */ 69 short preferred_node; /* preferred */
69 nodemask_t nodes; /* interleave */ 70 nodemask_t nodes; /* interleave/bind */
70 /* undefined for default */ 71 /* undefined for default */
71 } v; 72 } v;
72 nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ 73 nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
@@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
151 152
152extern struct mempolicy default_policy; 153extern struct mempolicy default_policy;
153extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, 154extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
154 unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol); 155 unsigned long addr, gfp_t gfp_flags,
156 struct mempolicy **mpol, nodemask_t **nodemask);
155extern unsigned slab_node(struct mempolicy *policy); 157extern unsigned slab_node(struct mempolicy *policy);
156 158
157extern enum zone_type policy_zone; 159extern enum zone_type policy_zone;
@@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
239} 241}
240 242
241static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, 243static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
242 unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) 244 unsigned long addr, gfp_t gfp_flags,
245 struct mempolicy **mpol, nodemask_t **nodemask)
243{ 246{
247 *mpol = NULL;
248 *nodemask = NULL;
244 return node_zonelist(0, gfp_flags); 249 return node_zonelist(0, gfp_flags);
245} 250}
246 251
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d34b4c290017..498d6ceff2f4 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
749#endif /* CONFIG_NUMA */ 749#endif /* CONFIG_NUMA */
750} 750}
751 751
752static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) 752/**
753{ 753 * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
754 zoneref->zone = zone; 754 * @z - The cursor used as a starting point for the search
755 zoneref->zone_idx = zone_idx(zone); 755 * @highest_zoneidx - The zone index of the highest zone to return
756} 756 * @nodes - An optional nodemask to filter the zonelist with
757 * @zone - The first suitable zone found is returned via this parameter
758 *
759 * This function returns the next zone at or below a given zone index that is
760 * within the allowed nodemask using a cursor as the starting point for the
761 * search. The zoneref returned is a cursor that is used as the next starting
762 * point for future calls to next_zones_zonelist().
763 */
764struct zoneref *next_zones_zonelist(struct zoneref *z,
765 enum zone_type highest_zoneidx,
766 nodemask_t *nodes,
767 struct zone **zone);
757 768
758/* Returns the first zone at or below highest_zoneidx in a zonelist */ 769/**
770 * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
771 * @zonelist - The zonelist to search for a suitable zone
772 * @highest_zoneidx - The zone index of the highest zone to return
773 * @nodes - An optional nodemask to filter the zonelist with
774 * @zone - The first suitable zone found is returned via this parameter
775 *
776 * This function returns the first zone at or below a given zone index that is
777 * within the allowed nodemask. The zoneref returned is a cursor that can be
778 * used to iterate the zonelist with next_zones_zonelist. The cursor should
779 * not be used by the caller as it does not match the value of the zone
780 * returned.
781 */
759static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, 782static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
760 enum zone_type highest_zoneidx) 783 enum zone_type highest_zoneidx,
784 nodemask_t *nodes,
785 struct zone **zone)
761{ 786{
762 struct zoneref *z; 787 return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
763 788 zone);
764 /* Find the first suitable zone to use for the allocation */
765 z = zonelist->_zonerefs;
766 while (zonelist_zone_idx(z) > highest_zoneidx)
767 z++;
768
769 return z;
770} 789}
771 790
772/* Returns the next zone at or below highest_zoneidx in a zonelist */ 791/**
773static inline struct zoneref *next_zones_zonelist(struct zoneref *z, 792 * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
774 enum zone_type highest_zoneidx) 793 * @zone - The current zone in the iterator
775{ 794 * @z - The current pointer within zonelist->zones being iterated
776 /* Find the next suitable zone to use for the allocation */ 795 * @zlist - The zonelist being iterated
777 while (zonelist_zone_idx(z) > highest_zoneidx) 796 * @highidx - The zone index of the highest zone to return
778 z++; 797 * @nodemask - Nodemask allowed by the allocator
779 798 *
780 return z; 799 * This iterator iterates though all zones at or below a given zone index and
781} 800 * within a given nodemask
801 */
802#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
803 for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
804 zone; \
805 z = next_zones_zonelist(z, highidx, nodemask, &zone)) \
782 806
783/** 807/**
784 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index 808 * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
@@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
790 * This iterator iterates though all zones at or below a given zone index. 814 * This iterator iterates though all zones at or below a given zone index.
791 */ 815 */
792#define for_each_zone_zonelist(zone, z, zlist, highidx) \ 816#define for_each_zone_zonelist(zone, z, zlist, highidx) \
793 for (z = first_zones_zonelist(zlist, highidx), \ 817 for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
794 zone = zonelist_zone(z++); \
795 zone; \
796 z = next_zones_zonelist(z, highidx), \
797 zone = zonelist_zone(z++))
798 818
799#ifdef CONFIG_SPARSEMEM 819#ifdef CONFIG_SPARSEMEM
800#include <asm/sparsemem.h> 820#include <asm/sparsemem.h>
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a220b13cbfaf..c9923e3c9a3b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1958} 1958}
1959 1959
1960/** 1960/**
1961 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed 1961 * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
1962 * @zl: the zonelist to be checked 1962 * @nodemask: the nodemask to be checked
1963 * 1963 *
1964 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1964 * Are any of the nodes in the nodemask allowed in current->mems_allowed?
1965 */ 1965 */
1966int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1966int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1967{ 1967{
1968 int i; 1968 return nodes_intersects(*nodemask, current->mems_allowed);
1969
1970 for (i = 0; zl->_zonerefs[i].zone; i++) {
1971 int nid = zonelist_node_idx(&zl->_zonerefs[i]);
1972
1973 if (node_isset(nid, current->mems_allowed))
1974 return 1;
1975 }
1976 return 0;
1977} 1969}
1978 1970
1979/* 1971/*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 4bced0d705ca..3737d82f5225 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
95 int nid; 95 int nid;
96 struct page *page = NULL; 96 struct page *page = NULL;
97 struct mempolicy *mpol; 97 struct mempolicy *mpol;
98 nodemask_t *nodemask;
98 struct zonelist *zonelist = huge_zonelist(vma, address, 99 struct zonelist *zonelist = huge_zonelist(vma, address,
99 htlb_alloc_mask, &mpol); 100 htlb_alloc_mask, &mpol, &nodemask);
100 struct zone *zone; 101 struct zone *zone;
101 struct zoneref *z; 102 struct zoneref *z;
102 103
103 for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) { 104 for_each_zone_zonelist_nodemask(zone, z, zonelist,
105 MAX_NR_ZONES - 1, nodemask) {
104 nid = zone_to_nid(zone); 106 nid = zone_to_nid(zone);
105 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && 107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
106 !list_empty(&hugepage_freelists[nid])) { 108 !list_empty(&hugepage_freelists[nid])) {
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 90193a2a915b..acb5ee3587c3 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
163 return 0; 163 return 0;
164} 164}
165 165
166/* Generate a custom zonelist for the BIND policy. */ 166/* Check that the nodemask contains at least one populated zone */
167static struct zonelist *bind_zonelist(nodemask_t *nodes) 167static int is_valid_nodemask(nodemask_t *nodemask)
168{ 168{
169 struct zonelist *zl; 169 int nd, k;
170 int num, max, nd;
171 enum zone_type k;
172 170
173 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 171 /* Check that there is something useful in this mask */
174 max++; /* space for zlcache_ptr (see mmzone.h) */ 172 k = policy_zone;
175 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 173
176 if (!zl) 174 for_each_node_mask(nd, *nodemask) {
177 return ERR_PTR(-ENOMEM); 175 struct zone *z;
178 zl->zlcache_ptr = NULL; 176
179 num = 0; 177 for (k = 0; k <= policy_zone; k++) {
180 /* First put in the highest zones from all nodes, then all the next 178 z = &NODE_DATA(nd)->node_zones[k];
181 lower zones etc. Avoid empty zones because the memory allocator 179 if (z->present_pages > 0)
182 doesn't like them. If you implement node hot removal you 180 return 1;
183 have to fix that. */
184 k = MAX_NR_ZONES - 1;
185 while (1) {
186 for_each_node_mask(nd, *nodes) {
187 struct zone *z = &NODE_DATA(nd)->node_zones[k];
188 if (z->present_pages > 0)
189 zoneref_set_zone(z, &zl->_zonerefs[num++]);
190 } 181 }
191 if (k == 0)
192 break;
193 k--;
194 }
195 if (num == 0) {
196 kfree(zl);
197 return ERR_PTR(-EINVAL);
198 } 182 }
199 zl->_zonerefs[num].zone = NULL; 183
200 zl->_zonerefs[num].zone_idx = 0; 184 return 0;
201 return zl;
202} 185}
203 186
204/* Create a new policy */ 187/* Create a new policy */
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
229 policy->v.preferred_node = -1; 212 policy->v.preferred_node = -1;
230 break; 213 break;
231 case MPOL_BIND: 214 case MPOL_BIND:
232 policy->v.zonelist = bind_zonelist(nodes); 215 if (!is_valid_nodemask(nodes)) {
233 if (IS_ERR(policy->v.zonelist)) {
234 void *error_code = policy->v.zonelist;
235 kmem_cache_free(policy_cache, policy); 216 kmem_cache_free(policy_cache, policy);
236 return error_code; 217 return ERR_PTR(-EINVAL);
237 } 218 }
219 policy->v.nodes = *nodes;
238 break; 220 break;
239 } 221 }
240 policy->policy = mode; 222 policy->policy = mode;
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
500/* Fill a zone bitmap for a policy */ 482/* Fill a zone bitmap for a policy */
501static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 483static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
502{ 484{
503 int i;
504
505 nodes_clear(*nodes); 485 nodes_clear(*nodes);
506 switch (p->policy) { 486 switch (p->policy) {
507 case MPOL_BIND:
508 for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
509 struct zoneref *zref;
510 zref = &p->v.zonelist->_zonerefs[i];
511 node_set(zonelist_node_idx(zref), *nodes);
512 }
513 break;
514 case MPOL_DEFAULT: 487 case MPOL_DEFAULT:
515 break; 488 break;
489 case MPOL_BIND:
490 /* Fall through */
516 case MPOL_INTERLEAVE: 491 case MPOL_INTERLEAVE:
517 *nodes = p->v.nodes; 492 *nodes = p->v.nodes;
518 break; 493 break;
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
1160 return pol; 1135 return pol;
1161} 1136}
1162 1137
1138/* Return a nodemask representing a mempolicy */
1139static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1140{
1141 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1142 if (unlikely(policy->policy == MPOL_BIND) &&
1143 gfp_zone(gfp) >= policy_zone &&
1144 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1145 return &policy->v.nodes;
1146
1147 return NULL;
1148}
1149
1163/* Return a zonelist representing a mempolicy */ 1150/* Return a zonelist representing a mempolicy */
1164static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1151static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1165{ 1152{
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1172 nd = numa_node_id(); 1159 nd = numa_node_id();
1173 break; 1160 break;
1174 case MPOL_BIND: 1161 case MPOL_BIND:
1175 /* Lower zones don't get a policy applied */ 1162 /*
1176 /* Careful: current->mems_allowed might have moved */ 1163 * Normally, MPOL_BIND allocations node-local are node-local
1177 if (gfp_zone(gfp) >= policy_zone) 1164 * within the allowed nodemask. However, if __GFP_THISNODE is
1178 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 1165 * set and the current node is part of the mask, we use the
1179 return policy->v.zonelist; 1166 * the zonelist for the first node in the mask instead.
1180 /*FALL THROUGH*/ 1167 */
1168 nd = numa_node_id();
1169 if (unlikely(gfp & __GFP_THISNODE) &&
1170 unlikely(!node_isset(nd, policy->v.nodes)))
1171 nd = first_node(policy->v.nodes);
1172 break;
1181 case MPOL_INTERLEAVE: /* should not happen */ 1173 case MPOL_INTERLEAVE: /* should not happen */
1182 case MPOL_DEFAULT: 1174 case MPOL_DEFAULT:
1183 nd = numa_node_id(); 1175 nd = numa_node_id();
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
1220 * Follow bind policy behavior and start allocation at the 1212 * Follow bind policy behavior and start allocation at the
1221 * first node. 1213 * first node.
1222 */ 1214 */
1223 return zonelist_node_idx(policy->v.zonelist->_zonerefs); 1215 struct zonelist *zonelist;
1216 struct zone *zone;
1217 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1218 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1219 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1220 &policy->v.nodes,
1221 &zone);
1222 return zone->node;
1224 } 1223 }
1225 1224
1226 case MPOL_PREFERRED: 1225 case MPOL_PREFERRED:
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1278 * @vma = virtual memory area whose policy is sought 1277 * @vma = virtual memory area whose policy is sought
1279 * @addr = address in @vma for shared policy lookup and interleave policy 1278 * @addr = address in @vma for shared policy lookup and interleave policy
1280 * @gfp_flags = for requested zone 1279 * @gfp_flags = for requested zone
1281 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1280 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1281 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1282 * 1282 *
1283 * Returns a zonelist suitable for a huge page allocation. 1283 * Returns a zonelist suitable for a huge page allocation.
1284 * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1284 * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1285 * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1285 * If it is also a policy for which get_vma_policy() returns an extra 1286 * If it is also a policy for which get_vma_policy() returns an extra
1286 * reference, we must hold that reference until after allocation. 1287 * reference, we must hold that reference until after the allocation.
1287 * In that case, return policy via @mpol so hugetlb allocation can drop 1288 * In that case, return policy via @mpol so hugetlb allocation can drop
1288 * the reference. For non-'BIND referenced policies, we can/do drop the 1289 * the reference. For non-'BIND referenced policies, we can/do drop the
1289 * reference here, so the caller doesn't need to know about the special case 1290 * reference here, so the caller doesn't need to know about the special case
1290 * for default and current task policy. 1291 * for default and current task policy.
1291 */ 1292 */
1292struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1293struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1293 gfp_t gfp_flags, struct mempolicy **mpol) 1294 gfp_t gfp_flags, struct mempolicy **mpol,
1295 nodemask_t **nodemask)
1294{ 1296{
1295 struct mempolicy *pol = get_vma_policy(current, vma, addr); 1297 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1296 struct zonelist *zl; 1298 struct zonelist *zl;
1297 1299
1298 *mpol = NULL; /* probably no unref needed */ 1300 *mpol = NULL; /* probably no unref needed */
1299 if (pol->policy == MPOL_INTERLEAVE) { 1301 *nodemask = NULL; /* assume !MPOL_BIND */
1302 if (pol->policy == MPOL_BIND) {
1303 *nodemask = &pol->v.nodes;
1304 } else if (pol->policy == MPOL_INTERLEAVE) {
1300 unsigned nid; 1305 unsigned nid;
1301 1306
1302 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); 1307 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1376 /* 1381 /*
1377 * slow path: ref counted policy -- shared or vma 1382 * slow path: ref counted policy -- shared or vma
1378 */ 1383 */
1379 struct page *page = __alloc_pages(gfp, 0, zl); 1384 struct page *page = __alloc_pages_nodemask(gfp, 0,
1385 zl, nodemask_policy(gfp, pol));
1380 __mpol_free(pol); 1386 __mpol_free(pol);
1381 return page; 1387 return page;
1382 } 1388 }
1383 /* 1389 /*
1384 * fast path: default or task policy 1390 * fast path: default or task policy
1385 */ 1391 */
1386 return __alloc_pages(gfp, 0, zl); 1392 return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1387} 1393}
1388 1394
1389/** 1395/**
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1415 pol = &default_policy; 1421 pol = &default_policy;
1416 if (pol->policy == MPOL_INTERLEAVE) 1422 if (pol->policy == MPOL_INTERLEAVE)
1417 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1423 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1418 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1424 return __alloc_pages_nodemask(gfp, order,
1425 zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1419} 1426}
1420EXPORT_SYMBOL(alloc_pages_current); 1427EXPORT_SYMBOL(alloc_pages_current);
1421 1428
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1440 } 1447 }
1441 *new = *old; 1448 *new = *old;
1442 atomic_set(&new->refcnt, 1); 1449 atomic_set(&new->refcnt, 1);
1443 if (new->policy == MPOL_BIND) {
1444 int sz = ksize(old->v.zonelist);
1445 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1446 if (!new->v.zonelist) {
1447 kmem_cache_free(policy_cache, new);
1448 return ERR_PTR(-ENOMEM);
1449 }
1450 }
1451 return new; 1450 return new;
1452} 1451}
1453 1452
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1461 switch (a->policy) { 1460 switch (a->policy) {
1462 case MPOL_DEFAULT: 1461 case MPOL_DEFAULT:
1463 return 1; 1462 return 1;
1463 case MPOL_BIND:
1464 /* Fall through */
1464 case MPOL_INTERLEAVE: 1465 case MPOL_INTERLEAVE:
1465 return nodes_equal(a->v.nodes, b->v.nodes); 1466 return nodes_equal(a->v.nodes, b->v.nodes);
1466 case MPOL_PREFERRED: 1467 case MPOL_PREFERRED:
1467 return a->v.preferred_node == b->v.preferred_node; 1468 return a->v.preferred_node == b->v.preferred_node;
1468 case MPOL_BIND: {
1469 int i;
1470 for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
1471 struct zone *za, *zb;
1472 za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
1473 zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
1474 if (za != zb)
1475 return 0;
1476 }
1477 return b->v.zonelist->_zonerefs[i].zone == NULL;
1478 }
1479 default: 1469 default:
1480 BUG(); 1470 BUG();
1481 return 0; 1471 return 0;
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
1487{ 1477{
1488 if (!atomic_dec_and_test(&p->refcnt)) 1478 if (!atomic_dec_and_test(&p->refcnt))
1489 return; 1479 return;
1490 if (p->policy == MPOL_BIND)
1491 kfree(p->v.zonelist);
1492 p->policy = MPOL_DEFAULT; 1480 p->policy = MPOL_DEFAULT;
1493 kmem_cache_free(policy_cache, p); 1481 kmem_cache_free(policy_cache, p);
1494} 1482}
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1779 switch (pol->policy) { 1767 switch (pol->policy) {
1780 case MPOL_DEFAULT: 1768 case MPOL_DEFAULT:
1781 break; 1769 break;
1770 case MPOL_BIND:
1771 /* Fall through */
1782 case MPOL_INTERLEAVE: 1772 case MPOL_INTERLEAVE:
1783 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); 1773 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1784 pol->v.nodes = tmp; 1774 pol->v.nodes = tmp;
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
1791 *mpolmask, *newmask); 1781 *mpolmask, *newmask);
1792 *mpolmask = *newmask; 1782 *mpolmask = *newmask;
1793 break; 1783 break;
1794 case MPOL_BIND: {
1795 nodemask_t nodes;
1796 struct zoneref *z;
1797 struct zonelist *zonelist;
1798
1799 nodes_clear(nodes);
1800 for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
1801 node_set(zonelist_node_idx(z), nodes);
1802 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1803 nodes = tmp;
1804
1805 zonelist = bind_zonelist(&nodes);
1806
1807 /* If no mem, then zonelist is NULL and we keep old zonelist.
1808 * If that old zonelist has no remaining mems_allowed nodes,
1809 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1810 */
1811
1812 if (!IS_ERR(zonelist)) {
1813 /* Good - got mem - substitute new zonelist */
1814 kfree(pol->v.zonelist);
1815 pol->v.zonelist = zonelist;
1816 }
1817 *mpolmask = *newmask;
1818 break;
1819 }
1820 default: 1784 default:
1821 BUG(); 1785 BUG();
1822 break; 1786 break;
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1879 break; 1843 break;
1880 1844
1881 case MPOL_BIND: 1845 case MPOL_BIND:
1882 get_zonemask(pol, &nodes); 1846 /* Fall through */
1883 break;
1884
1885 case MPOL_INTERLEAVE: 1847 case MPOL_INTERLEAVE:
1886 nodes = pol->v.nodes; 1848 nodes = pol->v.nodes;
1887 break; 1849 break;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f18..486ed595ee6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
42 return zone; 42 return zone;
43} 43}
44 44
45static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
46{
47#ifdef CONFIG_NUMA
48 return node_isset(zonelist_node_idx(zref), *nodes);
49#else
50 return 1;
51#endif /* CONFIG_NUMA */
52}
53
54/* Returns the next zone at or below highest_zoneidx in a zonelist */
55struct zoneref *next_zones_zonelist(struct zoneref *z,
56 enum zone_type highest_zoneidx,
57 nodemask_t *nodes,
58 struct zone **zone)
59{
60 /*
61 * Find the next suitable zone to use for the allocation.
62 * Only filter based on nodemask if it's set
63 */
64 if (likely(nodes == NULL))
65 while (zonelist_zone_idx(z) > highest_zoneidx)
66 z++;
67 else
68 while (zonelist_zone_idx(z) > highest_zoneidx ||
69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++;
71
72 *zone = zonelist_zone(z++);
73 return z;
74}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6d94d04ea784..b4beb3eea8b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1377 * a page. 1377 * a page.
1378 */ 1378 */
1379static struct page * 1379static struct page *
1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1380get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1382{ 1382{
1383 struct zoneref *z; 1383 struct zoneref *z;
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
1388 int zlc_active = 0; /* set if using zonelist_cache */ 1388 int zlc_active = 0; /* set if using zonelist_cache */
1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1390 1390
1391 z = first_zones_zonelist(zonelist, high_zoneidx); 1391 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1392 classzone_idx = zonelist_zone_idx(z); 1392 &preferred_zone);
1393 preferred_zone = zonelist_zone(z); 1393 classzone_idx = zone_idx(preferred_zone);
1394 1394
1395zonelist_scan: 1395zonelist_scan:
1396 /* 1396 /*
1397 * Scan zonelist, looking for a zone with enough free. 1397 * Scan zonelist, looking for a zone with enough free.
1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1399 */ 1399 */
1400 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1400 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1401 high_zoneidx, nodemask) {
1401 if (NUMA_BUILD && zlc_active && 1402 if (NUMA_BUILD && zlc_active &&
1402 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1403 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1403 continue; 1404 continue;
@@ -1447,9 +1448,9 @@ try_next_zone:
1447/* 1448/*
1448 * This is the 'heart' of the zoned buddy allocator. 1449 * This is the 'heart' of the zoned buddy allocator.
1449 */ 1450 */
1450struct page * 1451static struct page *
1451__alloc_pages(gfp_t gfp_mask, unsigned int order, 1452__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1452 struct zonelist *zonelist) 1453 struct zonelist *zonelist, nodemask_t *nodemask)
1453{ 1454{
1454 const gfp_t wait = gfp_mask & __GFP_WAIT; 1455 const gfp_t wait = gfp_mask & __GFP_WAIT;
1455 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 1456 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
@@ -1478,7 +1479,7 @@ restart:
1478 return NULL; 1479 return NULL;
1479 } 1480 }
1480 1481
1481 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1482 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1482 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1483 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1483 if (page) 1484 if (page)
1484 goto got_pg; 1485 goto got_pg;
@@ -1523,7 +1524,7 @@ restart:
1523 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1524 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1524 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1525 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1525 */ 1526 */
1526 page = get_page_from_freelist(gfp_mask, order, zonelist, 1527 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1527 high_zoneidx, alloc_flags); 1528 high_zoneidx, alloc_flags);
1528 if (page) 1529 if (page)
1529 goto got_pg; 1530 goto got_pg;
@@ -1536,7 +1537,7 @@ rebalance:
1536 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1537 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1537nofail_alloc: 1538nofail_alloc:
1538 /* go through the zonelist yet again, ignoring mins */ 1539 /* go through the zonelist yet again, ignoring mins */
1539 page = get_page_from_freelist(gfp_mask, order, 1540 page = get_page_from_freelist(gfp_mask, nodemask, order,
1540 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1541 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1541 if (page) 1542 if (page)
1542 goto got_pg; 1543 goto got_pg;
@@ -1571,7 +1572,7 @@ nofail_alloc:
1571 drain_all_pages(); 1572 drain_all_pages();
1572 1573
1573 if (likely(did_some_progress)) { 1574 if (likely(did_some_progress)) {
1574 page = get_page_from_freelist(gfp_mask, order, 1575 page = get_page_from_freelist(gfp_mask, nodemask, order,
1575 zonelist, high_zoneidx, alloc_flags); 1576 zonelist, high_zoneidx, alloc_flags);
1576 if (page) 1577 if (page)
1577 goto got_pg; 1578 goto got_pg;
@@ -1587,8 +1588,9 @@ nofail_alloc:
1587 * a parallel oom killing, we must fail if we're still 1588 * a parallel oom killing, we must fail if we're still
1588 * under heavy pressure. 1589 * under heavy pressure.
1589 */ 1590 */
1590 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1591 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1591 zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1592 order, zonelist, high_zoneidx,
1593 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1592 if (page) { 1594 if (page) {
1593 clear_zonelist_oom(zonelist, gfp_mask); 1595 clear_zonelist_oom(zonelist, gfp_mask);
1594 goto got_pg; 1596 goto got_pg;
@@ -1637,6 +1639,20 @@ got_pg:
1637 return page; 1639 return page;
1638} 1640}
1639 1641
1642struct page *
1643__alloc_pages(gfp_t gfp_mask, unsigned int order,
1644 struct zonelist *zonelist)
1645{
1646 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1647}
1648
1649struct page *
1650__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1651 struct zonelist *zonelist, nodemask_t *nodemask)
1652{
1653 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1654}
1655
1640EXPORT_SYMBOL(__alloc_pages); 1656EXPORT_SYMBOL(__alloc_pages);
1641 1657
1642/* 1658/*
@@ -1880,6 +1896,12 @@ void show_free_areas(void)
1880 show_swap_cache_info(); 1896 show_swap_cache_info();
1881} 1897}
1882 1898
1899static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
1900{
1901 zoneref->zone = zone;
1902 zoneref->zone_idx = zone_idx(zone);
1903}
1904
1883/* 1905/*
1884 * Builds allocation fallback zone lists. 1906 * Builds allocation fallback zone lists.
1885 * 1907 *