diff options
author | Mel Gorman <mel@csn.ul.ie> | 2008-04-28 05:12:18 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-04-28 11:58:19 -0400 |
commit | 19770b32609b6bf97a3dece2529089494cbfc549 (patch) | |
tree | 3b5922d1b20aabdf929bde9309f323841717747a | |
parent | dd1a239f6f2d4d3eedd318583ec319aa145b324c (diff) |
mm: filter based on a nodemask as well as a gfp_mask
The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy. As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering. This eliminates the need for
MPOL_BIND to create a custom zonelist.
A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist. I.e., pages will be allocated from the closest allowed node with
available memory.
[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/vm/numa_memory_policy.txt | 11 | ||||
-rw-r--r-- | fs/buffer.c | 9 | ||||
-rw-r--r-- | include/linux/cpuset.h | 4 | ||||
-rw-r--r-- | include/linux/gfp.h | 4 | ||||
-rw-r--r-- | include/linux/mempolicy.h | 19 | ||||
-rw-r--r-- | include/linux/mmzone.h | 80 | ||||
-rw-r--r-- | kernel/cpuset.c | 18 | ||||
-rw-r--r-- | mm/hugetlb.c | 6 | ||||
-rw-r--r-- | mm/mempolicy.c | 184 | ||||
-rw-r--r-- | mm/mmzone.c | 30 | ||||
-rw-r--r-- | mm/page_alloc.c | 50 |
11 files changed, 224 insertions, 191 deletions
diff --git a/Documentation/vm/numa_memory_policy.txt b/Documentation/vm/numa_memory_policy.txt index dd4986497996..1278e685d650 100644 --- a/Documentation/vm/numa_memory_policy.txt +++ b/Documentation/vm/numa_memory_policy.txt | |||
@@ -182,14 +182,9 @@ Components of Memory Policies | |||
182 | The Default mode does not use the optional set of nodes. | 182 | The Default mode does not use the optional set of nodes. |
183 | 183 | ||
184 | MPOL_BIND: This mode specifies that memory must come from the | 184 | MPOL_BIND: This mode specifies that memory must come from the |
185 | set of nodes specified by the policy. | 185 | set of nodes specified by the policy. Memory will be allocated from |
186 | 186 | the node in the set with sufficient free memory that is closest to | |
187 | The memory policy APIs do not specify an order in which the nodes | 187 | the node where the allocation takes place. |
188 | will be searched. However, unlike "local allocation", the Bind | ||
189 | policy does not consider the distance between the nodes. Rather, | ||
190 | allocations will fallback to the nodes specified by the policy in | ||
191 | order of numeric node id. Like everything in Linux, this is subject | ||
192 | to change. | ||
193 | 188 | ||
194 | MPOL_PREFERRED: This mode specifies that the allocation should be | 189 | MPOL_PREFERRED: This mode specifies that the allocation should be |
195 | attempted from the single node specified in the policy. If that | 190 | attempted from the single node specified in the policy. If that |
diff --git a/fs/buffer.c b/fs/buffer.c index ac84cd13075d..7d51e649b19a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev) | |||
360 | */ | 360 | */ |
361 | static void free_more_memory(void) | 361 | static void free_more_memory(void) |
362 | { | 362 | { |
363 | struct zoneref *zrefs; | 363 | struct zone *zone; |
364 | int nid; | 364 | int nid; |
365 | 365 | ||
366 | wakeup_pdflush(1024); | 366 | wakeup_pdflush(1024); |
367 | yield(); | 367 | yield(); |
368 | 368 | ||
369 | for_each_online_node(nid) { | 369 | for_each_online_node(nid) { |
370 | zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS), | 370 | (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), |
371 | gfp_zone(GFP_NOFS)); | 371 | gfp_zone(GFP_NOFS), NULL, |
372 | if (zrefs->zone) | 372 | &zone); |
373 | if (zone) | ||
373 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, | 374 | try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, |
374 | GFP_NOFS); | 375 | GFP_NOFS); |
375 | } | 376 | } |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 726761e24003..038578362b47 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | |||
26 | #define cpuset_current_mems_allowed (current->mems_allowed) | 26 | #define cpuset_current_mems_allowed (current->mems_allowed) |
27 | void cpuset_init_current_mems_allowed(void); | 27 | void cpuset_init_current_mems_allowed(void); |
28 | void cpuset_update_task_memory_state(void); | 28 | void cpuset_update_task_memory_state(void); |
29 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); | 29 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); |
30 | 30 | ||
31 | extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); | 31 | extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask); |
32 | extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); | 32 | extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask); |
@@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p) | |||
103 | static inline void cpuset_init_current_mems_allowed(void) {} | 103 | static inline void cpuset_init_current_mems_allowed(void) {} |
104 | static inline void cpuset_update_task_memory_state(void) {} | 104 | static inline void cpuset_update_task_memory_state(void) {} |
105 | 105 | ||
106 | static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | 106 | static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
107 | { | 107 | { |
108 | return 1; | 108 | return 1; |
109 | } | 109 | } |
diff --git a/include/linux/gfp.h b/include/linux/gfp.h index e1c6064cb6c7..898aa9d5b6c2 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h | |||
@@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { } | |||
182 | 182 | ||
183 | extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *); | 183 | extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *); |
184 | 184 | ||
185 | extern struct page * | ||
186 | __alloc_pages_nodemask(gfp_t, unsigned int, | ||
187 | struct zonelist *, nodemask_t *nodemask); | ||
188 | |||
185 | static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, | 189 | static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, |
186 | unsigned int order) | 190 | unsigned int order) |
187 | { | 191 | { |
diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 69160dc32d48..b8b3da7a3315 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h | |||
@@ -54,19 +54,20 @@ struct mm_struct; | |||
54 | * mmap_sem. | 54 | * mmap_sem. |
55 | * | 55 | * |
56 | * Freeing policy: | 56 | * Freeing policy: |
57 | * When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd. | 57 | * Mempolicy objects are reference counted. A mempolicy will be freed when |
58 | * All other policies don't have any external state. mpol_free() handles this. | 58 | * mpol_free() decrements the reference count to zero. |
59 | * | 59 | * |
60 | * Copying policy objects: | 60 | * Copying policy objects: |
61 | * For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this. | 61 | * mpol_copy() allocates a new mempolicy and copies the specified mempolicy |
62 | * to the new storage. The reference count of the new object is initialized | ||
63 | * to 1, representing the caller of mpol_copy(). | ||
62 | */ | 64 | */ |
63 | struct mempolicy { | 65 | struct mempolicy { |
64 | atomic_t refcnt; | 66 | atomic_t refcnt; |
65 | short policy; /* See MPOL_* above */ | 67 | short policy; /* See MPOL_* above */ |
66 | union { | 68 | union { |
67 | struct zonelist *zonelist; /* bind */ | ||
68 | short preferred_node; /* preferred */ | 69 | short preferred_node; /* preferred */ |
69 | nodemask_t nodes; /* interleave */ | 70 | nodemask_t nodes; /* interleave/bind */ |
70 | /* undefined for default */ | 71 | /* undefined for default */ |
71 | } v; | 72 | } v; |
72 | nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ | 73 | nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */ |
@@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p); | |||
151 | 152 | ||
152 | extern struct mempolicy default_policy; | 153 | extern struct mempolicy default_policy; |
153 | extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, | 154 | extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, |
154 | unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol); | 155 | unsigned long addr, gfp_t gfp_flags, |
156 | struct mempolicy **mpol, nodemask_t **nodemask); | ||
155 | extern unsigned slab_node(struct mempolicy *policy); | 157 | extern unsigned slab_node(struct mempolicy *policy); |
156 | 158 | ||
157 | extern enum zone_type policy_zone; | 159 | extern enum zone_type policy_zone; |
@@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p) | |||
239 | } | 241 | } |
240 | 242 | ||
241 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, | 243 | static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, |
242 | unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol) | 244 | unsigned long addr, gfp_t gfp_flags, |
245 | struct mempolicy **mpol, nodemask_t **nodemask) | ||
243 | { | 246 | { |
247 | *mpol = NULL; | ||
248 | *nodemask = NULL; | ||
244 | return node_zonelist(0, gfp_flags); | 249 | return node_zonelist(0, gfp_flags); |
245 | } | 250 | } |
246 | 251 | ||
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index d34b4c290017..498d6ceff2f4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h | |||
@@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref) | |||
749 | #endif /* CONFIG_NUMA */ | 749 | #endif /* CONFIG_NUMA */ |
750 | } | 750 | } |
751 | 751 | ||
752 | static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | 752 | /** |
753 | { | 753 | * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point |
754 | zoneref->zone = zone; | 754 | * @z - The cursor used as a starting point for the search |
755 | zoneref->zone_idx = zone_idx(zone); | 755 | * @highest_zoneidx - The zone index of the highest zone to return |
756 | } | 756 | * @nodes - An optional nodemask to filter the zonelist with |
757 | * @zone - The first suitable zone found is returned via this parameter | ||
758 | * | ||
759 | * This function returns the next zone at or below a given zone index that is | ||
760 | * within the allowed nodemask using a cursor as the starting point for the | ||
761 | * search. The zoneref returned is a cursor that is used as the next starting | ||
762 | * point for future calls to next_zones_zonelist(). | ||
763 | */ | ||
764 | struct zoneref *next_zones_zonelist(struct zoneref *z, | ||
765 | enum zone_type highest_zoneidx, | ||
766 | nodemask_t *nodes, | ||
767 | struct zone **zone); | ||
757 | 768 | ||
758 | /* Returns the first zone at or below highest_zoneidx in a zonelist */ | 769 | /** |
770 | * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist | ||
771 | * @zonelist - The zonelist to search for a suitable zone | ||
772 | * @highest_zoneidx - The zone index of the highest zone to return | ||
773 | * @nodes - An optional nodemask to filter the zonelist with | ||
774 | * @zone - The first suitable zone found is returned via this parameter | ||
775 | * | ||
776 | * This function returns the first zone at or below a given zone index that is | ||
777 | * within the allowed nodemask. The zoneref returned is a cursor that can be | ||
778 | * used to iterate the zonelist with next_zones_zonelist. The cursor should | ||
779 | * not be used by the caller as it does not match the value of the zone | ||
780 | * returned. | ||
781 | */ | ||
759 | static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, | 782 | static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, |
760 | enum zone_type highest_zoneidx) | 783 | enum zone_type highest_zoneidx, |
784 | nodemask_t *nodes, | ||
785 | struct zone **zone) | ||
761 | { | 786 | { |
762 | struct zoneref *z; | 787 | return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes, |
763 | 788 | zone); | |
764 | /* Find the first suitable zone to use for the allocation */ | ||
765 | z = zonelist->_zonerefs; | ||
766 | while (zonelist_zone_idx(z) > highest_zoneidx) | ||
767 | z++; | ||
768 | |||
769 | return z; | ||
770 | } | 789 | } |
771 | 790 | ||
772 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | 791 | /** |
773 | static inline struct zoneref *next_zones_zonelist(struct zoneref *z, | 792 | * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask |
774 | enum zone_type highest_zoneidx) | 793 | * @zone - The current zone in the iterator |
775 | { | 794 | * @z - The current pointer within zonelist->zones being iterated |
776 | /* Find the next suitable zone to use for the allocation */ | 795 | * @zlist - The zonelist being iterated |
777 | while (zonelist_zone_idx(z) > highest_zoneidx) | 796 | * @highidx - The zone index of the highest zone to return |
778 | z++; | 797 | * @nodemask - Nodemask allowed by the allocator |
779 | 798 | * | |
780 | return z; | 799 | * This iterator iterates though all zones at or below a given zone index and |
781 | } | 800 | * within a given nodemask |
801 | */ | ||
802 | #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ | ||
803 | for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \ | ||
804 | zone; \ | ||
805 | z = next_zones_zonelist(z, highidx, nodemask, &zone)) \ | ||
782 | 806 | ||
783 | /** | 807 | /** |
784 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index | 808 | * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index |
@@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z, | |||
790 | * This iterator iterates though all zones at or below a given zone index. | 814 | * This iterator iterates though all zones at or below a given zone index. |
791 | */ | 815 | */ |
792 | #define for_each_zone_zonelist(zone, z, zlist, highidx) \ | 816 | #define for_each_zone_zonelist(zone, z, zlist, highidx) \ |
793 | for (z = first_zones_zonelist(zlist, highidx), \ | 817 | for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) |
794 | zone = zonelist_zone(z++); \ | ||
795 | zone; \ | ||
796 | z = next_zones_zonelist(z, highidx), \ | ||
797 | zone = zonelist_zone(z++)) | ||
798 | 818 | ||
799 | #ifdef CONFIG_SPARSEMEM | 819 | #ifdef CONFIG_SPARSEMEM |
800 | #include <asm/sparsemem.h> | 820 | #include <asm/sparsemem.h> |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index a220b13cbfaf..c9923e3c9a3b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | |||
1958 | } | 1958 | } |
1959 | 1959 | ||
1960 | /** | 1960 | /** |
1961 | * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed | 1961 | * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed |
1962 | * @zl: the zonelist to be checked | 1962 | * @nodemask: the nodemask to be checked |
1963 | * | 1963 | * |
1964 | * Are any of the nodes on zonelist zl allowed in current->mems_allowed? | 1964 | * Are any of the nodes in the nodemask allowed in current->mems_allowed? |
1965 | */ | 1965 | */ |
1966 | int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) | 1966 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) |
1967 | { | 1967 | { |
1968 | int i; | 1968 | return nodes_intersects(*nodemask, current->mems_allowed); |
1969 | |||
1970 | for (i = 0; zl->_zonerefs[i].zone; i++) { | ||
1971 | int nid = zonelist_node_idx(&zl->_zonerefs[i]); | ||
1972 | |||
1973 | if (node_isset(nid, current->mems_allowed)) | ||
1974 | return 1; | ||
1975 | } | ||
1976 | return 0; | ||
1977 | } | 1969 | } |
1978 | 1970 | ||
1979 | /* | 1971 | /* |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4bced0d705ca..3737d82f5225 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
95 | int nid; | 95 | int nid; |
96 | struct page *page = NULL; | 96 | struct page *page = NULL; |
97 | struct mempolicy *mpol; | 97 | struct mempolicy *mpol; |
98 | nodemask_t *nodemask; | ||
98 | struct zonelist *zonelist = huge_zonelist(vma, address, | 99 | struct zonelist *zonelist = huge_zonelist(vma, address, |
99 | htlb_alloc_mask, &mpol); | 100 | htlb_alloc_mask, &mpol, &nodemask); |
100 | struct zone *zone; | 101 | struct zone *zone; |
101 | struct zoneref *z; | 102 | struct zoneref *z; |
102 | 103 | ||
103 | for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) { | 104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
105 | MAX_NR_ZONES - 1, nodemask) { | ||
104 | nid = zone_to_nid(zone); | 106 | nid = zone_to_nid(zone); |
105 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && | 107 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && |
106 | !list_empty(&hugepage_freelists[nid])) { | 108 | !list_empty(&hugepage_freelists[nid])) { |
diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 90193a2a915b..acb5ee3587c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c | |||
@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) | |||
163 | return 0; | 163 | return 0; |
164 | } | 164 | } |
165 | 165 | ||
166 | /* Generate a custom zonelist for the BIND policy. */ | 166 | /* Check that the nodemask contains at least one populated zone */ |
167 | static struct zonelist *bind_zonelist(nodemask_t *nodes) | 167 | static int is_valid_nodemask(nodemask_t *nodemask) |
168 | { | 168 | { |
169 | struct zonelist *zl; | 169 | int nd, k; |
170 | int num, max, nd; | ||
171 | enum zone_type k; | ||
172 | 170 | ||
173 | max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); | 171 | /* Check that there is something useful in this mask */ |
174 | max++; /* space for zlcache_ptr (see mmzone.h) */ | 172 | k = policy_zone; |
175 | zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); | 173 | |
176 | if (!zl) | 174 | for_each_node_mask(nd, *nodemask) { |
177 | return ERR_PTR(-ENOMEM); | 175 | struct zone *z; |
178 | zl->zlcache_ptr = NULL; | 176 | |
179 | num = 0; | 177 | for (k = 0; k <= policy_zone; k++) { |
180 | /* First put in the highest zones from all nodes, then all the next | 178 | z = &NODE_DATA(nd)->node_zones[k]; |
181 | lower zones etc. Avoid empty zones because the memory allocator | 179 | if (z->present_pages > 0) |
182 | doesn't like them. If you implement node hot removal you | 180 | return 1; |
183 | have to fix that. */ | ||
184 | k = MAX_NR_ZONES - 1; | ||
185 | while (1) { | ||
186 | for_each_node_mask(nd, *nodes) { | ||
187 | struct zone *z = &NODE_DATA(nd)->node_zones[k]; | ||
188 | if (z->present_pages > 0) | ||
189 | zoneref_set_zone(z, &zl->_zonerefs[num++]); | ||
190 | } | 181 | } |
191 | if (k == 0) | ||
192 | break; | ||
193 | k--; | ||
194 | } | ||
195 | if (num == 0) { | ||
196 | kfree(zl); | ||
197 | return ERR_PTR(-EINVAL); | ||
198 | } | 182 | } |
199 | zl->_zonerefs[num].zone = NULL; | 183 | |
200 | zl->_zonerefs[num].zone_idx = 0; | 184 | return 0; |
201 | return zl; | ||
202 | } | 185 | } |
203 | 186 | ||
204 | /* Create a new policy */ | 187 | /* Create a new policy */ |
@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) | |||
229 | policy->v.preferred_node = -1; | 212 | policy->v.preferred_node = -1; |
230 | break; | 213 | break; |
231 | case MPOL_BIND: | 214 | case MPOL_BIND: |
232 | policy->v.zonelist = bind_zonelist(nodes); | 215 | if (!is_valid_nodemask(nodes)) { |
233 | if (IS_ERR(policy->v.zonelist)) { | ||
234 | void *error_code = policy->v.zonelist; | ||
235 | kmem_cache_free(policy_cache, policy); | 216 | kmem_cache_free(policy_cache, policy); |
236 | return error_code; | 217 | return ERR_PTR(-EINVAL); |
237 | } | 218 | } |
219 | policy->v.nodes = *nodes; | ||
238 | break; | 220 | break; |
239 | } | 221 | } |
240 | policy->policy = mode; | 222 | policy->policy = mode; |
@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes) | |||
500 | /* Fill a zone bitmap for a policy */ | 482 | /* Fill a zone bitmap for a policy */ |
501 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) | 483 | static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) |
502 | { | 484 | { |
503 | int i; | ||
504 | |||
505 | nodes_clear(*nodes); | 485 | nodes_clear(*nodes); |
506 | switch (p->policy) { | 486 | switch (p->policy) { |
507 | case MPOL_BIND: | ||
508 | for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) { | ||
509 | struct zoneref *zref; | ||
510 | zref = &p->v.zonelist->_zonerefs[i]; | ||
511 | node_set(zonelist_node_idx(zref), *nodes); | ||
512 | } | ||
513 | break; | ||
514 | case MPOL_DEFAULT: | 487 | case MPOL_DEFAULT: |
515 | break; | 488 | break; |
489 | case MPOL_BIND: | ||
490 | /* Fall through */ | ||
516 | case MPOL_INTERLEAVE: | 491 | case MPOL_INTERLEAVE: |
517 | *nodes = p->v.nodes; | 492 | *nodes = p->v.nodes; |
518 | break; | 493 | break; |
@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task, | |||
1160 | return pol; | 1135 | return pol; |
1161 | } | 1136 | } |
1162 | 1137 | ||
1138 | /* Return a nodemask representing a mempolicy */ | ||
1139 | static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy) | ||
1140 | { | ||
1141 | /* Lower zones don't get a nodemask applied for MPOL_BIND */ | ||
1142 | if (unlikely(policy->policy == MPOL_BIND) && | ||
1143 | gfp_zone(gfp) >= policy_zone && | ||
1144 | cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) | ||
1145 | return &policy->v.nodes; | ||
1146 | |||
1147 | return NULL; | ||
1148 | } | ||
1149 | |||
1163 | /* Return a zonelist representing a mempolicy */ | 1150 | /* Return a zonelist representing a mempolicy */ |
1164 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | 1151 | static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) |
1165 | { | 1152 | { |
@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) | |||
1172 | nd = numa_node_id(); | 1159 | nd = numa_node_id(); |
1173 | break; | 1160 | break; |
1174 | case MPOL_BIND: | 1161 | case MPOL_BIND: |
1175 | /* Lower zones don't get a policy applied */ | 1162 | /* |
1176 | /* Careful: current->mems_allowed might have moved */ | 1163 | * Normally, MPOL_BIND allocations node-local are node-local |
1177 | if (gfp_zone(gfp) >= policy_zone) | 1164 | * within the allowed nodemask. However, if __GFP_THISNODE is |
1178 | if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) | 1165 | * set and the current node is part of the mask, we use the |
1179 | return policy->v.zonelist; | 1166 | * the zonelist for the first node in the mask instead. |
1180 | /*FALL THROUGH*/ | 1167 | */ |
1168 | nd = numa_node_id(); | ||
1169 | if (unlikely(gfp & __GFP_THISNODE) && | ||
1170 | unlikely(!node_isset(nd, policy->v.nodes))) | ||
1171 | nd = first_node(policy->v.nodes); | ||
1172 | break; | ||
1181 | case MPOL_INTERLEAVE: /* should not happen */ | 1173 | case MPOL_INTERLEAVE: /* should not happen */ |
1182 | case MPOL_DEFAULT: | 1174 | case MPOL_DEFAULT: |
1183 | nd = numa_node_id(); | 1175 | nd = numa_node_id(); |
@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy) | |||
1220 | * Follow bind policy behavior and start allocation at the | 1212 | * Follow bind policy behavior and start allocation at the |
1221 | * first node. | 1213 | * first node. |
1222 | */ | 1214 | */ |
1223 | return zonelist_node_idx(policy->v.zonelist->_zonerefs); | 1215 | struct zonelist *zonelist; |
1216 | struct zone *zone; | ||
1217 | enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); | ||
1218 | zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0]; | ||
1219 | (void)first_zones_zonelist(zonelist, highest_zoneidx, | ||
1220 | &policy->v.nodes, | ||
1221 | &zone); | ||
1222 | return zone->node; | ||
1224 | } | 1223 | } |
1225 | 1224 | ||
1226 | case MPOL_PREFERRED: | 1225 | case MPOL_PREFERRED: |
@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol, | |||
1278 | * @vma = virtual memory area whose policy is sought | 1277 | * @vma = virtual memory area whose policy is sought |
1279 | * @addr = address in @vma for shared policy lookup and interleave policy | 1278 | * @addr = address in @vma for shared policy lookup and interleave policy |
1280 | * @gfp_flags = for requested zone | 1279 | * @gfp_flags = for requested zone |
1281 | * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy | 1280 | * @mpol = pointer to mempolicy pointer for reference counted mempolicy |
1281 | * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask | ||
1282 | * | 1282 | * |
1283 | * Returns a zonelist suitable for a huge page allocation. | 1283 | * Returns a zonelist suitable for a huge page allocation. |
1284 | * If the effective policy is 'BIND, returns pointer to policy's zonelist. | 1284 | * If the effective policy is 'BIND, returns pointer to local node's zonelist, |
1285 | * and a pointer to the mempolicy's @nodemask for filtering the zonelist. | ||
1285 | * If it is also a policy for which get_vma_policy() returns an extra | 1286 | * If it is also a policy for which get_vma_policy() returns an extra |
1286 | * reference, we must hold that reference until after allocation. | 1287 | * reference, we must hold that reference until after the allocation. |
1287 | * In that case, return policy via @mpol so hugetlb allocation can drop | 1288 | * In that case, return policy via @mpol so hugetlb allocation can drop |
1288 | * the reference. For non-'BIND referenced policies, we can/do drop the | 1289 | * the reference. For non-'BIND referenced policies, we can/do drop the |
1289 | * reference here, so the caller doesn't need to know about the special case | 1290 | * reference here, so the caller doesn't need to know about the special case |
1290 | * for default and current task policy. | 1291 | * for default and current task policy. |
1291 | */ | 1292 | */ |
1292 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, | 1293 | struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, |
1293 | gfp_t gfp_flags, struct mempolicy **mpol) | 1294 | gfp_t gfp_flags, struct mempolicy **mpol, |
1295 | nodemask_t **nodemask) | ||
1294 | { | 1296 | { |
1295 | struct mempolicy *pol = get_vma_policy(current, vma, addr); | 1297 | struct mempolicy *pol = get_vma_policy(current, vma, addr); |
1296 | struct zonelist *zl; | 1298 | struct zonelist *zl; |
1297 | 1299 | ||
1298 | *mpol = NULL; /* probably no unref needed */ | 1300 | *mpol = NULL; /* probably no unref needed */ |
1299 | if (pol->policy == MPOL_INTERLEAVE) { | 1301 | *nodemask = NULL; /* assume !MPOL_BIND */ |
1302 | if (pol->policy == MPOL_BIND) { | ||
1303 | *nodemask = &pol->v.nodes; | ||
1304 | } else if (pol->policy == MPOL_INTERLEAVE) { | ||
1300 | unsigned nid; | 1305 | unsigned nid; |
1301 | 1306 | ||
1302 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); | 1307 | nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); |
@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) | |||
1376 | /* | 1381 | /* |
1377 | * slow path: ref counted policy -- shared or vma | 1382 | * slow path: ref counted policy -- shared or vma |
1378 | */ | 1383 | */ |
1379 | struct page *page = __alloc_pages(gfp, 0, zl); | 1384 | struct page *page = __alloc_pages_nodemask(gfp, 0, |
1385 | zl, nodemask_policy(gfp, pol)); | ||
1380 | __mpol_free(pol); | 1386 | __mpol_free(pol); |
1381 | return page; | 1387 | return page; |
1382 | } | 1388 | } |
1383 | /* | 1389 | /* |
1384 | * fast path: default or task policy | 1390 | * fast path: default or task policy |
1385 | */ | 1391 | */ |
1386 | return __alloc_pages(gfp, 0, zl); | 1392 | return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol)); |
1387 | } | 1393 | } |
1388 | 1394 | ||
1389 | /** | 1395 | /** |
@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) | |||
1415 | pol = &default_policy; | 1421 | pol = &default_policy; |
1416 | if (pol->policy == MPOL_INTERLEAVE) | 1422 | if (pol->policy == MPOL_INTERLEAVE) |
1417 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); | 1423 | return alloc_page_interleave(gfp, order, interleave_nodes(pol)); |
1418 | return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); | 1424 | return __alloc_pages_nodemask(gfp, order, |
1425 | zonelist_policy(gfp, pol), nodemask_policy(gfp, pol)); | ||
1419 | } | 1426 | } |
1420 | EXPORT_SYMBOL(alloc_pages_current); | 1427 | EXPORT_SYMBOL(alloc_pages_current); |
1421 | 1428 | ||
@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) | |||
1440 | } | 1447 | } |
1441 | *new = *old; | 1448 | *new = *old; |
1442 | atomic_set(&new->refcnt, 1); | 1449 | atomic_set(&new->refcnt, 1); |
1443 | if (new->policy == MPOL_BIND) { | ||
1444 | int sz = ksize(old->v.zonelist); | ||
1445 | new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); | ||
1446 | if (!new->v.zonelist) { | ||
1447 | kmem_cache_free(policy_cache, new); | ||
1448 | return ERR_PTR(-ENOMEM); | ||
1449 | } | ||
1450 | } | ||
1451 | return new; | 1450 | return new; |
1452 | } | 1451 | } |
1453 | 1452 | ||
@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) | |||
1461 | switch (a->policy) { | 1460 | switch (a->policy) { |
1462 | case MPOL_DEFAULT: | 1461 | case MPOL_DEFAULT: |
1463 | return 1; | 1462 | return 1; |
1463 | case MPOL_BIND: | ||
1464 | /* Fall through */ | ||
1464 | case MPOL_INTERLEAVE: | 1465 | case MPOL_INTERLEAVE: |
1465 | return nodes_equal(a->v.nodes, b->v.nodes); | 1466 | return nodes_equal(a->v.nodes, b->v.nodes); |
1466 | case MPOL_PREFERRED: | 1467 | case MPOL_PREFERRED: |
1467 | return a->v.preferred_node == b->v.preferred_node; | 1468 | return a->v.preferred_node == b->v.preferred_node; |
1468 | case MPOL_BIND: { | ||
1469 | int i; | ||
1470 | for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) { | ||
1471 | struct zone *za, *zb; | ||
1472 | za = zonelist_zone(&a->v.zonelist->_zonerefs[i]); | ||
1473 | zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]); | ||
1474 | if (za != zb) | ||
1475 | return 0; | ||
1476 | } | ||
1477 | return b->v.zonelist->_zonerefs[i].zone == NULL; | ||
1478 | } | ||
1479 | default: | 1469 | default: |
1480 | BUG(); | 1470 | BUG(); |
1481 | return 0; | 1471 | return 0; |
@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p) | |||
1487 | { | 1477 | { |
1488 | if (!atomic_dec_and_test(&p->refcnt)) | 1478 | if (!atomic_dec_and_test(&p->refcnt)) |
1489 | return; | 1479 | return; |
1490 | if (p->policy == MPOL_BIND) | ||
1491 | kfree(p->v.zonelist); | ||
1492 | p->policy = MPOL_DEFAULT; | 1480 | p->policy = MPOL_DEFAULT; |
1493 | kmem_cache_free(policy_cache, p); | 1481 | kmem_cache_free(policy_cache, p); |
1494 | } | 1482 | } |
@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1779 | switch (pol->policy) { | 1767 | switch (pol->policy) { |
1780 | case MPOL_DEFAULT: | 1768 | case MPOL_DEFAULT: |
1781 | break; | 1769 | break; |
1770 | case MPOL_BIND: | ||
1771 | /* Fall through */ | ||
1782 | case MPOL_INTERLEAVE: | 1772 | case MPOL_INTERLEAVE: |
1783 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); | 1773 | nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); |
1784 | pol->v.nodes = tmp; | 1774 | pol->v.nodes = tmp; |
@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol, | |||
1791 | *mpolmask, *newmask); | 1781 | *mpolmask, *newmask); |
1792 | *mpolmask = *newmask; | 1782 | *mpolmask = *newmask; |
1793 | break; | 1783 | break; |
1794 | case MPOL_BIND: { | ||
1795 | nodemask_t nodes; | ||
1796 | struct zoneref *z; | ||
1797 | struct zonelist *zonelist; | ||
1798 | |||
1799 | nodes_clear(nodes); | ||
1800 | for (z = pol->v.zonelist->_zonerefs; z->zone; z++) | ||
1801 | node_set(zonelist_node_idx(z), nodes); | ||
1802 | nodes_remap(tmp, nodes, *mpolmask, *newmask); | ||
1803 | nodes = tmp; | ||
1804 | |||
1805 | zonelist = bind_zonelist(&nodes); | ||
1806 | |||
1807 | /* If no mem, then zonelist is NULL and we keep old zonelist. | ||
1808 | * If that old zonelist has no remaining mems_allowed nodes, | ||
1809 | * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. | ||
1810 | */ | ||
1811 | |||
1812 | if (!IS_ERR(zonelist)) { | ||
1813 | /* Good - got mem - substitute new zonelist */ | ||
1814 | kfree(pol->v.zonelist); | ||
1815 | pol->v.zonelist = zonelist; | ||
1816 | } | ||
1817 | *mpolmask = *newmask; | ||
1818 | break; | ||
1819 | } | ||
1820 | default: | 1784 | default: |
1821 | BUG(); | 1785 | BUG(); |
1822 | break; | 1786 | break; |
@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) | |||
1879 | break; | 1843 | break; |
1880 | 1844 | ||
1881 | case MPOL_BIND: | 1845 | case MPOL_BIND: |
1882 | get_zonemask(pol, &nodes); | 1846 | /* Fall through */ |
1883 | break; | ||
1884 | |||
1885 | case MPOL_INTERLEAVE: | 1847 | case MPOL_INTERLEAVE: |
1886 | nodes = pol->v.nodes; | 1848 | nodes = pol->v.nodes; |
1887 | break; | 1849 | break; |
diff --git a/mm/mmzone.c b/mm/mmzone.c index eb5838634f18..486ed595ee6f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c | |||
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone) | |||
42 | return zone; | 42 | return zone; |
43 | } | 43 | } |
44 | 44 | ||
45 | static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes) | ||
46 | { | ||
47 | #ifdef CONFIG_NUMA | ||
48 | return node_isset(zonelist_node_idx(zref), *nodes); | ||
49 | #else | ||
50 | return 1; | ||
51 | #endif /* CONFIG_NUMA */ | ||
52 | } | ||
53 | |||
54 | /* Returns the next zone at or below highest_zoneidx in a zonelist */ | ||
55 | struct zoneref *next_zones_zonelist(struct zoneref *z, | ||
56 | enum zone_type highest_zoneidx, | ||
57 | nodemask_t *nodes, | ||
58 | struct zone **zone) | ||
59 | { | ||
60 | /* | ||
61 | * Find the next suitable zone to use for the allocation. | ||
62 | * Only filter based on nodemask if it's set | ||
63 | */ | ||
64 | if (likely(nodes == NULL)) | ||
65 | while (zonelist_zone_idx(z) > highest_zoneidx) | ||
66 | z++; | ||
67 | else | ||
68 | while (zonelist_zone_idx(z) > highest_zoneidx || | ||
69 | (z->zone && !zref_in_nodemask(z, nodes))) | ||
70 | z++; | ||
71 | |||
72 | *zone = zonelist_zone(z++); | ||
73 | return z; | ||
74 | } | ||
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6d94d04ea784..b4beb3eea8b7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) | |||
1377 | * a page. | 1377 | * a page. |
1378 | */ | 1378 | */ |
1379 | static struct page * | 1379 | static struct page * |
1380 | get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | 1380 | get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, |
1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) | 1381 | struct zonelist *zonelist, int high_zoneidx, int alloc_flags) |
1382 | { | 1382 | { |
1383 | struct zoneref *z; | 1383 | struct zoneref *z; |
@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, | |||
1388 | int zlc_active = 0; /* set if using zonelist_cache */ | 1388 | int zlc_active = 0; /* set if using zonelist_cache */ |
1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ | 1389 | int did_zlc_setup = 0; /* just call zlc_setup() one time */ |
1390 | 1390 | ||
1391 | z = first_zones_zonelist(zonelist, high_zoneidx); | 1391 | (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask, |
1392 | classzone_idx = zonelist_zone_idx(z); | 1392 | &preferred_zone); |
1393 | preferred_zone = zonelist_zone(z); | 1393 | classzone_idx = zone_idx(preferred_zone); |
1394 | 1394 | ||
1395 | zonelist_scan: | 1395 | zonelist_scan: |
1396 | /* | 1396 | /* |
1397 | * Scan zonelist, looking for a zone with enough free. | 1397 | * Scan zonelist, looking for a zone with enough free. |
1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1398 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1399 | */ | 1399 | */ |
1400 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 1400 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1401 | high_zoneidx, nodemask) { | ||
1401 | if (NUMA_BUILD && zlc_active && | 1402 | if (NUMA_BUILD && zlc_active && |
1402 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1403 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1403 | continue; | 1404 | continue; |
@@ -1447,9 +1448,9 @@ try_next_zone: | |||
1447 | /* | 1448 | /* |
1448 | * This is the 'heart' of the zoned buddy allocator. | 1449 | * This is the 'heart' of the zoned buddy allocator. |
1449 | */ | 1450 | */ |
1450 | struct page * | 1451 | static struct page * |
1451 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | 1452 | __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, |
1452 | struct zonelist *zonelist) | 1453 | struct zonelist *zonelist, nodemask_t *nodemask) |
1453 | { | 1454 | { |
1454 | const gfp_t wait = gfp_mask & __GFP_WAIT; | 1455 | const gfp_t wait = gfp_mask & __GFP_WAIT; |
1455 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 1456 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
@@ -1478,7 +1479,7 @@ restart: | |||
1478 | return NULL; | 1479 | return NULL; |
1479 | } | 1480 | } |
1480 | 1481 | ||
1481 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1482 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, |
1482 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); | 1483 | zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); |
1483 | if (page) | 1484 | if (page) |
1484 | goto got_pg; | 1485 | goto got_pg; |
@@ -1523,7 +1524,7 @@ restart: | |||
1523 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. | 1524 | * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. |
1524 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. | 1525 | * See also cpuset_zone_allowed() comment in kernel/cpuset.c. |
1525 | */ | 1526 | */ |
1526 | page = get_page_from_freelist(gfp_mask, order, zonelist, | 1527 | page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, |
1527 | high_zoneidx, alloc_flags); | 1528 | high_zoneidx, alloc_flags); |
1528 | if (page) | 1529 | if (page) |
1529 | goto got_pg; | 1530 | goto got_pg; |
@@ -1536,7 +1537,7 @@ rebalance: | |||
1536 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { | 1537 | if (!(gfp_mask & __GFP_NOMEMALLOC)) { |
1537 | nofail_alloc: | 1538 | nofail_alloc: |
1538 | /* go through the zonelist yet again, ignoring mins */ | 1539 | /* go through the zonelist yet again, ignoring mins */ |
1539 | page = get_page_from_freelist(gfp_mask, order, | 1540 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1540 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); | 1541 | zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); |
1541 | if (page) | 1542 | if (page) |
1542 | goto got_pg; | 1543 | goto got_pg; |
@@ -1571,7 +1572,7 @@ nofail_alloc: | |||
1571 | drain_all_pages(); | 1572 | drain_all_pages(); |
1572 | 1573 | ||
1573 | if (likely(did_some_progress)) { | 1574 | if (likely(did_some_progress)) { |
1574 | page = get_page_from_freelist(gfp_mask, order, | 1575 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
1575 | zonelist, high_zoneidx, alloc_flags); | 1576 | zonelist, high_zoneidx, alloc_flags); |
1576 | if (page) | 1577 | if (page) |
1577 | goto got_pg; | 1578 | goto got_pg; |
@@ -1587,8 +1588,9 @@ nofail_alloc: | |||
1587 | * a parallel oom killing, we must fail if we're still | 1588 | * a parallel oom killing, we must fail if we're still |
1588 | * under heavy pressure. | 1589 | * under heavy pressure. |
1589 | */ | 1590 | */ |
1590 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, | 1591 | page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, |
1591 | zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET); | 1592 | order, zonelist, high_zoneidx, |
1593 | ALLOC_WMARK_HIGH|ALLOC_CPUSET); | ||
1592 | if (page) { | 1594 | if (page) { |
1593 | clear_zonelist_oom(zonelist, gfp_mask); | 1595 | clear_zonelist_oom(zonelist, gfp_mask); |
1594 | goto got_pg; | 1596 | goto got_pg; |
@@ -1637,6 +1639,20 @@ got_pg: | |||
1637 | return page; | 1639 | return page; |
1638 | } | 1640 | } |
1639 | 1641 | ||
1642 | struct page * | ||
1643 | __alloc_pages(gfp_t gfp_mask, unsigned int order, | ||
1644 | struct zonelist *zonelist) | ||
1645 | { | ||
1646 | return __alloc_pages_internal(gfp_mask, order, zonelist, NULL); | ||
1647 | } | ||
1648 | |||
1649 | struct page * | ||
1650 | __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | ||
1651 | struct zonelist *zonelist, nodemask_t *nodemask) | ||
1652 | { | ||
1653 | return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask); | ||
1654 | } | ||
1655 | |||
1640 | EXPORT_SYMBOL(__alloc_pages); | 1656 | EXPORT_SYMBOL(__alloc_pages); |
1641 | 1657 | ||
1642 | /* | 1658 | /* |
@@ -1880,6 +1896,12 @@ void show_free_areas(void) | |||
1880 | show_swap_cache_info(); | 1896 | show_swap_cache_info(); |
1881 | } | 1897 | } |
1882 | 1898 | ||
1899 | static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) | ||
1900 | { | ||
1901 | zoneref->zone = zone; | ||
1902 | zoneref->zone_idx = zone_idx(zone); | ||
1903 | } | ||
1904 | |||
1883 | /* | 1905 | /* |
1884 | * Builds allocation fallback zone lists. | 1906 | * Builds allocation fallback zone lists. |
1885 | * | 1907 | * |