aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig12
-rw-r--r--mm/bootmem.c196
-rw-r--r--mm/dmapool.c12
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c200
-rw-r--r--mm/hugetlb.c78
-rw-r--r--mm/internal.h3
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memory.c228
-rw-r--r--mm/memory_hotplug.c186
-rw-r--r--mm/mempolicy.c1051
-rw-r--r--mm/mincore.c2
-rw-r--r--mm/mmap.c33
-rw-r--r--mm/mmzone.c30
-rw-r--r--mm/nommu.c6
-rw-r--r--mm/oom_kill.c58
-rw-r--r--mm/page_alloc.c274
-rw-r--r--mm/pagewalk.c8
-rw-r--r--mm/rmap.c8
-rw-r--r--mm/shmem.c144
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slub.c18
-rw-r--r--mm/sparse.c145
-rw-r--r--mm/swap.c37
-rw-r--r--mm/swapfile.c8
-rw-r--r--mm/truncate.c11
-rw-r--r--mm/vmalloc.c141
-rw-r--r--mm/vmscan.c46
-rw-r--r--mm/vmstat.c11
30 files changed, 1919 insertions, 1058 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0016ebd4dcba..3aa819d628c1 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -143,6 +143,18 @@ config MEMORY_HOTREMOVE
143 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE 143 depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
144 depends on MIGRATION 144 depends on MIGRATION
145 145
146#
147# If we have space for more page flags then we can enable additional
148# optimizations and functionality.
149#
150# Regular Sparsemem takes page flag bits for the sectionid if it does not
151# use a virtual memmap. Disable extended page flags for 32 bit platforms
152# that require the use of a sectionid in the page flags.
153#
154config PAGEFLAGS_EXTENDED
155 def_bool y
156 depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM
157
146# Heavily threaded applications may benefit from splitting the mm-wide 158# Heavily threaded applications may benefit from splitting the mm-wide
147# page_table_lock, so that faults on different parts of the user address 159# page_table_lock, so that faults on different parts of the user address
148# space can be handled with less contention: split it at this NR_CPUS. 160# space can be handled with less contention: split it at this NR_CPUS.
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 2ccea700968f..e8fb927392b9 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -111,44 +111,74 @@ static unsigned long __init init_bootmem_core(pg_data_t *pgdat,
111 * might be used for boot-time allocations - or it might get added 111 * might be used for boot-time allocations - or it might get added
112 * to the free page pool later on. 112 * to the free page pool later on.
113 */ 113 */
114static int __init reserve_bootmem_core(bootmem_data_t *bdata, 114static int __init can_reserve_bootmem_core(bootmem_data_t *bdata,
115 unsigned long addr, unsigned long size, int flags) 115 unsigned long addr, unsigned long size, int flags)
116{ 116{
117 unsigned long sidx, eidx; 117 unsigned long sidx, eidx;
118 unsigned long i; 118 unsigned long i;
119 int ret; 119
120 BUG_ON(!size);
121
122 /* out of range, don't hold other */
123 if (addr + size < bdata->node_boot_start ||
124 PFN_DOWN(addr) > bdata->node_low_pfn)
125 return 0;
120 126
121 /* 127 /*
122 * round up, partially reserved pages are considered 128 * Round up to index to the range.
123 * fully reserved.
124 */ 129 */
130 if (addr > bdata->node_boot_start)
131 sidx= PFN_DOWN(addr - bdata->node_boot_start);
132 else
133 sidx = 0;
134
135 eidx = PFN_UP(addr + size - bdata->node_boot_start);
136 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
137 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
138
139 for (i = sidx; i < eidx; i++) {
140 if (test_bit(i, bdata->node_bootmem_map)) {
141 if (flags & BOOTMEM_EXCLUSIVE)
142 return -EBUSY;
143 }
144 }
145
146 return 0;
147
148}
149
150static void __init reserve_bootmem_core(bootmem_data_t *bdata,
151 unsigned long addr, unsigned long size, int flags)
152{
153 unsigned long sidx, eidx;
154 unsigned long i;
155
125 BUG_ON(!size); 156 BUG_ON(!size);
126 BUG_ON(PFN_DOWN(addr) >= bdata->node_low_pfn);
127 BUG_ON(PFN_UP(addr + size) > bdata->node_low_pfn);
128 BUG_ON(addr < bdata->node_boot_start);
129 157
130 sidx = PFN_DOWN(addr - bdata->node_boot_start); 158 /* out of range */
159 if (addr + size < bdata->node_boot_start ||
160 PFN_DOWN(addr) > bdata->node_low_pfn)
161 return;
162
163 /*
164 * Round up to index to the range.
165 */
166 if (addr > bdata->node_boot_start)
167 sidx= PFN_DOWN(addr - bdata->node_boot_start);
168 else
169 sidx = 0;
170
131 eidx = PFN_UP(addr + size - bdata->node_boot_start); 171 eidx = PFN_UP(addr + size - bdata->node_boot_start);
172 if (eidx > bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start))
173 eidx = bdata->node_low_pfn - PFN_DOWN(bdata->node_boot_start);
132 174
133 for (i = sidx; i < eidx; i++) 175 for (i = sidx; i < eidx; i++) {
134 if (test_and_set_bit(i, bdata->node_bootmem_map)) { 176 if (test_and_set_bit(i, bdata->node_bootmem_map)) {
135#ifdef CONFIG_DEBUG_BOOTMEM 177#ifdef CONFIG_DEBUG_BOOTMEM
136 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE); 178 printk("hm, page %08lx reserved twice.\n", i*PAGE_SIZE);
137#endif 179#endif
138 if (flags & BOOTMEM_EXCLUSIVE) {
139 ret = -EBUSY;
140 goto err;
141 }
142 } 180 }
143 181 }
144 return 0;
145
146err:
147 /* unreserve memory we accidentally reserved */
148 for (i--; i >= sidx; i--)
149 clear_bit(i, bdata->node_bootmem_map);
150
151 return ret;
152} 182}
153 183
154static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr, 184static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
@@ -206,9 +236,11 @@ void * __init
206__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size, 236__alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
207 unsigned long align, unsigned long goal, unsigned long limit) 237 unsigned long align, unsigned long goal, unsigned long limit)
208{ 238{
209 unsigned long offset, remaining_size, areasize, preferred; 239 unsigned long areasize, preferred;
210 unsigned long i, start = 0, incr, eidx, end_pfn; 240 unsigned long i, start = 0, incr, eidx, end_pfn;
211 void *ret; 241 void *ret;
242 unsigned long node_boot_start;
243 void *node_bootmem_map;
212 244
213 if (!size) { 245 if (!size) {
214 printk("__alloc_bootmem_core(): zero-sized request\n"); 246 printk("__alloc_bootmem_core(): zero-sized request\n");
@@ -216,70 +248,83 @@ __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
216 } 248 }
217 BUG_ON(align & (align-1)); 249 BUG_ON(align & (align-1));
218 250
219 if (limit && bdata->node_boot_start >= limit)
220 return NULL;
221
222 /* on nodes without memory - bootmem_map is NULL */ 251 /* on nodes without memory - bootmem_map is NULL */
223 if (!bdata->node_bootmem_map) 252 if (!bdata->node_bootmem_map)
224 return NULL; 253 return NULL;
225 254
255 /* bdata->node_boot_start is supposed to be (12+6)bits alignment on x86_64 ? */
256 node_boot_start = bdata->node_boot_start;
257 node_bootmem_map = bdata->node_bootmem_map;
258 if (align) {
259 node_boot_start = ALIGN(bdata->node_boot_start, align);
260 if (node_boot_start > bdata->node_boot_start)
261 node_bootmem_map = (unsigned long *)bdata->node_bootmem_map +
262 PFN_DOWN(node_boot_start - bdata->node_boot_start)/BITS_PER_LONG;
263 }
264
265 if (limit && node_boot_start >= limit)
266 return NULL;
267
226 end_pfn = bdata->node_low_pfn; 268 end_pfn = bdata->node_low_pfn;
227 limit = PFN_DOWN(limit); 269 limit = PFN_DOWN(limit);
228 if (limit && end_pfn > limit) 270 if (limit && end_pfn > limit)
229 end_pfn = limit; 271 end_pfn = limit;
230 272
231 eidx = end_pfn - PFN_DOWN(bdata->node_boot_start); 273 eidx = end_pfn - PFN_DOWN(node_boot_start);
232 offset = 0;
233 if (align && (bdata->node_boot_start & (align - 1UL)) != 0)
234 offset = align - (bdata->node_boot_start & (align - 1UL));
235 offset = PFN_DOWN(offset);
236 274
237 /* 275 /*
238 * We try to allocate bootmem pages above 'goal' 276 * We try to allocate bootmem pages above 'goal'
239 * first, then we try to allocate lower pages. 277 * first, then we try to allocate lower pages.
240 */ 278 */
241 if (goal && goal >= bdata->node_boot_start && PFN_DOWN(goal) < end_pfn) { 279 preferred = 0;
242 preferred = goal - bdata->node_boot_start; 280 if (goal && PFN_DOWN(goal) < end_pfn) {
281 if (goal > node_boot_start)
282 preferred = goal - node_boot_start;
243 283
244 if (bdata->last_success >= preferred) 284 if (bdata->last_success > node_boot_start &&
285 bdata->last_success - node_boot_start >= preferred)
245 if (!limit || (limit && limit > bdata->last_success)) 286 if (!limit || (limit && limit > bdata->last_success))
246 preferred = bdata->last_success; 287 preferred = bdata->last_success - node_boot_start;
247 } else 288 }
248 preferred = 0;
249 289
250 preferred = PFN_DOWN(ALIGN(preferred, align)) + offset; 290 preferred = PFN_DOWN(ALIGN(preferred, align));
251 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE; 291 areasize = (size + PAGE_SIZE-1) / PAGE_SIZE;
252 incr = align >> PAGE_SHIFT ? : 1; 292 incr = align >> PAGE_SHIFT ? : 1;
253 293
254restart_scan: 294restart_scan:
255 for (i = preferred; i < eidx; i += incr) { 295 for (i = preferred; i < eidx;) {
256 unsigned long j; 296 unsigned long j;
257 i = find_next_zero_bit(bdata->node_bootmem_map, eidx, i); 297
298 i = find_next_zero_bit(node_bootmem_map, eidx, i);
258 i = ALIGN(i, incr); 299 i = ALIGN(i, incr);
259 if (i >= eidx) 300 if (i >= eidx)
260 break; 301 break;
261 if (test_bit(i, bdata->node_bootmem_map)) 302 if (test_bit(i, node_bootmem_map)) {
303 i += incr;
262 continue; 304 continue;
305 }
263 for (j = i + 1; j < i + areasize; ++j) { 306 for (j = i + 1; j < i + areasize; ++j) {
264 if (j >= eidx) 307 if (j >= eidx)
265 goto fail_block; 308 goto fail_block;
266 if (test_bit(j, bdata->node_bootmem_map)) 309 if (test_bit(j, node_bootmem_map))
267 goto fail_block; 310 goto fail_block;
268 } 311 }
269 start = i; 312 start = i;
270 goto found; 313 goto found;
271 fail_block: 314 fail_block:
272 i = ALIGN(j, incr); 315 i = ALIGN(j, incr);
316 if (i == j)
317 i += incr;
273 } 318 }
274 319
275 if (preferred > offset) { 320 if (preferred > 0) {
276 preferred = offset; 321 preferred = 0;
277 goto restart_scan; 322 goto restart_scan;
278 } 323 }
279 return NULL; 324 return NULL;
280 325
281found: 326found:
282 bdata->last_success = PFN_PHYS(start); 327 bdata->last_success = PFN_PHYS(start) + node_boot_start;
283 BUG_ON(start >= eidx); 328 BUG_ON(start >= eidx);
284 329
285 /* 330 /*
@@ -289,6 +334,7 @@ found:
289 */ 334 */
290 if (align < PAGE_SIZE && 335 if (align < PAGE_SIZE &&
291 bdata->last_offset && bdata->last_pos+1 == start) { 336 bdata->last_offset && bdata->last_pos+1 == start) {
337 unsigned long offset, remaining_size;
292 offset = ALIGN(bdata->last_offset, align); 338 offset = ALIGN(bdata->last_offset, align);
293 BUG_ON(offset > PAGE_SIZE); 339 BUG_ON(offset > PAGE_SIZE);
294 remaining_size = PAGE_SIZE - offset; 340 remaining_size = PAGE_SIZE - offset;
@@ -297,14 +343,12 @@ found:
297 /* last_pos unchanged */ 343 /* last_pos unchanged */
298 bdata->last_offset = offset + size; 344 bdata->last_offset = offset + size;
299 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + 345 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
300 offset + 346 offset + node_boot_start);
301 bdata->node_boot_start);
302 } else { 347 } else {
303 remaining_size = size - remaining_size; 348 remaining_size = size - remaining_size;
304 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE; 349 areasize = (remaining_size + PAGE_SIZE-1) / PAGE_SIZE;
305 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE + 350 ret = phys_to_virt(bdata->last_pos * PAGE_SIZE +
306 offset + 351 offset + node_boot_start);
307 bdata->node_boot_start);
308 bdata->last_pos = start + areasize - 1; 352 bdata->last_pos = start + areasize - 1;
309 bdata->last_offset = remaining_size; 353 bdata->last_offset = remaining_size;
310 } 354 }
@@ -312,14 +356,14 @@ found:
312 } else { 356 } else {
313 bdata->last_pos = start + areasize - 1; 357 bdata->last_pos = start + areasize - 1;
314 bdata->last_offset = size & ~PAGE_MASK; 358 bdata->last_offset = size & ~PAGE_MASK;
315 ret = phys_to_virt(start * PAGE_SIZE + bdata->node_boot_start); 359 ret = phys_to_virt(start * PAGE_SIZE + node_boot_start);
316 } 360 }
317 361
318 /* 362 /*
319 * Reserve the area now: 363 * Reserve the area now:
320 */ 364 */
321 for (i = start; i < start + areasize; i++) 365 for (i = start; i < start + areasize; i++)
322 if (unlikely(test_and_set_bit(i, bdata->node_bootmem_map))) 366 if (unlikely(test_and_set_bit(i, node_bootmem_map)))
323 BUG(); 367 BUG();
324 memset(ret, 0, size); 368 memset(ret, 0, size);
325 return ret; 369 return ret;
@@ -401,6 +445,11 @@ unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
401void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, 445void __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
402 unsigned long size, int flags) 446 unsigned long size, int flags)
403{ 447{
448 int ret;
449
450 ret = can_reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
451 if (ret < 0)
452 return;
404 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags); 453 reserve_bootmem_core(pgdat->bdata, physaddr, size, flags);
405} 454}
406 455
@@ -412,6 +461,7 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
412 461
413unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) 462unsigned long __init free_all_bootmem_node(pg_data_t *pgdat)
414{ 463{
464 register_page_bootmem_info_node(pgdat);
415 return free_all_bootmem_core(pgdat); 465 return free_all_bootmem_core(pgdat);
416} 466}
417 467
@@ -426,7 +476,18 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
426int __init reserve_bootmem(unsigned long addr, unsigned long size, 476int __init reserve_bootmem(unsigned long addr, unsigned long size,
427 int flags) 477 int flags)
428{ 478{
429 return reserve_bootmem_core(NODE_DATA(0)->bdata, addr, size, flags); 479 bootmem_data_t *bdata;
480 int ret;
481
482 list_for_each_entry(bdata, &bdata_list, list) {
483 ret = can_reserve_bootmem_core(bdata, addr, size, flags);
484 if (ret < 0)
485 return ret;
486 }
487 list_for_each_entry(bdata, &bdata_list, list)
488 reserve_bootmem_core(bdata, addr, size, flags);
489
490 return 0;
430} 491}
431#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */ 492#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
432 493
@@ -484,6 +545,37 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
484 return __alloc_bootmem(size, align, goal); 545 return __alloc_bootmem(size, align, goal);
485} 546}
486 547
548#ifdef CONFIG_SPARSEMEM
549void * __init alloc_bootmem_section(unsigned long size,
550 unsigned long section_nr)
551{
552 void *ptr;
553 unsigned long limit, goal, start_nr, end_nr, pfn;
554 struct pglist_data *pgdat;
555
556 pfn = section_nr_to_pfn(section_nr);
557 goal = PFN_PHYS(pfn);
558 limit = PFN_PHYS(section_nr_to_pfn(section_nr + 1)) - 1;
559 pgdat = NODE_DATA(early_pfn_to_nid(pfn));
560 ptr = __alloc_bootmem_core(pgdat->bdata, size, SMP_CACHE_BYTES, goal,
561 limit);
562
563 if (!ptr)
564 return NULL;
565
566 start_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr)));
567 end_nr = pfn_to_section_nr(PFN_DOWN(__pa(ptr) + size));
568 if (start_nr != section_nr || end_nr != section_nr) {
569 printk(KERN_WARNING "alloc_bootmem failed on section %ld.\n",
570 section_nr);
571 free_bootmem_core(pgdat->bdata, __pa(ptr), size);
572 ptr = NULL;
573 }
574
575 return ptr;
576}
577#endif
578
487#ifndef ARCH_LOW_ADDRESS_LIMIT 579#ifndef ARCH_LOW_ADDRESS_LIMIT
488#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL 580#define ARCH_LOW_ADDRESS_LIMIT 0xffffffffUL
489#endif 581#endif
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 34aaac451a96..b1f0885dda22 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -37,6 +37,10 @@
37#include <linux/types.h> 37#include <linux/types.h>
38#include <linux/wait.h> 38#include <linux/wait.h>
39 39
40#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
41#define DMAPOOL_DEBUG 1
42#endif
43
40struct dma_pool { /* the pool */ 44struct dma_pool { /* the pool */
41 struct list_head page_list; 45 struct list_head page_list;
42 spinlock_t lock; 46 spinlock_t lock;
@@ -216,7 +220,7 @@ static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
216 page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation, 220 page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
217 &page->dma, mem_flags); 221 &page->dma, mem_flags);
218 if (page->vaddr) { 222 if (page->vaddr) {
219#ifdef CONFIG_DEBUG_SLAB 223#ifdef DMAPOOL_DEBUG
220 memset(page->vaddr, POOL_POISON_FREED, pool->allocation); 224 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
221#endif 225#endif
222 pool_initialise_page(pool, page); 226 pool_initialise_page(pool, page);
@@ -239,7 +243,7 @@ static void pool_free_page(struct dma_pool *pool, struct dma_page *page)
239{ 243{
240 dma_addr_t dma = page->dma; 244 dma_addr_t dma = page->dma;
241 245
242#ifdef CONFIG_DEBUG_SLAB 246#ifdef DMAPOOL_DEBUG
243 memset(page->vaddr, POOL_POISON_FREED, pool->allocation); 247 memset(page->vaddr, POOL_POISON_FREED, pool->allocation);
244#endif 248#endif
245 dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma); 249 dma_free_coherent(pool->dev, pool->allocation, page->vaddr, dma);
@@ -336,7 +340,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
336 page->offset = *(int *)(page->vaddr + offset); 340 page->offset = *(int *)(page->vaddr + offset);
337 retval = offset + page->vaddr; 341 retval = offset + page->vaddr;
338 *handle = offset + page->dma; 342 *handle = offset + page->dma;
339#ifdef CONFIG_DEBUG_SLAB 343#ifdef DMAPOOL_DEBUG
340 memset(retval, POOL_POISON_ALLOCATED, pool->size); 344 memset(retval, POOL_POISON_ALLOCATED, pool->size);
341#endif 345#endif
342 done: 346 done:
@@ -391,7 +395,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma)
391 } 395 }
392 396
393 offset = vaddr - page->vaddr; 397 offset = vaddr - page->vaddr;
394#ifdef CONFIG_DEBUG_SLAB 398#ifdef DMAPOOL_DEBUG
395 if ((dma - page->dma) != offset) { 399 if ((dma - page->dma) != offset) {
396 if (pool->dev) 400 if (pool->dev)
397 dev_err(pool->dev, 401 dev_err(pool->dev,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 3c0f1e99f5e4..343cfdfebd9e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -49,7 +49,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
49 goto out; 49 goto out;
50 } 50 }
51 51
52 if (mapping->a_ops->get_xip_page) { 52 if (mapping->a_ops->get_xip_mem) {
53 switch (advice) { 53 switch (advice) {
54 case POSIX_FADV_NORMAL: 54 case POSIX_FADV_NORMAL:
55 case POSIX_FADV_RANDOM: 55 case POSIX_FADV_RANDOM:
diff --git a/mm/filemap.c b/mm/filemap.c
index 07e9d9258b48..239d36163bbe 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -576,10 +576,12 @@ EXPORT_SYMBOL(unlock_page);
576 */ 576 */
577void end_page_writeback(struct page *page) 577void end_page_writeback(struct page *page)
578{ 578{
579 if (!TestClearPageReclaim(page) || rotate_reclaimable_page(page)) { 579 if (TestClearPageReclaim(page))
580 if (!test_clear_page_writeback(page)) 580 rotate_reclaimable_page(page);
581 BUG(); 581
582 } 582 if (!test_clear_page_writeback(page))
583 BUG();
584
583 smp_mb__after_clear_bit(); 585 smp_mb__after_clear_bit();
584 wake_up_page(page, PG_writeback); 586 wake_up_page(page, PG_writeback);
585} 587}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 5e598c42afd7..3e744abcce9d 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -15,6 +15,7 @@
15#include <linux/rmap.h> 15#include <linux/rmap.h>
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <asm/tlbflush.h> 17#include <asm/tlbflush.h>
18#include <asm/io.h>
18 19
19/* 20/*
20 * We do use our own empty page to avoid interference with other users 21 * We do use our own empty page to avoid interference with other users
@@ -42,37 +43,41 @@ static struct page *xip_sparse_page(void)
42 43
43/* 44/*
44 * This is a file read routine for execute in place files, and uses 45 * This is a file read routine for execute in place files, and uses
45 * the mapping->a_ops->get_xip_page() function for the actual low-level 46 * the mapping->a_ops->get_xip_mem() function for the actual low-level
46 * stuff. 47 * stuff.
47 * 48 *
48 * Note the struct file* is not used at all. It may be NULL. 49 * Note the struct file* is not used at all. It may be NULL.
49 */ 50 */
50static void 51static ssize_t
51do_xip_mapping_read(struct address_space *mapping, 52do_xip_mapping_read(struct address_space *mapping,
52 struct file_ra_state *_ra, 53 struct file_ra_state *_ra,
53 struct file *filp, 54 struct file *filp,
54 loff_t *ppos, 55 char __user *buf,
55 read_descriptor_t *desc, 56 size_t len,
56 read_actor_t actor) 57 loff_t *ppos)
57{ 58{
58 struct inode *inode = mapping->host; 59 struct inode *inode = mapping->host;
59 pgoff_t index, end_index; 60 pgoff_t index, end_index;
60 unsigned long offset; 61 unsigned long offset;
61 loff_t isize; 62 loff_t isize, pos;
63 size_t copied = 0, error = 0;
62 64
63 BUG_ON(!mapping->a_ops->get_xip_page); 65 BUG_ON(!mapping->a_ops->get_xip_mem);
64 66
65 index = *ppos >> PAGE_CACHE_SHIFT; 67 pos = *ppos;
66 offset = *ppos & ~PAGE_CACHE_MASK; 68 index = pos >> PAGE_CACHE_SHIFT;
69 offset = pos & ~PAGE_CACHE_MASK;
67 70
68 isize = i_size_read(inode); 71 isize = i_size_read(inode);
69 if (!isize) 72 if (!isize)
70 goto out; 73 goto out;
71 74
72 end_index = (isize - 1) >> PAGE_CACHE_SHIFT; 75 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
73 for (;;) { 76 do {
74 struct page *page; 77 unsigned long nr, left;
75 unsigned long nr, ret; 78 void *xip_mem;
79 unsigned long xip_pfn;
80 int zero = 0;
76 81
77 /* nr is the maximum number of bytes to copy from this page */ 82 /* nr is the maximum number of bytes to copy from this page */
78 nr = PAGE_CACHE_SIZE; 83 nr = PAGE_CACHE_SIZE;
@@ -85,19 +90,17 @@ do_xip_mapping_read(struct address_space *mapping,
85 } 90 }
86 } 91 }
87 nr = nr - offset; 92 nr = nr - offset;
93 if (nr > len)
94 nr = len;
88 95
89 page = mapping->a_ops->get_xip_page(mapping, 96 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
90 index*(PAGE_SIZE/512), 0); 97 &xip_mem, &xip_pfn);
91 if (!page) 98 if (unlikely(error)) {
92 goto no_xip_page; 99 if (error == -ENODATA) {
93 if (unlikely(IS_ERR(page))) {
94 if (PTR_ERR(page) == -ENODATA) {
95 /* sparse */ 100 /* sparse */
96 page = ZERO_PAGE(0); 101 zero = 1;
97 } else { 102 } else
98 desc->error = PTR_ERR(page);
99 goto out; 103 goto out;
100 }
101 } 104 }
102 105
103 /* If users can be writing to this page using arbitrary 106 /* If users can be writing to this page using arbitrary
@@ -105,10 +108,10 @@ do_xip_mapping_read(struct address_space *mapping,
105 * before reading the page on the kernel side. 108 * before reading the page on the kernel side.
106 */ 109 */
107 if (mapping_writably_mapped(mapping)) 110 if (mapping_writably_mapped(mapping))
108 flush_dcache_page(page); 111 /* address based flush */ ;
109 112
110 /* 113 /*
111 * Ok, we have the page, so now we can copy it to user space... 114 * Ok, we have the mem, so now we can copy it to user space...
112 * 115 *
113 * The actor routine returns how many bytes were actually used.. 116 * The actor routine returns how many bytes were actually used..
114 * NOTE! This may not be the same as how much of a user buffer 117 * NOTE! This may not be the same as how much of a user buffer
@@ -116,47 +119,38 @@ do_xip_mapping_read(struct address_space *mapping,
116 * "pos" here (the actor routine has to update the user buffer 119 * "pos" here (the actor routine has to update the user buffer
117 * pointers and the remaining count). 120 * pointers and the remaining count).
118 */ 121 */
119 ret = actor(desc, page, offset, nr); 122 if (!zero)
120 offset += ret; 123 left = __copy_to_user(buf+copied, xip_mem+offset, nr);
121 index += offset >> PAGE_CACHE_SHIFT; 124 else
122 offset &= ~PAGE_CACHE_MASK; 125 left = __clear_user(buf + copied, nr);
123 126
124 if (ret == nr && desc->count) 127 if (left) {
125 continue; 128 error = -EFAULT;
126 goto out; 129 goto out;
130 }
127 131
128no_xip_page: 132 copied += (nr - left);
129 /* Did not get the page. Report it */ 133 offset += (nr - left);
130 desc->error = -EIO; 134 index += offset >> PAGE_CACHE_SHIFT;
131 goto out; 135 offset &= ~PAGE_CACHE_MASK;
132 } 136 } while (copied < len);
133 137
134out: 138out:
135 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset; 139 *ppos = pos + copied;
136 if (filp) 140 if (filp)
137 file_accessed(filp); 141 file_accessed(filp);
142
143 return (copied ? copied : error);
138} 144}
139 145
140ssize_t 146ssize_t
141xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) 147xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
142{ 148{
143 read_descriptor_t desc;
144
145 if (!access_ok(VERIFY_WRITE, buf, len)) 149 if (!access_ok(VERIFY_WRITE, buf, len))
146 return -EFAULT; 150 return -EFAULT;
147 151
148 desc.written = 0; 152 return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
149 desc.arg.buf = buf; 153 buf, len, ppos);
150 desc.count = len;
151 desc.error = 0;
152
153 do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
154 ppos, &desc, file_read_actor);
155
156 if (desc.written)
157 return desc.written;
158 else
159 return desc.error;
160} 154}
161EXPORT_SYMBOL_GPL(xip_file_read); 155EXPORT_SYMBOL_GPL(xip_file_read);
162 156
@@ -211,13 +205,16 @@ __xip_unmap (struct address_space * mapping,
211 * 205 *
212 * This function is derived from filemap_fault, but used for execute in place 206 * This function is derived from filemap_fault, but used for execute in place
213 */ 207 */
214static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf) 208static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
215{ 209{
216 struct file *file = area->vm_file; 210 struct file *file = vma->vm_file;
217 struct address_space *mapping = file->f_mapping; 211 struct address_space *mapping = file->f_mapping;
218 struct inode *inode = mapping->host; 212 struct inode *inode = mapping->host;
219 struct page *page;
220 pgoff_t size; 213 pgoff_t size;
214 void *xip_mem;
215 unsigned long xip_pfn;
216 struct page *page;
217 int error;
221 218
222 /* XXX: are VM_FAULT_ codes OK? */ 219 /* XXX: are VM_FAULT_ codes OK? */
223 220
@@ -225,35 +222,44 @@ static int xip_file_fault(struct vm_area_struct *area, struct vm_fault *vmf)
225 if (vmf->pgoff >= size) 222 if (vmf->pgoff >= size)
226 return VM_FAULT_SIGBUS; 223 return VM_FAULT_SIGBUS;
227 224
228 page = mapping->a_ops->get_xip_page(mapping, 225 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
229 vmf->pgoff*(PAGE_SIZE/512), 0); 226 &xip_mem, &xip_pfn);
230 if (!IS_ERR(page)) 227 if (likely(!error))
231 goto out; 228 goto found;
232 if (PTR_ERR(page) != -ENODATA) 229 if (error != -ENODATA)
233 return VM_FAULT_OOM; 230 return VM_FAULT_OOM;
234 231
235 /* sparse block */ 232 /* sparse block */
236 if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) && 233 if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
237 (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) && 234 (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
238 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) { 235 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
236 int err;
237
239 /* maybe shared writable, allocate new block */ 238 /* maybe shared writable, allocate new block */
240 page = mapping->a_ops->get_xip_page(mapping, 239 error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
241 vmf->pgoff*(PAGE_SIZE/512), 1); 240 &xip_mem, &xip_pfn);
242 if (IS_ERR(page)) 241 if (error)
243 return VM_FAULT_SIGBUS; 242 return VM_FAULT_SIGBUS;
244 /* unmap page at pgoff from all other vmas */ 243 /* unmap sparse mappings at pgoff from all other vmas */
245 __xip_unmap(mapping, vmf->pgoff); 244 __xip_unmap(mapping, vmf->pgoff);
245
246found:
247 err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
248 xip_pfn);
249 if (err == -ENOMEM)
250 return VM_FAULT_OOM;
251 BUG_ON(err);
252 return VM_FAULT_NOPAGE;
246 } else { 253 } else {
247 /* not shared and writable, use xip_sparse_page() */ 254 /* not shared and writable, use xip_sparse_page() */
248 page = xip_sparse_page(); 255 page = xip_sparse_page();
249 if (!page) 256 if (!page)
250 return VM_FAULT_OOM; 257 return VM_FAULT_OOM;
251 }
252 258
253out: 259 page_cache_get(page);
254 page_cache_get(page); 260 vmf->page = page;
255 vmf->page = page; 261 return 0;
256 return 0; 262 }
257} 263}
258 264
259static struct vm_operations_struct xip_file_vm_ops = { 265static struct vm_operations_struct xip_file_vm_ops = {
@@ -262,11 +268,11 @@ static struct vm_operations_struct xip_file_vm_ops = {
262 268
263int xip_file_mmap(struct file * file, struct vm_area_struct * vma) 269int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
264{ 270{
265 BUG_ON(!file->f_mapping->a_ops->get_xip_page); 271 BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
266 272
267 file_accessed(file); 273 file_accessed(file);
268 vma->vm_ops = &xip_file_vm_ops; 274 vma->vm_ops = &xip_file_vm_ops;
269 vma->vm_flags |= VM_CAN_NONLINEAR; 275 vma->vm_flags |= VM_CAN_NONLINEAR | VM_MIXEDMAP;
270 return 0; 276 return 0;
271} 277}
272EXPORT_SYMBOL_GPL(xip_file_mmap); 278EXPORT_SYMBOL_GPL(xip_file_mmap);
@@ -279,17 +285,17 @@ __xip_file_write(struct file *filp, const char __user *buf,
279 const struct address_space_operations *a_ops = mapping->a_ops; 285 const struct address_space_operations *a_ops = mapping->a_ops;
280 struct inode *inode = mapping->host; 286 struct inode *inode = mapping->host;
281 long status = 0; 287 long status = 0;
282 struct page *page;
283 size_t bytes; 288 size_t bytes;
284 ssize_t written = 0; 289 ssize_t written = 0;
285 290
286 BUG_ON(!mapping->a_ops->get_xip_page); 291 BUG_ON(!mapping->a_ops->get_xip_mem);
287 292
288 do { 293 do {
289 unsigned long index; 294 unsigned long index;
290 unsigned long offset; 295 unsigned long offset;
291 size_t copied; 296 size_t copied;
292 char *kaddr; 297 void *xip_mem;
298 unsigned long xip_pfn;
293 299
294 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */ 300 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
295 index = pos >> PAGE_CACHE_SHIFT; 301 index = pos >> PAGE_CACHE_SHIFT;
@@ -297,28 +303,22 @@ __xip_file_write(struct file *filp, const char __user *buf,
297 if (bytes > count) 303 if (bytes > count)
298 bytes = count; 304 bytes = count;
299 305
300 page = a_ops->get_xip_page(mapping, 306 status = a_ops->get_xip_mem(mapping, index, 0,
301 index*(PAGE_SIZE/512), 0); 307 &xip_mem, &xip_pfn);
302 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) { 308 if (status == -ENODATA) {
303 /* we allocate a new page unmap it */ 309 /* we allocate a new page unmap it */
304 page = a_ops->get_xip_page(mapping, 310 status = a_ops->get_xip_mem(mapping, index, 1,
305 index*(PAGE_SIZE/512), 1); 311 &xip_mem, &xip_pfn);
306 if (!IS_ERR(page)) 312 if (!status)
307 /* unmap page at pgoff from all other vmas */ 313 /* unmap page at pgoff from all other vmas */
308 __xip_unmap(mapping, index); 314 __xip_unmap(mapping, index);
309 } 315 }
310 316
311 if (IS_ERR(page)) { 317 if (status)
312 status = PTR_ERR(page);
313 break; 318 break;
314 }
315 319
316 fault_in_pages_readable(buf, bytes);
317 kaddr = kmap_atomic(page, KM_USER0);
318 copied = bytes - 320 copied = bytes -
319 __copy_from_user_inatomic_nocache(kaddr + offset, buf, bytes); 321 __copy_from_user_nocache(xip_mem + offset, buf, bytes);
320 kunmap_atomic(kaddr, KM_USER0);
321 flush_dcache_page(page);
322 322
323 if (likely(copied > 0)) { 323 if (likely(copied > 0)) {
324 status = copied; 324 status = copied;
@@ -398,7 +398,7 @@ EXPORT_SYMBOL_GPL(xip_file_write);
398 398
399/* 399/*
400 * truncate a page used for execute in place 400 * truncate a page used for execute in place
401 * functionality is analog to block_truncate_page but does use get_xip_page 401 * functionality is analog to block_truncate_page but does use get_xip_mem
402 * to get the page instead of page cache 402 * to get the page instead of page cache
403 */ 403 */
404int 404int
@@ -408,9 +408,11 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
408 unsigned offset = from & (PAGE_CACHE_SIZE-1); 408 unsigned offset = from & (PAGE_CACHE_SIZE-1);
409 unsigned blocksize; 409 unsigned blocksize;
410 unsigned length; 410 unsigned length;
411 struct page *page; 411 void *xip_mem;
412 unsigned long xip_pfn;
413 int err;
412 414
413 BUG_ON(!mapping->a_ops->get_xip_page); 415 BUG_ON(!mapping->a_ops->get_xip_mem);
414 416
415 blocksize = 1 << mapping->host->i_blkbits; 417 blocksize = 1 << mapping->host->i_blkbits;
416 length = offset & (blocksize - 1); 418 length = offset & (blocksize - 1);
@@ -421,18 +423,16 @@ xip_truncate_page(struct address_space *mapping, loff_t from)
421 423
422 length = blocksize - length; 424 length = blocksize - length;
423 425
424 page = mapping->a_ops->get_xip_page(mapping, 426 err = mapping->a_ops->get_xip_mem(mapping, index, 0,
425 index*(PAGE_SIZE/512), 0); 427 &xip_mem, &xip_pfn);
426 if (!page) 428 if (unlikely(err)) {
427 return -ENOMEM; 429 if (err == -ENODATA)
428 if (unlikely(IS_ERR(page))) {
429 if (PTR_ERR(page) == -ENODATA)
430 /* Hole? No need to truncate */ 430 /* Hole? No need to truncate */
431 return 0; 431 return 0;
432 else 432 else
433 return PTR_ERR(page); 433 return err;
434 } 434 }
435 zero_user(page, offset, length); 435 memset(xip_mem + offset, 0, length);
436 return 0; 436 return 0;
437} 437}
438EXPORT_SYMBOL_GPL(xip_truncate_page); 438EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 51c9e2c01640..df28c1773fb2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -95,13 +95,16 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
95 int nid; 95 int nid;
96 struct page *page = NULL; 96 struct page *page = NULL;
97 struct mempolicy *mpol; 97 struct mempolicy *mpol;
98 nodemask_t *nodemask;
98 struct zonelist *zonelist = huge_zonelist(vma, address, 99 struct zonelist *zonelist = huge_zonelist(vma, address,
99 htlb_alloc_mask, &mpol); 100 htlb_alloc_mask, &mpol, &nodemask);
100 struct zone **z; 101 struct zone *zone;
101 102 struct zoneref *z;
102 for (z = zonelist->zones; *z; z++) { 103
103 nid = zone_to_nid(*z); 104 for_each_zone_zonelist_nodemask(zone, z, zonelist,
104 if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && 105 MAX_NR_ZONES - 1, nodemask) {
106 nid = zone_to_nid(zone);
107 if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
105 !list_empty(&hugepage_freelists[nid])) { 108 !list_empty(&hugepage_freelists[nid])) {
106 page = list_entry(hugepage_freelists[nid].next, 109 page = list_entry(hugepage_freelists[nid].next,
107 struct page, lru); 110 struct page, lru);
@@ -113,7 +116,7 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
113 break; 116 break;
114 } 117 }
115 } 118 }
116 mpol_free(mpol); /* unref if mpol !NULL */ 119 mpol_cond_put(mpol);
117 return page; 120 return page;
118} 121}
119 122
@@ -129,6 +132,7 @@ static void update_and_free_page(struct page *page)
129 } 132 }
130 set_compound_page_dtor(page, NULL); 133 set_compound_page_dtor(page, NULL);
131 set_page_refcounted(page); 134 set_page_refcounted(page);
135 arch_release_hugepage(page);
132 __free_pages(page, HUGETLB_PAGE_ORDER); 136 __free_pages(page, HUGETLB_PAGE_ORDER);
133} 137}
134 138
@@ -198,6 +202,10 @@ static struct page *alloc_fresh_huge_page_node(int nid)
198 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN, 202 htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
199 HUGETLB_PAGE_ORDER); 203 HUGETLB_PAGE_ORDER);
200 if (page) { 204 if (page) {
205 if (arch_prepare_hugepage(page)) {
206 __free_pages(page, HUGETLB_PAGE_ORDER);
207 return 0;
208 }
201 set_compound_page_dtor(page, free_huge_page); 209 set_compound_page_dtor(page, free_huge_page);
202 spin_lock(&hugetlb_lock); 210 spin_lock(&hugetlb_lock);
203 nr_huge_pages++; 211 nr_huge_pages++;
@@ -239,6 +247,11 @@ static int alloc_fresh_huge_page(void)
239 hugetlb_next_nid = next_nid; 247 hugetlb_next_nid = next_nid;
240 } while (!page && hugetlb_next_nid != start_nid); 248 } while (!page && hugetlb_next_nid != start_nid);
241 249
250 if (ret)
251 count_vm_event(HTLB_BUDDY_PGALLOC);
252 else
253 count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
254
242 return ret; 255 return ret;
243} 256}
244 257
@@ -299,9 +312,11 @@ static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
299 */ 312 */
300 nr_huge_pages_node[nid]++; 313 nr_huge_pages_node[nid]++;
301 surplus_huge_pages_node[nid]++; 314 surplus_huge_pages_node[nid]++;
315 __count_vm_event(HTLB_BUDDY_PGALLOC);
302 } else { 316 } else {
303 nr_huge_pages--; 317 nr_huge_pages--;
304 surplus_huge_pages--; 318 surplus_huge_pages--;
319 __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
305 } 320 }
306 spin_unlock(&hugetlb_lock); 321 spin_unlock(&hugetlb_lock);
307 322
@@ -369,11 +384,19 @@ retry:
369 resv_huge_pages += delta; 384 resv_huge_pages += delta;
370 ret = 0; 385 ret = 0;
371free: 386free:
387 /* Free the needed pages to the hugetlb pool */
372 list_for_each_entry_safe(page, tmp, &surplus_list, lru) { 388 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
389 if ((--needed) < 0)
390 break;
373 list_del(&page->lru); 391 list_del(&page->lru);
374 if ((--needed) >= 0) 392 enqueue_huge_page(page);
375 enqueue_huge_page(page); 393 }
376 else { 394
395 /* Free unnecessary surplus pages to the buddy allocator */
396 if (!list_empty(&surplus_list)) {
397 spin_unlock(&hugetlb_lock);
398 list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
399 list_del(&page->lru);
377 /* 400 /*
378 * The page has a reference count of zero already, so 401 * The page has a reference count of zero already, so
379 * call free_huge_page directly instead of using 402 * call free_huge_page directly instead of using
@@ -381,10 +404,9 @@ free:
381 * unlocked which is safe because free_huge_page takes 404 * unlocked which is safe because free_huge_page takes
382 * hugetlb_lock before deciding how to free the page. 405 * hugetlb_lock before deciding how to free the page.
383 */ 406 */
384 spin_unlock(&hugetlb_lock);
385 free_huge_page(page); 407 free_huge_page(page);
386 spin_lock(&hugetlb_lock);
387 } 408 }
409 spin_lock(&hugetlb_lock);
388 } 410 }
389 411
390 return ret; 412 return ret;
@@ -718,7 +740,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
718 entry = 740 entry =
719 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); 741 pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot)));
720 } else { 742 } else {
721 entry = pte_wrprotect(mk_pte(page, vma->vm_page_prot)); 743 entry = huge_pte_wrprotect(mk_pte(page, vma->vm_page_prot));
722 } 744 }
723 entry = pte_mkyoung(entry); 745 entry = pte_mkyoung(entry);
724 entry = pte_mkhuge(entry); 746 entry = pte_mkhuge(entry);
@@ -731,8 +753,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
731{ 753{
732 pte_t entry; 754 pte_t entry;
733 755
734 entry = pte_mkwrite(pte_mkdirty(*ptep)); 756 entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep)));
735 if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { 757 if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) {
736 update_mmu_cache(vma, address, entry); 758 update_mmu_cache(vma, address, entry);
737 } 759 }
738} 760}
@@ -762,10 +784,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
762 784
763 spin_lock(&dst->page_table_lock); 785 spin_lock(&dst->page_table_lock);
764 spin_lock(&src->page_table_lock); 786 spin_lock(&src->page_table_lock);
765 if (!pte_none(*src_pte)) { 787 if (!huge_pte_none(huge_ptep_get(src_pte))) {
766 if (cow) 788 if (cow)
767 ptep_set_wrprotect(src, addr, src_pte); 789 huge_ptep_set_wrprotect(src, addr, src_pte);
768 entry = *src_pte; 790 entry = huge_ptep_get(src_pte);
769 ptepage = pte_page(entry); 791 ptepage = pte_page(entry);
770 get_page(ptepage); 792 get_page(ptepage);
771 set_huge_pte_at(dst, addr, dst_pte, entry); 793 set_huge_pte_at(dst, addr, dst_pte, entry);
@@ -809,7 +831,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
809 continue; 831 continue;
810 832
811 pte = huge_ptep_get_and_clear(mm, address, ptep); 833 pte = huge_ptep_get_and_clear(mm, address, ptep);
812 if (pte_none(pte)) 834 if (huge_pte_none(pte))
813 continue; 835 continue;
814 836
815 page = pte_page(pte); 837 page = pte_page(pte);
@@ -873,8 +895,9 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
873 spin_lock(&mm->page_table_lock); 895 spin_lock(&mm->page_table_lock);
874 896
875 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 897 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
876 if (likely(pte_same(*ptep, pte))) { 898 if (likely(pte_same(huge_ptep_get(ptep), pte))) {
877 /* Break COW */ 899 /* Break COW */
900 huge_ptep_clear_flush(vma, address, ptep);
878 set_huge_pte_at(mm, address, ptep, 901 set_huge_pte_at(mm, address, ptep,
879 make_huge_pte(vma, new_page, 1)); 902 make_huge_pte(vma, new_page, 1));
880 /* Make the old page be freed below */ 903 /* Make the old page be freed below */
@@ -942,7 +965,7 @@ retry:
942 goto backout; 965 goto backout;
943 966
944 ret = 0; 967 ret = 0;
945 if (!pte_none(*ptep)) 968 if (!huge_pte_none(huge_ptep_get(ptep)))
946 goto backout; 969 goto backout;
947 970
948 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) 971 new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
@@ -984,8 +1007,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
984 * the same page in the page cache. 1007 * the same page in the page cache.
985 */ 1008 */
986 mutex_lock(&hugetlb_instantiation_mutex); 1009 mutex_lock(&hugetlb_instantiation_mutex);
987 entry = *ptep; 1010 entry = huge_ptep_get(ptep);
988 if (pte_none(entry)) { 1011 if (huge_pte_none(entry)) {
989 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 1012 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
990 mutex_unlock(&hugetlb_instantiation_mutex); 1013 mutex_unlock(&hugetlb_instantiation_mutex);
991 return ret; 1014 return ret;
@@ -995,7 +1018,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
995 1018
996 spin_lock(&mm->page_table_lock); 1019 spin_lock(&mm->page_table_lock);
997 /* Check for a racing update before calling hugetlb_cow */ 1020 /* Check for a racing update before calling hugetlb_cow */
998 if (likely(pte_same(entry, *ptep))) 1021 if (likely(pte_same(entry, huge_ptep_get(ptep))))
999 if (write_access && !pte_write(entry)) 1022 if (write_access && !pte_write(entry))
1000 ret = hugetlb_cow(mm, vma, address, ptep, entry); 1023 ret = hugetlb_cow(mm, vma, address, ptep, entry);
1001 spin_unlock(&mm->page_table_lock); 1024 spin_unlock(&mm->page_table_lock);
@@ -1025,7 +1048,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1025 */ 1048 */
1026 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK); 1049 pte = huge_pte_offset(mm, vaddr & HPAGE_MASK);
1027 1050
1028 if (!pte || pte_none(*pte) || (write && !pte_write(*pte))) { 1051 if (!pte || huge_pte_none(huge_ptep_get(pte)) ||
1052 (write && !pte_write(huge_ptep_get(pte)))) {
1029 int ret; 1053 int ret;
1030 1054
1031 spin_unlock(&mm->page_table_lock); 1055 spin_unlock(&mm->page_table_lock);
@@ -1041,7 +1065,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
1041 } 1065 }
1042 1066
1043 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT; 1067 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
1044 page = pte_page(*pte); 1068 page = pte_page(huge_ptep_get(pte));
1045same_page: 1069same_page:
1046 if (pages) { 1070 if (pages) {
1047 get_page(page); 1071 get_page(page);
@@ -1090,7 +1114,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
1090 continue; 1114 continue;
1091 if (huge_pmd_unshare(mm, &address, ptep)) 1115 if (huge_pmd_unshare(mm, &address, ptep))
1092 continue; 1116 continue;
1093 if (!pte_none(*ptep)) { 1117 if (!huge_pte_none(huge_ptep_get(ptep))) {
1094 pte = huge_ptep_get_and_clear(mm, address, ptep); 1118 pte = huge_ptep_get_and_clear(mm, address, ptep);
1095 pte = pte_mkhuge(pte_modify(pte, newprot)); 1119 pte = pte_mkhuge(pte_modify(pte, newprot));
1096 set_huge_pte_at(mm, address, ptep, pte); 1120 set_huge_pte_at(mm, address, ptep, pte);
diff --git a/mm/internal.h b/mm/internal.h
index 789727309f4d..0034e947e4bc 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -34,8 +34,7 @@ static inline void __put_page(struct page *page)
34 atomic_dec(&page->_count); 34 atomic_dec(&page->_count);
35} 35}
36 36
37extern void __init __free_pages_bootmem(struct page *page, 37extern void __free_pages_bootmem(struct page *page, unsigned int order);
38 unsigned int order);
39 38
40/* 39/*
41 * function for dealing with page's order in buddy system. 40 * function for dealing with page's order in buddy system.
diff --git a/mm/madvise.c b/mm/madvise.c
index 93ee375b38e7..23a0ec3e0ea0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,7 +112,7 @@ static long madvise_willneed(struct vm_area_struct * vma,
112 if (!file) 112 if (!file)
113 return -EBADF; 113 return -EBADF;
114 114
115 if (file->f_mapping->a_ops->get_xip_page) { 115 if (file->f_mapping->a_ops->get_xip_mem) {
116 /* no bad return value, but ignore advice */ 116 /* no bad return value, but ignore advice */
117 return 0; 117 return 0;
118 } 118 }
diff --git a/mm/memory.c b/mm/memory.c
index 0d14d1e58a5f..bbab1e37055e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -371,57 +371,93 @@ static inline int is_cow_mapping(unsigned int flags)
371} 371}
372 372
373/* 373/*
374 * This function gets the "struct page" associated with a pte. 374 * vm_normal_page -- This function gets the "struct page" associated with a pte.
375 * 375 *
376 * NOTE! Some mappings do not have "struct pages". A raw PFN mapping 376 * "Special" mappings do not wish to be associated with a "struct page" (either
377 * will have each page table entry just pointing to a raw page frame 377 * it doesn't exist, or it exists but they don't want to touch it). In this
378 * number, and as far as the VM layer is concerned, those do not have 378 * case, NULL is returned here. "Normal" mappings do have a struct page.
379 * pages associated with them - even if the PFN might point to memory
380 * that otherwise is perfectly fine and has a "struct page".
381 * 379 *
382 * The way we recognize those mappings is through the rules set up 380 * There are 2 broad cases. Firstly, an architecture may define a pte_special()
383 * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, 381 * pte bit, in which case this function is trivial. Secondly, an architecture
384 * and the vm_pgoff will point to the first PFN mapped: thus every 382 * may not have a spare pte bit, which requires a more complicated scheme,
385 * page that is a raw mapping will always honor the rule 383 * described below.
384 *
385 * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a
386 * special mapping (even if there are underlying and valid "struct pages").
387 * COWed pages of a VM_PFNMAP are always normal.
388 *
389 * The way we recognize COWed pages within VM_PFNMAP mappings is through the
390 * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit
391 * set, and the vm_pgoff will point to the first PFN mapped: thus every special
392 * mapping will always honor the rule
386 * 393 *
387 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) 394 * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
388 * 395 *
389 * and if that isn't true, the page has been COW'ed (in which case it 396 * And for normal mappings this is false.
390 * _does_ have a "struct page" associated with it even if it is in a 397 *
391 * VM_PFNMAP range). 398 * This restricts such mappings to be a linear translation from virtual address
399 * to pfn. To get around this restriction, we allow arbitrary mappings so long
400 * as the vma is not a COW mapping; in that case, we know that all ptes are
401 * special (because none can have been COWed).
402 *
403 *
404 * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP.
405 *
406 * VM_MIXEDMAP mappings can likewise contain memory with or without "struct
407 * page" backing, however the difference is that _all_ pages with a struct
408 * page (that is, those where pfn_valid is true) are refcounted and considered
409 * normal pages by the VM. The disadvantage is that pages are refcounted
410 * (which can be slower and simply not an option for some PFNMAP users). The
411 * advantage is that we don't have to follow the strict linearity rule of
412 * PFNMAP mappings in order to support COWable mappings.
413 *
392 */ 414 */
393struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) 415#ifdef __HAVE_ARCH_PTE_SPECIAL
416# define HAVE_PTE_SPECIAL 1
417#else
418# define HAVE_PTE_SPECIAL 0
419#endif
420struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
421 pte_t pte)
394{ 422{
395 unsigned long pfn = pte_pfn(pte); 423 unsigned long pfn;
396 424
397 if (unlikely(vma->vm_flags & VM_PFNMAP)) { 425 if (HAVE_PTE_SPECIAL) {
398 unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; 426 if (likely(!pte_special(pte))) {
399 if (pfn == vma->vm_pgoff + off) 427 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
400 return NULL; 428 return pte_page(pte);
401 if (!is_cow_mapping(vma->vm_flags)) 429 }
402 return NULL; 430 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
431 return NULL;
403 } 432 }
404 433
405#ifdef CONFIG_DEBUG_VM 434 /* !HAVE_PTE_SPECIAL case follows: */
406 /* 435
407 * Add some anal sanity checks for now. Eventually, 436 pfn = pte_pfn(pte);
408 * we should just do "return pfn_to_page(pfn)", but 437
409 * in the meantime we check that we get a valid pfn, 438 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
410 * and that the resulting page looks ok. 439 if (vma->vm_flags & VM_MIXEDMAP) {
411 */ 440 if (!pfn_valid(pfn))
412 if (unlikely(!pfn_valid(pfn))) { 441 return NULL;
413 print_bad_pte(vma, pte, addr); 442 goto out;
414 return NULL; 443 } else {
444 unsigned long off;
445 off = (addr - vma->vm_start) >> PAGE_SHIFT;
446 if (pfn == vma->vm_pgoff + off)
447 return NULL;
448 if (!is_cow_mapping(vma->vm_flags))
449 return NULL;
450 }
415 } 451 }
416#endif 452
453 VM_BUG_ON(!pfn_valid(pfn));
417 454
418 /* 455 /*
419 * NOTE! We still have PageReserved() pages in the page 456 * NOTE! We still have PageReserved() pages in the page tables.
420 * tables.
421 * 457 *
422 * The PAGE_ZERO() pages and various VDSO mappings can 458 * eg. VDSO mappings can cause them to exist.
423 * cause them to exist.
424 */ 459 */
460out:
425 return pfn_to_page(pfn); 461 return pfn_to_page(pfn);
426} 462}
427 463
@@ -1057,8 +1093,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1057 if (pages) 1093 if (pages)
1058 foll_flags |= FOLL_GET; 1094 foll_flags |= FOLL_GET;
1059 if (!write && !(vma->vm_flags & VM_LOCKED) && 1095 if (!write && !(vma->vm_flags & VM_LOCKED) &&
1060 (!vma->vm_ops || (!vma->vm_ops->nopage && 1096 (!vma->vm_ops || !vma->vm_ops->fault))
1061 !vma->vm_ops->fault)))
1062 foll_flags |= FOLL_ANON; 1097 foll_flags |= FOLL_ANON;
1063 1098
1064 do { 1099 do {
@@ -1141,8 +1176,10 @@ pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
1141 * old drivers should use this, and they needed to mark their 1176 * old drivers should use this, and they needed to mark their
1142 * pages reserved for the old functions anyway. 1177 * pages reserved for the old functions anyway.
1143 */ 1178 */
1144static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot) 1179static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1180 struct page *page, pgprot_t prot)
1145{ 1181{
1182 struct mm_struct *mm = vma->vm_mm;
1146 int retval; 1183 int retval;
1147 pte_t *pte; 1184 pte_t *pte;
1148 spinlock_t *ptl; 1185 spinlock_t *ptl;
@@ -1202,40 +1239,26 @@ out:
1202 * 1239 *
1203 * The page does not need to be reserved. 1240 * The page does not need to be reserved.
1204 */ 1241 */
1205int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) 1242int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
1243 struct page *page)
1206{ 1244{
1207 if (addr < vma->vm_start || addr >= vma->vm_end) 1245 if (addr < vma->vm_start || addr >= vma->vm_end)
1208 return -EFAULT; 1246 return -EFAULT;
1209 if (!page_count(page)) 1247 if (!page_count(page))
1210 return -EINVAL; 1248 return -EINVAL;
1211 vma->vm_flags |= VM_INSERTPAGE; 1249 vma->vm_flags |= VM_INSERTPAGE;
1212 return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot); 1250 return insert_page(vma, addr, page, vma->vm_page_prot);
1213} 1251}
1214EXPORT_SYMBOL(vm_insert_page); 1252EXPORT_SYMBOL(vm_insert_page);
1215 1253
1216/** 1254static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1217 * vm_insert_pfn - insert single pfn into user vma 1255 unsigned long pfn, pgprot_t prot)
1218 * @vma: user vma to map to
1219 * @addr: target user address of this page
1220 * @pfn: source kernel pfn
1221 *
1222 * Similar to vm_inert_page, this allows drivers to insert individual pages
1223 * they've allocated into a user vma. Same comments apply.
1224 *
1225 * This function should only be called from a vm_ops->fault handler, and
1226 * in that case the handler should return NULL.
1227 */
1228int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1229 unsigned long pfn)
1230{ 1256{
1231 struct mm_struct *mm = vma->vm_mm; 1257 struct mm_struct *mm = vma->vm_mm;
1232 int retval; 1258 int retval;
1233 pte_t *pte, entry; 1259 pte_t *pte, entry;
1234 spinlock_t *ptl; 1260 spinlock_t *ptl;
1235 1261
1236 BUG_ON(!(vma->vm_flags & VM_PFNMAP));
1237 BUG_ON(is_cow_mapping(vma->vm_flags));
1238
1239 retval = -ENOMEM; 1262 retval = -ENOMEM;
1240 pte = get_locked_pte(mm, addr, &ptl); 1263 pte = get_locked_pte(mm, addr, &ptl);
1241 if (!pte) 1264 if (!pte)
@@ -1245,19 +1268,74 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1245 goto out_unlock; 1268 goto out_unlock;
1246 1269
1247 /* Ok, finally just insert the thing.. */ 1270 /* Ok, finally just insert the thing.. */
1248 entry = pfn_pte(pfn, vma->vm_page_prot); 1271 entry = pte_mkspecial(pfn_pte(pfn, prot));
1249 set_pte_at(mm, addr, pte, entry); 1272 set_pte_at(mm, addr, pte, entry);
1250 update_mmu_cache(vma, addr, entry); 1273 update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */
1251 1274
1252 retval = 0; 1275 retval = 0;
1253out_unlock: 1276out_unlock:
1254 pte_unmap_unlock(pte, ptl); 1277 pte_unmap_unlock(pte, ptl);
1255
1256out: 1278out:
1257 return retval; 1279 return retval;
1258} 1280}
1281
1282/**
1283 * vm_insert_pfn - insert single pfn into user vma
1284 * @vma: user vma to map to
1285 * @addr: target user address of this page
1286 * @pfn: source kernel pfn
1287 *
1288 * Similar to vm_inert_page, this allows drivers to insert individual pages
1289 * they've allocated into a user vma. Same comments apply.
1290 *
1291 * This function should only be called from a vm_ops->fault handler, and
1292 * in that case the handler should return NULL.
1293 */
1294int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1295 unsigned long pfn)
1296{
1297 /*
1298 * Technically, architectures with pte_special can avoid all these
1299 * restrictions (same for remap_pfn_range). However we would like
1300 * consistency in testing and feature parity among all, so we should
1301 * try to keep these invariants in place for everybody.
1302 */
1303 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
1304 BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) ==
1305 (VM_PFNMAP|VM_MIXEDMAP));
1306 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
1307 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
1308
1309 if (addr < vma->vm_start || addr >= vma->vm_end)
1310 return -EFAULT;
1311 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1312}
1259EXPORT_SYMBOL(vm_insert_pfn); 1313EXPORT_SYMBOL(vm_insert_pfn);
1260 1314
1315int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
1316 unsigned long pfn)
1317{
1318 BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
1319
1320 if (addr < vma->vm_start || addr >= vma->vm_end)
1321 return -EFAULT;
1322
1323 /*
1324 * If we don't have pte special, then we have to use the pfn_valid()
1325 * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must*
1326 * refcount the page if pfn_valid is true (hence insert_page rather
1327 * than insert_pfn).
1328 */
1329 if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
1330 struct page *page;
1331
1332 page = pfn_to_page(pfn);
1333 return insert_page(vma, addr, page, vma->vm_page_prot);
1334 }
1335 return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
1336}
1337EXPORT_SYMBOL(vm_insert_mixed);
1338
1261/* 1339/*
1262 * maps a range of physical memory into the requested pages. the old 1340 * maps a range of physical memory into the requested pages. the old
1263 * mappings are removed. any references to nonexistent pages results 1341 * mappings are removed. any references to nonexistent pages results
@@ -1276,7 +1354,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
1276 arch_enter_lazy_mmu_mode(); 1354 arch_enter_lazy_mmu_mode();
1277 do { 1355 do {
1278 BUG_ON(!pte_none(*pte)); 1356 BUG_ON(!pte_none(*pte));
1279 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot)); 1357 set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot)));
1280 pfn++; 1358 pfn++;
1281 } while (pte++, addr += PAGE_SIZE, addr != end); 1359 } while (pte++, addr += PAGE_SIZE, addr != end);
1282 arch_leave_lazy_mmu_mode(); 1360 arch_leave_lazy_mmu_mode();
@@ -2199,20 +2277,9 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2199 2277
2200 BUG_ON(vma->vm_flags & VM_PFNMAP); 2278 BUG_ON(vma->vm_flags & VM_PFNMAP);
2201 2279
2202 if (likely(vma->vm_ops->fault)) { 2280 ret = vma->vm_ops->fault(vma, &vmf);
2203 ret = vma->vm_ops->fault(vma, &vmf); 2281 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
2204 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) 2282 return ret;
2205 return ret;
2206 } else {
2207 /* Legacy ->nopage path */
2208 ret = 0;
2209 vmf.page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
2210 /* no page was available -- either SIGBUS or OOM */
2211 if (unlikely(vmf.page == NOPAGE_SIGBUS))
2212 return VM_FAULT_SIGBUS;
2213 else if (unlikely(vmf.page == NOPAGE_OOM))
2214 return VM_FAULT_OOM;
2215 }
2216 2283
2217 /* 2284 /*
2218 * For consistency in subsequent calls, make the faulted page always 2285 * For consistency in subsequent calls, make the faulted page always
@@ -2377,10 +2444,13 @@ static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
2377 unsigned long pfn; 2444 unsigned long pfn;
2378 2445
2379 pte_unmap(page_table); 2446 pte_unmap(page_table);
2380 BUG_ON(!(vma->vm_flags & VM_PFNMAP)); 2447 BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)));
2381 BUG_ON(is_cow_mapping(vma->vm_flags)); 2448 BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags));
2382 2449
2383 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK); 2450 pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
2451
2452 BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
2453
2384 if (unlikely(pfn == NOPFN_OOM)) 2454 if (unlikely(pfn == NOPFN_OOM))
2385 return VM_FAULT_OOM; 2455 return VM_FAULT_OOM;
2386 else if (unlikely(pfn == NOPFN_SIGBUS)) 2456 else if (unlikely(pfn == NOPFN_SIGBUS))
@@ -2458,7 +2528,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
2458 if (!pte_present(entry)) { 2528 if (!pte_present(entry)) {
2459 if (pte_none(entry)) { 2529 if (pte_none(entry)) {
2460 if (vma->vm_ops) { 2530 if (vma->vm_ops) {
2461 if (vma->vm_ops->fault || vma->vm_ops->nopage) 2531 if (likely(vma->vm_ops->fault))
2462 return do_linear_fault(mm, vma, address, 2532 return do_linear_fault(mm, vma, address,
2463 pte, pmd, write_access, entry); 2533 pte, pmd, write_access, entry);
2464 if (unlikely(vma->vm_ops->nopfn)) 2534 if (unlikely(vma->vm_ops->nopfn))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0fb330271271..b17dca7249f8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -29,6 +29,8 @@
29 29
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
32#include "internal.h"
33
32/* add this memory to iomem resource */ 34/* add this memory to iomem resource */
33static struct resource *register_memory_resource(u64 start, u64 size) 35static struct resource *register_memory_resource(u64 start, u64 size)
34{ 36{
@@ -58,8 +60,105 @@ static void release_memory_resource(struct resource *res)
58 return; 60 return;
59} 61}
60 62
61
62#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 63#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
64#ifndef CONFIG_SPARSEMEM_VMEMMAP
65static void get_page_bootmem(unsigned long info, struct page *page, int magic)
66{
67 atomic_set(&page->_mapcount, magic);
68 SetPagePrivate(page);
69 set_page_private(page, info);
70 atomic_inc(&page->_count);
71}
72
73void put_page_bootmem(struct page *page)
74{
75 int magic;
76
77 magic = atomic_read(&page->_mapcount);
78 BUG_ON(magic >= -1);
79
80 if (atomic_dec_return(&page->_count) == 1) {
81 ClearPagePrivate(page);
82 set_page_private(page, 0);
83 reset_page_mapcount(page);
84 __free_pages_bootmem(page, 0);
85 }
86
87}
88
89void register_page_bootmem_info_section(unsigned long start_pfn)
90{
91 unsigned long *usemap, mapsize, section_nr, i;
92 struct mem_section *ms;
93 struct page *page, *memmap;
94
95 if (!pfn_valid(start_pfn))
96 return;
97
98 section_nr = pfn_to_section_nr(start_pfn);
99 ms = __nr_to_section(section_nr);
100
101 /* Get section's memmap address */
102 memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
103
104 /*
105 * Get page for the memmap's phys address
106 * XXX: need more consideration for sparse_vmemmap...
107 */
108 page = virt_to_page(memmap);
109 mapsize = sizeof(struct page) * PAGES_PER_SECTION;
110 mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
111
112 /* remember memmap's page */
113 for (i = 0; i < mapsize; i++, page++)
114 get_page_bootmem(section_nr, page, SECTION_INFO);
115
116 usemap = __nr_to_section(section_nr)->pageblock_flags;
117 page = virt_to_page(usemap);
118
119 mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
120
121 for (i = 0; i < mapsize; i++, page++)
122 get_page_bootmem(section_nr, page, MIX_INFO);
123
124}
125
126void register_page_bootmem_info_node(struct pglist_data *pgdat)
127{
128 unsigned long i, pfn, end_pfn, nr_pages;
129 int node = pgdat->node_id;
130 struct page *page;
131 struct zone *zone;
132
133 nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
134 page = virt_to_page(pgdat);
135
136 for (i = 0; i < nr_pages; i++, page++)
137 get_page_bootmem(node, page, NODE_INFO);
138
139 zone = &pgdat->node_zones[0];
140 for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
141 if (zone->wait_table) {
142 nr_pages = zone->wait_table_hash_nr_entries
143 * sizeof(wait_queue_head_t);
144 nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
145 page = virt_to_page(zone->wait_table);
146
147 for (i = 0; i < nr_pages; i++, page++)
148 get_page_bootmem(node, page, NODE_INFO);
149 }
150 }
151
152 pfn = pgdat->node_start_pfn;
153 end_pfn = pfn + pgdat->node_spanned_pages;
154
155 /* register_section info */
156 for (; pfn < end_pfn; pfn += PAGES_PER_SECTION)
157 register_page_bootmem_info_section(pfn);
158
159}
160#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
161
63static int __add_zone(struct zone *zone, unsigned long phys_start_pfn) 162static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
64{ 163{
65 struct pglist_data *pgdat = zone->zone_pgdat; 164 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -101,6 +200,36 @@ static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
101 return register_new_memory(__pfn_to_section(phys_start_pfn)); 200 return register_new_memory(__pfn_to_section(phys_start_pfn));
102} 201}
103 202
203#ifdef CONFIG_SPARSEMEM_VMEMMAP
204static int __remove_section(struct zone *zone, struct mem_section *ms)
205{
206 /*
207 * XXX: Freeing memmap with vmemmap is not implement yet.
208 * This should be removed later.
209 */
210 return -EBUSY;
211}
212#else
213static int __remove_section(struct zone *zone, struct mem_section *ms)
214{
215 unsigned long flags;
216 struct pglist_data *pgdat = zone->zone_pgdat;
217 int ret = -EINVAL;
218
219 if (!valid_section(ms))
220 return ret;
221
222 ret = unregister_memory_section(ms);
223 if (ret)
224 return ret;
225
226 pgdat_resize_lock(pgdat, &flags);
227 sparse_remove_one_section(zone, ms);
228 pgdat_resize_unlock(pgdat, &flags);
229 return 0;
230}
231#endif
232
104/* 233/*
105 * Reasonably generic function for adding memory. It is 234 * Reasonably generic function for adding memory. It is
106 * expected that archs that support memory hotplug will 235 * expected that archs that support memory hotplug will
@@ -134,6 +263,42 @@ int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
134} 263}
135EXPORT_SYMBOL_GPL(__add_pages); 264EXPORT_SYMBOL_GPL(__add_pages);
136 265
266/**
267 * __remove_pages() - remove sections of pages from a zone
268 * @zone: zone from which pages need to be removed
269 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
270 * @nr_pages: number of pages to remove (must be multiple of section size)
271 *
272 * Generic helper function to remove section mappings and sysfs entries
273 * for the section of the memory we are removing. Caller needs to make
274 * sure that pages are marked reserved and zones are adjust properly by
275 * calling offline_pages().
276 */
277int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
278 unsigned long nr_pages)
279{
280 unsigned long i, ret = 0;
281 int sections_to_remove;
282
283 /*
284 * We can only remove entire sections
285 */
286 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
287 BUG_ON(nr_pages % PAGES_PER_SECTION);
288
289 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
290
291 sections_to_remove = nr_pages / PAGES_PER_SECTION;
292 for (i = 0; i < sections_to_remove; i++) {
293 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
294 ret = __remove_section(zone, __pfn_to_section(pfn));
295 if (ret)
296 break;
297 }
298 return ret;
299}
300EXPORT_SYMBOL_GPL(__remove_pages);
301
137static void grow_zone_span(struct zone *zone, 302static void grow_zone_span(struct zone *zone,
138 unsigned long start_pfn, unsigned long end_pfn) 303 unsigned long start_pfn, unsigned long end_pfn)
139{ 304{
@@ -164,6 +329,25 @@ static void grow_pgdat_span(struct pglist_data *pgdat,
164 pgdat->node_start_pfn; 329 pgdat->node_start_pfn;
165} 330}
166 331
332void online_page(struct page *page)
333{
334 totalram_pages++;
335 num_physpages++;
336
337#ifdef CONFIG_HIGHMEM
338 if (PageHighMem(page))
339 totalhigh_pages++;
340#endif
341
342#ifdef CONFIG_FLATMEM
343 max_mapnr = max(page_to_pfn(page), max_mapnr);
344#endif
345
346 ClearPageReserved(page);
347 init_page_count(page);
348 __free_page(page);
349}
350
167static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, 351static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
168 void *arg) 352 void *arg)
169{ 353{
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3c3601121509..a37a5034f63d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -63,7 +63,6 @@
63 grows down? 63 grows down?
64 make bind policy root only? It can trigger oom much faster and the 64 make bind policy root only? It can trigger oom much faster and the
65 kernel is not always grateful with that. 65 kernel is not always grateful with that.
66 could replace all the switch()es with a mempolicy_ops structure.
67*/ 66*/
68 67
69#include <linux/mempolicy.h> 68#include <linux/mempolicy.h>
@@ -89,6 +88,7 @@
89#include <linux/rmap.h> 88#include <linux/rmap.h>
90#include <linux/security.h> 89#include <linux/security.h>
91#include <linux/syscalls.h> 90#include <linux/syscalls.h>
91#include <linux/ctype.h>
92 92
93#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 94#include <asm/uaccess.h>
@@ -105,142 +105,264 @@ static struct kmem_cache *sn_cache;
105 policied. */ 105 policied. */
106enum zone_type policy_zone = 0; 106enum zone_type policy_zone = 0;
107 107
108/*
109 * run-time system-wide default policy => local allocation
110 */
108struct mempolicy default_policy = { 111struct mempolicy default_policy = {
109 .refcnt = ATOMIC_INIT(1), /* never free it */ 112 .refcnt = ATOMIC_INIT(1), /* never free it */
110 .policy = MPOL_DEFAULT, 113 .mode = MPOL_PREFERRED,
114 .flags = MPOL_F_LOCAL,
111}; 115};
112 116
113static void mpol_rebind_policy(struct mempolicy *pol, 117static const struct mempolicy_operations {
114 const nodemask_t *newmask); 118 int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
119 void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
120} mpol_ops[MPOL_MAX];
115 121
116/* Do sanity checking on a policy */ 122/* Check that the nodemask contains at least one populated zone */
117static int mpol_check_policy(int mode, nodemask_t *nodes) 123static int is_valid_nodemask(const nodemask_t *nodemask)
118{ 124{
119 int was_empty, is_empty; 125 int nd, k;
120 126
121 if (!nodes) 127 /* Check that there is something useful in this mask */
122 return 0; 128 k = policy_zone;
123 129
124 /* 130 for_each_node_mask(nd, *nodemask) {
125 * "Contextualize" the in-coming nodemast for cpusets: 131 struct zone *z;
126 * Remember whether in-coming nodemask was empty, If not,
127 * restrict the nodes to the allowed nodes in the cpuset.
128 * This is guaranteed to be a subset of nodes with memory.
129 */
130 cpuset_update_task_memory_state();
131 is_empty = was_empty = nodes_empty(*nodes);
132 if (!was_empty) {
133 nodes_and(*nodes, *nodes, cpuset_current_mems_allowed);
134 is_empty = nodes_empty(*nodes); /* after "contextualization" */
135 }
136 132
137 switch (mode) { 133 for (k = 0; k <= policy_zone; k++) {
138 case MPOL_DEFAULT: 134 z = &NODE_DATA(nd)->node_zones[k];
139 /* 135 if (z->present_pages > 0)
140 * require caller to specify an empty nodemask 136 return 1;
141 * before "contextualization" 137 }
142 */
143 if (!was_empty)
144 return -EINVAL;
145 break;
146 case MPOL_BIND:
147 case MPOL_INTERLEAVE:
148 /*
149 * require at least 1 valid node after "contextualization"
150 */
151 if (is_empty)
152 return -EINVAL;
153 break;
154 case MPOL_PREFERRED:
155 /*
156 * Did caller specify invalid nodes?
157 * Don't silently accept this as "local allocation".
158 */
159 if (!was_empty && is_empty)
160 return -EINVAL;
161 break;
162 } 138 }
139
163 return 0; 140 return 0;
164} 141}
165 142
166/* Generate a custom zonelist for the BIND policy. */ 143static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
167static struct zonelist *bind_zonelist(nodemask_t *nodes)
168{ 144{
169 struct zonelist *zl; 145 return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
170 int num, max, nd; 146}
171 enum zone_type k;
172 147
173 max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); 148static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
174 max++; /* space for zlcache_ptr (see mmzone.h) */ 149 const nodemask_t *rel)
175 zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); 150{
176 if (!zl) 151 nodemask_t tmp;
177 return ERR_PTR(-ENOMEM); 152 nodes_fold(tmp, *orig, nodes_weight(*rel));
178 zl->zlcache_ptr = NULL; 153 nodes_onto(*ret, tmp, *rel);
179 num = 0; 154}
180 /* First put in the highest zones from all nodes, then all the next 155
181 lower zones etc. Avoid empty zones because the memory allocator 156static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
182 doesn't like them. If you implement node hot removal you 157{
183 have to fix that. */ 158 if (nodes_empty(*nodes))
184 k = MAX_NR_ZONES - 1; 159 return -EINVAL;
185 while (1) { 160 pol->v.nodes = *nodes;
186 for_each_node_mask(nd, *nodes) { 161 return 0;
187 struct zone *z = &NODE_DATA(nd)->node_zones[k]; 162}
188 if (z->present_pages > 0) 163
189 zl->zones[num++] = z; 164static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
190 } 165{
191 if (k == 0) 166 if (!nodes)
192 break; 167 pol->flags |= MPOL_F_LOCAL; /* local allocation */
193 k--; 168 else if (nodes_empty(*nodes))
194 } 169 return -EINVAL; /* no allowed nodes */
195 if (num == 0) { 170 else
196 kfree(zl); 171 pol->v.preferred_node = first_node(*nodes);
197 return ERR_PTR(-EINVAL); 172 return 0;
198 } 173}
199 zl->zones[num] = NULL; 174
200 return zl; 175static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
176{
177 if (!is_valid_nodemask(nodes))
178 return -EINVAL;
179 pol->v.nodes = *nodes;
180 return 0;
201} 181}
202 182
203/* Create a new policy */ 183/* Create a new policy */
204static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) 184static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
185 nodemask_t *nodes)
205{ 186{
206 struct mempolicy *policy; 187 struct mempolicy *policy;
188 nodemask_t cpuset_context_nmask;
189 int ret;
207 190
208 pr_debug("setting mode %d nodes[0] %lx\n", 191 pr_debug("setting mode %d flags %d nodes[0] %lx\n",
209 mode, nodes ? nodes_addr(*nodes)[0] : -1); 192 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
210 193
211 if (mode == MPOL_DEFAULT) 194 if (mode == MPOL_DEFAULT) {
212 return NULL; 195 if (nodes && !nodes_empty(*nodes))
196 return ERR_PTR(-EINVAL);
197 return NULL; /* simply delete any existing policy */
198 }
199 VM_BUG_ON(!nodes);
200
201 /*
202 * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
203 * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
204 * All other modes require a valid pointer to a non-empty nodemask.
205 */
206 if (mode == MPOL_PREFERRED) {
207 if (nodes_empty(*nodes)) {
208 if (((flags & MPOL_F_STATIC_NODES) ||
209 (flags & MPOL_F_RELATIVE_NODES)))
210 return ERR_PTR(-EINVAL);
211 nodes = NULL; /* flag local alloc */
212 }
213 } else if (nodes_empty(*nodes))
214 return ERR_PTR(-EINVAL);
213 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); 215 policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
214 if (!policy) 216 if (!policy)
215 return ERR_PTR(-ENOMEM); 217 return ERR_PTR(-ENOMEM);
216 atomic_set(&policy->refcnt, 1); 218 atomic_set(&policy->refcnt, 1);
217 switch (mode) { 219 policy->mode = mode;
218 case MPOL_INTERLEAVE: 220 policy->flags = flags;
219 policy->v.nodes = *nodes; 221
220 if (nodes_weight(policy->v.nodes) == 0) { 222 if (nodes) {
221 kmem_cache_free(policy_cache, policy); 223 /*
222 return ERR_PTR(-EINVAL); 224 * cpuset related setup doesn't apply to local allocation
223 } 225 */
224 break; 226 cpuset_update_task_memory_state();
225 case MPOL_PREFERRED: 227 if (flags & MPOL_F_RELATIVE_NODES)
226 policy->v.preferred_node = first_node(*nodes); 228 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
227 if (policy->v.preferred_node >= MAX_NUMNODES) 229 &cpuset_current_mems_allowed);
228 policy->v.preferred_node = -1; 230 else
229 break; 231 nodes_and(cpuset_context_nmask, *nodes,
230 case MPOL_BIND: 232 cpuset_current_mems_allowed);
231 policy->v.zonelist = bind_zonelist(nodes); 233 if (mpol_store_user_nodemask(policy))
232 if (IS_ERR(policy->v.zonelist)) { 234 policy->w.user_nodemask = *nodes;
233 void *error_code = policy->v.zonelist; 235 else
234 kmem_cache_free(policy_cache, policy); 236 policy->w.cpuset_mems_allowed =
235 return error_code; 237 cpuset_mems_allowed(current);
236 } 238 }
237 break; 239
240 ret = mpol_ops[mode].create(policy,
241 nodes ? &cpuset_context_nmask : NULL);
242 if (ret < 0) {
243 kmem_cache_free(policy_cache, policy);
244 return ERR_PTR(ret);
238 } 245 }
239 policy->policy = mode;
240 policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
241 return policy; 246 return policy;
242} 247}
243 248
249/* Slow path of a mpol destructor. */
250void __mpol_put(struct mempolicy *p)
251{
252 if (!atomic_dec_and_test(&p->refcnt))
253 return;
254 kmem_cache_free(policy_cache, p);
255}
256
257static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
258{
259}
260
261static void mpol_rebind_nodemask(struct mempolicy *pol,
262 const nodemask_t *nodes)
263{
264 nodemask_t tmp;
265
266 if (pol->flags & MPOL_F_STATIC_NODES)
267 nodes_and(tmp, pol->w.user_nodemask, *nodes);
268 else if (pol->flags & MPOL_F_RELATIVE_NODES)
269 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
270 else {
271 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
272 *nodes);
273 pol->w.cpuset_mems_allowed = *nodes;
274 }
275
276 pol->v.nodes = tmp;
277 if (!node_isset(current->il_next, tmp)) {
278 current->il_next = next_node(current->il_next, tmp);
279 if (current->il_next >= MAX_NUMNODES)
280 current->il_next = first_node(tmp);
281 if (current->il_next >= MAX_NUMNODES)
282 current->il_next = numa_node_id();
283 }
284}
285
286static void mpol_rebind_preferred(struct mempolicy *pol,
287 const nodemask_t *nodes)
288{
289 nodemask_t tmp;
290
291 if (pol->flags & MPOL_F_STATIC_NODES) {
292 int node = first_node(pol->w.user_nodemask);
293
294 if (node_isset(node, *nodes)) {
295 pol->v.preferred_node = node;
296 pol->flags &= ~MPOL_F_LOCAL;
297 } else
298 pol->flags |= MPOL_F_LOCAL;
299 } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
300 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
301 pol->v.preferred_node = first_node(tmp);
302 } else if (!(pol->flags & MPOL_F_LOCAL)) {
303 pol->v.preferred_node = node_remap(pol->v.preferred_node,
304 pol->w.cpuset_mems_allowed,
305 *nodes);
306 pol->w.cpuset_mems_allowed = *nodes;
307 }
308}
309
310/* Migrate a policy to a different set of nodes */
311static void mpol_rebind_policy(struct mempolicy *pol,
312 const nodemask_t *newmask)
313{
314 if (!pol)
315 return;
316 if (!mpol_store_user_nodemask(pol) &&
317 nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
318 return;
319 mpol_ops[pol->mode].rebind(pol, newmask);
320}
321
322/*
323 * Wrapper for mpol_rebind_policy() that just requires task
324 * pointer, and updates task mempolicy.
325 */
326
327void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
328{
329 mpol_rebind_policy(tsk->mempolicy, new);
330}
331
332/*
333 * Rebind each vma in mm to new nodemask.
334 *
335 * Call holding a reference to mm. Takes mm->mmap_sem during call.
336 */
337
338void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
339{
340 struct vm_area_struct *vma;
341
342 down_write(&mm->mmap_sem);
343 for (vma = mm->mmap; vma; vma = vma->vm_next)
344 mpol_rebind_policy(vma->vm_policy, new);
345 up_write(&mm->mmap_sem);
346}
347
348static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
349 [MPOL_DEFAULT] = {
350 .rebind = mpol_rebind_default,
351 },
352 [MPOL_INTERLEAVE] = {
353 .create = mpol_new_interleave,
354 .rebind = mpol_rebind_nodemask,
355 },
356 [MPOL_PREFERRED] = {
357 .create = mpol_new_preferred,
358 .rebind = mpol_rebind_preferred,
359 },
360 [MPOL_BIND] = {
361 .create = mpol_new_bind,
362 .rebind = mpol_rebind_nodemask,
363 },
364};
365
244static void gather_stats(struct page *, void *, int pte_dirty); 366static void gather_stats(struct page *, void *, int pte_dirty);
245static void migrate_page_add(struct page *page, struct list_head *pagelist, 367static void migrate_page_add(struct page *page, struct list_head *pagelist,
246 unsigned long flags); 368 unsigned long flags);
@@ -421,7 +543,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
421 if (!err) { 543 if (!err) {
422 mpol_get(new); 544 mpol_get(new);
423 vma->vm_policy = new; 545 vma->vm_policy = new;
424 mpol_free(old); 546 mpol_put(old);
425 } 547 }
426 return err; 548 return err;
427} 549}
@@ -479,46 +601,55 @@ static void mpol_set_task_struct_flag(void)
479} 601}
480 602
481/* Set the process memory policy */ 603/* Set the process memory policy */
482static long do_set_mempolicy(int mode, nodemask_t *nodes) 604static long do_set_mempolicy(unsigned short mode, unsigned short flags,
605 nodemask_t *nodes)
483{ 606{
484 struct mempolicy *new; 607 struct mempolicy *new;
608 struct mm_struct *mm = current->mm;
485 609
486 if (mpol_check_policy(mode, nodes)) 610 new = mpol_new(mode, flags, nodes);
487 return -EINVAL;
488 new = mpol_new(mode, nodes);
489 if (IS_ERR(new)) 611 if (IS_ERR(new))
490 return PTR_ERR(new); 612 return PTR_ERR(new);
491 mpol_free(current->mempolicy); 613
614 /*
615 * prevent changing our mempolicy while show_numa_maps()
616 * is using it.
617 * Note: do_set_mempolicy() can be called at init time
618 * with no 'mm'.
619 */
620 if (mm)
621 down_write(&mm->mmap_sem);
622 mpol_put(current->mempolicy);
492 current->mempolicy = new; 623 current->mempolicy = new;
493 mpol_set_task_struct_flag(); 624 mpol_set_task_struct_flag();
494 if (new && new->policy == MPOL_INTERLEAVE) 625 if (new && new->mode == MPOL_INTERLEAVE &&
626 nodes_weight(new->v.nodes))
495 current->il_next = first_node(new->v.nodes); 627 current->il_next = first_node(new->v.nodes);
628 if (mm)
629 up_write(&mm->mmap_sem);
630
496 return 0; 631 return 0;
497} 632}
498 633
499/* Fill a zone bitmap for a policy */ 634/*
500static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) 635 * Return nodemask for policy for get_mempolicy() query
636 */
637static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
501{ 638{
502 int i;
503
504 nodes_clear(*nodes); 639 nodes_clear(*nodes);
505 switch (p->policy) { 640 if (p == &default_policy)
641 return;
642
643 switch (p->mode) {
506 case MPOL_BIND: 644 case MPOL_BIND:
507 for (i = 0; p->v.zonelist->zones[i]; i++) 645 /* Fall through */
508 node_set(zone_to_nid(p->v.zonelist->zones[i]),
509 *nodes);
510 break;
511 case MPOL_DEFAULT:
512 break;
513 case MPOL_INTERLEAVE: 646 case MPOL_INTERLEAVE:
514 *nodes = p->v.nodes; 647 *nodes = p->v.nodes;
515 break; 648 break;
516 case MPOL_PREFERRED: 649 case MPOL_PREFERRED:
517 /* or use current node instead of memory_map? */ 650 if (!(p->flags & MPOL_F_LOCAL))
518 if (p->v.preferred_node < 0)
519 *nodes = node_states[N_HIGH_MEMORY];
520 else
521 node_set(p->v.preferred_node, *nodes); 651 node_set(p->v.preferred_node, *nodes);
652 /* else return empty node mask for local allocation */
522 break; 653 break;
523 default: 654 default:
524 BUG(); 655 BUG();
@@ -561,6 +692,11 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
561 } 692 }
562 693
563 if (flags & MPOL_F_ADDR) { 694 if (flags & MPOL_F_ADDR) {
695 /*
696 * Do NOT fall back to task policy if the
697 * vma/shared policy at addr is NULL. We
698 * want to return MPOL_DEFAULT in this case.
699 */
564 down_read(&mm->mmap_sem); 700 down_read(&mm->mmap_sem);
565 vma = find_vma_intersection(mm, addr, addr+1); 701 vma = find_vma_intersection(mm, addr, addr+1);
566 if (!vma) { 702 if (!vma) {
@@ -575,7 +711,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
575 return -EINVAL; 711 return -EINVAL;
576 712
577 if (!pol) 713 if (!pol)
578 pol = &default_policy; 714 pol = &default_policy; /* indicates default behavior */
579 715
580 if (flags & MPOL_F_NODE) { 716 if (flags & MPOL_F_NODE) {
581 if (flags & MPOL_F_ADDR) { 717 if (flags & MPOL_F_ADDR) {
@@ -584,14 +720,17 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
584 goto out; 720 goto out;
585 *policy = err; 721 *policy = err;
586 } else if (pol == current->mempolicy && 722 } else if (pol == current->mempolicy &&
587 pol->policy == MPOL_INTERLEAVE) { 723 pol->mode == MPOL_INTERLEAVE) {
588 *policy = current->il_next; 724 *policy = current->il_next;
589 } else { 725 } else {
590 err = -EINVAL; 726 err = -EINVAL;
591 goto out; 727 goto out;
592 } 728 }
593 } else 729 } else {
594 *policy = pol->policy; 730 *policy = pol == &default_policy ? MPOL_DEFAULT :
731 pol->mode;
732 *policy |= pol->flags;
733 }
595 734
596 if (vma) { 735 if (vma) {
597 up_read(&current->mm->mmap_sem); 736 up_read(&current->mm->mmap_sem);
@@ -600,9 +739,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
600 739
601 err = 0; 740 err = 0;
602 if (nmask) 741 if (nmask)
603 get_zonemask(pol, nmask); 742 get_policy_nodemask(pol, nmask);
604 743
605 out: 744 out:
745 mpol_cond_put(pol);
606 if (vma) 746 if (vma)
607 up_read(&current->mm->mmap_sem); 747 up_read(&current->mm->mmap_sem);
608 return err; 748 return err;
@@ -664,7 +804,7 @@ int do_migrate_pages(struct mm_struct *mm,
664 int err = 0; 804 int err = 0;
665 nodemask_t tmp; 805 nodemask_t tmp;
666 806
667 down_read(&mm->mmap_sem); 807 down_read(&mm->mmap_sem);
668 808
669 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 809 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
670 if (err) 810 if (err)
@@ -781,8 +921,8 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
781#endif 921#endif
782 922
783static long do_mbind(unsigned long start, unsigned long len, 923static long do_mbind(unsigned long start, unsigned long len,
784 unsigned long mode, nodemask_t *nmask, 924 unsigned short mode, unsigned short mode_flags,
785 unsigned long flags) 925 nodemask_t *nmask, unsigned long flags)
786{ 926{
787 struct vm_area_struct *vma; 927 struct vm_area_struct *vma;
788 struct mm_struct *mm = current->mm; 928 struct mm_struct *mm = current->mm;
@@ -791,9 +931,8 @@ static long do_mbind(unsigned long start, unsigned long len,
791 int err; 931 int err;
792 LIST_HEAD(pagelist); 932 LIST_HEAD(pagelist);
793 933
794 if ((flags & ~(unsigned long)(MPOL_MF_STRICT | 934 if (flags & ~(unsigned long)(MPOL_MF_STRICT |
795 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) 935 MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
796 || mode > MPOL_MAX)
797 return -EINVAL; 936 return -EINVAL;
798 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) 937 if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
799 return -EPERM; 938 return -EPERM;
@@ -812,10 +951,7 @@ static long do_mbind(unsigned long start, unsigned long len,
812 if (end == start) 951 if (end == start)
813 return 0; 952 return 0;
814 953
815 if (mpol_check_policy(mode, nmask)) 954 new = mpol_new(mode, mode_flags, nmask);
816 return -EINVAL;
817
818 new = mpol_new(mode, nmask);
819 if (IS_ERR(new)) 955 if (IS_ERR(new))
820 return PTR_ERR(new); 956 return PTR_ERR(new);
821 957
@@ -826,8 +962,9 @@ static long do_mbind(unsigned long start, unsigned long len,
826 if (!new) 962 if (!new)
827 flags |= MPOL_MF_DISCONTIG_OK; 963 flags |= MPOL_MF_DISCONTIG_OK;
828 964
829 pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, 965 pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
830 mode, nmask ? nodes_addr(*nmask)[0] : -1); 966 start, start + len, mode, mode_flags,
967 nmask ? nodes_addr(*nmask)[0] : -1);
831 968
832 down_write(&mm->mmap_sem); 969 down_write(&mm->mmap_sem);
833 vma = check_range(mm, start, end, nmask, 970 vma = check_range(mm, start, end, nmask,
@@ -848,7 +985,7 @@ static long do_mbind(unsigned long start, unsigned long len,
848 } 985 }
849 986
850 up_write(&mm->mmap_sem); 987 up_write(&mm->mmap_sem);
851 mpol_free(new); 988 mpol_put(new);
852 return err; 989 return err;
853} 990}
854 991
@@ -926,11 +1063,19 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
926{ 1063{
927 nodemask_t nodes; 1064 nodemask_t nodes;
928 int err; 1065 int err;
1066 unsigned short mode_flags;
929 1067
1068 mode_flags = mode & MPOL_MODE_FLAGS;
1069 mode &= ~MPOL_MODE_FLAGS;
1070 if (mode >= MPOL_MAX)
1071 return -EINVAL;
1072 if ((mode_flags & MPOL_F_STATIC_NODES) &&
1073 (mode_flags & MPOL_F_RELATIVE_NODES))
1074 return -EINVAL;
930 err = get_nodes(&nodes, nmask, maxnode); 1075 err = get_nodes(&nodes, nmask, maxnode);
931 if (err) 1076 if (err)
932 return err; 1077 return err;
933 return do_mbind(start, len, mode, &nodes, flags); 1078 return do_mbind(start, len, mode, mode_flags, &nodes, flags);
934} 1079}
935 1080
936/* Set the process memory policy */ 1081/* Set the process memory policy */
@@ -939,13 +1084,18 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
939{ 1084{
940 int err; 1085 int err;
941 nodemask_t nodes; 1086 nodemask_t nodes;
1087 unsigned short flags;
942 1088
943 if (mode < 0 || mode > MPOL_MAX) 1089 flags = mode & MPOL_MODE_FLAGS;
1090 mode &= ~MPOL_MODE_FLAGS;
1091 if ((unsigned int)mode >= MPOL_MAX)
1092 return -EINVAL;
1093 if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
944 return -EINVAL; 1094 return -EINVAL;
945 err = get_nodes(&nodes, nmask, maxnode); 1095 err = get_nodes(&nodes, nmask, maxnode);
946 if (err) 1096 if (err)
947 return err; 1097 return err;
948 return do_set_mempolicy(mode, &nodes); 1098 return do_set_mempolicy(mode, flags, &nodes);
949} 1099}
950 1100
951asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 1101asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
@@ -1131,59 +1281,75 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1131 * 1281 *
1132 * Returns effective policy for a VMA at specified address. 1282 * Returns effective policy for a VMA at specified address.
1133 * Falls back to @task or system default policy, as necessary. 1283 * Falls back to @task or system default policy, as necessary.
1134 * Returned policy has extra reference count if shared, vma, 1284 * Current or other task's task mempolicy and non-shared vma policies
1135 * or some other task's policy [show_numa_maps() can pass 1285 * are protected by the task's mmap_sem, which must be held for read by
1136 * @task != current]. It is the caller's responsibility to 1286 * the caller.
1137 * free the reference in these cases. 1287 * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1288 * count--added by the get_policy() vm_op, as appropriate--to protect against
1289 * freeing by another task. It is the caller's responsibility to free the
1290 * extra reference for shared policies.
1138 */ 1291 */
1139static struct mempolicy * get_vma_policy(struct task_struct *task, 1292static struct mempolicy *get_vma_policy(struct task_struct *task,
1140 struct vm_area_struct *vma, unsigned long addr) 1293 struct vm_area_struct *vma, unsigned long addr)
1141{ 1294{
1142 struct mempolicy *pol = task->mempolicy; 1295 struct mempolicy *pol = task->mempolicy;
1143 int shared_pol = 0;
1144 1296
1145 if (vma) { 1297 if (vma) {
1146 if (vma->vm_ops && vma->vm_ops->get_policy) { 1298 if (vma->vm_ops && vma->vm_ops->get_policy) {
1147 pol = vma->vm_ops->get_policy(vma, addr); 1299 struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1148 shared_pol = 1; /* if pol non-NULL, add ref below */ 1300 addr);
1149 } else if (vma->vm_policy && 1301 if (vpol)
1150 vma->vm_policy->policy != MPOL_DEFAULT) 1302 pol = vpol;
1303 } else if (vma->vm_policy)
1151 pol = vma->vm_policy; 1304 pol = vma->vm_policy;
1152 } 1305 }
1153 if (!pol) 1306 if (!pol)
1154 pol = &default_policy; 1307 pol = &default_policy;
1155 else if (!shared_pol && pol != current->mempolicy)
1156 mpol_get(pol); /* vma or other task's policy */
1157 return pol; 1308 return pol;
1158} 1309}
1159 1310
1160/* Return a zonelist representing a mempolicy */ 1311/*
1161static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy) 1312 * Return a nodemask representing a mempolicy for filtering nodes for
1313 * page allocation
1314 */
1315static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1162{ 1316{
1163 int nd; 1317 /* Lower zones don't get a nodemask applied for MPOL_BIND */
1318 if (unlikely(policy->mode == MPOL_BIND) &&
1319 gfp_zone(gfp) >= policy_zone &&
1320 cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1321 return &policy->v.nodes;
1164 1322
1165 switch (policy->policy) { 1323 return NULL;
1324}
1325
1326/* Return a zonelist indicated by gfp for node representing a mempolicy */
1327static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1328{
1329 int nd = numa_node_id();
1330
1331 switch (policy->mode) {
1166 case MPOL_PREFERRED: 1332 case MPOL_PREFERRED:
1167 nd = policy->v.preferred_node; 1333 if (!(policy->flags & MPOL_F_LOCAL))
1168 if (nd < 0) 1334 nd = policy->v.preferred_node;
1169 nd = numa_node_id();
1170 break; 1335 break;
1171 case MPOL_BIND: 1336 case MPOL_BIND:
1172 /* Lower zones don't get a policy applied */ 1337 /*
1173 /* Careful: current->mems_allowed might have moved */ 1338 * Normally, MPOL_BIND allocations are node-local within the
1174 if (gfp_zone(gfp) >= policy_zone) 1339 * allowed nodemask. However, if __GFP_THISNODE is set and the
1175 if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist)) 1340 * current node is part of the mask, we use the zonelist for
1176 return policy->v.zonelist; 1341 * the first node in the mask instead.
1177 /*FALL THROUGH*/ 1342 */
1343 if (unlikely(gfp & __GFP_THISNODE) &&
1344 unlikely(!node_isset(nd, policy->v.nodes)))
1345 nd = first_node(policy->v.nodes);
1346 break;
1178 case MPOL_INTERLEAVE: /* should not happen */ 1347 case MPOL_INTERLEAVE: /* should not happen */
1179 case MPOL_DEFAULT:
1180 nd = numa_node_id();
1181 break; 1348 break;
1182 default: 1349 default:
1183 nd = 0;
1184 BUG(); 1350 BUG();
1185 } 1351 }
1186 return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp); 1352 return node_zonelist(nd, gfp);
1187} 1353}
1188 1354
1189/* Do dynamic interleaving for a process */ 1355/* Do dynamic interleaving for a process */
@@ -1196,36 +1362,51 @@ static unsigned interleave_nodes(struct mempolicy *policy)
1196 next = next_node(nid, policy->v.nodes); 1362 next = next_node(nid, policy->v.nodes);
1197 if (next >= MAX_NUMNODES) 1363 if (next >= MAX_NUMNODES)
1198 next = first_node(policy->v.nodes); 1364 next = first_node(policy->v.nodes);
1199 me->il_next = next; 1365 if (next < MAX_NUMNODES)
1366 me->il_next = next;
1200 return nid; 1367 return nid;
1201} 1368}
1202 1369
1203/* 1370/*
1204 * Depending on the memory policy provide a node from which to allocate the 1371 * Depending on the memory policy provide a node from which to allocate the
1205 * next slab entry. 1372 * next slab entry.
1373 * @policy must be protected by freeing by the caller. If @policy is
1374 * the current task's mempolicy, this protection is implicit, as only the
1375 * task can change it's policy. The system default policy requires no
1376 * such protection.
1206 */ 1377 */
1207unsigned slab_node(struct mempolicy *policy) 1378unsigned slab_node(struct mempolicy *policy)
1208{ 1379{
1209 int pol = policy ? policy->policy : MPOL_DEFAULT; 1380 if (!policy || policy->flags & MPOL_F_LOCAL)
1381 return numa_node_id();
1382
1383 switch (policy->mode) {
1384 case MPOL_PREFERRED:
1385 /*
1386 * handled MPOL_F_LOCAL above
1387 */
1388 return policy->v.preferred_node;
1210 1389
1211 switch (pol) {
1212 case MPOL_INTERLEAVE: 1390 case MPOL_INTERLEAVE:
1213 return interleave_nodes(policy); 1391 return interleave_nodes(policy);
1214 1392
1215 case MPOL_BIND: 1393 case MPOL_BIND: {
1216 /* 1394 /*
1217 * Follow bind policy behavior and start allocation at the 1395 * Follow bind policy behavior and start allocation at the
1218 * first node. 1396 * first node.
1219 */ 1397 */
1220 return zone_to_nid(policy->v.zonelist->zones[0]); 1398 struct zonelist *zonelist;
1221 1399 struct zone *zone;
1222 case MPOL_PREFERRED: 1400 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1223 if (policy->v.preferred_node >= 0) 1401 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1224 return policy->v.preferred_node; 1402 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1225 /* Fall through */ 1403 &policy->v.nodes,
1404 &zone);
1405 return zone->node;
1406 }
1226 1407
1227 default: 1408 default:
1228 return numa_node_id(); 1409 BUG();
1229 } 1410 }
1230} 1411}
1231 1412
@@ -1234,10 +1415,13 @@ static unsigned offset_il_node(struct mempolicy *pol,
1234 struct vm_area_struct *vma, unsigned long off) 1415 struct vm_area_struct *vma, unsigned long off)
1235{ 1416{
1236 unsigned nnodes = nodes_weight(pol->v.nodes); 1417 unsigned nnodes = nodes_weight(pol->v.nodes);
1237 unsigned target = (unsigned)off % nnodes; 1418 unsigned target;
1238 int c; 1419 int c;
1239 int nid = -1; 1420 int nid = -1;
1240 1421
1422 if (!nnodes)
1423 return numa_node_id();
1424 target = (unsigned int)off % nnodes;
1241 c = 0; 1425 c = 0;
1242 do { 1426 do {
1243 nid = next_node(nid, pol->v.nodes); 1427 nid = next_node(nid, pol->v.nodes);
@@ -1274,40 +1458,30 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
1274 * @vma = virtual memory area whose policy is sought 1458 * @vma = virtual memory area whose policy is sought
1275 * @addr = address in @vma for shared policy lookup and interleave policy 1459 * @addr = address in @vma for shared policy lookup and interleave policy
1276 * @gfp_flags = for requested zone 1460 * @gfp_flags = for requested zone
1277 * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy 1461 * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1462 * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1278 * 1463 *
1279 * Returns a zonelist suitable for a huge page allocation. 1464 * Returns a zonelist suitable for a huge page allocation and a pointer
1280 * If the effective policy is 'BIND, returns pointer to policy's zonelist. 1465 * to the struct mempolicy for conditional unref after allocation.
1281 * If it is also a policy for which get_vma_policy() returns an extra 1466 * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1282 * reference, we must hold that reference until after allocation. 1467 * @nodemask for filtering the zonelist.
1283 * In that case, return policy via @mpol so hugetlb allocation can drop
1284 * the reference. For non-'BIND referenced policies, we can/do drop the
1285 * reference here, so the caller doesn't need to know about the special case
1286 * for default and current task policy.
1287 */ 1468 */
1288struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, 1469struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1289 gfp_t gfp_flags, struct mempolicy **mpol) 1470 gfp_t gfp_flags, struct mempolicy **mpol,
1471 nodemask_t **nodemask)
1290{ 1472{
1291 struct mempolicy *pol = get_vma_policy(current, vma, addr);
1292 struct zonelist *zl; 1473 struct zonelist *zl;
1293 1474
1294 *mpol = NULL; /* probably no unref needed */ 1475 *mpol = get_vma_policy(current, vma, addr);
1295 if (pol->policy == MPOL_INTERLEAVE) { 1476 *nodemask = NULL; /* assume !MPOL_BIND */
1296 unsigned nid;
1297
1298 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1299 if (unlikely(pol != &default_policy &&
1300 pol != current->mempolicy))
1301 __mpol_free(pol); /* finished with pol */
1302 return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
1303 }
1304 1477
1305 zl = zonelist_policy(GFP_HIGHUSER, pol); 1478 if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1306 if (unlikely(pol != &default_policy && pol != current->mempolicy)) { 1479 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1307 if (pol->policy != MPOL_BIND) 1480 HPAGE_SHIFT), gfp_flags);
1308 __mpol_free(pol); /* finished with pol */ 1481 } else {
1309 else 1482 zl = policy_zonelist(gfp_flags, *mpol);
1310 *mpol = pol; /* unref needed after allocation */ 1483 if ((*mpol)->mode == MPOL_BIND)
1484 *nodemask = &(*mpol)->v.nodes;
1311 } 1485 }
1312 return zl; 1486 return zl;
1313} 1487}
@@ -1321,9 +1495,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1321 struct zonelist *zl; 1495 struct zonelist *zl;
1322 struct page *page; 1496 struct page *page;
1323 1497
1324 zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); 1498 zl = node_zonelist(nid, gfp);
1325 page = __alloc_pages(gfp, order, zl); 1499 page = __alloc_pages(gfp, order, zl);
1326 if (page && page_zone(page) == zl->zones[0]) 1500 if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1327 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); 1501 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1328 return page; 1502 return page;
1329} 1503}
@@ -1358,28 +1532,27 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1358 1532
1359 cpuset_update_task_memory_state(); 1533 cpuset_update_task_memory_state();
1360 1534
1361 if (unlikely(pol->policy == MPOL_INTERLEAVE)) { 1535 if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1362 unsigned nid; 1536 unsigned nid;
1363 1537
1364 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); 1538 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1365 if (unlikely(pol != &default_policy && 1539 mpol_cond_put(pol);
1366 pol != current->mempolicy))
1367 __mpol_free(pol); /* finished with pol */
1368 return alloc_page_interleave(gfp, 0, nid); 1540 return alloc_page_interleave(gfp, 0, nid);
1369 } 1541 }
1370 zl = zonelist_policy(gfp, pol); 1542 zl = policy_zonelist(gfp, pol);
1371 if (pol != &default_policy && pol != current->mempolicy) { 1543 if (unlikely(mpol_needs_cond_ref(pol))) {
1372 /* 1544 /*
1373 * slow path: ref counted policy -- shared or vma 1545 * slow path: ref counted shared policy
1374 */ 1546 */
1375 struct page *page = __alloc_pages(gfp, 0, zl); 1547 struct page *page = __alloc_pages_nodemask(gfp, 0,
1376 __mpol_free(pol); 1548 zl, policy_nodemask(gfp, pol));
1549 __mpol_put(pol);
1377 return page; 1550 return page;
1378 } 1551 }
1379 /* 1552 /*
1380 * fast path: default or task policy 1553 * fast path: default or task policy
1381 */ 1554 */
1382 return __alloc_pages(gfp, 0, zl); 1555 return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1383} 1556}
1384 1557
1385/** 1558/**
@@ -1409,22 +1582,28 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1409 cpuset_update_task_memory_state(); 1582 cpuset_update_task_memory_state();
1410 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) 1583 if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1411 pol = &default_policy; 1584 pol = &default_policy;
1412 if (pol->policy == MPOL_INTERLEAVE) 1585
1586 /*
1587 * No reference counting needed for current->mempolicy
1588 * nor system default_policy
1589 */
1590 if (pol->mode == MPOL_INTERLEAVE)
1413 return alloc_page_interleave(gfp, order, interleave_nodes(pol)); 1591 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1414 return __alloc_pages(gfp, order, zonelist_policy(gfp, pol)); 1592 return __alloc_pages_nodemask(gfp, order,
1593 policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1415} 1594}
1416EXPORT_SYMBOL(alloc_pages_current); 1595EXPORT_SYMBOL(alloc_pages_current);
1417 1596
1418/* 1597/*
1419 * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it 1598 * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1420 * rebinds the mempolicy its copying by calling mpol_rebind_policy() 1599 * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1421 * with the mems_allowed returned by cpuset_mems_allowed(). This 1600 * with the mems_allowed returned by cpuset_mems_allowed(). This
1422 * keeps mempolicies cpuset relative after its cpuset moves. See 1601 * keeps mempolicies cpuset relative after its cpuset moves. See
1423 * further kernel/cpuset.c update_nodemask(). 1602 * further kernel/cpuset.c update_nodemask().
1424 */ 1603 */
1425 1604
1426/* Slow path of a mempolicy copy */ 1605/* Slow path of a mempolicy duplicate */
1427struct mempolicy *__mpol_copy(struct mempolicy *old) 1606struct mempolicy *__mpol_dup(struct mempolicy *old)
1428{ 1607{
1429 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); 1608 struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1430 1609
@@ -1436,55 +1615,64 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
1436 } 1615 }
1437 *new = *old; 1616 *new = *old;
1438 atomic_set(&new->refcnt, 1); 1617 atomic_set(&new->refcnt, 1);
1439 if (new->policy == MPOL_BIND) {
1440 int sz = ksize(old->v.zonelist);
1441 new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
1442 if (!new->v.zonelist) {
1443 kmem_cache_free(policy_cache, new);
1444 return ERR_PTR(-ENOMEM);
1445 }
1446 }
1447 return new; 1618 return new;
1448} 1619}
1449 1620
1621/*
1622 * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1623 * eliminate the * MPOL_F_* flags that require conditional ref and
1624 * [NOTE!!!] drop the extra ref. Not safe to reference *frompol directly
1625 * after return. Use the returned value.
1626 *
1627 * Allows use of a mempolicy for, e.g., multiple allocations with a single
1628 * policy lookup, even if the policy needs/has extra ref on lookup.
1629 * shmem_readahead needs this.
1630 */
1631struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1632 struct mempolicy *frompol)
1633{
1634 if (!mpol_needs_cond_ref(frompol))
1635 return frompol;
1636
1637 *tompol = *frompol;
1638 tompol->flags &= ~MPOL_F_SHARED; /* copy doesn't need unref */
1639 __mpol_put(frompol);
1640 return tompol;
1641}
1642
1643static int mpol_match_intent(const struct mempolicy *a,
1644 const struct mempolicy *b)
1645{
1646 if (a->flags != b->flags)
1647 return 0;
1648 if (!mpol_store_user_nodemask(a))
1649 return 1;
1650 return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1651}
1652
1450/* Slow path of a mempolicy comparison */ 1653/* Slow path of a mempolicy comparison */
1451int __mpol_equal(struct mempolicy *a, struct mempolicy *b) 1654int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1452{ 1655{
1453 if (!a || !b) 1656 if (!a || !b)
1454 return 0; 1657 return 0;
1455 if (a->policy != b->policy) 1658 if (a->mode != b->mode)
1456 return 0; 1659 return 0;
1457 switch (a->policy) { 1660 if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1458 case MPOL_DEFAULT: 1661 return 0;
1459 return 1; 1662 switch (a->mode) {
1663 case MPOL_BIND:
1664 /* Fall through */
1460 case MPOL_INTERLEAVE: 1665 case MPOL_INTERLEAVE:
1461 return nodes_equal(a->v.nodes, b->v.nodes); 1666 return nodes_equal(a->v.nodes, b->v.nodes);
1462 case MPOL_PREFERRED: 1667 case MPOL_PREFERRED:
1463 return a->v.preferred_node == b->v.preferred_node; 1668 return a->v.preferred_node == b->v.preferred_node &&
1464 case MPOL_BIND: { 1669 a->flags == b->flags;
1465 int i;
1466 for (i = 0; a->v.zonelist->zones[i]; i++)
1467 if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1468 return 0;
1469 return b->v.zonelist->zones[i] == NULL;
1470 }
1471 default: 1670 default:
1472 BUG(); 1671 BUG();
1473 return 0; 1672 return 0;
1474 } 1673 }
1475} 1674}
1476 1675
1477/* Slow path of a mpol destructor. */
1478void __mpol_free(struct mempolicy *p)
1479{
1480 if (!atomic_dec_and_test(&p->refcnt))
1481 return;
1482 if (p->policy == MPOL_BIND)
1483 kfree(p->v.zonelist);
1484 p->policy = MPOL_DEFAULT;
1485 kmem_cache_free(policy_cache, p);
1486}
1487
1488/* 1676/*
1489 * Shared memory backing store policy support. 1677 * Shared memory backing store policy support.
1490 * 1678 *
@@ -1547,7 +1735,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1547 rb_link_node(&new->nd, parent, p); 1735 rb_link_node(&new->nd, parent, p);
1548 rb_insert_color(&new->nd, &sp->root); 1736 rb_insert_color(&new->nd, &sp->root);
1549 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, 1737 pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1550 new->policy ? new->policy->policy : 0); 1738 new->policy ? new->policy->mode : 0);
1551} 1739}
1552 1740
1553/* Find shared policy intersecting idx */ 1741/* Find shared policy intersecting idx */
@@ -1573,7 +1761,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1573{ 1761{
1574 pr_debug("deleting %lx-l%lx\n", n->start, n->end); 1762 pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1575 rb_erase(&n->nd, &sp->root); 1763 rb_erase(&n->nd, &sp->root);
1576 mpol_free(n->policy); 1764 mpol_put(n->policy);
1577 kmem_cache_free(sn_cache, n); 1765 kmem_cache_free(sn_cache, n);
1578} 1766}
1579 1767
@@ -1587,6 +1775,7 @@ static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1587 n->start = start; 1775 n->start = start;
1588 n->end = end; 1776 n->end = end;
1589 mpol_get(pol); 1777 mpol_get(pol);
1778 pol->flags |= MPOL_F_SHARED; /* for unref */
1590 n->policy = pol; 1779 n->policy = pol;
1591 return n; 1780 return n;
1592} 1781}
@@ -1633,33 +1822,41 @@ restart:
1633 sp_insert(sp, new); 1822 sp_insert(sp, new);
1634 spin_unlock(&sp->lock); 1823 spin_unlock(&sp->lock);
1635 if (new2) { 1824 if (new2) {
1636 mpol_free(new2->policy); 1825 mpol_put(new2->policy);
1637 kmem_cache_free(sn_cache, new2); 1826 kmem_cache_free(sn_cache, new2);
1638 } 1827 }
1639 return 0; 1828 return 0;
1640} 1829}
1641 1830
1642void mpol_shared_policy_init(struct shared_policy *info, int policy, 1831/**
1643 nodemask_t *policy_nodes) 1832 * mpol_shared_policy_init - initialize shared policy for inode
1644{ 1833 * @sp: pointer to inode shared policy
1645 info->root = RB_ROOT; 1834 * @mpol: struct mempolicy to install
1646 spin_lock_init(&info->lock); 1835 *
1647 1836 * Install non-NULL @mpol in inode's shared policy rb-tree.
1648 if (policy != MPOL_DEFAULT) { 1837 * On entry, the current task has a reference on a non-NULL @mpol.
1649 struct mempolicy *newpol; 1838 * This must be released on exit.
1650 1839 */
1651 /* Falls back to MPOL_DEFAULT on any error */ 1840void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1652 newpol = mpol_new(policy, policy_nodes); 1841{
1653 if (!IS_ERR(newpol)) { 1842 sp->root = RB_ROOT; /* empty tree == default mempolicy */
1654 /* Create pseudo-vma that contains just the policy */ 1843 spin_lock_init(&sp->lock);
1655 struct vm_area_struct pvma; 1844
1656 1845 if (mpol) {
1657 memset(&pvma, 0, sizeof(struct vm_area_struct)); 1846 struct vm_area_struct pvma;
1658 /* Policy covers entire file */ 1847 struct mempolicy *new;
1659 pvma.vm_end = TASK_SIZE; 1848
1660 mpol_set_shared_policy(info, &pvma, newpol); 1849 /* contextualize the tmpfs mount point mempolicy */
1661 mpol_free(newpol); 1850 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1662 } 1851 mpol_put(mpol); /* drop our ref on sb mpol */
1852 if (IS_ERR(new))
1853 return; /* no valid nodemask intersection */
1854
1855 /* Create pseudo-vma that contains just the policy */
1856 memset(&pvma, 0, sizeof(struct vm_area_struct));
1857 pvma.vm_end = TASK_SIZE; /* policy covers entire file */
1858 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
1859 mpol_put(new); /* drop initial ref */
1663 } 1860 }
1664} 1861}
1665 1862
@@ -1670,9 +1867,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
1670 struct sp_node *new = NULL; 1867 struct sp_node *new = NULL;
1671 unsigned long sz = vma_pages(vma); 1868 unsigned long sz = vma_pages(vma);
1672 1869
1673 pr_debug("set_shared_policy %lx sz %lu %d %lx\n", 1870 pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1674 vma->vm_pgoff, 1871 vma->vm_pgoff,
1675 sz, npol? npol->policy : -1, 1872 sz, npol ? npol->mode : -1,
1873 npol ? npol->flags : -1,
1676 npol ? nodes_addr(npol->v.nodes)[0] : -1); 1874 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1677 1875
1678 if (npol) { 1876 if (npol) {
@@ -1700,7 +1898,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
1700 n = rb_entry(next, struct sp_node, nd); 1898 n = rb_entry(next, struct sp_node, nd);
1701 next = rb_next(&n->nd); 1899 next = rb_next(&n->nd);
1702 rb_erase(&n->nd, &p->root); 1900 rb_erase(&n->nd, &p->root);
1703 mpol_free(n->policy); 1901 mpol_put(n->policy);
1704 kmem_cache_free(sn_cache, n); 1902 kmem_cache_free(sn_cache, n);
1705 } 1903 }
1706 spin_unlock(&p->lock); 1904 spin_unlock(&p->lock);
@@ -1745,120 +1943,177 @@ void __init numa_policy_init(void)
1745 if (unlikely(nodes_empty(interleave_nodes))) 1943 if (unlikely(nodes_empty(interleave_nodes)))
1746 node_set(prefer, interleave_nodes); 1944 node_set(prefer, interleave_nodes);
1747 1945
1748 if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) 1946 if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1749 printk("numa_policy_init: interleaving failed\n"); 1947 printk("numa_policy_init: interleaving failed\n");
1750} 1948}
1751 1949
1752/* Reset policy of current process to default */ 1950/* Reset policy of current process to default */
1753void numa_default_policy(void) 1951void numa_default_policy(void)
1754{ 1952{
1755 do_set_mempolicy(MPOL_DEFAULT, NULL); 1953 do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1756} 1954}
1757 1955
1758/* Migrate a policy to a different set of nodes */ 1956/*
1759static void mpol_rebind_policy(struct mempolicy *pol, 1957 * Parse and format mempolicy from/to strings
1760 const nodemask_t *newmask) 1958 */
1761{
1762 nodemask_t *mpolmask;
1763 nodemask_t tmp;
1764 1959
1765 if (!pol) 1960/*
1766 return; 1961 * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
1767 mpolmask = &pol->cpuset_mems_allowed; 1962 * Used only for mpol_parse_str() and mpol_to_str()
1768 if (nodes_equal(*mpolmask, *newmask)) 1963 */
1769 return; 1964#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
1965static const char * const policy_types[] =
1966 { "default", "prefer", "bind", "interleave", "local" };
1770 1967
1771 switch (pol->policy) {
1772 case MPOL_DEFAULT:
1773 break;
1774 case MPOL_INTERLEAVE:
1775 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1776 pol->v.nodes = tmp;
1777 *mpolmask = *newmask;
1778 current->il_next = node_remap(current->il_next,
1779 *mpolmask, *newmask);
1780 break;
1781 case MPOL_PREFERRED:
1782 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1783 *mpolmask, *newmask);
1784 *mpolmask = *newmask;
1785 break;
1786 case MPOL_BIND: {
1787 nodemask_t nodes;
1788 struct zone **z;
1789 struct zonelist *zonelist;
1790 1968
1969#ifdef CONFIG_TMPFS
1970/**
1971 * mpol_parse_str - parse string to mempolicy
1972 * @str: string containing mempolicy to parse
1973 * @mpol: pointer to struct mempolicy pointer, returned on success.
1974 * @no_context: flag whether to "contextualize" the mempolicy
1975 *
1976 * Format of input:
1977 * <mode>[=<flags>][:<nodelist>]
1978 *
1979 * if @no_context is true, save the input nodemask in w.user_nodemask in
1980 * the returned mempolicy. This will be used to "clone" the mempolicy in
1981 * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
1982 * mount option. Note that if 'static' or 'relative' mode flags were
1983 * specified, the input nodemask will already have been saved. Saving
1984 * it again is redundant, but safe.
1985 *
1986 * On success, returns 0, else 1
1987 */
1988int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
1989{
1990 struct mempolicy *new = NULL;
1991 unsigned short uninitialized_var(mode);
1992 unsigned short uninitialized_var(mode_flags);
1993 nodemask_t nodes;
1994 char *nodelist = strchr(str, ':');
1995 char *flags = strchr(str, '=');
1996 int i;
1997 int err = 1;
1998
1999 if (nodelist) {
2000 /* NUL-terminate mode or flags string */
2001 *nodelist++ = '\0';
2002 if (nodelist_parse(nodelist, nodes))
2003 goto out;
2004 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2005 goto out;
2006 } else
1791 nodes_clear(nodes); 2007 nodes_clear(nodes);
1792 for (z = pol->v.zonelist->zones; *z; z++)
1793 node_set(zone_to_nid(*z), nodes);
1794 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1795 nodes = tmp;
1796 2008
1797 zonelist = bind_zonelist(&nodes); 2009 if (flags)
2010 *flags++ = '\0'; /* terminate mode string */
1798 2011
1799 /* If no mem, then zonelist is NULL and we keep old zonelist. 2012 for (i = 0; i <= MPOL_LOCAL; i++) {
1800 * If that old zonelist has no remaining mems_allowed nodes, 2013 if (!strcmp(str, policy_types[i])) {
1801 * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. 2014 mode = i;
1802 */ 2015 break;
2016 }
2017 }
2018 if (i > MPOL_LOCAL)
2019 goto out;
1803 2020
1804 if (!IS_ERR(zonelist)) { 2021 switch (mode) {
1805 /* Good - got mem - substitute new zonelist */ 2022 case MPOL_PREFERRED:
1806 kfree(pol->v.zonelist); 2023 /*
1807 pol->v.zonelist = zonelist; 2024 * Insist on a nodelist of one node only
2025 */
2026 if (nodelist) {
2027 char *rest = nodelist;
2028 while (isdigit(*rest))
2029 rest++;
2030 if (!*rest)
2031 err = 0;
1808 } 2032 }
1809 *mpolmask = *newmask;
1810 break; 2033 break;
1811 } 2034 case MPOL_INTERLEAVE:
1812 default: 2035 /*
1813 BUG(); 2036 * Default to online nodes with memory if no nodelist
2037 */
2038 if (!nodelist)
2039 nodes = node_states[N_HIGH_MEMORY];
2040 err = 0;
2041 break;
2042 case MPOL_LOCAL:
2043 /*
2044 * Don't allow a nodelist; mpol_new() checks flags
2045 */
2046 if (nodelist)
2047 goto out;
2048 mode = MPOL_PREFERRED;
1814 break; 2049 break;
1815 }
1816}
1817
1818/*
1819 * Wrapper for mpol_rebind_policy() that just requires task
1820 * pointer, and updates task mempolicy.
1821 */
1822 2050
1823void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) 2051 /*
1824{ 2052 * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
1825 mpol_rebind_policy(tsk->mempolicy, new); 2053 * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
1826} 2054 */
2055 }
1827 2056
1828/* 2057 mode_flags = 0;
1829 * Rebind each vma in mm to new nodemask. 2058 if (flags) {
1830 * 2059 /*
1831 * Call holding a reference to mm. Takes mm->mmap_sem during call. 2060 * Currently, we only support two mutually exclusive
1832 */ 2061 * mode flags.
2062 */
2063 if (!strcmp(flags, "static"))
2064 mode_flags |= MPOL_F_STATIC_NODES;
2065 else if (!strcmp(flags, "relative"))
2066 mode_flags |= MPOL_F_RELATIVE_NODES;
2067 else
2068 err = 1;
2069 }
1833 2070
1834void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) 2071 new = mpol_new(mode, mode_flags, &nodes);
1835{ 2072 if (IS_ERR(new))
1836 struct vm_area_struct *vma; 2073 err = 1;
2074 else if (no_context)
2075 new->w.user_nodemask = nodes; /* save for contextualization */
1837 2076
1838 down_write(&mm->mmap_sem); 2077out:
1839 for (vma = mm->mmap; vma; vma = vma->vm_next) 2078 /* Restore string for error message */
1840 mpol_rebind_policy(vma->vm_policy, new); 2079 if (nodelist)
1841 up_write(&mm->mmap_sem); 2080 *--nodelist = ':';
2081 if (flags)
2082 *--flags = '=';
2083 if (!err)
2084 *mpol = new;
2085 return err;
1842} 2086}
2087#endif /* CONFIG_TMPFS */
1843 2088
1844/* 2089/**
1845 * Display pages allocated per node and memory policy via /proc. 2090 * mpol_to_str - format a mempolicy structure for printing
1846 */ 2091 * @buffer: to contain formatted mempolicy string
1847 2092 * @maxlen: length of @buffer
1848static const char * const policy_types[] = 2093 * @pol: pointer to mempolicy to be formatted
1849 { "default", "prefer", "bind", "interleave" }; 2094 * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
1850 2095 *
1851/*
1852 * Convert a mempolicy into a string. 2096 * Convert a mempolicy into a string.
1853 * Returns the number of characters in buffer (if positive) 2097 * Returns the number of characters in buffer (if positive)
1854 * or an error (negative) 2098 * or an error (negative)
1855 */ 2099 */
1856static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) 2100int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
1857{ 2101{
1858 char *p = buffer; 2102 char *p = buffer;
1859 int l; 2103 int l;
1860 nodemask_t nodes; 2104 nodemask_t nodes;
1861 int mode = pol ? pol->policy : MPOL_DEFAULT; 2105 unsigned short mode;
2106 unsigned short flags = pol ? pol->flags : 0;
2107
2108 /*
2109 * Sanity check: room for longest mode, flag and some nodes
2110 */
2111 VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2112
2113 if (!pol || pol == &default_policy)
2114 mode = MPOL_DEFAULT;
2115 else
2116 mode = pol->mode;
1862 2117
1863 switch (mode) { 2118 switch (mode) {
1864 case MPOL_DEFAULT: 2119 case MPOL_DEFAULT:
@@ -1867,33 +2122,50 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1867 2122
1868 case MPOL_PREFERRED: 2123 case MPOL_PREFERRED:
1869 nodes_clear(nodes); 2124 nodes_clear(nodes);
1870 node_set(pol->v.preferred_node, nodes); 2125 if (flags & MPOL_F_LOCAL)
2126 mode = MPOL_LOCAL; /* pseudo-policy */
2127 else
2128 node_set(pol->v.preferred_node, nodes);
1871 break; 2129 break;
1872 2130
1873 case MPOL_BIND: 2131 case MPOL_BIND:
1874 get_zonemask(pol, &nodes); 2132 /* Fall through */
1875 break;
1876
1877 case MPOL_INTERLEAVE: 2133 case MPOL_INTERLEAVE:
1878 nodes = pol->v.nodes; 2134 if (no_context)
2135 nodes = pol->w.user_nodemask;
2136 else
2137 nodes = pol->v.nodes;
1879 break; 2138 break;
1880 2139
1881 default: 2140 default:
1882 BUG(); 2141 BUG();
1883 return -EFAULT;
1884 } 2142 }
1885 2143
1886 l = strlen(policy_types[mode]); 2144 l = strlen(policy_types[mode]);
1887 if (buffer + maxlen < p + l + 1) 2145 if (buffer + maxlen < p + l + 1)
1888 return -ENOSPC; 2146 return -ENOSPC;
1889 2147
1890 strcpy(p, policy_types[mode]); 2148 strcpy(p, policy_types[mode]);
1891 p += l; 2149 p += l;
1892 2150
1893 if (!nodes_empty(nodes)) { 2151 if (flags & MPOL_MODE_FLAGS) {
1894 if (buffer + maxlen < p + 2) 2152 if (buffer + maxlen < p + 2)
1895 return -ENOSPC; 2153 return -ENOSPC;
1896 *p++ = '='; 2154 *p++ = '=';
2155
2156 /*
2157 * Currently, the only defined flags are mutually exclusive
2158 */
2159 if (flags & MPOL_F_STATIC_NODES)
2160 p += snprintf(p, buffer + maxlen - p, "static");
2161 else if (flags & MPOL_F_RELATIVE_NODES)
2162 p += snprintf(p, buffer + maxlen - p, "relative");
2163 }
2164
2165 if (!nodes_empty(nodes)) {
2166 if (buffer + maxlen < p + 2)
2167 return -ENOSPC;
2168 *p++ = ':';
1897 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); 2169 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1898 } 2170 }
1899 return p - buffer; 2171 return p - buffer;
@@ -1971,6 +2243,9 @@ static inline void check_huge_range(struct vm_area_struct *vma,
1971} 2243}
1972#endif 2244#endif
1973 2245
2246/*
2247 * Display pages allocated per node and memory policy via /proc.
2248 */
1974int show_numa_map(struct seq_file *m, void *v) 2249int show_numa_map(struct seq_file *m, void *v)
1975{ 2250{
1976 struct proc_maps_private *priv = m->private; 2251 struct proc_maps_private *priv = m->private;
@@ -1990,12 +2265,8 @@ int show_numa_map(struct seq_file *m, void *v)
1990 return 0; 2265 return 0;
1991 2266
1992 pol = get_vma_policy(priv->task, vma, vma->vm_start); 2267 pol = get_vma_policy(priv->task, vma, vma->vm_start);
1993 mpol_to_str(buffer, sizeof(buffer), pol); 2268 mpol_to_str(buffer, sizeof(buffer), pol, 0);
1994 /* 2269 mpol_cond_put(pol);
1995 * unref shared or other task's mempolicy
1996 */
1997 if (pol != &default_policy && pol != current->mempolicy)
1998 __mpol_free(pol);
1999 2270
2000 seq_printf(m, "%08lx %s", vma->vm_start, buffer); 2271 seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2001 2272
diff --git a/mm/mincore.c b/mm/mincore.c
index 5efe0ded69b1..5178800bc129 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -33,7 +33,7 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
33 * When tmpfs swaps out a page from a file, any process mapping that 33 * When tmpfs swaps out a page from a file, any process mapping that
34 * file will not get a swp_entry_t in its pte, but rather it is like 34 * file will not get a swp_entry_t in its pte, but rather it is like
35 * any other file mapping (ie. marked !present and faulted in with 35 * any other file mapping (ie. marked !present and faulted in with
36 * tmpfs's .nopage). So swapped out tmpfs mappings are tested here. 36 * tmpfs's .fault). So swapped out tmpfs mappings are tested here.
37 * 37 *
38 * However when tmpfs moves the page from pagecache and into swapcache, 38 * However when tmpfs moves the page from pagecache and into swapcache,
39 * it is still in core, but the find_get_page below won't find it. 39 * it is still in core, but the find_get_page below won't find it.
diff --git a/mm/mmap.c b/mm/mmap.c
index a32d28ce31cd..677d184b0d42 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -232,7 +232,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
232 vma->vm_ops->close(vma); 232 vma->vm_ops->close(vma);
233 if (vma->vm_file) 233 if (vma->vm_file)
234 fput(vma->vm_file); 234 fput(vma->vm_file);
235 mpol_free(vma_policy(vma)); 235 mpol_put(vma_policy(vma));
236 kmem_cache_free(vm_area_cachep, vma); 236 kmem_cache_free(vm_area_cachep, vma);
237 return next; 237 return next;
238} 238}
@@ -626,7 +626,7 @@ again: remove_next = 1 + (end > next->vm_end);
626 if (file) 626 if (file)
627 fput(file); 627 fput(file);
628 mm->map_count--; 628 mm->map_count--;
629 mpol_free(vma_policy(next)); 629 mpol_put(vma_policy(next));
630 kmem_cache_free(vm_area_cachep, next); 630 kmem_cache_free(vm_area_cachep, next);
631 /* 631 /*
632 * In mprotect's case 6 (see comments on vma_merge), 632 * In mprotect's case 6 (see comments on vma_merge),
@@ -1068,7 +1068,6 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1068 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1068 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1069} 1069}
1070 1070
1071
1072unsigned long mmap_region(struct file *file, unsigned long addr, 1071unsigned long mmap_region(struct file *file, unsigned long addr,
1073 unsigned long len, unsigned long flags, 1072 unsigned long len, unsigned long flags,
1074 unsigned int vm_flags, unsigned long pgoff, 1073 unsigned int vm_flags, unsigned long pgoff,
@@ -1181,22 +1180,20 @@ munmap_back:
1181 if (vma_wants_writenotify(vma)) 1180 if (vma_wants_writenotify(vma))
1182 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1181 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1183 1182
1184 if (!file || !vma_merge(mm, prev, addr, vma->vm_end, 1183 if (file && vma_merge(mm, prev, addr, vma->vm_end,
1185 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1184 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) {
1186 file = vma->vm_file; 1185 mpol_put(vma_policy(vma));
1187 vma_link(mm, vma, prev, rb_link, rb_parent);
1188 if (correct_wcount)
1189 atomic_inc(&inode->i_writecount);
1190 } else {
1191 if (file) {
1192 if (correct_wcount)
1193 atomic_inc(&inode->i_writecount);
1194 fput(file);
1195 }
1196 mpol_free(vma_policy(vma));
1197 kmem_cache_free(vm_area_cachep, vma); 1186 kmem_cache_free(vm_area_cachep, vma);
1187 fput(file);
1188 } else {
1189 vma_link(mm, vma, prev, rb_link, rb_parent);
1190 file = vma->vm_file;
1198 } 1191 }
1199out: 1192
1193 /* Once vma denies write, undo our temporary denial count */
1194 if (correct_wcount)
1195 atomic_inc(&inode->i_writecount);
1196out:
1200 mm->total_vm += len >> PAGE_SHIFT; 1197 mm->total_vm += len >> PAGE_SHIFT;
1201 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1198 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1202 if (vm_flags & VM_LOCKED) { 1199 if (vm_flags & VM_LOCKED) {
@@ -1813,7 +1810,7 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1813 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); 1810 new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
1814 } 1811 }
1815 1812
1816 pol = mpol_copy(vma_policy(vma)); 1813 pol = mpol_dup(vma_policy(vma));
1817 if (IS_ERR(pol)) { 1814 if (IS_ERR(pol)) {
1818 kmem_cache_free(vm_area_cachep, new); 1815 kmem_cache_free(vm_area_cachep, new);
1819 return PTR_ERR(pol); 1816 return PTR_ERR(pol);
@@ -2129,7 +2126,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2129 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); 2126 new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
2130 if (new_vma) { 2127 if (new_vma) {
2131 *new_vma = *vma; 2128 *new_vma = *vma;
2132 pol = mpol_copy(vma_policy(vma)); 2129 pol = mpol_dup(vma_policy(vma));
2133 if (IS_ERR(pol)) { 2130 if (IS_ERR(pol)) {
2134 kmem_cache_free(vm_area_cachep, new_vma); 2131 kmem_cache_free(vm_area_cachep, new_vma);
2135 return NULL; 2132 return NULL;
diff --git a/mm/mmzone.c b/mm/mmzone.c
index eb5838634f18..486ed595ee6f 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
42 return zone; 42 return zone;
43} 43}
44 44
45static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
46{
47#ifdef CONFIG_NUMA
48 return node_isset(zonelist_node_idx(zref), *nodes);
49#else
50 return 1;
51#endif /* CONFIG_NUMA */
52}
53
54/* Returns the next zone at or below highest_zoneidx in a zonelist */
55struct zoneref *next_zones_zonelist(struct zoneref *z,
56 enum zone_type highest_zoneidx,
57 nodemask_t *nodes,
58 struct zone **zone)
59{
60 /*
61 * Find the next suitable zone to use for the allocation.
62 * Only filter based on nodemask if it's set
63 */
64 if (likely(nodes == NULL))
65 while (zonelist_zone_idx(z) > highest_zoneidx)
66 z++;
67 else
68 while (zonelist_zone_idx(z) > highest_zoneidx ||
69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++;
71
72 *zone = zonelist_zone(z++);
73 return z;
74}
diff --git a/mm/nommu.c b/mm/nommu.c
index 5d8ae086f74e..1d32fe89d57b 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -105,7 +105,11 @@ unsigned int kobjsize(const void *objp)
105{ 105{
106 struct page *page; 106 struct page *page;
107 107
108 if (!objp || !((page = virt_to_page(objp)))) 108 /*
109 * If the object we have should not have ksize performed on it,
110 * return size of 0
111 */
112 if (!objp || (unsigned long)objp >= memory_end || !((page = virt_to_page(objp))))
109 return 0; 113 return 0;
110 114
111 if (PageSlab(page)) 115 if (PageSlab(page))
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index beb592fe9389..8a5467ee6265 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -53,8 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
53 * of least surprise ... (be careful when you change it) 53 * of least surprise ... (be careful when you change it)
54 */ 54 */
55 55
56unsigned long badness(struct task_struct *p, unsigned long uptime, 56unsigned long badness(struct task_struct *p, unsigned long uptime)
57 struct mem_cgroup *mem)
58{ 57{
59 unsigned long points, cpu_time, run_time, s; 58 unsigned long points, cpu_time, run_time, s;
60 struct mm_struct *mm; 59 struct mm_struct *mm;
@@ -175,12 +174,14 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
175 gfp_t gfp_mask) 174 gfp_t gfp_mask)
176{ 175{
177#ifdef CONFIG_NUMA 176#ifdef CONFIG_NUMA
178 struct zone **z; 177 struct zone *zone;
178 struct zoneref *z;
179 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
179 nodemask_t nodes = node_states[N_HIGH_MEMORY]; 180 nodemask_t nodes = node_states[N_HIGH_MEMORY];
180 181
181 for (z = zonelist->zones; *z; z++) 182 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
182 if (cpuset_zone_allowed_softwall(*z, gfp_mask)) 183 if (cpuset_zone_allowed_softwall(zone, gfp_mask))
183 node_clear(zone_to_nid(*z), nodes); 184 node_clear(zone_to_nid(zone), nodes);
184 else 185 else
185 return CONSTRAINT_CPUSET; 186 return CONSTRAINT_CPUSET;
186 187
@@ -254,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
254 if (p->oomkilladj == OOM_DISABLE) 255 if (p->oomkilladj == OOM_DISABLE)
255 continue; 256 continue;
256 257
257 points = badness(p, uptime.tv_sec, mem); 258 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 259 if (points > *ppoints || !chosen) {
259 chosen = p; 260 chosen = p;
260 *ppoints = points; 261 *ppoints = points;
@@ -460,29 +461,29 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
460 * if a parallel OOM killing is already taking place that includes a zone in 461 * if a parallel OOM killing is already taking place that includes a zone in
461 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. 462 * the zonelist. Otherwise, locks all zones in the zonelist and returns 1.
462 */ 463 */
463int try_set_zone_oom(struct zonelist *zonelist) 464int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
464{ 465{
465 struct zone **z; 466 struct zoneref *z;
467 struct zone *zone;
466 int ret = 1; 468 int ret = 1;
467 469
468 z = zonelist->zones;
469
470 spin_lock(&zone_scan_mutex); 470 spin_lock(&zone_scan_mutex);
471 do { 471 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
472 if (zone_is_oom_locked(*z)) { 472 if (zone_is_oom_locked(zone)) {
473 ret = 0; 473 ret = 0;
474 goto out; 474 goto out;
475 } 475 }
476 } while (*(++z) != NULL); 476 }
477
478 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
479 /*
480 * Lock each zone in the zonelist under zone_scan_mutex so a
481 * parallel invocation of try_set_zone_oom() doesn't succeed
482 * when it shouldn't.
483 */
484 zone_set_flag(zone, ZONE_OOM_LOCKED);
485 }
477 486
478 /*
479 * Lock each zone in the zonelist under zone_scan_mutex so a parallel
480 * invocation of try_set_zone_oom() doesn't succeed when it shouldn't.
481 */
482 z = zonelist->zones;
483 do {
484 zone_set_flag(*z, ZONE_OOM_LOCKED);
485 } while (*(++z) != NULL);
486out: 487out:
487 spin_unlock(&zone_scan_mutex); 488 spin_unlock(&zone_scan_mutex);
488 return ret; 489 return ret;
@@ -493,16 +494,15 @@ out:
493 * allocation attempts with zonelists containing them may now recall the OOM 494 * allocation attempts with zonelists containing them may now recall the OOM
494 * killer, if necessary. 495 * killer, if necessary.
495 */ 496 */
496void clear_zonelist_oom(struct zonelist *zonelist) 497void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
497{ 498{
498 struct zone **z; 499 struct zoneref *z;
499 500 struct zone *zone;
500 z = zonelist->zones;
501 501
502 spin_lock(&zone_scan_mutex); 502 spin_lock(&zone_scan_mutex);
503 do { 503 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
504 zone_clear_flag(*z, ZONE_OOM_LOCKED); 504 zone_clear_flag(zone, ZONE_OOM_LOCKED);
505 } while (*(++z) != NULL); 505 }
506 spin_unlock(&zone_scan_mutex); 506 spin_unlock(&zone_scan_mutex);
507} 507}
508 508
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 32e796af12a1..d1cf4f05dcda 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -546,7 +546,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
546/* 546/*
547 * permit the bootmem allocator to evade page validation on high-order frees 547 * permit the bootmem allocator to evade page validation on high-order frees
548 */ 548 */
549void __init __free_pages_bootmem(struct page *page, unsigned int order) 549void __free_pages_bootmem(struct page *page, unsigned int order)
550{ 550{
551 if (order == 0) { 551 if (order == 0) {
552 __ClearPageReserved(page); 552 __ClearPageReserved(page);
@@ -632,7 +632,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
632 if (PageReserved(page)) 632 if (PageReserved(page))
633 return 1; 633 return 1;
634 634
635 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_readahead | 635 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
636 1 << PG_referenced | 1 << PG_arch_1 | 636 1 << PG_referenced | 1 << PG_arch_1 |
637 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 637 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk);
638 set_page_private(page, 0); 638 set_page_private(page, 0);
@@ -1050,7 +1050,7 @@ void split_page(struct page *page, unsigned int order)
1050 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1050 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1051 * or two. 1051 * or two.
1052 */ 1052 */
1053static struct page *buffered_rmqueue(struct zonelist *zonelist, 1053static struct page *buffered_rmqueue(struct zone *preferred_zone,
1054 struct zone *zone, int order, gfp_t gfp_flags) 1054 struct zone *zone, int order, gfp_t gfp_flags)
1055{ 1055{
1056 unsigned long flags; 1056 unsigned long flags;
@@ -1102,7 +1102,7 @@ again:
1102 } 1102 }
1103 1103
1104 __count_zone_vm_events(PGALLOC, zone, 1 << order); 1104 __count_zone_vm_events(PGALLOC, zone, 1 << order);
1105 zone_statistics(zonelist, zone); 1105 zone_statistics(preferred_zone, zone);
1106 local_irq_restore(flags); 1106 local_irq_restore(flags);
1107 put_cpu(); 1107 put_cpu();
1108 1108
@@ -1284,7 +1284,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1284 if (!zlc) 1284 if (!zlc)
1285 return NULL; 1285 return NULL;
1286 1286
1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) { 1287 if (time_after(jiffies, zlc->last_full_zap + HZ)) {
1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 1288 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
1289 zlc->last_full_zap = jiffies; 1289 zlc->last_full_zap = jiffies;
1290 } 1290 }
@@ -1317,7 +1317,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1317 * We are low on memory in the second scan, and should leave no stone 1317 * We are low on memory in the second scan, and should leave no stone
1318 * unturned looking for a free page. 1318 * unturned looking for a free page.
1319 */ 1319 */
1320static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1320static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1321 nodemask_t *allowednodes) 1321 nodemask_t *allowednodes)
1322{ 1322{
1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1323 struct zonelist_cache *zlc; /* cached zonelist speedup info */
@@ -1328,7 +1328,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1328 if (!zlc) 1328 if (!zlc)
1329 return 1; 1329 return 1;
1330 1330
1331 i = z - zonelist->zones; 1331 i = z - zonelist->_zonerefs;
1332 n = zlc->z_to_n[i]; 1332 n = zlc->z_to_n[i];
1333 1333
1334 /* This zone is worth trying if it is allowed but not full */ 1334 /* This zone is worth trying if it is allowed but not full */
@@ -1340,7 +1340,7 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
1340 * zlc->fullzones, so that subsequent attempts to allocate a page 1340 * zlc->fullzones, so that subsequent attempts to allocate a page
1341 * from that zone don't waste time re-examining it. 1341 * from that zone don't waste time re-examining it.
1342 */ 1342 */
1343static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1343static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1344{ 1344{
1345 struct zonelist_cache *zlc; /* cached zonelist speedup info */ 1345 struct zonelist_cache *zlc; /* cached zonelist speedup info */
1346 int i; /* index of *z in zonelist zones */ 1346 int i; /* index of *z in zonelist zones */
@@ -1349,7 +1349,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1349 if (!zlc) 1349 if (!zlc)
1350 return; 1350 return;
1351 1351
1352 i = z - zonelist->zones; 1352 i = z - zonelist->_zonerefs;
1353 1353
1354 set_bit(i, zlc->fullzones); 1354 set_bit(i, zlc->fullzones);
1355} 1355}
@@ -1361,13 +1361,13 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1361 return NULL; 1361 return NULL;
1362} 1362}
1363 1363
1364static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z, 1364static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z,
1365 nodemask_t *allowednodes) 1365 nodemask_t *allowednodes)
1366{ 1366{
1367 return 1; 1367 return 1;
1368} 1368}
1369 1369
1370static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z) 1370static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1371{ 1371{
1372} 1372}
1373#endif /* CONFIG_NUMA */ 1373#endif /* CONFIG_NUMA */
@@ -1377,42 +1377,31 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
1377 * a page. 1377 * a page.
1378 */ 1378 */
1379static struct page * 1379static struct page *
1380get_page_from_freelist(gfp_t gfp_mask, unsigned int order, 1380get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1381 struct zonelist *zonelist, int alloc_flags) 1381 struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
1382{ 1382{
1383 struct zone **z; 1383 struct zoneref *z;
1384 struct page *page = NULL; 1384 struct page *page = NULL;
1385 int classzone_idx = zone_idx(zonelist->zones[0]); 1385 int classzone_idx;
1386 struct zone *zone; 1386 struct zone *zone, *preferred_zone;
1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1387 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1388 int zlc_active = 0; /* set if using zonelist_cache */ 1388 int zlc_active = 0; /* set if using zonelist_cache */
1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1389 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1390 enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ 1390
1391 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1392 &preferred_zone);
1393 classzone_idx = zone_idx(preferred_zone);
1391 1394
1392zonelist_scan: 1395zonelist_scan:
1393 /* 1396 /*
1394 * Scan zonelist, looking for a zone with enough free. 1397 * Scan zonelist, looking for a zone with enough free.
1395 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1398 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1396 */ 1399 */
1397 z = zonelist->zones; 1400 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1398 1401 high_zoneidx, nodemask) {
1399 do {
1400 /*
1401 * In NUMA, this could be a policy zonelist which contains
1402 * zones that may not be allowed by the current gfp_mask.
1403 * Check the zone is allowed by the current flags
1404 */
1405 if (unlikely(alloc_should_filter_zonelist(zonelist))) {
1406 if (highest_zoneidx == -1)
1407 highest_zoneidx = gfp_zone(gfp_mask);
1408 if (zone_idx(*z) > highest_zoneidx)
1409 continue;
1410 }
1411
1412 if (NUMA_BUILD && zlc_active && 1402 if (NUMA_BUILD && zlc_active &&
1413 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1403 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1414 continue; 1404 continue;
1415 zone = *z;
1416 if ((alloc_flags & ALLOC_CPUSET) && 1405 if ((alloc_flags & ALLOC_CPUSET) &&
1417 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1406 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1418 goto try_next_zone; 1407 goto try_next_zone;
@@ -1433,7 +1422,7 @@ zonelist_scan:
1433 } 1422 }
1434 } 1423 }
1435 1424
1436 page = buffered_rmqueue(zonelist, zone, order, gfp_mask); 1425 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
1437 if (page) 1426 if (page)
1438 break; 1427 break;
1439this_zone_full: 1428this_zone_full:
@@ -1446,7 +1435,7 @@ try_next_zone:
1446 zlc_active = 1; 1435 zlc_active = 1;
1447 did_zlc_setup = 1; 1436 did_zlc_setup = 1;
1448 } 1437 }
1449 } while (*(++z) != NULL); 1438 }
1450 1439
1451 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1440 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
1452 /* Disable zlc cache for second zonelist scan */ 1441 /* Disable zlc cache for second zonelist scan */
@@ -1459,12 +1448,14 @@ try_next_zone:
1459/* 1448/*
1460 * This is the 'heart' of the zoned buddy allocator. 1449 * This is the 'heart' of the zoned buddy allocator.
1461 */ 1450 */
1462struct page * 1451static struct page *
1463__alloc_pages(gfp_t gfp_mask, unsigned int order, 1452__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
1464 struct zonelist *zonelist) 1453 struct zonelist *zonelist, nodemask_t *nodemask)
1465{ 1454{
1466 const gfp_t wait = gfp_mask & __GFP_WAIT; 1455 const gfp_t wait = gfp_mask & __GFP_WAIT;
1467 struct zone **z; 1456 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1457 struct zoneref *z;
1458 struct zone *zone;
1468 struct page *page; 1459 struct page *page;
1469 struct reclaim_state reclaim_state; 1460 struct reclaim_state reclaim_state;
1470 struct task_struct *p = current; 1461 struct task_struct *p = current;
@@ -1478,9 +1469,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
1478 return NULL; 1469 return NULL;
1479 1470
1480restart: 1471restart:
1481 z = zonelist->zones; /* the list of zones suitable for gfp_mask */ 1472 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */
1482 1473
1483 if (unlikely(*z == NULL)) { 1474 if (unlikely(!z->zone)) {
1484 /* 1475 /*
1485 * Happens if we have an empty zonelist as a result of 1476 * Happens if we have an empty zonelist as a result of
1486 * GFP_THISNODE being used on a memoryless node 1477 * GFP_THISNODE being used on a memoryless node
@@ -1488,8 +1479,8 @@ restart:
1488 return NULL; 1479 return NULL;
1489 } 1480 }
1490 1481
1491 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1482 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1492 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1483 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
1493 if (page) 1484 if (page)
1494 goto got_pg; 1485 goto got_pg;
1495 1486
@@ -1504,8 +1495,8 @@ restart:
1504 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1495 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1505 goto nopage; 1496 goto nopage;
1506 1497
1507 for (z = zonelist->zones; *z; z++) 1498 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1508 wakeup_kswapd(*z, order); 1499 wakeup_kswapd(zone, order);
1509 1500
1510 /* 1501 /*
1511 * OK, we're below the kswapd watermark and have kicked background 1502 * OK, we're below the kswapd watermark and have kicked background
@@ -1533,7 +1524,8 @@ restart:
1533 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 1524 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1534 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 1525 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1535 */ 1526 */
1536 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags); 1527 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1528 high_zoneidx, alloc_flags);
1537 if (page) 1529 if (page)
1538 goto got_pg; 1530 goto got_pg;
1539 1531
@@ -1545,8 +1537,8 @@ rebalance:
1545 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1537 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
1546nofail_alloc: 1538nofail_alloc:
1547 /* go through the zonelist yet again, ignoring mins */ 1539 /* go through the zonelist yet again, ignoring mins */
1548 page = get_page_from_freelist(gfp_mask, order, 1540 page = get_page_from_freelist(gfp_mask, nodemask, order,
1549 zonelist, ALLOC_NO_WATERMARKS); 1541 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
1550 if (page) 1542 if (page)
1551 goto got_pg; 1543 goto got_pg;
1552 if (gfp_mask & __GFP_NOFAIL) { 1544 if (gfp_mask & __GFP_NOFAIL) {
@@ -1569,7 +1561,7 @@ nofail_alloc:
1569 reclaim_state.reclaimed_slab = 0; 1561 reclaim_state.reclaimed_slab = 0;
1570 p->reclaim_state = &reclaim_state; 1562 p->reclaim_state = &reclaim_state;
1571 1563
1572 did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask); 1564 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask);
1573 1565
1574 p->reclaim_state = NULL; 1566 p->reclaim_state = NULL;
1575 p->flags &= ~PF_MEMALLOC; 1567 p->flags &= ~PF_MEMALLOC;
@@ -1580,12 +1572,12 @@ nofail_alloc:
1580 drain_all_pages(); 1572 drain_all_pages();
1581 1573
1582 if (likely(did_some_progress)) { 1574 if (likely(did_some_progress)) {
1583 page = get_page_from_freelist(gfp_mask, order, 1575 page = get_page_from_freelist(gfp_mask, nodemask, order,
1584 zonelist, alloc_flags); 1576 zonelist, high_zoneidx, alloc_flags);
1585 if (page) 1577 if (page)
1586 goto got_pg; 1578 goto got_pg;
1587 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1579 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1588 if (!try_set_zone_oom(zonelist)) { 1580 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1589 schedule_timeout_uninterruptible(1); 1581 schedule_timeout_uninterruptible(1);
1590 goto restart; 1582 goto restart;
1591 } 1583 }
@@ -1596,21 +1588,22 @@ nofail_alloc:
1596 * a parallel oom killing, we must fail if we're still 1588 * a parallel oom killing, we must fail if we're still
1597 * under heavy pressure. 1589 * under heavy pressure.
1598 */ 1590 */
1599 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, 1591 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1600 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); 1592 order, zonelist, high_zoneidx,
1593 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1601 if (page) { 1594 if (page) {
1602 clear_zonelist_oom(zonelist); 1595 clear_zonelist_oom(zonelist, gfp_mask);
1603 goto got_pg; 1596 goto got_pg;
1604 } 1597 }
1605 1598
1606 /* The OOM killer will not help higher order allocs so fail */ 1599 /* The OOM killer will not help higher order allocs so fail */
1607 if (order > PAGE_ALLOC_COSTLY_ORDER) { 1600 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1608 clear_zonelist_oom(zonelist); 1601 clear_zonelist_oom(zonelist, gfp_mask);
1609 goto nopage; 1602 goto nopage;
1610 } 1603 }
1611 1604
1612 out_of_memory(zonelist, gfp_mask, order); 1605 out_of_memory(zonelist, gfp_mask, order);
1613 clear_zonelist_oom(zonelist); 1606 clear_zonelist_oom(zonelist, gfp_mask);
1614 goto restart; 1607 goto restart;
1615 } 1608 }
1616 1609
@@ -1646,6 +1639,20 @@ got_pg:
1646 return page; 1639 return page;
1647} 1640}
1648 1641
1642struct page *
1643__alloc_pages(gfp_t gfp_mask, unsigned int order,
1644 struct zonelist *zonelist)
1645{
1646 return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
1647}
1648
1649struct page *
1650__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1651 struct zonelist *zonelist, nodemask_t *nodemask)
1652{
1653 return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
1654}
1655
1649EXPORT_SYMBOL(__alloc_pages); 1656EXPORT_SYMBOL(__alloc_pages);
1650 1657
1651/* 1658/*
@@ -1712,15 +1719,15 @@ EXPORT_SYMBOL(free_pages);
1712 1719
1713static unsigned int nr_free_zone_pages(int offset) 1720static unsigned int nr_free_zone_pages(int offset)
1714{ 1721{
1722 struct zoneref *z;
1723 struct zone *zone;
1724
1715 /* Just pick one node, since fallback list is circular */ 1725 /* Just pick one node, since fallback list is circular */
1716 pg_data_t *pgdat = NODE_DATA(numa_node_id());
1717 unsigned int sum = 0; 1726 unsigned int sum = 0;
1718 1727
1719 struct zonelist *zonelist = pgdat->node_zonelists + offset; 1728 struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
1720 struct zone **zonep = zonelist->zones;
1721 struct zone *zone;
1722 1729
1723 for (zone = *zonep++; zone; zone = *zonep++) { 1730 for_each_zone_zonelist(zone, z, zonelist, offset) {
1724 unsigned long size = zone->present_pages; 1731 unsigned long size = zone->present_pages;
1725 unsigned long high = zone->pages_high; 1732 unsigned long high = zone->pages_high;
1726 if (size > high) 1733 if (size > high)
@@ -1889,6 +1896,12 @@ void show_free_areas(void)
1889 show_swap_cache_info(); 1896 show_swap_cache_info();
1890} 1897}
1891 1898
1899static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
1900{
1901 zoneref->zone = zone;
1902 zoneref->zone_idx = zone_idx(zone);
1903}
1904
1892/* 1905/*
1893 * Builds allocation fallback zone lists. 1906 * Builds allocation fallback zone lists.
1894 * 1907 *
@@ -1906,7 +1919,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1906 zone_type--; 1919 zone_type--;
1907 zone = pgdat->node_zones + zone_type; 1920 zone = pgdat->node_zones + zone_type;
1908 if (populated_zone(zone)) { 1921 if (populated_zone(zone)) {
1909 zonelist->zones[nr_zones++] = zone; 1922 zoneref_set_zone(zone,
1923 &zonelist->_zonerefs[nr_zones++]);
1910 check_highest_zone(zone_type); 1924 check_highest_zone(zone_type);
1911 } 1925 }
1912 1926
@@ -2078,17 +2092,16 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2078 */ 2092 */
2079static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) 2093static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2080{ 2094{
2081 enum zone_type i;
2082 int j; 2095 int j;
2083 struct zonelist *zonelist; 2096 struct zonelist *zonelist;
2084 2097
2085 for (i = 0; i < MAX_NR_ZONES; i++) { 2098 zonelist = &pgdat->node_zonelists[0];
2086 zonelist = pgdat->node_zonelists + i; 2099 for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
2087 for (j = 0; zonelist->zones[j] != NULL; j++) 2100 ;
2088 ; 2101 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2089 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2102 MAX_NR_ZONES - 1);
2090 zonelist->zones[j] = NULL; 2103 zonelist->_zonerefs[j].zone = NULL;
2091 } 2104 zonelist->_zonerefs[j].zone_idx = 0;
2092} 2105}
2093 2106
2094/* 2107/*
@@ -2096,15 +2109,13 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
2096 */ 2109 */
2097static void build_thisnode_zonelists(pg_data_t *pgdat) 2110static void build_thisnode_zonelists(pg_data_t *pgdat)
2098{ 2111{
2099 enum zone_type i;
2100 int j; 2112 int j;
2101 struct zonelist *zonelist; 2113 struct zonelist *zonelist;
2102 2114
2103 for (i = 0; i < MAX_NR_ZONES; i++) { 2115 zonelist = &pgdat->node_zonelists[1];
2104 zonelist = pgdat->node_zonelists + MAX_NR_ZONES + i; 2116 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2105 j = build_zonelists_node(pgdat, zonelist, 0, i); 2117 zonelist->_zonerefs[j].zone = NULL;
2106 zonelist->zones[j] = NULL; 2118 zonelist->_zonerefs[j].zone_idx = 0;
2107 }
2108} 2119}
2109 2120
2110/* 2121/*
@@ -2117,27 +2128,26 @@ static int node_order[MAX_NUMNODES];
2117 2128
2118static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) 2129static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
2119{ 2130{
2120 enum zone_type i;
2121 int pos, j, node; 2131 int pos, j, node;
2122 int zone_type; /* needs to be signed */ 2132 int zone_type; /* needs to be signed */
2123 struct zone *z; 2133 struct zone *z;
2124 struct zonelist *zonelist; 2134 struct zonelist *zonelist;
2125 2135
2126 for (i = 0; i < MAX_NR_ZONES; i++) { 2136 zonelist = &pgdat->node_zonelists[0];
2127 zonelist = pgdat->node_zonelists + i; 2137 pos = 0;
2128 pos = 0; 2138 for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
2129 for (zone_type = i; zone_type >= 0; zone_type--) { 2139 for (j = 0; j < nr_nodes; j++) {
2130 for (j = 0; j < nr_nodes; j++) { 2140 node = node_order[j];
2131 node = node_order[j]; 2141 z = &NODE_DATA(node)->node_zones[zone_type];
2132 z = &NODE_DATA(node)->node_zones[zone_type]; 2142 if (populated_zone(z)) {
2133 if (populated_zone(z)) { 2143 zoneref_set_zone(z,
2134 zonelist->zones[pos++] = z; 2144 &zonelist->_zonerefs[pos++]);
2135 check_highest_zone(zone_type); 2145 check_highest_zone(zone_type);
2136 }
2137 } 2146 }
2138 } 2147 }
2139 zonelist->zones[pos] = NULL;
2140 } 2148 }
2149 zonelist->_zonerefs[pos].zone = NULL;
2150 zonelist->_zonerefs[pos].zone_idx = 0;
2141} 2151}
2142 2152
2143static int default_zonelist_order(void) 2153static int default_zonelist_order(void)
@@ -2214,7 +2224,8 @@ static void build_zonelists(pg_data_t *pgdat)
2214 /* initialize zonelists */ 2224 /* initialize zonelists */
2215 for (i = 0; i < MAX_ZONELISTS; i++) { 2225 for (i = 0; i < MAX_ZONELISTS; i++) {
2216 zonelist = pgdat->node_zonelists + i; 2226 zonelist = pgdat->node_zonelists + i;
2217 zonelist->zones[0] = NULL; 2227 zonelist->_zonerefs[0].zone = NULL;
2228 zonelist->_zonerefs[0].zone_idx = 0;
2218 } 2229 }
2219 2230
2220 /* NUMA-aware ordering of nodes */ 2231 /* NUMA-aware ordering of nodes */
@@ -2264,19 +2275,15 @@ static void build_zonelists(pg_data_t *pgdat)
2264/* Construct the zonelist performance cache - see further mmzone.h */ 2275/* Construct the zonelist performance cache - see further mmzone.h */
2265static void build_zonelist_cache(pg_data_t *pgdat) 2276static void build_zonelist_cache(pg_data_t *pgdat)
2266{ 2277{
2267 int i; 2278 struct zonelist *zonelist;
2268 2279 struct zonelist_cache *zlc;
2269 for (i = 0; i < MAX_NR_ZONES; i++) { 2280 struct zoneref *z;
2270 struct zonelist *zonelist;
2271 struct zonelist_cache *zlc;
2272 struct zone **z;
2273 2281
2274 zonelist = pgdat->node_zonelists + i; 2282 zonelist = &pgdat->node_zonelists[0];
2275 zonelist->zlcache_ptr = zlc = &zonelist->zlcache; 2283 zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
2276 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); 2284 bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
2277 for (z = zonelist->zones; *z; z++) 2285 for (z = zonelist->_zonerefs; z->zone; z++)
2278 zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z); 2286 zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z);
2279 }
2280} 2287}
2281 2288
2282 2289
@@ -2290,45 +2297,44 @@ static void set_zonelist_order(void)
2290static void build_zonelists(pg_data_t *pgdat) 2297static void build_zonelists(pg_data_t *pgdat)
2291{ 2298{
2292 int node, local_node; 2299 int node, local_node;
2293 enum zone_type i,j; 2300 enum zone_type j;
2301 struct zonelist *zonelist;
2294 2302
2295 local_node = pgdat->node_id; 2303 local_node = pgdat->node_id;
2296 for (i = 0; i < MAX_NR_ZONES; i++) {
2297 struct zonelist *zonelist;
2298 2304
2299 zonelist = pgdat->node_zonelists + i; 2305 zonelist = &pgdat->node_zonelists[0];
2306 j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
2300 2307
2301 j = build_zonelists_node(pgdat, zonelist, 0, i); 2308 /*
2302 /* 2309 * Now we build the zonelist so that it contains the zones
2303 * Now we build the zonelist so that it contains the zones 2310 * of all the other nodes.
2304 * of all the other nodes. 2311 * We don't want to pressure a particular node, so when
2305 * We don't want to pressure a particular node, so when 2312 * building the zones for node N, we make sure that the
2306 * building the zones for node N, we make sure that the 2313 * zones coming right after the local ones are those from
2307 * zones coming right after the local ones are those from 2314 * node N+1 (modulo N)
2308 * node N+1 (modulo N) 2315 */
2309 */ 2316 for (node = local_node + 1; node < MAX_NUMNODES; node++) {
2310 for (node = local_node + 1; node < MAX_NUMNODES; node++) { 2317 if (!node_online(node))
2311 if (!node_online(node)) 2318 continue;
2312 continue; 2319 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2313 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2320 MAX_NR_ZONES - 1);
2314 } 2321 }
2315 for (node = 0; node < local_node; node++) { 2322 for (node = 0; node < local_node; node++) {
2316 if (!node_online(node)) 2323 if (!node_online(node))
2317 continue; 2324 continue;
2318 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 2325 j = build_zonelists_node(NODE_DATA(node), zonelist, j,
2319 } 2326 MAX_NR_ZONES - 1);
2320
2321 zonelist->zones[j] = NULL;
2322 } 2327 }
2328
2329 zonelist->_zonerefs[j].zone = NULL;
2330 zonelist->_zonerefs[j].zone_idx = 0;
2323} 2331}
2324 2332
2325/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2333/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
2326static void build_zonelist_cache(pg_data_t *pgdat) 2334static void build_zonelist_cache(pg_data_t *pgdat)
2327{ 2335{
2328 int i; 2336 pgdat->node_zonelists[0].zlcache_ptr = NULL;
2329 2337 pgdat->node_zonelists[1].zlcache_ptr = NULL;
2330 for (i = 0; i < MAX_NR_ZONES; i++)
2331 pgdat->node_zonelists[i].zlcache_ptr = NULL;
2332} 2338}
2333 2339
2334#endif /* CONFIG_NUMA */ 2340#endif /* CONFIG_NUMA */
@@ -4339,9 +4345,7 @@ void *__init alloc_large_system_hash(const char *tablename,
4339 else if (hashdist) 4345 else if (hashdist)
4340 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4346 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4341 else { 4347 else {
4342 unsigned long order; 4348 unsigned long order = get_order(size);
4343 for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
4344 ;
4345 table = (void*) __get_free_pages(GFP_ATOMIC, order); 4349 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4346 /* 4350 /*
4347 * If bucketsize is not a power-of-two, we may free 4351 * If bucketsize is not a power-of-two, we may free
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 1cf1417ef8b7..0afd2387e507 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -9,11 +9,15 @@ static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
9 int err = 0; 9 int err = 0;
10 10
11 pte = pte_offset_map(pmd, addr); 11 pte = pte_offset_map(pmd, addr);
12 do { 12 for (;;) {
13 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private); 13 err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, private);
14 if (err) 14 if (err)
15 break; 15 break;
16 } while (pte++, addr += PAGE_SIZE, addr != end); 16 addr += PAGE_SIZE;
17 if (addr == end)
18 break;
19 pte++;
20 }
17 21
18 pte_unmap(pte); 22 pte_unmap(pte);
19 return err; 23 return err;
diff --git a/mm/rmap.c b/mm/rmap.c
index 997f06907b6d..bf0a5b7cfb8e 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -413,9 +413,6 @@ int page_referenced(struct page *page, int is_locked,
413{ 413{
414 int referenced = 0; 414 int referenced = 0;
415 415
416 if (page_test_and_clear_young(page))
417 referenced++;
418
419 if (TestClearPageReferenced(page)) 416 if (TestClearPageReferenced(page))
420 referenced++; 417 referenced++;
421 418
@@ -433,6 +430,10 @@ int page_referenced(struct page *page, int is_locked,
433 unlock_page(page); 430 unlock_page(page);
434 } 431 }
435 } 432 }
433
434 if (page_test_and_clear_young(page))
435 referenced++;
436
436 return referenced; 437 return referenced;
437} 438}
438 439
@@ -661,7 +662,6 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
661 printk (KERN_EMERG " page->mapping = %p\n", page->mapping); 662 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
662 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops); 663 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
663 if (vma->vm_ops) { 664 if (vma->vm_ops) {
664 print_symbol (KERN_EMERG " vma->vm_ops->nopage = %s\n", (unsigned long)vma->vm_ops->nopage);
665 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault); 665 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
666 } 666 }
667 if (vma->vm_file && vma->vm_file->f_op) 667 if (vma->vm_file && vma->vm_file->f_op)
diff --git a/mm/shmem.c b/mm/shmem.c
index f514dd392cd9..e6d9298aa22a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1079,104 +1079,47 @@ redirty:
1079 1079
1080#ifdef CONFIG_NUMA 1080#ifdef CONFIG_NUMA
1081#ifdef CONFIG_TMPFS 1081#ifdef CONFIG_TMPFS
1082static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes) 1082static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
1083{ 1083{
1084 char *nodelist = strchr(value, ':'); 1084 char buffer[64];
1085 int err = 1;
1086 1085
1087 if (nodelist) { 1086 if (!mpol || mpol->mode == MPOL_DEFAULT)
1088 /* NUL-terminate policy string */ 1087 return; /* show nothing */
1089 *nodelist++ = '\0';
1090 if (nodelist_parse(nodelist, *policy_nodes))
1091 goto out;
1092 if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
1093 goto out;
1094 }
1095 if (!strcmp(value, "default")) {
1096 *policy = MPOL_DEFAULT;
1097 /* Don't allow a nodelist */
1098 if (!nodelist)
1099 err = 0;
1100 } else if (!strcmp(value, "prefer")) {
1101 *policy = MPOL_PREFERRED;
1102 /* Insist on a nodelist of one node only */
1103 if (nodelist) {
1104 char *rest = nodelist;
1105 while (isdigit(*rest))
1106 rest++;
1107 if (!*rest)
1108 err = 0;
1109 }
1110 } else if (!strcmp(value, "bind")) {
1111 *policy = MPOL_BIND;
1112 /* Insist on a nodelist */
1113 if (nodelist)
1114 err = 0;
1115 } else if (!strcmp(value, "interleave")) {
1116 *policy = MPOL_INTERLEAVE;
1117 /*
1118 * Default to online nodes with memory if no nodelist
1119 */
1120 if (!nodelist)
1121 *policy_nodes = node_states[N_HIGH_MEMORY];
1122 err = 0;
1123 }
1124out:
1125 /* Restore string for error message */
1126 if (nodelist)
1127 *--nodelist = ':';
1128 return err;
1129}
1130
1131static void shmem_show_mpol(struct seq_file *seq, int policy,
1132 const nodemask_t policy_nodes)
1133{
1134 char *policy_string;
1135 1088
1136 switch (policy) { 1089 mpol_to_str(buffer, sizeof(buffer), mpol, 1);
1137 case MPOL_PREFERRED:
1138 policy_string = "prefer";
1139 break;
1140 case MPOL_BIND:
1141 policy_string = "bind";
1142 break;
1143 case MPOL_INTERLEAVE:
1144 policy_string = "interleave";
1145 break;
1146 default:
1147 /* MPOL_DEFAULT */
1148 return;
1149 }
1150 1090
1151 seq_printf(seq, ",mpol=%s", policy_string); 1091 seq_printf(seq, ",mpol=%s", buffer);
1152 1092}
1153 if (policy != MPOL_INTERLEAVE ||
1154 !nodes_equal(policy_nodes, node_states[N_HIGH_MEMORY])) {
1155 char buffer[64];
1156 int len;
1157 1093
1158 len = nodelist_scnprintf(buffer, sizeof(buffer), policy_nodes); 1094static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1159 if (len < sizeof(buffer)) 1095{
1160 seq_printf(seq, ":%s", buffer); 1096 struct mempolicy *mpol = NULL;
1161 else 1097 if (sbinfo->mpol) {
1162 seq_printf(seq, ":?"); 1098 spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
1099 mpol = sbinfo->mpol;
1100 mpol_get(mpol);
1101 spin_unlock(&sbinfo->stat_lock);
1163 } 1102 }
1103 return mpol;
1164} 1104}
1165#endif /* CONFIG_TMPFS */ 1105#endif /* CONFIG_TMPFS */
1166 1106
1167static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp, 1107static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
1168 struct shmem_inode_info *info, unsigned long idx) 1108 struct shmem_inode_info *info, unsigned long idx)
1169{ 1109{
1110 struct mempolicy mpol, *spol;
1170 struct vm_area_struct pvma; 1111 struct vm_area_struct pvma;
1171 struct page *page; 1112 struct page *page;
1172 1113
1114 spol = mpol_cond_copy(&mpol,
1115 mpol_shared_policy_lookup(&info->policy, idx));
1116
1173 /* Create a pseudo vma that just contains the policy */ 1117 /* Create a pseudo vma that just contains the policy */
1174 pvma.vm_start = 0; 1118 pvma.vm_start = 0;
1175 pvma.vm_pgoff = idx; 1119 pvma.vm_pgoff = idx;
1176 pvma.vm_ops = NULL; 1120 pvma.vm_ops = NULL;
1177 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 1121 pvma.vm_policy = spol;
1178 page = swapin_readahead(entry, gfp, &pvma, 0); 1122 page = swapin_readahead(entry, gfp, &pvma, 0);
1179 mpol_free(pvma.vm_policy);
1180 return page; 1123 return page;
1181} 1124}
1182 1125
@@ -1184,27 +1127,21 @@ static struct page *shmem_alloc_page(gfp_t gfp,
1184 struct shmem_inode_info *info, unsigned long idx) 1127 struct shmem_inode_info *info, unsigned long idx)
1185{ 1128{
1186 struct vm_area_struct pvma; 1129 struct vm_area_struct pvma;
1187 struct page *page;
1188 1130
1189 /* Create a pseudo vma that just contains the policy */ 1131 /* Create a pseudo vma that just contains the policy */
1190 pvma.vm_start = 0; 1132 pvma.vm_start = 0;
1191 pvma.vm_pgoff = idx; 1133 pvma.vm_pgoff = idx;
1192 pvma.vm_ops = NULL; 1134 pvma.vm_ops = NULL;
1193 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx); 1135 pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
1194 page = alloc_page_vma(gfp, &pvma, 0); 1136
1195 mpol_free(pvma.vm_policy); 1137 /*
1196 return page; 1138 * alloc_page_vma() will drop the shared policy reference
1139 */
1140 return alloc_page_vma(gfp, &pvma, 0);
1197} 1141}
1198#else /* !CONFIG_NUMA */ 1142#else /* !CONFIG_NUMA */
1199#ifdef CONFIG_TMPFS 1143#ifdef CONFIG_TMPFS
1200static inline int shmem_parse_mpol(char *value, int *policy, 1144static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
1201 nodemask_t *policy_nodes)
1202{
1203 return 1;
1204}
1205
1206static inline void shmem_show_mpol(struct seq_file *seq, int policy,
1207 const nodemask_t policy_nodes)
1208{ 1145{
1209} 1146}
1210#endif /* CONFIG_TMPFS */ 1147#endif /* CONFIG_TMPFS */
@@ -1222,6 +1159,13 @@ static inline struct page *shmem_alloc_page(gfp_t gfp,
1222} 1159}
1223#endif /* CONFIG_NUMA */ 1160#endif /* CONFIG_NUMA */
1224 1161
1162#if !defined(CONFIG_NUMA) || !defined(CONFIG_TMPFS)
1163static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
1164{
1165 return NULL;
1166}
1167#endif
1168
1225/* 1169/*
1226 * shmem_getpage - either get the page from swap or allocate a new one 1170 * shmem_getpage - either get the page from swap or allocate a new one
1227 * 1171 *
@@ -1576,8 +1520,8 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1576 case S_IFREG: 1520 case S_IFREG:
1577 inode->i_op = &shmem_inode_operations; 1521 inode->i_op = &shmem_inode_operations;
1578 inode->i_fop = &shmem_file_operations; 1522 inode->i_fop = &shmem_file_operations;
1579 mpol_shared_policy_init(&info->policy, sbinfo->policy, 1523 mpol_shared_policy_init(&info->policy,
1580 &sbinfo->policy_nodes); 1524 shmem_get_sbmpol(sbinfo));
1581 break; 1525 break;
1582 case S_IFDIR: 1526 case S_IFDIR:
1583 inc_nlink(inode); 1527 inc_nlink(inode);
@@ -1591,8 +1535,7 @@ shmem_get_inode(struct super_block *sb, int mode, dev_t dev)
1591 * Must not load anything in the rbtree, 1535 * Must not load anything in the rbtree,
1592 * mpol_free_shared_policy will not be called. 1536 * mpol_free_shared_policy will not be called.
1593 */ 1537 */
1594 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, 1538 mpol_shared_policy_init(&info->policy, NULL);
1595 NULL);
1596 break; 1539 break;
1597 } 1540 }
1598 } else 1541 } else
@@ -2207,8 +2150,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
2207 if (*rest) 2150 if (*rest)
2208 goto bad_val; 2151 goto bad_val;
2209 } else if (!strcmp(this_char,"mpol")) { 2152 } else if (!strcmp(this_char,"mpol")) {
2210 if (shmem_parse_mpol(value, &sbinfo->policy, 2153 if (mpol_parse_str(value, &sbinfo->mpol, 1))
2211 &sbinfo->policy_nodes))
2212 goto bad_val; 2154 goto bad_val;
2213 } else { 2155 } else {
2214 printk(KERN_ERR "tmpfs: Bad mount option %s\n", 2156 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
@@ -2259,8 +2201,9 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
2259 sbinfo->free_blocks = config.max_blocks - blocks; 2201 sbinfo->free_blocks = config.max_blocks - blocks;
2260 sbinfo->max_inodes = config.max_inodes; 2202 sbinfo->max_inodes = config.max_inodes;
2261 sbinfo->free_inodes = config.max_inodes - inodes; 2203 sbinfo->free_inodes = config.max_inodes - inodes;
2262 sbinfo->policy = config.policy; 2204
2263 sbinfo->policy_nodes = config.policy_nodes; 2205 mpol_put(sbinfo->mpol);
2206 sbinfo->mpol = config.mpol; /* transfers initial ref */
2264out: 2207out:
2265 spin_unlock(&sbinfo->stat_lock); 2208 spin_unlock(&sbinfo->stat_lock);
2266 return error; 2209 return error;
@@ -2281,7 +2224,7 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
2281 seq_printf(seq, ",uid=%u", sbinfo->uid); 2224 seq_printf(seq, ",uid=%u", sbinfo->uid);
2282 if (sbinfo->gid != 0) 2225 if (sbinfo->gid != 0)
2283 seq_printf(seq, ",gid=%u", sbinfo->gid); 2226 seq_printf(seq, ",gid=%u", sbinfo->gid);
2284 shmem_show_mpol(seq, sbinfo->policy, sbinfo->policy_nodes); 2227 shmem_show_mpol(seq, sbinfo->mpol);
2285 return 0; 2228 return 0;
2286} 2229}
2287#endif /* CONFIG_TMPFS */ 2230#endif /* CONFIG_TMPFS */
@@ -2311,8 +2254,7 @@ static int shmem_fill_super(struct super_block *sb,
2311 sbinfo->mode = S_IRWXUGO | S_ISVTX; 2254 sbinfo->mode = S_IRWXUGO | S_ISVTX;
2312 sbinfo->uid = current->fsuid; 2255 sbinfo->uid = current->fsuid;
2313 sbinfo->gid = current->fsgid; 2256 sbinfo->gid = current->fsgid;
2314 sbinfo->policy = MPOL_DEFAULT; 2257 sbinfo->mpol = NULL;
2315 sbinfo->policy_nodes = node_states[N_HIGH_MEMORY];
2316 sb->s_fs_info = sbinfo; 2258 sb->s_fs_info = sbinfo;
2317 2259
2318#ifdef CONFIG_TMPFS 2260#ifdef CONFIG_TMPFS
diff --git a/mm/slab.c b/mm/slab.c
index 03927cb5ec9e..39d20f8a0791 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -139,10 +139,6 @@
139#define BYTES_PER_WORD sizeof(void *) 139#define BYTES_PER_WORD sizeof(void *)
140#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) 140#define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long))
141 141
142#ifndef cache_line_size
143#define cache_line_size() L1_CACHE_BYTES
144#endif
145
146#ifndef ARCH_KMALLOC_MINALIGN 142#ifndef ARCH_KMALLOC_MINALIGN
147/* 143/*
148 * Enforce a minimum alignment for the kmalloc caches. 144 * Enforce a minimum alignment for the kmalloc caches.
@@ -3242,15 +3238,16 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
3242{ 3238{
3243 struct zonelist *zonelist; 3239 struct zonelist *zonelist;
3244 gfp_t local_flags; 3240 gfp_t local_flags;
3245 struct zone **z; 3241 struct zoneref *z;
3242 struct zone *zone;
3243 enum zone_type high_zoneidx = gfp_zone(flags);
3246 void *obj = NULL; 3244 void *obj = NULL;
3247 int nid; 3245 int nid;
3248 3246
3249 if (flags & __GFP_THISNODE) 3247 if (flags & __GFP_THISNODE)
3250 return NULL; 3248 return NULL;
3251 3249
3252 zonelist = &NODE_DATA(slab_node(current->mempolicy)) 3250 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
3253 ->node_zonelists[gfp_zone(flags)];
3254 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); 3251 local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
3255 3252
3256retry: 3253retry:
@@ -3258,10 +3255,10 @@ retry:
3258 * Look through allowed nodes for objects available 3255 * Look through allowed nodes for objects available
3259 * from existing per node queues. 3256 * from existing per node queues.
3260 */ 3257 */
3261 for (z = zonelist->zones; *z && !obj; z++) { 3258 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
3262 nid = zone_to_nid(*z); 3259 nid = zone_to_nid(zone);
3263 3260
3264 if (cpuset_zone_allowed_hardwall(*z, flags) && 3261 if (cpuset_zone_allowed_hardwall(zone, flags) &&
3265 cache->nodelists[nid] && 3262 cache->nodelists[nid] &&
3266 cache->nodelists[nid]->free_objects) 3263 cache->nodelists[nid]->free_objects)
3267 obj = ____cache_alloc_node(cache, 3264 obj = ____cache_alloc_node(cache,
diff --git a/mm/slub.c b/mm/slub.c
index d821ce6fff39..992ecd4f0d39 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -186,11 +186,6 @@ static inline void ClearSlabDebug(struct page *page)
186#define __OBJECT_POISON 0x80000000 /* Poison object */ 186#define __OBJECT_POISON 0x80000000 /* Poison object */
187#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ 187#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */
188 188
189/* Not all arches define cache_line_size */
190#ifndef cache_line_size
191#define cache_line_size() L1_CACHE_BYTES
192#endif
193
194static int kmem_size = sizeof(struct kmem_cache); 189static int kmem_size = sizeof(struct kmem_cache);
195 190
196#ifdef CONFIG_SMP 191#ifdef CONFIG_SMP
@@ -1330,7 +1325,9 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1330{ 1325{
1331#ifdef CONFIG_NUMA 1326#ifdef CONFIG_NUMA
1332 struct zonelist *zonelist; 1327 struct zonelist *zonelist;
1333 struct zone **z; 1328 struct zoneref *z;
1329 struct zone *zone;
1330 enum zone_type high_zoneidx = gfp_zone(flags);
1334 struct page *page; 1331 struct page *page;
1335 1332
1336 /* 1333 /*
@@ -1355,14 +1352,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
1355 get_cycles() % 1024 > s->remote_node_defrag_ratio) 1352 get_cycles() % 1024 > s->remote_node_defrag_ratio)
1356 return NULL; 1353 return NULL;
1357 1354
1358 zonelist = &NODE_DATA( 1355 zonelist = node_zonelist(slab_node(current->mempolicy), flags);
1359 slab_node(current->mempolicy))->node_zonelists[gfp_zone(flags)]; 1356 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1360 for (z = zonelist->zones; *z; z++) {
1361 struct kmem_cache_node *n; 1357 struct kmem_cache_node *n;
1362 1358
1363 n = get_node(s, zone_to_nid(*z)); 1359 n = get_node(s, zone_to_nid(zone));
1364 1360
1365 if (n && cpuset_zone_allowed_hardwall(*z, flags) && 1361 if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
1366 n->nr_partial > MIN_PARTIAL) { 1362 n->nr_partial > MIN_PARTIAL) {
1367 page = get_partial_node(n); 1363 page = get_partial_node(n);
1368 if (page) 1364 if (page)
diff --git a/mm/sparse.c b/mm/sparse.c
index 98d6b39c3472..dff71f173ae9 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/spinlock.h> 9#include <linux/spinlock.h>
10#include <linux/vmalloc.h> 10#include <linux/vmalloc.h>
11#include "internal.h"
11#include <asm/dma.h> 12#include <asm/dma.h>
12#include <asm/pgalloc.h> 13#include <asm/pgalloc.h>
13#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -208,12 +209,12 @@ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long p
208} 209}
209 210
210/* 211/*
211 * We need this if we ever free the mem_maps. While not implemented yet, 212 * Decode mem_map from the coded memmap
212 * this function is included for parity with its sibling.
213 */ 213 */
214static __attribute((unused))
215struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum) 214struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum)
216{ 215{
216 /* mask off the extra low bits of information */
217 coded_mem_map &= SECTION_MAP_MASK;
217 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum); 218 return ((struct page *)coded_mem_map) + section_nr_to_pfn(pnum);
218} 219}
219 220
@@ -232,7 +233,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms,
232 return 1; 233 return 1;
233} 234}
234 235
235static unsigned long usemap_size(void) 236unsigned long usemap_size(void)
236{ 237{
237 unsigned long size_bytes; 238 unsigned long size_bytes;
238 size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; 239 size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8;
@@ -249,11 +250,22 @@ static unsigned long *__kmalloc_section_usemap(void)
249 250
250static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) 251static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum)
251{ 252{
252 unsigned long *usemap; 253 unsigned long *usemap, section_nr;
253 struct mem_section *ms = __nr_to_section(pnum); 254 struct mem_section *ms = __nr_to_section(pnum);
254 int nid = sparse_early_nid(ms); 255 int nid = sparse_early_nid(ms);
256 struct pglist_data *pgdat = NODE_DATA(nid);
255 257
256 usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); 258 /*
259 * Usemap's page can't be freed until freeing other sections
260 * which use it. And, Pgdat has same feature.
261 * If section A has pgdat and section B has usemap for other
262 * sections (includes section A), both sections can't be removed,
263 * because there is the dependency each other.
264 * To solve above issue, this collects all usemap on the same section
265 * which has pgdat.
266 */
267 section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
268 usemap = alloc_bootmem_section(usemap_size(), section_nr);
257 if (usemap) 269 if (usemap)
258 return usemap; 270 return usemap;
259 271
@@ -273,8 +285,8 @@ struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid)
273 if (map) 285 if (map)
274 return map; 286 return map;
275 287
276 map = alloc_bootmem_node(NODE_DATA(nid), 288 map = alloc_bootmem_pages_node(NODE_DATA(nid),
277 sizeof(struct page) * PAGES_PER_SECTION); 289 PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION));
278 return map; 290 return map;
279} 291}
280#endif /* !CONFIG_SPARSEMEM_VMEMMAP */ 292#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -295,6 +307,9 @@ struct page __init *sparse_early_mem_map_alloc(unsigned long pnum)
295 return NULL; 307 return NULL;
296} 308}
297 309
310void __attribute__((weak)) __meminit vmemmap_populate_print_last(void)
311{
312}
298/* 313/*
299 * Allocate the accumulated non-linear sections, allocate a mem_map 314 * Allocate the accumulated non-linear sections, allocate a mem_map
300 * for each and record the physical to section mapping. 315 * for each and record the physical to section mapping.
@@ -304,22 +319,50 @@ void __init sparse_init(void)
304 unsigned long pnum; 319 unsigned long pnum;
305 struct page *map; 320 struct page *map;
306 unsigned long *usemap; 321 unsigned long *usemap;
322 unsigned long **usemap_map;
323 int size;
324
325 /*
326 * map is using big page (aka 2M in x86 64 bit)
327 * usemap is less one page (aka 24 bytes)
328 * so alloc 2M (with 2M align) and 24 bytes in turn will
329 * make next 2M slip to one more 2M later.
330 * then in big system, the memory will have a lot of holes...
331 * here try to allocate 2M pages continously.
332 *
333 * powerpc need to call sparse_init_one_section right after each
334 * sparse_early_mem_map_alloc, so allocate usemap_map at first.
335 */
336 size = sizeof(unsigned long *) * NR_MEM_SECTIONS;
337 usemap_map = alloc_bootmem(size);
338 if (!usemap_map)
339 panic("can not allocate usemap_map\n");
307 340
308 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { 341 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
309 if (!present_section_nr(pnum)) 342 if (!present_section_nr(pnum))
310 continue; 343 continue;
344 usemap_map[pnum] = sparse_early_usemap_alloc(pnum);
345 }
311 346
312 map = sparse_early_mem_map_alloc(pnum); 347 for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) {
313 if (!map) 348 if (!present_section_nr(pnum))
314 continue; 349 continue;
315 350
316 usemap = sparse_early_usemap_alloc(pnum); 351 usemap = usemap_map[pnum];
317 if (!usemap) 352 if (!usemap)
318 continue; 353 continue;
319 354
355 map = sparse_early_mem_map_alloc(pnum);
356 if (!map)
357 continue;
358
320 sparse_init_one_section(__nr_to_section(pnum), pnum, map, 359 sparse_init_one_section(__nr_to_section(pnum), pnum, map,
321 usemap); 360 usemap);
322 } 361 }
362
363 vmemmap_populate_print_last();
364
365 free_bootmem(__pa(usemap_map), size);
323} 366}
324 367
325#ifdef CONFIG_MEMORY_HOTPLUG 368#ifdef CONFIG_MEMORY_HOTPLUG
@@ -334,6 +377,9 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
334{ 377{
335 return; /* XXX: Not implemented yet */ 378 return; /* XXX: Not implemented yet */
336} 379}
380static void free_map_bootmem(struct page *page, unsigned long nr_pages)
381{
382}
337#else 383#else
338static struct page *__kmalloc_section_memmap(unsigned long nr_pages) 384static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
339{ 385{
@@ -371,8 +417,69 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
371 free_pages((unsigned long)memmap, 417 free_pages((unsigned long)memmap,
372 get_order(sizeof(struct page) * nr_pages)); 418 get_order(sizeof(struct page) * nr_pages));
373} 419}
420
421static void free_map_bootmem(struct page *page, unsigned long nr_pages)
422{
423 unsigned long maps_section_nr, removing_section_nr, i;
424 int magic;
425
426 for (i = 0; i < nr_pages; i++, page++) {
427 magic = atomic_read(&page->_mapcount);
428
429 BUG_ON(magic == NODE_INFO);
430
431 maps_section_nr = pfn_to_section_nr(page_to_pfn(page));
432 removing_section_nr = page->private;
433
434 /*
435 * When this function is called, the removing section is
436 * logical offlined state. This means all pages are isolated
437 * from page allocator. If removing section's memmap is placed
438 * on the same section, it must not be freed.
439 * If it is freed, page allocator may allocate it which will
440 * be removed physically soon.
441 */
442 if (maps_section_nr != removing_section_nr)
443 put_page_bootmem(page);
444 }
445}
374#endif /* CONFIG_SPARSEMEM_VMEMMAP */ 446#endif /* CONFIG_SPARSEMEM_VMEMMAP */
375 447
448static void free_section_usemap(struct page *memmap, unsigned long *usemap)
449{
450 struct page *usemap_page;
451 unsigned long nr_pages;
452
453 if (!usemap)
454 return;
455
456 usemap_page = virt_to_page(usemap);
457 /*
458 * Check to see if allocation came from hot-plug-add
459 */
460 if (PageSlab(usemap_page)) {
461 kfree(usemap);
462 if (memmap)
463 __kfree_section_memmap(memmap, PAGES_PER_SECTION);
464 return;
465 }
466
467 /*
468 * The usemap came from bootmem. This is packed with other usemaps
469 * on the section which has pgdat at boot time. Just keep it as is now.
470 */
471
472 if (memmap) {
473 struct page *memmap_page;
474 memmap_page = virt_to_page(memmap);
475
476 nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
477 >> PAGE_SHIFT;
478
479 free_map_bootmem(memmap_page, nr_pages);
480 }
481}
482
376/* 483/*
377 * returns the number of sections whose mem_maps were properly 484 * returns the number of sections whose mem_maps were properly
378 * set. If this is <=0, then that means that the passed-in 485 * set. If this is <=0, then that means that the passed-in
@@ -425,4 +532,20 @@ out:
425 } 532 }
426 return ret; 533 return ret;
427} 534}
535
536void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
537{
538 struct page *memmap = NULL;
539 unsigned long *usemap = NULL;
540
541 if (ms->section_mem_map) {
542 usemap = ms->pageblock_flags;
543 memmap = sparse_decode_mem_map(ms->section_mem_map,
544 __section_nr(ms));
545 ms->section_mem_map = 0;
546 ms->pageblock_flags = NULL;
547 }
548
549 free_section_usemap(memmap, usemap);
550}
428#endif 551#endif
diff --git a/mm/swap.c b/mm/swap.c
index aa1139ccf3a7..91e194445a5e 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -132,34 +132,21 @@ static void pagevec_move_tail(struct pagevec *pvec)
132 * Writeback is about to end against a page which has been marked for immediate 132 * Writeback is about to end against a page which has been marked for immediate
133 * reclaim. If it still appears to be reclaimable, move it to the tail of the 133 * reclaim. If it still appears to be reclaimable, move it to the tail of the
134 * inactive list. 134 * inactive list.
135 *
136 * Returns zero if it cleared PG_writeback.
137 */ 135 */
138int rotate_reclaimable_page(struct page *page) 136void rotate_reclaimable_page(struct page *page)
139{ 137{
140 struct pagevec *pvec; 138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
141 unsigned long flags; 139 PageLRU(page)) {
142 140 struct pagevec *pvec;
143 if (PageLocked(page)) 141 unsigned long flags;
144 return 1;
145 if (PageDirty(page))
146 return 1;
147 if (PageActive(page))
148 return 1;
149 if (!PageLRU(page))
150 return 1;
151
152 page_cache_get(page);
153 local_irq_save(flags);
154 pvec = &__get_cpu_var(lru_rotate_pvecs);
155 if (!pagevec_add(pvec, page))
156 pagevec_move_tail(pvec);
157 local_irq_restore(flags);
158
159 if (!test_clear_page_writeback(page))
160 BUG();
161 142
162 return 0; 143 page_cache_get(page);
144 local_irq_save(flags);
145 pvec = &__get_cpu_var(lru_rotate_pvecs);
146 if (!pagevec_add(pvec, page))
147 pagevec_move_tail(pvec);
148 local_irq_restore(flags);
149 }
163} 150}
164 151
165/* 152/*
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2da149cfc9ac..67051be7083a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1582,6 +1582,14 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1582 error = -EINVAL; 1582 error = -EINVAL;
1583 goto bad_swap; 1583 goto bad_swap;
1584 case 2: 1584 case 2:
1585 /* swap partition endianess hack... */
1586 if (swab32(swap_header->info.version) == 1) {
1587 swab32s(&swap_header->info.version);
1588 swab32s(&swap_header->info.last_page);
1589 swab32s(&swap_header->info.nr_badpages);
1590 for (i = 0; i < swap_header->info.nr_badpages; i++)
1591 swab32s(&swap_header->info.badpages[i]);
1592 }
1585 /* Check the swap header's sub-version and the size of 1593 /* Check the swap header's sub-version and the size of
1586 the swap file and bad block lists */ 1594 the swap file and bad block lists */
1587 if (swap_header->info.version != 1) { 1595 if (swap_header->info.version != 1) {
diff --git a/mm/truncate.c b/mm/truncate.c
index 7d20ce41ecf5..b8961cb63414 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -391,6 +391,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
391 pgoff_t next; 391 pgoff_t next;
392 int i; 392 int i;
393 int ret = 0; 393 int ret = 0;
394 int ret2 = 0;
394 int did_range_unmap = 0; 395 int did_range_unmap = 0;
395 int wrapped = 0; 396 int wrapped = 0;
396 397
@@ -438,9 +439,13 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
438 } 439 }
439 } 440 }
440 BUG_ON(page_mapped(page)); 441 BUG_ON(page_mapped(page));
441 ret = do_launder_page(mapping, page); 442 ret2 = do_launder_page(mapping, page);
442 if (ret == 0 && !invalidate_complete_page2(mapping, page)) 443 if (ret2 == 0) {
443 ret = -EIO; 444 if (!invalidate_complete_page2(mapping, page))
445 ret2 = -EIO;
446 }
447 if (ret2 < 0)
448 ret = ret2;
444 unlock_page(page); 449 unlock_page(page);
445 } 450 }
446 pagevec_release(&pvec); 451 pagevec_release(&pvec);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ecf91f8034bf..e33e0ae69ad1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,8 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17 17#include <linux/seq_file.h>
18#include <linux/vmalloc.h> 18#include <linux/vmalloc.h>
19#include <linux/kallsyms.h>
19 20
20#include <asm/uaccess.h> 21#include <asm/uaccess.h>
21#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
@@ -25,7 +26,7 @@ DEFINE_RWLOCK(vmlist_lock);
25struct vm_struct *vmlist; 26struct vm_struct *vmlist;
26 27
27static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 28static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
28 int node); 29 int node, void *caller);
29 30
30static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 31static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
31{ 32{
@@ -204,9 +205,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
204} 205}
205EXPORT_SYMBOL(vmalloc_to_pfn); 206EXPORT_SYMBOL(vmalloc_to_pfn);
206 207
207static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long flags, 208static struct vm_struct *
208 unsigned long start, unsigned long end, 209__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
209 int node, gfp_t gfp_mask) 210 unsigned long end, int node, gfp_t gfp_mask, void *caller)
210{ 211{
211 struct vm_struct **p, *tmp, *area; 212 struct vm_struct **p, *tmp, *area;
212 unsigned long align = 1; 213 unsigned long align = 1;
@@ -269,6 +270,7 @@ found:
269 area->pages = NULL; 270 area->pages = NULL;
270 area->nr_pages = 0; 271 area->nr_pages = 0;
271 area->phys_addr = 0; 272 area->phys_addr = 0;
273 area->caller = caller;
272 write_unlock(&vmlist_lock); 274 write_unlock(&vmlist_lock);
273 275
274 return area; 276 return area;
@@ -284,7 +286,8 @@ out:
284struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 286struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
285 unsigned long start, unsigned long end) 287 unsigned long start, unsigned long end)
286{ 288{
287 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL); 289 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL,
290 __builtin_return_address(0));
288} 291}
289EXPORT_SYMBOL_GPL(__get_vm_area); 292EXPORT_SYMBOL_GPL(__get_vm_area);
290 293
@@ -299,14 +302,22 @@ EXPORT_SYMBOL_GPL(__get_vm_area);
299 */ 302 */
300struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 303struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
301{ 304{
302 return __get_vm_area(size, flags, VMALLOC_START, VMALLOC_END); 305 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
306 -1, GFP_KERNEL, __builtin_return_address(0));
307}
308
309struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
310 void *caller)
311{
312 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END,
313 -1, GFP_KERNEL, caller);
303} 314}
304 315
305struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 316struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
306 int node, gfp_t gfp_mask) 317 int node, gfp_t gfp_mask)
307{ 318{
308 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 319 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node,
309 gfp_mask); 320 gfp_mask, __builtin_return_address(0));
310} 321}
311 322
312/* Caller must hold vmlist_lock */ 323/* Caller must hold vmlist_lock */
@@ -455,9 +466,11 @@ void *vmap(struct page **pages, unsigned int count,
455 if (count > num_physpages) 466 if (count > num_physpages)
456 return NULL; 467 return NULL;
457 468
458 area = get_vm_area((count << PAGE_SHIFT), flags); 469 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
470 __builtin_return_address(0));
459 if (!area) 471 if (!area)
460 return NULL; 472 return NULL;
473
461 if (map_vm_area(area, prot, &pages)) { 474 if (map_vm_area(area, prot, &pages)) {
462 vunmap(area->addr); 475 vunmap(area->addr);
463 return NULL; 476 return NULL;
@@ -468,7 +481,7 @@ void *vmap(struct page **pages, unsigned int count,
468EXPORT_SYMBOL(vmap); 481EXPORT_SYMBOL(vmap);
469 482
470static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 483static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
471 pgprot_t prot, int node) 484 pgprot_t prot, int node, void *caller)
472{ 485{
473 struct page **pages; 486 struct page **pages;
474 unsigned int nr_pages, array_size, i; 487 unsigned int nr_pages, array_size, i;
@@ -480,7 +493,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
480 /* Please note that the recursion is strictly bounded. */ 493 /* Please note that the recursion is strictly bounded. */
481 if (array_size > PAGE_SIZE) { 494 if (array_size > PAGE_SIZE) {
482 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 495 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO,
483 PAGE_KERNEL, node); 496 PAGE_KERNEL, node, caller);
484 area->flags |= VM_VPAGES; 497 area->flags |= VM_VPAGES;
485 } else { 498 } else {
486 pages = kmalloc_node(array_size, 499 pages = kmalloc_node(array_size,
@@ -488,6 +501,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
488 node); 501 node);
489 } 502 }
490 area->pages = pages; 503 area->pages = pages;
504 area->caller = caller;
491 if (!area->pages) { 505 if (!area->pages) {
492 remove_vm_area(area->addr); 506 remove_vm_area(area->addr);
493 kfree(area); 507 kfree(area);
@@ -521,7 +535,8 @@ fail:
521 535
522void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) 536void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
523{ 537{
524 return __vmalloc_area_node(area, gfp_mask, prot, -1); 538 return __vmalloc_area_node(area, gfp_mask, prot, -1,
539 __builtin_return_address(0));
525} 540}
526 541
527/** 542/**
@@ -536,7 +551,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
536 * kernel virtual space, using a pagetable protection of @prot. 551 * kernel virtual space, using a pagetable protection of @prot.
537 */ 552 */
538static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 553static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
539 int node) 554 int node, void *caller)
540{ 555{
541 struct vm_struct *area; 556 struct vm_struct *area;
542 557
@@ -544,16 +559,19 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
544 if (!size || (size >> PAGE_SHIFT) > num_physpages) 559 if (!size || (size >> PAGE_SHIFT) > num_physpages)
545 return NULL; 560 return NULL;
546 561
547 area = get_vm_area_node(size, VM_ALLOC, node, gfp_mask); 562 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END,
563 node, gfp_mask, caller);
564
548 if (!area) 565 if (!area)
549 return NULL; 566 return NULL;
550 567
551 return __vmalloc_area_node(area, gfp_mask, prot, node); 568 return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
552} 569}
553 570
554void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 571void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
555{ 572{
556 return __vmalloc_node(size, gfp_mask, prot, -1); 573 return __vmalloc_node(size, gfp_mask, prot, -1,
574 __builtin_return_address(0));
557} 575}
558EXPORT_SYMBOL(__vmalloc); 576EXPORT_SYMBOL(__vmalloc);
559 577
@@ -568,7 +586,8 @@ EXPORT_SYMBOL(__vmalloc);
568 */ 586 */
569void *vmalloc(unsigned long size) 587void *vmalloc(unsigned long size)
570{ 588{
571 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); 589 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
590 -1, __builtin_return_address(0));
572} 591}
573EXPORT_SYMBOL(vmalloc); 592EXPORT_SYMBOL(vmalloc);
574 593
@@ -608,7 +627,8 @@ EXPORT_SYMBOL(vmalloc_user);
608 */ 627 */
609void *vmalloc_node(unsigned long size, int node) 628void *vmalloc_node(unsigned long size, int node)
610{ 629{
611 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node); 630 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
631 node, __builtin_return_address(0));
612} 632}
613EXPORT_SYMBOL(vmalloc_node); 633EXPORT_SYMBOL(vmalloc_node);
614 634
@@ -843,7 +863,8 @@ struct vm_struct *alloc_vm_area(size_t size)
843{ 863{
844 struct vm_struct *area; 864 struct vm_struct *area;
845 865
846 area = get_vm_area(size, VM_IOREMAP); 866 area = get_vm_area_caller(size, VM_IOREMAP,
867 __builtin_return_address(0));
847 if (area == NULL) 868 if (area == NULL)
848 return NULL; 869 return NULL;
849 870
@@ -873,3 +894,85 @@ void free_vm_area(struct vm_struct *area)
873 kfree(area); 894 kfree(area);
874} 895}
875EXPORT_SYMBOL_GPL(free_vm_area); 896EXPORT_SYMBOL_GPL(free_vm_area);
897
898
899#ifdef CONFIG_PROC_FS
900static void *s_start(struct seq_file *m, loff_t *pos)
901{
902 loff_t n = *pos;
903 struct vm_struct *v;
904
905 read_lock(&vmlist_lock);
906 v = vmlist;
907 while (n > 0 && v) {
908 n--;
909 v = v->next;
910 }
911 if (!n)
912 return v;
913
914 return NULL;
915
916}
917
918static void *s_next(struct seq_file *m, void *p, loff_t *pos)
919{
920 struct vm_struct *v = p;
921
922 ++*pos;
923 return v->next;
924}
925
926static void s_stop(struct seq_file *m, void *p)
927{
928 read_unlock(&vmlist_lock);
929}
930
931static int s_show(struct seq_file *m, void *p)
932{
933 struct vm_struct *v = p;
934
935 seq_printf(m, "0x%p-0x%p %7ld",
936 v->addr, v->addr + v->size, v->size);
937
938 if (v->caller) {
939 char buff[2 * KSYM_NAME_LEN];
940
941 seq_putc(m, ' ');
942 sprint_symbol(buff, (unsigned long)v->caller);
943 seq_puts(m, buff);
944 }
945
946 if (v->nr_pages)
947 seq_printf(m, " pages=%d", v->nr_pages);
948
949 if (v->phys_addr)
950 seq_printf(m, " phys=%lx", v->phys_addr);
951
952 if (v->flags & VM_IOREMAP)
953 seq_printf(m, " ioremap");
954
955 if (v->flags & VM_ALLOC)
956 seq_printf(m, " vmalloc");
957
958 if (v->flags & VM_MAP)
959 seq_printf(m, " vmap");
960
961 if (v->flags & VM_USERMAP)
962 seq_printf(m, " user");
963
964 if (v->flags & VM_VPAGES)
965 seq_printf(m, " vpages");
966
967 seq_putc(m, '\n');
968 return 0;
969}
970
971const struct seq_operations vmalloc_op = {
972 .start = s_start,
973 .next = s_next,
974 .stop = s_stop,
975 .show = s_show,
976};
977#endif
978
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f80a5b7c057f..eceac9f9032f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1246,17 +1246,16 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1246 * If a zone is deemed to be full of pinned pages then just give it a light 1246 * If a zone is deemed to be full of pinned pages then just give it a light
1247 * scan then give up on it. 1247 * scan then give up on it.
1248 */ 1248 */
1249static unsigned long shrink_zones(int priority, struct zone **zones, 1249static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1250 struct scan_control *sc) 1250 struct scan_control *sc)
1251{ 1251{
1252 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1252 unsigned long nr_reclaimed = 0; 1253 unsigned long nr_reclaimed = 0;
1253 int i; 1254 struct zoneref *z;
1254 1255 struct zone *zone;
1255 1256
1256 sc->all_unreclaimable = 1; 1257 sc->all_unreclaimable = 1;
1257 for (i = 0; zones[i] != NULL; i++) { 1258 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1258 struct zone *zone = zones[i];
1259
1260 if (!populated_zone(zone)) 1259 if (!populated_zone(zone))
1261 continue; 1260 continue;
1262 /* 1261 /*
@@ -1301,8 +1300,8 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
1301 * holds filesystem locks which prevent writeout this might not work, and the 1300 * holds filesystem locks which prevent writeout this might not work, and the
1302 * allocation attempt will fail. 1301 * allocation attempt will fail.
1303 */ 1302 */
1304static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, 1303static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1305 struct scan_control *sc) 1304 struct scan_control *sc)
1306{ 1305{
1307 int priority; 1306 int priority;
1308 int ret = 0; 1307 int ret = 0;
@@ -1310,7 +1309,9 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1310 unsigned long nr_reclaimed = 0; 1309 unsigned long nr_reclaimed = 0;
1311 struct reclaim_state *reclaim_state = current->reclaim_state; 1310 struct reclaim_state *reclaim_state = current->reclaim_state;
1312 unsigned long lru_pages = 0; 1311 unsigned long lru_pages = 0;
1313 int i; 1312 struct zoneref *z;
1313 struct zone *zone;
1314 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1314 1315
1315 if (scan_global_lru(sc)) 1316 if (scan_global_lru(sc))
1316 count_vm_event(ALLOCSTALL); 1317 count_vm_event(ALLOCSTALL);
@@ -1318,8 +1319,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1318 * mem_cgroup will not do shrink_slab. 1319 * mem_cgroup will not do shrink_slab.
1319 */ 1320 */
1320 if (scan_global_lru(sc)) { 1321 if (scan_global_lru(sc)) {
1321 for (i = 0; zones[i] != NULL; i++) { 1322 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1322 struct zone *zone = zones[i];
1323 1323
1324 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1324 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1325 continue; 1325 continue;
@@ -1333,13 +1333,13 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask,
1333 sc->nr_scanned = 0; 1333 sc->nr_scanned = 0;
1334 if (!priority) 1334 if (!priority)
1335 disable_swap_token(); 1335 disable_swap_token();
1336 nr_reclaimed += shrink_zones(priority, zones, sc); 1336 nr_reclaimed += shrink_zones(priority, zonelist, sc);
1337 /* 1337 /*
1338 * Don't shrink slabs when reclaiming memory from 1338 * Don't shrink slabs when reclaiming memory from
1339 * over limit cgroups 1339 * over limit cgroups
1340 */ 1340 */
1341 if (scan_global_lru(sc)) { 1341 if (scan_global_lru(sc)) {
1342 shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); 1342 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1343 if (reclaim_state) { 1343 if (reclaim_state) {
1344 nr_reclaimed += reclaim_state->reclaimed_slab; 1344 nr_reclaimed += reclaim_state->reclaimed_slab;
1345 reclaim_state->reclaimed_slab = 0; 1345 reclaim_state->reclaimed_slab = 0;
@@ -1383,8 +1383,7 @@ out:
1383 priority = 0; 1383 priority = 0;
1384 1384
1385 if (scan_global_lru(sc)) { 1385 if (scan_global_lru(sc)) {
1386 for (i = 0; zones[i] != NULL; i++) { 1386 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1387 struct zone *zone = zones[i];
1388 1387
1389 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1388 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1390 continue; 1389 continue;
@@ -1397,7 +1396,8 @@ out:
1397 return ret; 1396 return ret;
1398} 1397}
1399 1398
1400unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask) 1399unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1400 gfp_t gfp_mask)
1401{ 1401{
1402 struct scan_control sc = { 1402 struct scan_control sc = {
1403 .gfp_mask = gfp_mask, 1403 .gfp_mask = gfp_mask,
@@ -1410,7 +1410,7 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
1410 .isolate_pages = isolate_pages_global, 1410 .isolate_pages = isolate_pages_global,
1411 }; 1411 };
1412 1412
1413 return do_try_to_free_pages(zones, gfp_mask, &sc); 1413 return do_try_to_free_pages(zonelist, &sc);
1414} 1414}
1415 1415
1416#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1416#ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1419,7 +1419,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1419 gfp_t gfp_mask) 1419 gfp_t gfp_mask)
1420{ 1420{
1421 struct scan_control sc = { 1421 struct scan_control sc = {
1422 .gfp_mask = gfp_mask,
1423 .may_writepage = !laptop_mode, 1422 .may_writepage = !laptop_mode,
1424 .may_swap = 1, 1423 .may_swap = 1,
1425 .swap_cluster_max = SWAP_CLUSTER_MAX, 1424 .swap_cluster_max = SWAP_CLUSTER_MAX,
@@ -1428,13 +1427,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1428 .mem_cgroup = mem_cont, 1427 .mem_cgroup = mem_cont,
1429 .isolate_pages = mem_cgroup_isolate_pages, 1428 .isolate_pages = mem_cgroup_isolate_pages,
1430 }; 1429 };
1431 struct zone **zones; 1430 struct zonelist *zonelist;
1432 int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE);
1433 1431
1434 zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; 1432 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1435 if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) 1433 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1436 return 1; 1434 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
1437 return 0; 1435 return do_try_to_free_pages(zonelist, &sc);
1438} 1436}
1439#endif 1437#endif
1440 1438
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7c7286e9506d..ec6035eda933 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -322,6 +322,7 @@ void refresh_cpu_vm_stats(int cpu)
322 p->expire = 3; 322 p->expire = 3;
323#endif 323#endif
324 } 324 }
325 cond_resched();
325#ifdef CONFIG_NUMA 326#ifdef CONFIG_NUMA
326 /* 327 /*
327 * Deal with draining the remote pageset of this 328 * Deal with draining the remote pageset of this
@@ -364,13 +365,13 @@ void refresh_cpu_vm_stats(int cpu)
364 * 365 *
365 * Must be called with interrupts disabled. 366 * Must be called with interrupts disabled.
366 */ 367 */
367void zone_statistics(struct zonelist *zonelist, struct zone *z) 368void zone_statistics(struct zone *preferred_zone, struct zone *z)
368{ 369{
369 if (z->zone_pgdat == zonelist->zones[0]->zone_pgdat) { 370 if (z->zone_pgdat == preferred_zone->zone_pgdat) {
370 __inc_zone_state(z, NUMA_HIT); 371 __inc_zone_state(z, NUMA_HIT);
371 } else { 372 } else {
372 __inc_zone_state(z, NUMA_MISS); 373 __inc_zone_state(z, NUMA_MISS);
373 __inc_zone_state(zonelist->zones[0], NUMA_FOREIGN); 374 __inc_zone_state(preferred_zone, NUMA_FOREIGN);
374 } 375 }
375 if (z->node == numa_node_id()) 376 if (z->node == numa_node_id())
376 __inc_zone_state(z, NUMA_LOCAL); 377 __inc_zone_state(z, NUMA_LOCAL);
@@ -645,6 +646,10 @@ static const char * const vmstat_text[] = {
645 "allocstall", 646 "allocstall",
646 647
647 "pgrotated", 648 "pgrotated",
649#ifdef CONFIG_HUGETLB_PAGE
650 "htlb_buddy_alloc_success",
651 "htlb_buddy_alloc_fail",
652#endif
648#endif 653#endif
649}; 654};
650 655