aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
committerDave Kleikamp <shaggy@austin.ibm.com>2006-01-24 15:34:47 -0500
commit0a0fc0ddbe732779366ab6b1b879f62195e65967 (patch)
tree7b42490a676cf39ae0691b6859ecf7fd410f229b /mm/page_alloc.c
parent4d5dbd0945d9e0833dd7964a3d6ee33157f7cc7a (diff)
parent3ee68c4af3fd7228c1be63254b9f884614f9ebb2 (diff)
Merge with /home/shaggy/git/linus-clean/
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c788
1 files changed, 450 insertions, 338 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 987225bdd661..df54e2fc8ee0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
52unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
55 59
56/* 60/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 61 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -60,8 +64,11 @@ long nr_swap_pages;
60 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 64 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
61 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 65 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
62 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 66 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
67 *
68 * TBD: should special case ZONE_DMA32 machines here - in those we normally
69 * don't need any ZONE_NORMAL reservation
63 */ 70 */
64int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 }; 71int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
65 72
66EXPORT_SYMBOL(totalram_pages); 73EXPORT_SYMBOL(totalram_pages);
67 74
@@ -72,12 +79,13 @@ EXPORT_SYMBOL(totalram_pages);
72struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly; 79struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
73EXPORT_SYMBOL(zone_table); 80EXPORT_SYMBOL(zone_table);
74 81
75static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; 82static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
76int min_free_kbytes = 1024; 83int min_free_kbytes = 1024;
77 84
78unsigned long __initdata nr_kernel_pages; 85unsigned long __initdata nr_kernel_pages;
79unsigned long __initdata nr_all_pages; 86unsigned long __initdata nr_all_pages;
80 87
88#ifdef CONFIG_DEBUG_VM
81static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 89static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
82{ 90{
83 int ret = 0; 91 int ret = 0;
@@ -119,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
119 return 0; 127 return 0;
120} 128}
121 129
122static void bad_page(const char *function, struct page *page) 130#else
131static inline int bad_range(struct zone *zone, struct page *page)
132{
133 return 0;
134}
135#endif
136
137static void bad_page(struct page *page)
123{ 138{
124 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 139 printk(KERN_EMERG "Bad page state in process '%s'\n"
125 function, current->comm, page); 140 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
126 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 141 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
127 (int)(2*sizeof(page_flags_t)), (unsigned long)page->flags, 142 KERN_EMERG "Backtrace:\n",
128 page->mapping, page_mapcount(page), page_count(page)); 143 current->comm, page, (int)(2*sizeof(unsigned long)),
129 printk(KERN_EMERG "Backtrace:\n"); 144 (unsigned long)page->flags, page->mapping,
145 page_mapcount(page), page_count(page));
130 dump_stack(); 146 dump_stack();
131 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
132 page->flags &= ~(1 << PG_lru | 147 page->flags &= ~(1 << PG_lru |
133 1 << PG_private | 148 1 << PG_private |
134 1 << PG_locked | 149 1 << PG_locked |
@@ -137,18 +152,13 @@ static void bad_page(const char *function, struct page *page)
137 1 << PG_reclaim | 152 1 << PG_reclaim |
138 1 << PG_slab | 153 1 << PG_slab |
139 1 << PG_swapcache | 154 1 << PG_swapcache |
140 1 << PG_writeback | 155 1 << PG_writeback );
141 1 << PG_reserved );
142 set_page_count(page, 0); 156 set_page_count(page, 0);
143 reset_page_mapcount(page); 157 reset_page_mapcount(page);
144 page->mapping = NULL; 158 page->mapping = NULL;
145 add_taint(TAINT_BAD_PAGE); 159 add_taint(TAINT_BAD_PAGE);
146} 160}
147 161
148#ifndef CONFIG_HUGETLB_PAGE
149#define prep_compound_page(page, order) do { } while (0)
150#define destroy_compound_page(page, order) do { } while (0)
151#else
152/* 162/*
153 * Higher-order pages are called "compound pages". They are structured thusly: 163 * Higher-order pages are called "compound pages". They are structured thusly:
154 * 164 *
@@ -186,23 +196,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
186 int i; 196 int i;
187 int nr_pages = 1 << order; 197 int nr_pages = 1 << order;
188 198
189 if (!PageCompound(page)) 199 if (unlikely(page[1].index != order))
190 return; 200 bad_page(page);
191
192 if (page[1].index != order)
193 bad_page(__FUNCTION__, page);
194 201
195 for (i = 0; i < nr_pages; i++) { 202 for (i = 0; i < nr_pages; i++) {
196 struct page *p = page + i; 203 struct page *p = page + i;
197 204
198 if (!PageCompound(p)) 205 if (unlikely(!PageCompound(p) |
199 bad_page(__FUNCTION__, page); 206 (page_private(p) != (unsigned long)page)))
200 if (page_private(p) != (unsigned long)page) 207 bad_page(page);
201 bad_page(__FUNCTION__, page);
202 ClearPageCompound(p); 208 ClearPageCompound(p);
203 } 209 }
204} 210}
205#endif /* CONFIG_HUGETLB_PAGE */
206 211
207/* 212/*
208 * function for dealing with page's order in buddy system. 213 * function for dealing with page's order in buddy system.
@@ -258,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
258/* 263/*
259 * This function checks whether a page is free && is the buddy 264 * This function checks whether a page is free && is the buddy
260 * we can do coalesce a page and its buddy if 265 * we can do coalesce a page and its buddy if
261 * (a) the buddy is free && 266 * (a) the buddy is not in a hole &&
262 * (b) the buddy is on the buddy system && 267 * (b) the buddy is free &&
263 * (c) a page and its buddy have the same order. 268 * (c) the buddy is on the buddy system &&
269 * (d) a page and its buddy have the same order.
264 * for recording page's order, we use page_private(page) and PG_private. 270 * for recording page's order, we use page_private(page) and PG_private.
265 * 271 *
266 */ 272 */
267static inline int page_is_buddy(struct page *page, int order) 273static inline int page_is_buddy(struct page *page, int order)
268{ 274{
275#ifdef CONFIG_HOLES_IN_ZONE
276 if (!pfn_valid(page_to_pfn(page)))
277 return 0;
278#endif
279
269 if (PagePrivate(page) && 280 if (PagePrivate(page) &&
270 (page_order(page) == order) && 281 (page_order(page) == order) &&
271 page_count(page) == 0) 282 page_count(page) == 0)
@@ -297,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
297 * -- wli 308 * -- wli
298 */ 309 */
299 310
300static inline void __free_pages_bulk (struct page *page, 311static inline void __free_one_page(struct page *page,
301 struct zone *zone, unsigned int order) 312 struct zone *zone, unsigned int order)
302{ 313{
303 unsigned long page_idx; 314 unsigned long page_idx;
304 int order_size = 1 << order; 315 int order_size = 1 << order;
305 316
306 if (unlikely(order)) 317 if (unlikely(PageCompound(page)))
307 destroy_compound_page(page, order); 318 destroy_compound_page(page, order);
308 319
309 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 320 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -317,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
317 struct free_area *area; 328 struct free_area *area;
318 struct page *buddy; 329 struct page *buddy;
319 330
320 combined_idx = __find_combined_index(page_idx, order);
321 buddy = __page_find_buddy(page, page_idx, order); 331 buddy = __page_find_buddy(page, page_idx, order);
322
323 if (bad_range(zone, buddy))
324 break;
325 if (!page_is_buddy(buddy, order)) 332 if (!page_is_buddy(buddy, order))
326 break; /* Move the buddy up one level. */ 333 break; /* Move the buddy up one level. */
334
327 list_del(&buddy->lru); 335 list_del(&buddy->lru);
328 area = zone->free_area + order; 336 area = zone->free_area + order;
329 area->nr_free--; 337 area->nr_free--;
330 rmv_page_order(buddy); 338 rmv_page_order(buddy);
339 combined_idx = __find_combined_index(page_idx, order);
331 page = page + (combined_idx - page_idx); 340 page = page + (combined_idx - page_idx);
332 page_idx = combined_idx; 341 page_idx = combined_idx;
333 order++; 342 order++;
@@ -337,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
337 zone->free_area[order].nr_free++; 346 zone->free_area[order].nr_free++;
338} 347}
339 348
340static inline void free_pages_check(const char *function, struct page *page) 349static inline int free_pages_check(struct page *page)
341{ 350{
342 if ( page_mapcount(page) || 351 if (unlikely(page_mapcount(page) |
343 page->mapping != NULL || 352 (page->mapping != NULL) |
344 page_count(page) != 0 || 353 (page_count(page) != 0) |
345 (page->flags & ( 354 (page->flags & (
346 1 << PG_lru | 355 1 << PG_lru |
347 1 << PG_private | 356 1 << PG_private |
@@ -351,10 +360,16 @@ static inline void free_pages_check(const char *function, struct page *page)
351 1 << PG_slab | 360 1 << PG_slab |
352 1 << PG_swapcache | 361 1 << PG_swapcache |
353 1 << PG_writeback | 362 1 << PG_writeback |
354 1 << PG_reserved ))) 363 1 << PG_reserved ))))
355 bad_page(function, page); 364 bad_page(page);
356 if (PageDirty(page)) 365 if (PageDirty(page))
357 __ClearPageDirty(page); 366 __ClearPageDirty(page);
367 /*
368 * For now, we report if PG_reserved was found set, but do not
369 * clear it, and do not free the page. But we shall soon need
370 * to do more, for when the ZERO_PAGE count wraps negative.
371 */
372 return PageReserved(page);
358} 373}
359 374
360/* 375/*
@@ -368,48 +383,90 @@ static inline void free_pages_check(const char *function, struct page *page)
368 * And clear the zone's pages_scanned counter, to hold off the "all pages are 383 * And clear the zone's pages_scanned counter, to hold off the "all pages are
369 * pinned" detection logic. 384 * pinned" detection logic.
370 */ 385 */
371static int 386static void free_pages_bulk(struct zone *zone, int count,
372free_pages_bulk(struct zone *zone, int count, 387 struct list_head *list, int order)
373 struct list_head *list, unsigned int order)
374{ 388{
375 unsigned long flags; 389 spin_lock(&zone->lock);
376 struct page *page = NULL;
377 int ret = 0;
378
379 spin_lock_irqsave(&zone->lock, flags);
380 zone->all_unreclaimable = 0; 390 zone->all_unreclaimable = 0;
381 zone->pages_scanned = 0; 391 zone->pages_scanned = 0;
382 while (!list_empty(list) && count--) { 392 while (count--) {
393 struct page *page;
394
395 BUG_ON(list_empty(list));
383 page = list_entry(list->prev, struct page, lru); 396 page = list_entry(list->prev, struct page, lru);
384 /* have to delete it as __free_pages_bulk list manipulates */ 397 /* have to delete it as __free_one_page list manipulates */
385 list_del(&page->lru); 398 list_del(&page->lru);
386 __free_pages_bulk(page, zone, order); 399 __free_one_page(page, zone, order);
387 ret++;
388 } 400 }
389 spin_unlock_irqrestore(&zone->lock, flags); 401 spin_unlock(&zone->lock);
390 return ret;
391} 402}
392 403
393void __free_pages_ok(struct page *page, unsigned int order) 404static void free_one_page(struct zone *zone, struct page *page, int order)
394{ 405{
395 LIST_HEAD(list); 406 LIST_HEAD(list);
407 list_add(&page->lru, &list);
408 free_pages_bulk(zone, 1, &list, order);
409}
410
411static void __free_pages_ok(struct page *page, unsigned int order)
412{
413 unsigned long flags;
396 int i; 414 int i;
415 int reserved = 0;
397 416
398 arch_free_page(page, order); 417 arch_free_page(page, order);
399 418 if (!PageHighMem(page))
400 mod_page_state(pgfree, 1 << order); 419 mutex_debug_check_no_locks_freed(page_address(page),
420 PAGE_SIZE<<order);
401 421
402#ifndef CONFIG_MMU 422#ifndef CONFIG_MMU
403 if (order > 0) 423 for (i = 1 ; i < (1 << order) ; ++i)
404 for (i = 1 ; i < (1 << order) ; ++i) 424 __put_page(page + i);
405 __put_page(page + i);
406#endif 425#endif
407 426
408 for (i = 0 ; i < (1 << order) ; ++i) 427 for (i = 0 ; i < (1 << order) ; ++i)
409 free_pages_check(__FUNCTION__, page + i); 428 reserved += free_pages_check(page + i);
410 list_add(&page->lru, &list); 429 if (reserved)
411 kernel_map_pages(page, 1<<order, 0); 430 return;
412 free_pages_bulk(page_zone(page), 1, &list, order); 431
432 kernel_map_pages(page, 1 << order, 0);
433 local_irq_save(flags);
434 __mod_page_state(pgfree, 1 << order);
435 free_one_page(page_zone(page), page, order);
436 local_irq_restore(flags);
437}
438
439/*
440 * permit the bootmem allocator to evade page validation on high-order frees
441 */
442void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
443{
444 if (order == 0) {
445 __ClearPageReserved(page);
446 set_page_count(page, 0);
447
448 free_hot_cold_page(page, 0);
449 } else {
450 LIST_HEAD(list);
451 int loop;
452
453 for (loop = 0; loop < BITS_PER_LONG; loop++) {
454 struct page *p = &page[loop];
455
456 if (loop + 16 < BITS_PER_LONG)
457 prefetchw(p + 16);
458 __ClearPageReserved(p);
459 set_page_count(p, 0);
460 }
461
462 arch_free_page(page, order);
463
464 mod_page_state(pgfree, 1 << order);
465
466 list_add(&page->lru, &list);
467 kernel_map_pages(page, 1 << order, 0);
468 free_pages_bulk(page_zone(page), 1, &list, order);
469 }
413} 470}
414 471
415 472
@@ -427,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
427 * 484 *
428 * -- wli 485 * -- wli
429 */ 486 */
430static inline struct page * 487static inline void expand(struct zone *zone, struct page *page,
431expand(struct zone *zone, struct page *page,
432 int low, int high, struct free_area *area) 488 int low, int high, struct free_area *area)
433{ 489{
434 unsigned long size = 1 << high; 490 unsigned long size = 1 << high;
@@ -442,34 +498,16 @@ expand(struct zone *zone, struct page *page,
442 area->nr_free++; 498 area->nr_free++;
443 set_page_order(&page[size], high); 499 set_page_order(&page[size], high);
444 } 500 }
445 return page;
446}
447
448void set_page_refs(struct page *page, int order)
449{
450#ifdef CONFIG_MMU
451 set_page_count(page, 1);
452#else
453 int i;
454
455 /*
456 * We need to reference all the pages for this order, otherwise if
457 * anyone accesses one of the pages with (get/put) it will be freed.
458 * - eg: access_process_vm()
459 */
460 for (i = 0; i < (1 << order); i++)
461 set_page_count(page + i, 1);
462#endif /* CONFIG_MMU */
463} 501}
464 502
465/* 503/*
466 * This page is about to be returned from the page allocator 504 * This page is about to be returned from the page allocator
467 */ 505 */
468static void prep_new_page(struct page *page, int order) 506static int prep_new_page(struct page *page, int order)
469{ 507{
470 if ( page_mapcount(page) || 508 if (unlikely(page_mapcount(page) |
471 page->mapping != NULL || 509 (page->mapping != NULL) |
472 page_count(page) != 0 || 510 (page_count(page) != 0) |
473 (page->flags & ( 511 (page->flags & (
474 1 << PG_lru | 512 1 << PG_lru |
475 1 << PG_private | 513 1 << PG_private |
@@ -480,8 +518,15 @@ static void prep_new_page(struct page *page, int order)
480 1 << PG_slab | 518 1 << PG_slab |
481 1 << PG_swapcache | 519 1 << PG_swapcache |
482 1 << PG_writeback | 520 1 << PG_writeback |
483 1 << PG_reserved ))) 521 1 << PG_reserved ))))
484 bad_page(__FUNCTION__, page); 522 bad_page(page);
523
524 /*
525 * For now, we report if PG_reserved was found set, but do not
526 * clear it, and do not allocate the page: as a safety net.
527 */
528 if (PageReserved(page))
529 return 1;
485 530
486 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 531 page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
487 1 << PG_referenced | 1 << PG_arch_1 | 532 1 << PG_referenced | 1 << PG_arch_1 |
@@ -489,6 +534,7 @@ static void prep_new_page(struct page *page, int order)
489 set_page_private(page, 0); 534 set_page_private(page, 0);
490 set_page_refs(page, order); 535 set_page_refs(page, order);
491 kernel_map_pages(page, 1 << order, 1); 536 kernel_map_pages(page, 1 << order, 1);
537 return 0;
492} 538}
493 539
494/* 540/*
@@ -511,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
511 rmv_page_order(page); 557 rmv_page_order(page);
512 area->nr_free--; 558 area->nr_free--;
513 zone->free_pages -= 1UL << order; 559 zone->free_pages -= 1UL << order;
514 return expand(zone, page, order, current_order, area); 560 expand(zone, page, order, current_order, area);
561 return page;
515 } 562 }
516 563
517 return NULL; 564 return NULL;
@@ -525,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525static int rmqueue_bulk(struct zone *zone, unsigned int order, 572static int rmqueue_bulk(struct zone *zone, unsigned int order,
526 unsigned long count, struct list_head *list) 573 unsigned long count, struct list_head *list)
527{ 574{
528 unsigned long flags;
529 int i; 575 int i;
530 int allocated = 0;
531 struct page *page;
532 576
533 spin_lock_irqsave(&zone->lock, flags); 577 spin_lock(&zone->lock);
534 for (i = 0; i < count; ++i) { 578 for (i = 0; i < count; ++i) {
535 page = __rmqueue(zone, order); 579 struct page *page = __rmqueue(zone, order);
536 if (page == NULL) 580 if (unlikely(page == NULL))
537 break; 581 break;
538 allocated++;
539 list_add_tail(&page->lru, list); 582 list_add_tail(&page->lru, list);
540 } 583 }
541 spin_unlock_irqrestore(&zone->lock, flags); 584 spin_unlock(&zone->lock);
542 return allocated; 585 return i;
543} 586}
544 587
545#ifdef CONFIG_NUMA 588#ifdef CONFIG_NUMA
@@ -558,14 +601,13 @@ void drain_remote_pages(void)
558 if (zone->zone_pgdat->node_id == numa_node_id()) 601 if (zone->zone_pgdat->node_id == numa_node_id())
559 continue; 602 continue;
560 603
561 pset = zone->pageset[smp_processor_id()]; 604 pset = zone_pcp(zone, smp_processor_id());
562 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 605 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
563 struct per_cpu_pages *pcp; 606 struct per_cpu_pages *pcp;
564 607
565 pcp = &pset->pcp[i]; 608 pcp = &pset->pcp[i];
566 if (pcp->count) 609 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
567 pcp->count -= free_pages_bulk(zone, pcp->count, 610 pcp->count = 0;
568 &pcp->list, 0);
569 } 611 }
570 } 612 }
571 local_irq_restore(flags); 613 local_irq_restore(flags);
@@ -575,6 +617,7 @@ void drain_remote_pages(void)
575#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 617#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
576static void __drain_pages(unsigned int cpu) 618static void __drain_pages(unsigned int cpu)
577{ 619{
620 unsigned long flags;
578 struct zone *zone; 621 struct zone *zone;
579 int i; 622 int i;
580 623
@@ -586,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
586 struct per_cpu_pages *pcp; 629 struct per_cpu_pages *pcp;
587 630
588 pcp = &pset->pcp[i]; 631 pcp = &pset->pcp[i];
589 pcp->count -= free_pages_bulk(zone, pcp->count, 632 local_irq_save(flags);
590 &pcp->list, 0); 633 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
634 pcp->count = 0;
635 local_irq_restore(flags);
591 } 636 }
592 } 637 }
593} 638}
@@ -633,18 +678,14 @@ void drain_local_pages(void)
633} 678}
634#endif /* CONFIG_PM */ 679#endif /* CONFIG_PM */
635 680
636static void zone_statistics(struct zonelist *zonelist, struct zone *z) 681static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
637{ 682{
638#ifdef CONFIG_NUMA 683#ifdef CONFIG_NUMA
639 unsigned long flags;
640 int cpu;
641 pg_data_t *pg = z->zone_pgdat; 684 pg_data_t *pg = z->zone_pgdat;
642 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 685 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
643 struct per_cpu_pageset *p; 686 struct per_cpu_pageset *p;
644 687
645 local_irq_save(flags); 688 p = zone_pcp(z, cpu);
646 cpu = smp_processor_id();
647 p = zone_pcp(z,cpu);
648 if (pg == orig) { 689 if (pg == orig) {
649 p->numa_hit++; 690 p->numa_hit++;
650 } else { 691 } else {
@@ -655,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
655 p->local_node++; 696 p->local_node++;
656 else 697 else
657 p->other_node++; 698 p->other_node++;
658 local_irq_restore(flags);
659#endif 699#endif
660} 700}
661 701
662/* 702/*
663 * Free a 0-order page 703 * Free a 0-order page
664 */ 704 */
665static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
666static void fastcall free_hot_cold_page(struct page *page, int cold) 705static void fastcall free_hot_cold_page(struct page *page, int cold)
667{ 706{
668 struct zone *zone = page_zone(page); 707 struct zone *zone = page_zone(page);
@@ -671,17 +710,22 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
671 710
672 arch_free_page(page, 0); 711 arch_free_page(page, 0);
673 712
674 kernel_map_pages(page, 1, 0);
675 inc_page_state(pgfree);
676 if (PageAnon(page)) 713 if (PageAnon(page))
677 page->mapping = NULL; 714 page->mapping = NULL;
678 free_pages_check(__FUNCTION__, page); 715 if (free_pages_check(page))
716 return;
717
718 kernel_map_pages(page, 1, 0);
719
679 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 720 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
680 local_irq_save(flags); 721 local_irq_save(flags);
722 __inc_page_state(pgfree);
681 list_add(&page->lru, &pcp->list); 723 list_add(&page->lru, &pcp->list);
682 pcp->count++; 724 pcp->count++;
683 if (pcp->count >= pcp->high) 725 if (pcp->count >= pcp->high) {
684 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 726 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
727 pcp->count -= pcp->batch;
728 }
685 local_irq_restore(flags); 729 local_irq_restore(flags);
686 put_cpu(); 730 put_cpu();
687} 731}
@@ -710,64 +754,82 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
710 * we cheat by calling it from here, in the order > 0 path. Saves a branch 754 * we cheat by calling it from here, in the order > 0 path. Saves a branch
711 * or two. 755 * or two.
712 */ 756 */
713static struct page * 757static struct page *buffered_rmqueue(struct zonelist *zonelist,
714buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 758 struct zone *zone, int order, gfp_t gfp_flags)
715{ 759{
716 unsigned long flags; 760 unsigned long flags;
717 struct page *page = NULL; 761 struct page *page;
718 int cold = !!(gfp_flags & __GFP_COLD); 762 int cold = !!(gfp_flags & __GFP_COLD);
763 int cpu;
719 764
720 if (order == 0) { 765again:
766 cpu = get_cpu();
767 if (likely(order == 0)) {
721 struct per_cpu_pages *pcp; 768 struct per_cpu_pages *pcp;
722 769
723 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 770 pcp = &zone_pcp(zone, cpu)->pcp[cold];
724 local_irq_save(flags); 771 local_irq_save(flags);
725 if (pcp->count <= pcp->low) 772 if (!pcp->count) {
726 pcp->count += rmqueue_bulk(zone, 0, 773 pcp->count += rmqueue_bulk(zone, 0,
727 pcp->batch, &pcp->list); 774 pcp->batch, &pcp->list);
728 if (pcp->count) { 775 if (unlikely(!pcp->count))
729 page = list_entry(pcp->list.next, struct page, lru); 776 goto failed;
730 list_del(&page->lru);
731 pcp->count--;
732 } 777 }
733 local_irq_restore(flags); 778 page = list_entry(pcp->list.next, struct page, lru);
734 put_cpu(); 779 list_del(&page->lru);
735 } 780 pcp->count--;
736 781 } else {
737 if (page == NULL) {
738 spin_lock_irqsave(&zone->lock, flags); 782 spin_lock_irqsave(&zone->lock, flags);
739 page = __rmqueue(zone, order); 783 page = __rmqueue(zone, order);
740 spin_unlock_irqrestore(&zone->lock, flags); 784 spin_unlock(&zone->lock);
785 if (!page)
786 goto failed;
741 } 787 }
742 788
743 if (page != NULL) { 789 __mod_page_state_zone(zone, pgalloc, 1 << order);
744 BUG_ON(bad_range(zone, page)); 790 zone_statistics(zonelist, zone, cpu);
745 mod_page_state_zone(zone, pgalloc, 1 << order); 791 local_irq_restore(flags);
746 prep_new_page(page, order); 792 put_cpu();
793
794 BUG_ON(bad_range(zone, page));
795 if (prep_new_page(page, order))
796 goto again;
747 797
748 if (gfp_flags & __GFP_ZERO) 798 if (gfp_flags & __GFP_ZERO)
749 prep_zero_page(page, order, gfp_flags); 799 prep_zero_page(page, order, gfp_flags);
750 800
751 if (order && (gfp_flags & __GFP_COMP)) 801 if (order && (gfp_flags & __GFP_COMP))
752 prep_compound_page(page, order); 802 prep_compound_page(page, order);
753 }
754 return page; 803 return page;
804
805failed:
806 local_irq_restore(flags);
807 put_cpu();
808 return NULL;
755} 809}
756 810
811#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
812#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
813#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
814#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
815#define ALLOC_HARDER 0x10 /* try to alloc harder */
816#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
817#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
818
757/* 819/*
758 * Return 1 if free pages are above 'mark'. This takes into account the order 820 * Return 1 if free pages are above 'mark'. This takes into account the order
759 * of the allocation. 821 * of the allocation.
760 */ 822 */
761int zone_watermark_ok(struct zone *z, int order, unsigned long mark, 823int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
762 int classzone_idx, int can_try_harder, gfp_t gfp_high) 824 int classzone_idx, int alloc_flags)
763{ 825{
764 /* free_pages my go negative - that's OK */ 826 /* free_pages my go negative - that's OK */
765 long min = mark, free_pages = z->free_pages - (1 << order) + 1; 827 long min = mark, free_pages = z->free_pages - (1 << order) + 1;
766 int o; 828 int o;
767 829
768 if (gfp_high) 830 if (alloc_flags & ALLOC_HIGH)
769 min -= min / 2; 831 min -= min / 2;
770 if (can_try_harder) 832 if (alloc_flags & ALLOC_HARDER)
771 min -= min / 4; 833 min -= min / 4;
772 834
773 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 835 if (free_pages <= min + z->lowmem_reserve[classzone_idx])
@@ -785,14 +847,48 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
785 return 1; 847 return 1;
786} 848}
787 849
788static inline int 850/*
789should_reclaim_zone(struct zone *z, gfp_t gfp_mask) 851 * get_page_from_freeliest goes through the zonelist trying to allocate
852 * a page.
853 */
854static struct page *
855get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
856 struct zonelist *zonelist, int alloc_flags)
790{ 857{
791 if (!z->reclaim_pages) 858 struct zone **z = zonelist->zones;
792 return 0; 859 struct page *page = NULL;
793 if (gfp_mask & __GFP_NORECLAIM) 860 int classzone_idx = zone_idx(*z);
794 return 0; 861
795 return 1; 862 /*
863 * Go through the zonelist once, looking for a zone with enough free.
864 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
865 */
866 do {
867 if ((alloc_flags & ALLOC_CPUSET) &&
868 !cpuset_zone_allowed(*z, gfp_mask))
869 continue;
870
871 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
872 unsigned long mark;
873 if (alloc_flags & ALLOC_WMARK_MIN)
874 mark = (*z)->pages_min;
875 else if (alloc_flags & ALLOC_WMARK_LOW)
876 mark = (*z)->pages_low;
877 else
878 mark = (*z)->pages_high;
879 if (!zone_watermark_ok(*z, order, mark,
880 classzone_idx, alloc_flags))
881 if (!zone_reclaim_mode ||
882 !zone_reclaim(*z, gfp_mask, order))
883 continue;
884 }
885
886 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
887 if (page) {
888 break;
889 }
890 } while (*(++z) != NULL);
891 return page;
796} 892}
797 893
798/* 894/*
@@ -803,105 +899,76 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
803 struct zonelist *zonelist) 899 struct zonelist *zonelist)
804{ 900{
805 const gfp_t wait = gfp_mask & __GFP_WAIT; 901 const gfp_t wait = gfp_mask & __GFP_WAIT;
806 struct zone **zones, *z; 902 struct zone **z;
807 struct page *page; 903 struct page *page;
808 struct reclaim_state reclaim_state; 904 struct reclaim_state reclaim_state;
809 struct task_struct *p = current; 905 struct task_struct *p = current;
810 int i;
811 int classzone_idx;
812 int do_retry; 906 int do_retry;
813 int can_try_harder; 907 int alloc_flags;
814 int did_some_progress; 908 int did_some_progress;
815 909
816 might_sleep_if(wait); 910 might_sleep_if(wait);
817 911
818 /* 912restart:
819 * The caller may dip into page reserves a bit more if the caller 913 z = zonelist->zones; /* the list of zones suitable for gfp_mask */
820 * cannot run direct reclaim, or is the caller has realtime scheduling
821 * policy
822 */
823 can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) || !wait;
824
825 zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
826 914
827 if (unlikely(zones[0] == NULL)) { 915 if (unlikely(*z == NULL)) {
828 /* Should this ever happen?? */ 916 /* Should this ever happen?? */
829 return NULL; 917 return NULL;
830 } 918 }
831 919
832 classzone_idx = zone_idx(zones[0]); 920 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
921 zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
922 if (page)
923 goto got_pg;
924
925 do {
926 wakeup_kswapd(*z, order);
927 } while (*(++z));
833 928
834restart:
835 /* 929 /*
836 * Go through the zonelist once, looking for a zone with enough free. 930 * OK, we're below the kswapd watermark and have kicked background
837 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 931 * reclaim. Now things get more complex, so set up alloc_flags according
932 * to how we want to proceed.
933 *
934 * The caller may dip into page reserves a bit more if the caller
935 * cannot run direct reclaim, or if the caller has realtime scheduling
936 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
937 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
838 */ 938 */
839 for (i = 0; (z = zones[i]) != NULL; i++) { 939 alloc_flags = ALLOC_WMARK_MIN;
840 int do_reclaim = should_reclaim_zone(z, gfp_mask); 940 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
841 941 alloc_flags |= ALLOC_HARDER;
842 if (!cpuset_zone_allowed(z, __GFP_HARDWALL)) 942 if (gfp_mask & __GFP_HIGH)
843 continue; 943 alloc_flags |= ALLOC_HIGH;
844 944 alloc_flags |= ALLOC_CPUSET;
845 /*
846 * If the zone is to attempt early page reclaim then this loop
847 * will try to reclaim pages and check the watermark a second
848 * time before giving up and falling back to the next zone.
849 */
850zone_reclaim_retry:
851 if (!zone_watermark_ok(z, order, z->pages_low,
852 classzone_idx, 0, 0)) {
853 if (!do_reclaim)
854 continue;
855 else {
856 zone_reclaim(z, gfp_mask, order);
857 /* Only try reclaim once */
858 do_reclaim = 0;
859 goto zone_reclaim_retry;
860 }
861 }
862
863 page = buffered_rmqueue(z, order, gfp_mask);
864 if (page)
865 goto got_pg;
866 }
867
868 for (i = 0; (z = zones[i]) != NULL; i++)
869 wakeup_kswapd(z, order);
870 945
871 /* 946 /*
872 * Go through the zonelist again. Let __GFP_HIGH and allocations 947 * Go through the zonelist again. Let __GFP_HIGH and allocations
873 * coming from realtime tasks to go deeper into reserves 948 * coming from realtime tasks go deeper into reserves.
874 * 949 *
875 * This is the last chance, in general, before the goto nopage. 950 * This is the last chance, in general, before the goto nopage.
876 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 951 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
877 * See also cpuset_zone_allowed() comment in kernel/cpuset.c. 952 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
878 */ 953 */
879 for (i = 0; (z = zones[i]) != NULL; i++) { 954 page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
880 if (!zone_watermark_ok(z, order, z->pages_min, 955 if (page)
881 classzone_idx, can_try_harder, 956 goto got_pg;
882 gfp_mask & __GFP_HIGH))
883 continue;
884
885 if (wait && !cpuset_zone_allowed(z, gfp_mask))
886 continue;
887
888 page = buffered_rmqueue(z, order, gfp_mask);
889 if (page)
890 goto got_pg;
891 }
892 957
893 /* This allocation should allow future memory freeing. */ 958 /* This allocation should allow future memory freeing. */
894 959
895 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 960 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
896 && !in_interrupt()) { 961 && !in_interrupt()) {
897 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 962 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
963nofail_alloc:
898 /* go through the zonelist yet again, ignoring mins */ 964 /* go through the zonelist yet again, ignoring mins */
899 for (i = 0; (z = zones[i]) != NULL; i++) { 965 page = get_page_from_freelist(gfp_mask, order,
900 if (!cpuset_zone_allowed(z, gfp_mask)) 966 zonelist, ALLOC_NO_WATERMARKS);
901 continue; 967 if (page)
902 page = buffered_rmqueue(z, order, gfp_mask); 968 goto got_pg;
903 if (page) 969 if (gfp_mask & __GFP_NOFAIL) {
904 goto got_pg; 970 blk_congestion_wait(WRITE, HZ/50);
971 goto nofail_alloc;
905 } 972 }
906 } 973 }
907 goto nopage; 974 goto nopage;
@@ -915,11 +982,12 @@ rebalance:
915 cond_resched(); 982 cond_resched();
916 983
917 /* We now go into synchronous reclaim */ 984 /* We now go into synchronous reclaim */
985 cpuset_memory_pressure_bump();
918 p->flags |= PF_MEMALLOC; 986 p->flags |= PF_MEMALLOC;
919 reclaim_state.reclaimed_slab = 0; 987 reclaim_state.reclaimed_slab = 0;
920 p->reclaim_state = &reclaim_state; 988 p->reclaim_state = &reclaim_state;
921 989
922 did_some_progress = try_to_free_pages(zones, gfp_mask); 990 did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
923 991
924 p->reclaim_state = NULL; 992 p->reclaim_state = NULL;
925 p->flags &= ~PF_MEMALLOC; 993 p->flags &= ~PF_MEMALLOC;
@@ -927,19 +995,10 @@ rebalance:
927 cond_resched(); 995 cond_resched();
928 996
929 if (likely(did_some_progress)) { 997 if (likely(did_some_progress)) {
930 for (i = 0; (z = zones[i]) != NULL; i++) { 998 page = get_page_from_freelist(gfp_mask, order,
931 if (!zone_watermark_ok(z, order, z->pages_min, 999 zonelist, alloc_flags);
932 classzone_idx, can_try_harder, 1000 if (page)
933 gfp_mask & __GFP_HIGH)) 1001 goto got_pg;
934 continue;
935
936 if (!cpuset_zone_allowed(z, gfp_mask))
937 continue;
938
939 page = buffered_rmqueue(z, order, gfp_mask);
940 if (page)
941 goto got_pg;
942 }
943 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { 1002 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
944 /* 1003 /*
945 * Go through the zonelist yet one more time, keep 1004 * Go through the zonelist yet one more time, keep
@@ -947,18 +1006,10 @@ rebalance:
947 * a parallel oom killing, we must fail if we're still 1006 * a parallel oom killing, we must fail if we're still
948 * under heavy pressure. 1007 * under heavy pressure.
949 */ 1008 */
950 for (i = 0; (z = zones[i]) != NULL; i++) { 1009 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
951 if (!zone_watermark_ok(z, order, z->pages_high, 1010 zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
952 classzone_idx, 0, 0)) 1011 if (page)
953 continue; 1012 goto got_pg;
954
955 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
956 continue;
957
958 page = buffered_rmqueue(z, order, gfp_mask);
959 if (page)
960 goto got_pg;
961 }
962 1013
963 out_of_memory(gfp_mask, order); 1014 out_of_memory(gfp_mask, order);
964 goto restart; 1015 goto restart;
@@ -991,9 +1042,7 @@ nopage:
991 dump_stack(); 1042 dump_stack();
992 show_mem(); 1043 show_mem();
993 } 1044 }
994 return NULL;
995got_pg: 1045got_pg:
996 zone_statistics(zonelist, z);
997 return page; 1046 return page;
998} 1047}
999 1048
@@ -1160,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache);
1160DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1209DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1161#endif 1210#endif
1162 1211
1163void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1212static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1164{ 1213{
1165 int cpu = 0; 1214 int cpu = 0;
1166 1215
@@ -1213,7 +1262,7 @@ void get_full_page_state(struct page_state *ret)
1213 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1262 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1214} 1263}
1215 1264
1216unsigned long __read_page_state(unsigned long offset) 1265unsigned long read_page_state_offset(unsigned long offset)
1217{ 1266{
1218 unsigned long ret = 0; 1267 unsigned long ret = 0;
1219 int cpu; 1268 int cpu;
@@ -1227,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset)
1227 return ret; 1276 return ret;
1228} 1277}
1229 1278
1230void __mod_page_state(unsigned long offset, unsigned long delta) 1279void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1280{
1281 void *ptr;
1282
1283 ptr = &__get_cpu_var(page_states);
1284 *(unsigned long *)(ptr + offset) += delta;
1285}
1286EXPORT_SYMBOL(__mod_page_state_offset);
1287
1288void mod_page_state_offset(unsigned long offset, unsigned long delta)
1231{ 1289{
1232 unsigned long flags; 1290 unsigned long flags;
1233 void* ptr; 1291 void *ptr;
1234 1292
1235 local_irq_save(flags); 1293 local_irq_save(flags);
1236 ptr = &__get_cpu_var(page_states); 1294 ptr = &__get_cpu_var(page_states);
1237 *(unsigned long*)(ptr + offset) += delta; 1295 *(unsigned long *)(ptr + offset) += delta;
1238 local_irq_restore(flags); 1296 local_irq_restore(flags);
1239} 1297}
1240 1298EXPORT_SYMBOL(mod_page_state_offset);
1241EXPORT_SYMBOL(__mod_page_state);
1242 1299
1243void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1300void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1244 unsigned long *free, struct pglist_data *pgdat) 1301 unsigned long *free, struct pglist_data *pgdat)
@@ -1324,7 +1381,7 @@ void show_free_areas(void)
1324 show_node(zone); 1381 show_node(zone);
1325 printk("%s per-cpu:", zone->name); 1382 printk("%s per-cpu:", zone->name);
1326 1383
1327 if (!zone->present_pages) { 1384 if (!populated_zone(zone)) {
1328 printk(" empty\n"); 1385 printk(" empty\n");
1329 continue; 1386 continue;
1330 } else 1387 } else
@@ -1336,10 +1393,9 @@ void show_free_areas(void)
1336 pageset = zone_pcp(zone, cpu); 1393 pageset = zone_pcp(zone, cpu);
1337 1394
1338 for (temperature = 0; temperature < 2; temperature++) 1395 for (temperature = 0; temperature < 2; temperature++)
1339 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1396 printk("cpu %d %s: high %d, batch %d used:%d\n",
1340 cpu, 1397 cpu,
1341 temperature ? "cold" : "hot", 1398 temperature ? "cold" : "hot",
1342 pageset->pcp[temperature].low,
1343 pageset->pcp[temperature].high, 1399 pageset->pcp[temperature].high,
1344 pageset->pcp[temperature].batch, 1400 pageset->pcp[temperature].batch,
1345 pageset->pcp[temperature].count); 1401 pageset->pcp[temperature].count);
@@ -1402,7 +1458,7 @@ void show_free_areas(void)
1402 1458
1403 show_node(zone); 1459 show_node(zone);
1404 printk("%s: ", zone->name); 1460 printk("%s: ", zone->name);
1405 if (!zone->present_pages) { 1461 if (!populated_zone(zone)) {
1406 printk("empty\n"); 1462 printk("empty\n");
1407 continue; 1463 continue;
1408 } 1464 }
@@ -1422,32 +1478,29 @@ void show_free_areas(void)
1422 1478
1423/* 1479/*
1424 * Builds allocation fallback zone lists. 1480 * Builds allocation fallback zone lists.
1481 *
1482 * Add all populated zones of a node to the zonelist.
1425 */ 1483 */
1426static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1484static int __init build_zonelists_node(pg_data_t *pgdat,
1427{ 1485 struct zonelist *zonelist, int nr_zones, int zone_type)
1428 switch (k) { 1486{
1429 struct zone *zone; 1487 struct zone *zone;
1430 default: 1488
1431 BUG(); 1489 BUG_ON(zone_type > ZONE_HIGHMEM);
1432 case ZONE_HIGHMEM: 1490
1433 zone = pgdat->node_zones + ZONE_HIGHMEM; 1491 do {
1434 if (zone->present_pages) { 1492 zone = pgdat->node_zones + zone_type;
1493 if (populated_zone(zone)) {
1435#ifndef CONFIG_HIGHMEM 1494#ifndef CONFIG_HIGHMEM
1436 BUG(); 1495 BUG_ON(zone_type > ZONE_NORMAL);
1437#endif 1496#endif
1438 zonelist->zones[j++] = zone; 1497 zonelist->zones[nr_zones++] = zone;
1498 check_highest_zone(zone_type);
1439 } 1499 }
1440 case ZONE_NORMAL: 1500 zone_type--;
1441 zone = pgdat->node_zones + ZONE_NORMAL;
1442 if (zone->present_pages)
1443 zonelist->zones[j++] = zone;
1444 case ZONE_DMA:
1445 zone = pgdat->node_zones + ZONE_DMA;
1446 if (zone->present_pages)
1447 zonelist->zones[j++] = zone;
1448 }
1449 1501
1450 return j; 1502 } while (zone_type >= 0);
1503 return nr_zones;
1451} 1504}
1452 1505
1453static inline int highest_zone(int zone_bits) 1506static inline int highest_zone(int zone_bits)
@@ -1455,6 +1508,8 @@ static inline int highest_zone(int zone_bits)
1455 int res = ZONE_NORMAL; 1508 int res = ZONE_NORMAL;
1456 if (zone_bits & (__force int)__GFP_HIGHMEM) 1509 if (zone_bits & (__force int)__GFP_HIGHMEM)
1457 res = ZONE_HIGHMEM; 1510 res = ZONE_HIGHMEM;
1511 if (zone_bits & (__force int)__GFP_DMA32)
1512 res = ZONE_DMA32;
1458 if (zone_bits & (__force int)__GFP_DMA) 1513 if (zone_bits & (__force int)__GFP_DMA)
1459 res = ZONE_DMA; 1514 res = ZONE_DMA;
1460 return res; 1515 return res;
@@ -1542,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat)
1542 prev_node = local_node; 1597 prev_node = local_node;
1543 nodes_clear(used_mask); 1598 nodes_clear(used_mask);
1544 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1599 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1600 int distance = node_distance(local_node, node);
1601
1602 /*
1603 * If another node is sufficiently far away then it is better
1604 * to reclaim pages in a zone before going off node.
1605 */
1606 if (distance > RECLAIM_DISTANCE)
1607 zone_reclaim_mode = 1;
1608
1545 /* 1609 /*
1546 * We don't want to pressure a particular node. 1610 * We don't want to pressure a particular node.
1547 * So adding penalty to the first node in same 1611 * So adding penalty to the first node in same
1548 * distance group to make it round-robin. 1612 * distance group to make it round-robin.
1549 */ 1613 */
1550 if (node_distance(local_node, node) != 1614
1551 node_distance(local_node, prev_node)) 1615 if (distance != node_distance(local_node, prev_node))
1552 node_load[node] += load; 1616 node_load[node] += load;
1553 prev_node = node; 1617 prev_node = node;
1554 load--; 1618 load--;
@@ -1682,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
1682 * up by free_all_bootmem() once the early boot process is 1746 * up by free_all_bootmem() once the early boot process is
1683 * done. Non-atomic initialization, single-pass. 1747 * done. Non-atomic initialization, single-pass.
1684 */ 1748 */
1685void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, 1749void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1686 unsigned long start_pfn) 1750 unsigned long start_pfn)
1687{ 1751{
1688 struct page *page; 1752 struct page *page;
1689 unsigned long end_pfn = start_pfn + size; 1753 unsigned long end_pfn = start_pfn + size;
1690 unsigned long pfn; 1754 unsigned long pfn;
1691 1755
1692 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1756 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1693 if (!early_pfn_valid(pfn)) 1757 if (!early_pfn_valid(pfn))
1694 continue; 1758 continue;
1695 if (!early_pfn_in_nid(pfn, nid))
1696 continue;
1697 page = pfn_to_page(pfn); 1759 page = pfn_to_page(pfn);
1698 set_page_links(page, zone, nid, pfn); 1760 set_page_links(page, zone, nid, pfn);
1699 set_page_count(page, 1); 1761 set_page_count(page, 1);
@@ -1737,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
1737 memmap_init_zone((size), (nid), (zone), (start_pfn)) 1799 memmap_init_zone((size), (nid), (zone), (start_pfn))
1738#endif 1800#endif
1739 1801
1740static int __devinit zone_batchsize(struct zone *zone) 1802static int __meminit zone_batchsize(struct zone *zone)
1741{ 1803{
1742 int batch; 1804 int batch;
1743 1805
@@ -1755,16 +1817,16 @@ static int __devinit zone_batchsize(struct zone *zone)
1755 batch = 1; 1817 batch = 1;
1756 1818
1757 /* 1819 /*
1758 * We will be trying to allcoate bigger chunks of contiguous 1820 * Clamp the batch to a 2^n - 1 value. Having a power
1759 * memory of the order of fls(batch). This should result in 1821 * of 2 value was found to be more likely to have
1760 * better cache coloring. 1822 * suboptimal cache aliasing properties in some cases.
1761 * 1823 *
1762 * A sanity check also to ensure that batch is still in limits. 1824 * For example if 2 tasks are alternately allocating
1825 * batches of pages, one task can end up with a lot
1826 * of pages of one half of the possible page colors
1827 * and the other with pages of the other colors.
1763 */ 1828 */
1764 batch = (1 << fls(batch + batch/2)); 1829 batch = (1 << (fls(batch + batch/2)-1)) - 1;
1765
1766 if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
1767 batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
1768 1830
1769 return batch; 1831 return batch;
1770} 1832}
@@ -1777,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1777 1839
1778 pcp = &p->pcp[0]; /* hot */ 1840 pcp = &p->pcp[0]; /* hot */
1779 pcp->count = 0; 1841 pcp->count = 0;
1780 pcp->low = 0;
1781 pcp->high = 6 * batch; 1842 pcp->high = 6 * batch;
1782 pcp->batch = max(1UL, 1 * batch); 1843 pcp->batch = max(1UL, 1 * batch);
1783 INIT_LIST_HEAD(&pcp->list); 1844 INIT_LIST_HEAD(&pcp->list);
1784 1845
1785 pcp = &p->pcp[1]; /* cold*/ 1846 pcp = &p->pcp[1]; /* cold*/
1786 pcp->count = 0; 1847 pcp->count = 0;
1787 pcp->low = 0;
1788 pcp->high = 2 * batch; 1848 pcp->high = 2 * batch;
1789 pcp->batch = max(1UL, batch/2); 1849 pcp->batch = max(1UL, batch/2);
1790 INIT_LIST_HEAD(&pcp->list); 1850 INIT_LIST_HEAD(&pcp->list);
1791} 1851}
1792 1852
1853/*
1854 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1855 * to the value high for the pageset p.
1856 */
1857
1858static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1859 unsigned long high)
1860{
1861 struct per_cpu_pages *pcp;
1862
1863 pcp = &p->pcp[0]; /* hot list */
1864 pcp->high = high;
1865 pcp->batch = max(1UL, high/4);
1866 if ((high/4) > (PAGE_SHIFT * 8))
1867 pcp->batch = PAGE_SHIFT * 8;
1868}
1869
1870
1793#ifdef CONFIG_NUMA 1871#ifdef CONFIG_NUMA
1794/* 1872/*
1795 * Boot pageset table. One per cpu which is going to be used for all 1873 * Boot pageset table. One per cpu which is going to be used for all
@@ -1815,18 +1893,22 @@ static struct per_cpu_pageset
1815 * Dynamically allocate memory for the 1893 * Dynamically allocate memory for the
1816 * per cpu pageset array in struct zone. 1894 * per cpu pageset array in struct zone.
1817 */ 1895 */
1818static int __devinit process_zones(int cpu) 1896static int __meminit process_zones(int cpu)
1819{ 1897{
1820 struct zone *zone, *dzone; 1898 struct zone *zone, *dzone;
1821 1899
1822 for_each_zone(zone) { 1900 for_each_zone(zone) {
1823 1901
1824 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1902 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1825 GFP_KERNEL, cpu_to_node(cpu)); 1903 GFP_KERNEL, cpu_to_node(cpu));
1826 if (!zone->pageset[cpu]) 1904 if (!zone_pcp(zone, cpu))
1827 goto bad; 1905 goto bad;
1828 1906
1829 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1907 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1908
1909 if (percpu_pagelist_fraction)
1910 setup_pagelist_highmark(zone_pcp(zone, cpu),
1911 (zone->present_pages / percpu_pagelist_fraction));
1830 } 1912 }
1831 1913
1832 return 0; 1914 return 0;
@@ -1834,15 +1916,14 @@ bad:
1834 for_each_zone(dzone) { 1916 for_each_zone(dzone) {
1835 if (dzone == zone) 1917 if (dzone == zone)
1836 break; 1918 break;
1837 kfree(dzone->pageset[cpu]); 1919 kfree(zone_pcp(dzone, cpu));
1838 dzone->pageset[cpu] = NULL; 1920 zone_pcp(dzone, cpu) = NULL;
1839 } 1921 }
1840 return -ENOMEM; 1922 return -ENOMEM;
1841} 1923}
1842 1924
1843static inline void free_zone_pagesets(int cpu) 1925static inline void free_zone_pagesets(int cpu)
1844{ 1926{
1845#ifdef CONFIG_NUMA
1846 struct zone *zone; 1927 struct zone *zone;
1847 1928
1848 for_each_zone(zone) { 1929 for_each_zone(zone) {
@@ -1851,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu)
1851 zone_pcp(zone, cpu) = NULL; 1932 zone_pcp(zone, cpu) = NULL;
1852 kfree(pset); 1933 kfree(pset);
1853 } 1934 }
1854#endif
1855} 1935}
1856 1936
1857static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1937static int __meminit pageset_cpuup_callback(struct notifier_block *nfb,
1858 unsigned long action, 1938 unsigned long action,
1859 void *hcpu) 1939 void *hcpu)
1860{ 1940{
@@ -1866,11 +1946,10 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1866 if (process_zones(cpu)) 1946 if (process_zones(cpu))
1867 ret = NOTIFY_BAD; 1947 ret = NOTIFY_BAD;
1868 break; 1948 break;
1869#ifdef CONFIG_HOTPLUG_CPU 1949 case CPU_UP_CANCELED:
1870 case CPU_DEAD: 1950 case CPU_DEAD:
1871 free_zone_pagesets(cpu); 1951 free_zone_pagesets(cpu);
1872 break; 1952 break;
1873#endif
1874 default: 1953 default:
1875 break; 1954 break;
1876 } 1955 }
@@ -1880,7 +1959,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
1880static struct notifier_block pageset_notifier = 1959static struct notifier_block pageset_notifier =
1881 { &pageset_cpuup_callback, NULL, 0 }; 1960 { &pageset_cpuup_callback, NULL, 0 };
1882 1961
1883void __init setup_per_cpu_pageset() 1962void __init setup_per_cpu_pageset(void)
1884{ 1963{
1885 int err; 1964 int err;
1886 1965
@@ -1895,7 +1974,7 @@ void __init setup_per_cpu_pageset()
1895 1974
1896#endif 1975#endif
1897 1976
1898static __devinit 1977static __meminit
1899void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) 1978void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1900{ 1979{
1901 int i; 1980 int i;
@@ -1915,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
1915 init_waitqueue_head(zone->wait_table + i); 1994 init_waitqueue_head(zone->wait_table + i);
1916} 1995}
1917 1996
1918static __devinit void zone_pcp_init(struct zone *zone) 1997static __meminit void zone_pcp_init(struct zone *zone)
1919{ 1998{
1920 int cpu; 1999 int cpu;
1921 unsigned long batch = zone_batchsize(zone); 2000 unsigned long batch = zone_batchsize(zone);
@@ -1923,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1923 for (cpu = 0; cpu < NR_CPUS; cpu++) { 2002 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1924#ifdef CONFIG_NUMA 2003#ifdef CONFIG_NUMA
1925 /* Early boot. Slab allocator not functional yet */ 2004 /* Early boot. Slab allocator not functional yet */
1926 zone->pageset[cpu] = &boot_pageset[cpu]; 2005 zone_pcp(zone, cpu) = &boot_pageset[cpu];
1927 setup_pageset(&boot_pageset[cpu],0); 2006 setup_pageset(&boot_pageset[cpu],0);
1928#else 2007#else
1929 setup_pageset(zone_pcp(zone,cpu), batch); 2008 setup_pageset(zone_pcp(zone,cpu), batch);
@@ -1933,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1933 zone->name, zone->present_pages, batch); 2012 zone->name, zone->present_pages, batch);
1934} 2013}
1935 2014
1936static __devinit void init_currently_empty_zone(struct zone *zone, 2015static __meminit void init_currently_empty_zone(struct zone *zone,
1937 unsigned long zone_start_pfn, unsigned long size) 2016 unsigned long zone_start_pfn, unsigned long size)
1938{ 2017{
1939 struct pglist_data *pgdat = zone->zone_pgdat; 2018 struct pglist_data *pgdat = zone->zone_pgdat;
@@ -1975,7 +2054,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat,
1975 if (zholes_size) 2054 if (zholes_size)
1976 realsize -= zholes_size[j]; 2055 realsize -= zholes_size[j];
1977 2056
1978 if (j == ZONE_DMA || j == ZONE_NORMAL) 2057 if (j < ZONE_HIGHMEM)
1979 nr_kernel_pages += realsize; 2058 nr_kernel_pages += realsize;
1980 nr_all_pages += realsize; 2059 nr_all_pages += realsize;
1981 2060
@@ -2100,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg)
2100 int order; 2179 int order;
2101 2180
2102 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2181 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2103 if (!zone->present_pages) 2182 if (!populated_zone(zone))
2104 continue; 2183 continue;
2105 2184
2106 spin_lock_irqsave(&zone->lock, flags); 2185 spin_lock_irqsave(&zone->lock, flags);
@@ -2133,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2133 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2212 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2134 int i; 2213 int i;
2135 2214
2136 if (!zone->present_pages) 2215 if (!populated_zone(zone))
2137 continue; 2216 continue;
2138 2217
2139 spin_lock_irqsave(&zone->lock, flags); 2218 spin_lock_irqsave(&zone->lock, flags);
@@ -2166,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2166 seq_printf(m, 2245 seq_printf(m,
2167 ")" 2246 ")"
2168 "\n pagesets"); 2247 "\n pagesets");
2169 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2248 for_each_online_cpu(i) {
2170 struct per_cpu_pageset *pageset; 2249 struct per_cpu_pageset *pageset;
2171 int j; 2250 int j;
2172 2251
@@ -2181,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2181 seq_printf(m, 2260 seq_printf(m,
2182 "\n cpu: %i pcp: %i" 2261 "\n cpu: %i pcp: %i"
2183 "\n count: %i" 2262 "\n count: %i"
2184 "\n low: %i"
2185 "\n high: %i" 2263 "\n high: %i"
2186 "\n batch: %i", 2264 "\n batch: %i",
2187 i, j, 2265 i, j,
2188 pageset->pcp[j].count, 2266 pageset->pcp[j].count,
2189 pageset->pcp[j].low,
2190 pageset->pcp[j].high, 2267 pageset->pcp[j].high,
2191 pageset->pcp[j].batch); 2268 pageset->pcp[j].batch);
2192 } 2269 }
@@ -2241,32 +2318,40 @@ static char *vmstat_text[] = {
2241 "pgpgout", 2318 "pgpgout",
2242 "pswpin", 2319 "pswpin",
2243 "pswpout", 2320 "pswpout",
2244 "pgalloc_high",
2245 2321
2322 "pgalloc_high",
2246 "pgalloc_normal", 2323 "pgalloc_normal",
2324 "pgalloc_dma32",
2247 "pgalloc_dma", 2325 "pgalloc_dma",
2326
2248 "pgfree", 2327 "pgfree",
2249 "pgactivate", 2328 "pgactivate",
2250 "pgdeactivate", 2329 "pgdeactivate",
2251 2330
2252 "pgfault", 2331 "pgfault",
2253 "pgmajfault", 2332 "pgmajfault",
2333
2254 "pgrefill_high", 2334 "pgrefill_high",
2255 "pgrefill_normal", 2335 "pgrefill_normal",
2336 "pgrefill_dma32",
2256 "pgrefill_dma", 2337 "pgrefill_dma",
2257 2338
2258 "pgsteal_high", 2339 "pgsteal_high",
2259 "pgsteal_normal", 2340 "pgsteal_normal",
2341 "pgsteal_dma32",
2260 "pgsteal_dma", 2342 "pgsteal_dma",
2343
2261 "pgscan_kswapd_high", 2344 "pgscan_kswapd_high",
2262 "pgscan_kswapd_normal", 2345 "pgscan_kswapd_normal",
2263 2346 "pgscan_kswapd_dma32",
2264 "pgscan_kswapd_dma", 2347 "pgscan_kswapd_dma",
2348
2265 "pgscan_direct_high", 2349 "pgscan_direct_high",
2266 "pgscan_direct_normal", 2350 "pgscan_direct_normal",
2351 "pgscan_direct_dma32",
2267 "pgscan_direct_dma", 2352 "pgscan_direct_dma",
2268 "pginodesteal",
2269 2353
2354 "pginodesteal",
2270 "slabs_scanned", 2355 "slabs_scanned",
2271 "kswapd_steal", 2356 "kswapd_steal",
2272 "kswapd_inodesteal", 2357 "kswapd_inodesteal",
@@ -2417,13 +2502,18 @@ void setup_per_zone_pages_min(void)
2417 } 2502 }
2418 2503
2419 for_each_zone(zone) { 2504 for_each_zone(zone) {
2505 unsigned long tmp;
2420 spin_lock_irqsave(&zone->lru_lock, flags); 2506 spin_lock_irqsave(&zone->lru_lock, flags);
2507 tmp = (pages_min * zone->present_pages) / lowmem_pages;
2421 if (is_highmem(zone)) { 2508 if (is_highmem(zone)) {
2422 /* 2509 /*
2423 * Often, highmem doesn't need to reserve any pages. 2510 * __GFP_HIGH and PF_MEMALLOC allocations usually don't
2424 * But the pages_min/low/high values are also used for 2511 * need highmem pages, so cap pages_min to a small
2425 * batching up page reclaim activity so we need a 2512 * value here.
2426 * decent value here. 2513 *
2514 * The (pages_high-pages_low) and (pages_low-pages_min)
2515 * deltas controls asynch page reclaim, and so should
2516 * not be capped for highmem.
2427 */ 2517 */
2428 int min_pages; 2518 int min_pages;
2429 2519
@@ -2434,19 +2524,15 @@ void setup_per_zone_pages_min(void)
2434 min_pages = 128; 2524 min_pages = 128;
2435 zone->pages_min = min_pages; 2525 zone->pages_min = min_pages;
2436 } else { 2526 } else {
2437 /* if it's a lowmem zone, reserve a number of pages 2527 /*
2528 * If it's a lowmem zone, reserve a number of pages
2438 * proportionate to the zone's size. 2529 * proportionate to the zone's size.
2439 */ 2530 */
2440 zone->pages_min = (pages_min * zone->present_pages) / 2531 zone->pages_min = tmp;
2441 lowmem_pages;
2442 } 2532 }
2443 2533
2444 /* 2534 zone->pages_low = zone->pages_min + tmp / 4;
2445 * When interpreting these watermarks, just keep in mind that: 2535 zone->pages_high = zone->pages_min + tmp / 2;
2446 * zone->pages_min == (zone->pages_min * 4) / 4;
2447 */
2448 zone->pages_low = (zone->pages_min * 5) / 4;
2449 zone->pages_high = (zone->pages_min * 6) / 4;
2450 spin_unlock_irqrestore(&zone->lru_lock, flags); 2536 spin_unlock_irqrestore(&zone->lru_lock, flags);
2451 } 2537 }
2452} 2538}
@@ -2522,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2522 return 0; 2608 return 0;
2523} 2609}
2524 2610
2611/*
2612 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2613 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2614 * can have before it gets flushed back to buddy allocator.
2615 */
2616
2617int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2618 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2619{
2620 struct zone *zone;
2621 unsigned int cpu;
2622 int ret;
2623
2624 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2625 if (!write || (ret == -EINVAL))
2626 return ret;
2627 for_each_zone(zone) {
2628 for_each_online_cpu(cpu) {
2629 unsigned long high;
2630 high = zone->present_pages / percpu_pagelist_fraction;
2631 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2632 }
2633 }
2634 return 0;
2635}
2636
2525__initdata int hashdist = HASHDIST_DEFAULT; 2637__initdata int hashdist = HASHDIST_DEFAULT;
2526 2638
2527#ifdef CONFIG_NUMA 2639#ifdef CONFIG_NUMA