aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c343
1 files changed, 186 insertions, 157 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..fd47494cb989 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -53,6 +54,8 @@ unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
55 56
57static void fastcall free_hot_cold_page(struct page *page, int cold);
58
56/* 59/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 60 * results with 256, 32 in the lowmem_reserve sysctl:
58 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) 61 * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
@@ -81,6 +84,7 @@ int min_free_kbytes = 1024;
81unsigned long __initdata nr_kernel_pages; 84unsigned long __initdata nr_kernel_pages;
82unsigned long __initdata nr_all_pages; 85unsigned long __initdata nr_all_pages;
83 86
87#ifdef CONFIG_DEBUG_VM
84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 88static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
85{ 89{
86 int ret = 0; 90 int ret = 0;
@@ -122,16 +126,23 @@ static int bad_range(struct zone *zone, struct page *page)
122 return 0; 126 return 0;
123} 127}
124 128
125static void bad_page(const char *function, struct page *page) 129#else
130static inline int bad_range(struct zone *zone, struct page *page)
131{
132 return 0;
133}
134#endif
135
136static void bad_page(struct page *page)
126{ 137{
127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 138 printk(KERN_EMERG "Bad page state in process '%s'\n"
128 function, current->comm, page); 139 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 140 "Trying to fix it up, but a reboot is needed\n"
130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 141 "Backtrace:\n",
131 page->mapping, page_mapcount(page), page_count(page)); 142 current->comm, page, (int)(2*sizeof(unsigned long)),
132 printk(KERN_EMERG "Backtrace:\n"); 143 (unsigned long)page->flags, page->mapping,
144 page_mapcount(page), page_count(page));
133 dump_stack(); 145 dump_stack();
134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
135 page->flags &= ~(1 << PG_lru | 146 page->flags &= ~(1 << PG_lru |
136 1 << PG_private | 147 1 << PG_private |
137 1 << PG_locked | 148 1 << PG_locked |
@@ -184,19 +195,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
184 int i; 195 int i;
185 int nr_pages = 1 << order; 196 int nr_pages = 1 << order;
186 197
187 if (!PageCompound(page)) 198 if (unlikely(page[1].index != order))
188 return; 199 bad_page(page);
189
190 if (page[1].index != order)
191 bad_page(__FUNCTION__, page);
192 200
193 for (i = 0; i < nr_pages; i++) { 201 for (i = 0; i < nr_pages; i++) {
194 struct page *p = page + i; 202 struct page *p = page + i;
195 203
196 if (!PageCompound(p)) 204 if (unlikely(!PageCompound(p) |
197 bad_page(__FUNCTION__, page); 205 (page_private(p) != (unsigned long)page)))
198 if (page_private(p) != (unsigned long)page) 206 bad_page(page);
199 bad_page(__FUNCTION__, page);
200 ClearPageCompound(p); 207 ClearPageCompound(p);
201 } 208 }
202} 209}
@@ -255,14 +262,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
255/* 262/*
256 * This function checks whether a page is free && is the buddy 263 * This function checks whether a page is free && is the buddy
257 * we can do coalesce a page and its buddy if 264 * we can do coalesce a page and its buddy if
258 * (a) the buddy is free && 265 * (a) the buddy is not in a hole &&
259 * (b) the buddy is on the buddy system && 266 * (b) the buddy is free &&
260 * (c) a page and its buddy have the same order. 267 * (c) the buddy is on the buddy system &&
268 * (d) a page and its buddy have the same order.
261 * for recording page's order, we use page_private(page) and PG_private. 269 * for recording page's order, we use page_private(page) and PG_private.
262 * 270 *
263 */ 271 */
264static inline int page_is_buddy(struct page *page, int order) 272static inline int page_is_buddy(struct page *page, int order)
265{ 273{
274#ifdef CONFIG_HOLES_IN_ZONE
275 if (!pfn_valid(page_to_pfn(page)))
276 return 0;
277#endif
278
266 if (PagePrivate(page) && 279 if (PagePrivate(page) &&
267 (page_order(page) == order) && 280 (page_order(page) == order) &&
268 page_count(page) == 0) 281 page_count(page) == 0)
@@ -300,7 +313,7 @@ static inline void __free_pages_bulk (struct page *page,
300 unsigned long page_idx; 313 unsigned long page_idx;
301 int order_size = 1 << order; 314 int order_size = 1 << order;
302 315
303 if (unlikely(order)) 316 if (unlikely(PageCompound(page)))
304 destroy_compound_page(page, order); 317 destroy_compound_page(page, order);
305 318
306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 319 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +327,15 @@ static inline void __free_pages_bulk (struct page *page,
314 struct free_area *area; 327 struct free_area *area;
315 struct page *buddy; 328 struct page *buddy;
316 329
317 combined_idx = __find_combined_index(page_idx, order);
318 buddy = __page_find_buddy(page, page_idx, order); 330 buddy = __page_find_buddy(page, page_idx, order);
319
320 if (bad_range(zone, buddy))
321 break;
322 if (!page_is_buddy(buddy, order)) 331 if (!page_is_buddy(buddy, order))
323 break; /* Move the buddy up one level. */ 332 break; /* Move the buddy up one level. */
333
324 list_del(&buddy->lru); 334 list_del(&buddy->lru);
325 area = zone->free_area + order; 335 area = zone->free_area + order;
326 area->nr_free--; 336 area->nr_free--;
327 rmv_page_order(buddy); 337 rmv_page_order(buddy);
338 combined_idx = __find_combined_index(page_idx, order);
328 page = page + (combined_idx - page_idx); 339 page = page + (combined_idx - page_idx);
329 page_idx = combined_idx; 340 page_idx = combined_idx;
330 order++; 341 order++;
@@ -334,11 +345,11 @@ static inline void __free_pages_bulk (struct page *page,
334 zone->free_area[order].nr_free++; 345 zone->free_area[order].nr_free++;
335} 346}
336 347
337static inline int free_pages_check(const char *function, struct page *page) 348static inline int free_pages_check(struct page *page)
338{ 349{
339 if ( page_mapcount(page) || 350 if (unlikely(page_mapcount(page) |
340 page->mapping != NULL || 351 (page->mapping != NULL) |
341 page_count(page) != 0 || 352 (page_count(page) != 0) |
342 (page->flags & ( 353 (page->flags & (
343 1 << PG_lru | 354 1 << PG_lru |
344 1 << PG_private | 355 1 << PG_private |
@@ -348,8 +359,8 @@ static inline int free_pages_check(const char *function, struct page *page)
348 1 << PG_slab | 359 1 << PG_slab |
349 1 << PG_swapcache | 360 1 << PG_swapcache |
350 1 << PG_writeback | 361 1 << PG_writeback |
351 1 << PG_reserved ))) 362 1 << PG_reserved ))))
352 bad_page(function, page); 363 bad_page(page);
353 if (PageDirty(page)) 364 if (PageDirty(page))
354 __ClearPageDirty(page); 365 __ClearPageDirty(page);
355 /* 366 /*
@@ -375,11 +386,10 @@ static int
375free_pages_bulk(struct zone *zone, int count, 386free_pages_bulk(struct zone *zone, int count,
376 struct list_head *list, unsigned int order) 387 struct list_head *list, unsigned int order)
377{ 388{
378 unsigned long flags;
379 struct page *page = NULL; 389 struct page *page = NULL;
380 int ret = 0; 390 int ret = 0;
381 391
382 spin_lock_irqsave(&zone->lock, flags); 392 spin_lock(&zone->lock);
383 zone->all_unreclaimable = 0; 393 zone->all_unreclaimable = 0;
384 zone->pages_scanned = 0; 394 zone->pages_scanned = 0;
385 while (!list_empty(list) && count--) { 395 while (!list_empty(list) && count--) {
@@ -389,12 +399,13 @@ free_pages_bulk(struct zone *zone, int count,
389 __free_pages_bulk(page, zone, order); 399 __free_pages_bulk(page, zone, order);
390 ret++; 400 ret++;
391 } 401 }
392 spin_unlock_irqrestore(&zone->lock, flags); 402 spin_unlock(&zone->lock);
393 return ret; 403 return ret;
394} 404}
395 405
396void __free_pages_ok(struct page *page, unsigned int order) 406void __free_pages_ok(struct page *page, unsigned int order)
397{ 407{
408 unsigned long flags;
398 LIST_HEAD(list); 409 LIST_HEAD(list);
399 int i; 410 int i;
400 int reserved = 0; 411 int reserved = 0;
@@ -408,14 +419,49 @@ void __free_pages_ok(struct page *page, unsigned int order)
408#endif 419#endif
409 420
410 for (i = 0 ; i < (1 << order) ; ++i) 421 for (i = 0 ; i < (1 << order) ; ++i)
411 reserved += free_pages_check(__FUNCTION__, page + i); 422 reserved += free_pages_check(page + i);
412 if (reserved) 423 if (reserved)
413 return; 424 return;
414 425
415 list_add(&page->lru, &list); 426 list_add(&page->lru, &list);
416 mod_page_state(pgfree, 1 << order);
417 kernel_map_pages(page, 1<<order, 0); 427 kernel_map_pages(page, 1<<order, 0);
428 local_irq_save(flags);
429 __mod_page_state(pgfree, 1 << order);
418 free_pages_bulk(page_zone(page), 1, &list, order); 430 free_pages_bulk(page_zone(page), 1, &list, order);
431 local_irq_restore(flags);
432}
433
434/*
435 * permit the bootmem allocator to evade page validation on high-order frees
436 */
437void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
438{
439 if (order == 0) {
440 __ClearPageReserved(page);
441 set_page_count(page, 0);
442
443 free_hot_cold_page(page, 0);
444 } else {
445 LIST_HEAD(list);
446 int loop;
447
448 for (loop = 0; loop < BITS_PER_LONG; loop++) {
449 struct page *p = &page[loop];
450
451 if (loop + 16 < BITS_PER_LONG)
452 prefetchw(p + 16);
453 __ClearPageReserved(p);
454 set_page_count(p, 0);
455 }
456
457 arch_free_page(page, order);
458
459 mod_page_state(pgfree, 1 << order);
460
461 list_add(&page->lru, &list);
462 kernel_map_pages(page, 1 << order, 0);
463 free_pages_bulk(page_zone(page), 1, &list, order);
464 }
419} 465}
420 466
421 467
@@ -433,8 +479,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
433 * 479 *
434 * -- wli 480 * -- wli
435 */ 481 */
436static inline struct page * 482static inline void expand(struct zone *zone, struct page *page,
437expand(struct zone *zone, struct page *page,
438 int low, int high, struct free_area *area) 483 int low, int high, struct free_area *area)
439{ 484{
440 unsigned long size = 1 << high; 485 unsigned long size = 1 << high;
@@ -448,24 +493,6 @@ expand(struct zone *zone, struct page *page,
448 area->nr_free++; 493 area->nr_free++;
449 set_page_order(&page[size], high); 494 set_page_order(&page[size], high);
450 } 495 }
451 return page;
452}
453
454void set_page_refs(struct page *page, int order)
455{
456#ifdef CONFIG_MMU
457 set_page_count(page, 1);
458#else
459 int i;
460
461 /*
462 * We need to reference all the pages for this order, otherwise if
463 * anyone accesses one of the pages with (get/put) it will be freed.
464 * - eg: access_process_vm()
465 */
466 for (i = 0; i < (1 << order); i++)
467 set_page_count(page + i, 1);
468#endif /* CONFIG_MMU */
469} 496}
470 497
471/* 498/*
@@ -473,9 +500,9 @@ void set_page_refs(struct page *page, int order)
473 */ 500 */
474static int prep_new_page(struct page *page, int order) 501static int prep_new_page(struct page *page, int order)
475{ 502{
476 if ( page_mapcount(page) || 503 if (unlikely(page_mapcount(page) |
477 page->mapping != NULL || 504 (page->mapping != NULL) |
478 page_count(page) != 0 || 505 (page_count(page) != 0) |
479 (page->flags & ( 506 (page->flags & (
480 1 << PG_lru | 507 1 << PG_lru |
481 1 << PG_private | 508 1 << PG_private |
@@ -486,8 +513,8 @@ static int prep_new_page(struct page *page, int order)
486 1 << PG_slab | 513 1 << PG_slab |
487 1 << PG_swapcache | 514 1 << PG_swapcache |
488 1 << PG_writeback | 515 1 << PG_writeback |
489 1 << PG_reserved ))) 516 1 << PG_reserved ))))
490 bad_page(__FUNCTION__, page); 517 bad_page(page);
491 518
492 /* 519 /*
493 * For now, we report if PG_reserved was found set, but do not 520 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +552,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525 rmv_page_order(page); 552 rmv_page_order(page);
526 area->nr_free--; 553 area->nr_free--;
527 zone->free_pages -= 1UL << order; 554 zone->free_pages -= 1UL << order;
528 return expand(zone, page, order, current_order, area); 555 expand(zone, page, order, current_order, area);
556 return page;
529 } 557 }
530 558
531 return NULL; 559 return NULL;
@@ -539,21 +567,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
539static int rmqueue_bulk(struct zone *zone, unsigned int order, 567static int rmqueue_bulk(struct zone *zone, unsigned int order,
540 unsigned long count, struct list_head *list) 568 unsigned long count, struct list_head *list)
541{ 569{
542 unsigned long flags;
543 int i; 570 int i;
544 int allocated = 0;
545 struct page *page;
546 571
547 spin_lock_irqsave(&zone->lock, flags); 572 spin_lock(&zone->lock);
548 for (i = 0; i < count; ++i) { 573 for (i = 0; i < count; ++i) {
549 page = __rmqueue(zone, order); 574 struct page *page = __rmqueue(zone, order);
550 if (page == NULL) 575 if (unlikely(page == NULL))
551 break; 576 break;
552 allocated++;
553 list_add_tail(&page->lru, list); 577 list_add_tail(&page->lru, list);
554 } 578 }
555 spin_unlock_irqrestore(&zone->lock, flags); 579 spin_unlock(&zone->lock);
556 return allocated; 580 return i;
557} 581}
558 582
559#ifdef CONFIG_NUMA 583#ifdef CONFIG_NUMA
@@ -589,6 +613,7 @@ void drain_remote_pages(void)
589#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 613#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
590static void __drain_pages(unsigned int cpu) 614static void __drain_pages(unsigned int cpu)
591{ 615{
616 unsigned long flags;
592 struct zone *zone; 617 struct zone *zone;
593 int i; 618 int i;
594 619
@@ -600,8 +625,10 @@ static void __drain_pages(unsigned int cpu)
600 struct per_cpu_pages *pcp; 625 struct per_cpu_pages *pcp;
601 626
602 pcp = &pset->pcp[i]; 627 pcp = &pset->pcp[i];
628 local_irq_save(flags);
603 pcp->count -= free_pages_bulk(zone, pcp->count, 629 pcp->count -= free_pages_bulk(zone, pcp->count,
604 &pcp->list, 0); 630 &pcp->list, 0);
631 local_irq_restore(flags);
605 } 632 }
606 } 633 }
607} 634}
@@ -647,18 +674,14 @@ void drain_local_pages(void)
647} 674}
648#endif /* CONFIG_PM */ 675#endif /* CONFIG_PM */
649 676
650static void zone_statistics(struct zonelist *zonelist, struct zone *z) 677static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
651{ 678{
652#ifdef CONFIG_NUMA 679#ifdef CONFIG_NUMA
653 unsigned long flags;
654 int cpu;
655 pg_data_t *pg = z->zone_pgdat; 680 pg_data_t *pg = z->zone_pgdat;
656 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 681 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
657 struct per_cpu_pageset *p; 682 struct per_cpu_pageset *p;
658 683
659 local_irq_save(flags); 684 p = zone_pcp(z, cpu);
660 cpu = smp_processor_id();
661 p = zone_pcp(z,cpu);
662 if (pg == orig) { 685 if (pg == orig) {
663 p->numa_hit++; 686 p->numa_hit++;
664 } else { 687 } else {
@@ -669,14 +692,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
669 p->local_node++; 692 p->local_node++;
670 else 693 else
671 p->other_node++; 694 p->other_node++;
672 local_irq_restore(flags);
673#endif 695#endif
674} 696}
675 697
676/* 698/*
677 * Free a 0-order page 699 * Free a 0-order page
678 */ 700 */
679static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
680static void fastcall free_hot_cold_page(struct page *page, int cold) 701static void fastcall free_hot_cold_page(struct page *page, int cold)
681{ 702{
682 struct zone *zone = page_zone(page); 703 struct zone *zone = page_zone(page);
@@ -687,14 +708,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
687 708
688 if (PageAnon(page)) 709 if (PageAnon(page))
689 page->mapping = NULL; 710 page->mapping = NULL;
690 if (free_pages_check(__FUNCTION__, page)) 711 if (free_pages_check(page))
691 return; 712 return;
692 713
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0); 714 kernel_map_pages(page, 1, 0);
695 715
696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 716 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
697 local_irq_save(flags); 717 local_irq_save(flags);
718 __inc_page_state(pgfree);
698 list_add(&page->lru, &pcp->list); 719 list_add(&page->lru, &pcp->list);
699 pcp->count++; 720 pcp->count++;
700 if (pcp->count >= pcp->high) 721 if (pcp->count >= pcp->high)
@@ -727,49 +748,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
727 * we cheat by calling it from here, in the order > 0 path. Saves a branch 748 * we cheat by calling it from here, in the order > 0 path. Saves a branch
728 * or two. 749 * or two.
729 */ 750 */
730static struct page * 751static struct page *buffered_rmqueue(struct zonelist *zonelist,
731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 752 struct zone *zone, int order, gfp_t gfp_flags)
732{ 753{
733 unsigned long flags; 754 unsigned long flags;
734 struct page *page; 755 struct page *page;
735 int cold = !!(gfp_flags & __GFP_COLD); 756 int cold = !!(gfp_flags & __GFP_COLD);
757 int cpu;
736 758
737again: 759again:
760 cpu = get_cpu();
738 if (order == 0) { 761 if (order == 0) {
739 struct per_cpu_pages *pcp; 762 struct per_cpu_pages *pcp;
740 763
741 page = NULL; 764 pcp = &zone_pcp(zone, cpu)->pcp[cold];
742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
743 local_irq_save(flags); 765 local_irq_save(flags);
744 if (pcp->count <= pcp->low) 766 if (!pcp->count) {
745 pcp->count += rmqueue_bulk(zone, 0, 767 pcp->count += rmqueue_bulk(zone, 0,
746 pcp->batch, &pcp->list); 768 pcp->batch, &pcp->list);
747 if (pcp->count) { 769 if (unlikely(!pcp->count))
748 page = list_entry(pcp->list.next, struct page, lru); 770 goto failed;
749 list_del(&page->lru);
750 pcp->count--;
751 } 771 }
752 local_irq_restore(flags); 772 page = list_entry(pcp->list.next, struct page, lru);
753 put_cpu(); 773 list_del(&page->lru);
774 pcp->count--;
754 } else { 775 } else {
755 spin_lock_irqsave(&zone->lock, flags); 776 spin_lock_irqsave(&zone->lock, flags);
756 page = __rmqueue(zone, order); 777 page = __rmqueue(zone, order);
757 spin_unlock_irqrestore(&zone->lock, flags); 778 spin_unlock(&zone->lock);
779 if (!page)
780 goto failed;
758 } 781 }
759 782
760 if (page != NULL) { 783 __mod_page_state_zone(zone, pgalloc, 1 << order);
761 BUG_ON(bad_range(zone, page)); 784 zone_statistics(zonelist, zone, cpu);
762 mod_page_state_zone(zone, pgalloc, 1 << order); 785 local_irq_restore(flags);
763 if (prep_new_page(page, order)) 786 put_cpu();
764 goto again; 787
788 BUG_ON(bad_range(zone, page));
789 if (prep_new_page(page, order))
790 goto again;
765 791
766 if (gfp_flags & __GFP_ZERO) 792 if (gfp_flags & __GFP_ZERO)
767 prep_zero_page(page, order, gfp_flags); 793 prep_zero_page(page, order, gfp_flags);
768 794
769 if (order && (gfp_flags & __GFP_COMP)) 795 if (order && (gfp_flags & __GFP_COMP))
770 prep_compound_page(page, order); 796 prep_compound_page(page, order);
771 }
772 return page; 797 return page;
798
799failed:
800 local_irq_restore(flags);
801 put_cpu();
802 return NULL;
773} 803}
774 804
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 805#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +875,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
845 continue; 875 continue;
846 } 876 }
847 877
848 page = buffered_rmqueue(*z, order, gfp_mask); 878 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
849 if (page) { 879 if (page) {
850 zone_statistics(zonelist, *z);
851 break; 880 break;
852 } 881 }
853 } while (*(++z) != NULL); 882 } while (*(++z) != NULL);
@@ -903,8 +932,7 @@ restart:
903 alloc_flags |= ALLOC_HARDER; 932 alloc_flags |= ALLOC_HARDER;
904 if (gfp_mask & __GFP_HIGH) 933 if (gfp_mask & __GFP_HIGH)
905 alloc_flags |= ALLOC_HIGH; 934 alloc_flags |= ALLOC_HIGH;
906 if (wait) 935 alloc_flags |= ALLOC_CPUSET;
907 alloc_flags |= ALLOC_CPUSET;
908 936
909 /* 937 /*
910 * Go through the zonelist again. Let __GFP_HIGH and allocations 938 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +954,7 @@ restart:
926nofail_alloc: 954nofail_alloc:
927 /* go through the zonelist yet again, ignoring mins */ 955 /* go through the zonelist yet again, ignoring mins */
928 page = get_page_from_freelist(gfp_mask, order, 956 page = get_page_from_freelist(gfp_mask, order,
929 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); 957 zonelist, ALLOC_NO_WATERMARKS);
930 if (page) 958 if (page)
931 goto got_pg; 959 goto got_pg;
932 if (gfp_mask & __GFP_NOFAIL) { 960 if (gfp_mask & __GFP_NOFAIL) {
@@ -1171,12 +1199,11 @@ EXPORT_SYMBOL(nr_pagecache);
1171DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1199DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1172#endif 1200#endif
1173 1201
1174void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1202static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1175{ 1203{
1176 int cpu = 0; 1204 int cpu = 0;
1177 1205
1178 memset(ret, 0, sizeof(*ret)); 1206 memset(ret, 0, sizeof(*ret));
1179 cpus_and(*cpumask, *cpumask, cpu_online_map);
1180 1207
1181 cpu = first_cpu(*cpumask); 1208 cpu = first_cpu(*cpumask);
1182 while (cpu < NR_CPUS) { 1209 while (cpu < NR_CPUS) {
@@ -1224,12 +1251,12 @@ void get_full_page_state(struct page_state *ret)
1224 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1251 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1225} 1252}
1226 1253
1227unsigned long __read_page_state(unsigned long offset) 1254unsigned long read_page_state_offset(unsigned long offset)
1228{ 1255{
1229 unsigned long ret = 0; 1256 unsigned long ret = 0;
1230 int cpu; 1257 int cpu;
1231 1258
1232 for_each_online_cpu(cpu) { 1259 for_each_cpu(cpu) {
1233 unsigned long in; 1260 unsigned long in;
1234 1261
1235 in = (unsigned long)&per_cpu(page_states, cpu) + offset; 1262 in = (unsigned long)&per_cpu(page_states, cpu) + offset;
@@ -1238,18 +1265,26 @@ unsigned long __read_page_state(unsigned long offset)
1238 return ret; 1265 return ret;
1239} 1266}
1240 1267
1241void __mod_page_state(unsigned long offset, unsigned long delta) 1268void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1269{
1270 void *ptr;
1271
1272 ptr = &__get_cpu_var(page_states);
1273 *(unsigned long *)(ptr + offset) += delta;
1274}
1275EXPORT_SYMBOL(__mod_page_state_offset);
1276
1277void mod_page_state_offset(unsigned long offset, unsigned long delta)
1242{ 1278{
1243 unsigned long flags; 1279 unsigned long flags;
1244 void* ptr; 1280 void *ptr;
1245 1281
1246 local_irq_save(flags); 1282 local_irq_save(flags);
1247 ptr = &__get_cpu_var(page_states); 1283 ptr = &__get_cpu_var(page_states);
1248 *(unsigned long*)(ptr + offset) += delta; 1284 *(unsigned long *)(ptr + offset) += delta;
1249 local_irq_restore(flags); 1285 local_irq_restore(flags);
1250} 1286}
1251 1287EXPORT_SYMBOL(mod_page_state_offset);
1252EXPORT_SYMBOL(__mod_page_state);
1253 1288
1254void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1289void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1255 unsigned long *free, struct pglist_data *pgdat) 1290 unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1370,7 @@ void show_free_areas(void)
1335 show_node(zone); 1370 show_node(zone);
1336 printk("%s per-cpu:", zone->name); 1371 printk("%s per-cpu:", zone->name);
1337 1372
1338 if (!zone->present_pages) { 1373 if (!populated_zone(zone)) {
1339 printk(" empty\n"); 1374 printk(" empty\n");
1340 continue; 1375 continue;
1341 } else 1376 } else
@@ -1347,10 +1382,9 @@ void show_free_areas(void)
1347 pageset = zone_pcp(zone, cpu); 1382 pageset = zone_pcp(zone, cpu);
1348 1383
1349 for (temperature = 0; temperature < 2; temperature++) 1384 for (temperature = 0; temperature < 2; temperature++)
1350 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1385 printk("cpu %d %s: high %d, batch %d used:%d\n",
1351 cpu, 1386 cpu,
1352 temperature ? "cold" : "hot", 1387 temperature ? "cold" : "hot",
1353 pageset->pcp[temperature].low,
1354 pageset->pcp[temperature].high, 1388 pageset->pcp[temperature].high,
1355 pageset->pcp[temperature].batch, 1389 pageset->pcp[temperature].batch,
1356 pageset->pcp[temperature].count); 1390 pageset->pcp[temperature].count);
@@ -1413,7 +1447,7 @@ void show_free_areas(void)
1413 1447
1414 show_node(zone); 1448 show_node(zone);
1415 printk("%s: ", zone->name); 1449 printk("%s: ", zone->name);
1416 if (!zone->present_pages) { 1450 if (!populated_zone(zone)) {
1417 printk("empty\n"); 1451 printk("empty\n");
1418 continue; 1452 continue;
1419 } 1453 }
@@ -1433,36 +1467,29 @@ void show_free_areas(void)
1433 1467
1434/* 1468/*
1435 * Builds allocation fallback zone lists. 1469 * Builds allocation fallback zone lists.
1470 *
1471 * Add all populated zones of a node to the zonelist.
1436 */ 1472 */
1437static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1473static int __init build_zonelists_node(pg_data_t *pgdat,
1438{ 1474 struct zonelist *zonelist, int nr_zones, int zone_type)
1439 switch (k) { 1475{
1440 struct zone *zone; 1476 struct zone *zone;
1441 default: 1477
1442 BUG(); 1478 BUG_ON(zone_type > ZONE_HIGHMEM);
1443 case ZONE_HIGHMEM: 1479
1444 zone = pgdat->node_zones + ZONE_HIGHMEM; 1480 do {
1445 if (zone->present_pages) { 1481 zone = pgdat->node_zones + zone_type;
1482 if (populated_zone(zone)) {
1446#ifndef CONFIG_HIGHMEM 1483#ifndef CONFIG_HIGHMEM
1447 BUG(); 1484 BUG_ON(zone_type > ZONE_NORMAL);
1448#endif 1485#endif
1449 zonelist->zones[j++] = zone; 1486 zonelist->zones[nr_zones++] = zone;
1487 check_highest_zone(zone_type);
1450 } 1488 }
1451 case ZONE_NORMAL: 1489 zone_type--;
1452 zone = pgdat->node_zones + ZONE_NORMAL;
1453 if (zone->present_pages)
1454 zonelist->zones[j++] = zone;
1455 case ZONE_DMA32:
1456 zone = pgdat->node_zones + ZONE_DMA32;
1457 if (zone->present_pages)
1458 zonelist->zones[j++] = zone;
1459 case ZONE_DMA:
1460 zone = pgdat->node_zones + ZONE_DMA;
1461 if (zone->present_pages)
1462 zonelist->zones[j++] = zone;
1463 }
1464 1490
1465 return j; 1491 } while (zone_type >= 0);
1492 return nr_zones;
1466} 1493}
1467 1494
1468static inline int highest_zone(int zone_bits) 1495static inline int highest_zone(int zone_bits)
@@ -1709,8 +1736,6 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1709 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1736 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
1710 if (!early_pfn_valid(pfn)) 1737 if (!early_pfn_valid(pfn))
1711 continue; 1738 continue;
1712 if (!early_pfn_in_nid(pfn, nid))
1713 continue;
1714 page = pfn_to_page(pfn); 1739 page = pfn_to_page(pfn);
1715 set_page_links(page, zone, nid, pfn); 1740 set_page_links(page, zone, nid, pfn);
1716 set_page_count(page, 1); 1741 set_page_count(page, 1);
@@ -1794,14 +1819,12 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1794 1819
1795 pcp = &p->pcp[0]; /* hot */ 1820 pcp = &p->pcp[0]; /* hot */
1796 pcp->count = 0; 1821 pcp->count = 0;
1797 pcp->low = 0;
1798 pcp->high = 6 * batch; 1822 pcp->high = 6 * batch;
1799 pcp->batch = max(1UL, 1 * batch); 1823 pcp->batch = max(1UL, 1 * batch);
1800 INIT_LIST_HEAD(&pcp->list); 1824 INIT_LIST_HEAD(&pcp->list);
1801 1825
1802 pcp = &p->pcp[1]; /* cold*/ 1826 pcp = &p->pcp[1]; /* cold*/
1803 pcp->count = 0; 1827 pcp->count = 0;
1804 pcp->low = 0;
1805 pcp->high = 2 * batch; 1828 pcp->high = 2 * batch;
1806 pcp->batch = max(1UL, batch/2); 1829 pcp->batch = max(1UL, batch/2);
1807 INIT_LIST_HEAD(&pcp->list); 1830 INIT_LIST_HEAD(&pcp->list);
@@ -2116,7 +2139,7 @@ static int frag_show(struct seq_file *m, void *arg)
2116 int order; 2139 int order;
2117 2140
2118 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2141 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2119 if (!zone->present_pages) 2142 if (!populated_zone(zone))
2120 continue; 2143 continue;
2121 2144
2122 spin_lock_irqsave(&zone->lock, flags); 2145 spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2172,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2149 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2172 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2150 int i; 2173 int i;
2151 2174
2152 if (!zone->present_pages) 2175 if (!populated_zone(zone))
2153 continue; 2176 continue;
2154 2177
2155 spin_lock_irqsave(&zone->lock, flags); 2178 spin_lock_irqsave(&zone->lock, flags);
@@ -2197,12 +2220,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2197 seq_printf(m, 2220 seq_printf(m,
2198 "\n cpu: %i pcp: %i" 2221 "\n cpu: %i pcp: %i"
2199 "\n count: %i" 2222 "\n count: %i"
2200 "\n low: %i"
2201 "\n high: %i" 2223 "\n high: %i"
2202 "\n batch: %i", 2224 "\n batch: %i",
2203 i, j, 2225 i, j,
2204 pageset->pcp[j].count, 2226 pageset->pcp[j].count,
2205 pageset->pcp[j].low,
2206 pageset->pcp[j].high, 2227 pageset->pcp[j].high,
2207 pageset->pcp[j].batch); 2228 pageset->pcp[j].batch);
2208 } 2229 }
@@ -2257,32 +2278,40 @@ static char *vmstat_text[] = {
2257 "pgpgout", 2278 "pgpgout",
2258 "pswpin", 2279 "pswpin",
2259 "pswpout", 2280 "pswpout",
2260 "pgalloc_high",
2261 2281
2282 "pgalloc_high",
2262 "pgalloc_normal", 2283 "pgalloc_normal",
2284 "pgalloc_dma32",
2263 "pgalloc_dma", 2285 "pgalloc_dma",
2286
2264 "pgfree", 2287 "pgfree",
2265 "pgactivate", 2288 "pgactivate",
2266 "pgdeactivate", 2289 "pgdeactivate",
2267 2290
2268 "pgfault", 2291 "pgfault",
2269 "pgmajfault", 2292 "pgmajfault",
2293
2270 "pgrefill_high", 2294 "pgrefill_high",
2271 "pgrefill_normal", 2295 "pgrefill_normal",
2296 "pgrefill_dma32",
2272 "pgrefill_dma", 2297 "pgrefill_dma",
2273 2298
2274 "pgsteal_high", 2299 "pgsteal_high",
2275 "pgsteal_normal", 2300 "pgsteal_normal",
2301 "pgsteal_dma32",
2276 "pgsteal_dma", 2302 "pgsteal_dma",
2303
2277 "pgscan_kswapd_high", 2304 "pgscan_kswapd_high",
2278 "pgscan_kswapd_normal", 2305 "pgscan_kswapd_normal",
2279 2306 "pgscan_kswapd_dma32",
2280 "pgscan_kswapd_dma", 2307 "pgscan_kswapd_dma",
2308
2281 "pgscan_direct_high", 2309 "pgscan_direct_high",
2282 "pgscan_direct_normal", 2310 "pgscan_direct_normal",
2311 "pgscan_direct_dma32",
2283 "pgscan_direct_dma", 2312 "pgscan_direct_dma",
2284 "pginodesteal",
2285 2313
2314 "pginodesteal",
2286 "slabs_scanned", 2315 "slabs_scanned",
2287 "kswapd_steal", 2316 "kswapd_steal",
2288 "kswapd_inodesteal", 2317 "kswapd_inodesteal",