aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c472
1 files changed, 278 insertions, 194 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe14a8c87fc2..8c960b469593 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -36,6 +36,7 @@
36#include <linux/memory_hotplug.h> 36#include <linux/memory_hotplug.h>
37#include <linux/nodemask.h> 37#include <linux/nodemask.h>
38#include <linux/vmalloc.h> 38#include <linux/vmalloc.h>
39#include <linux/mempolicy.h>
39 40
40#include <asm/tlbflush.h> 41#include <asm/tlbflush.h>
41#include "internal.h" 42#include "internal.h"
@@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly;
52unsigned long totalram_pages __read_mostly; 53unsigned long totalram_pages __read_mostly;
53unsigned long totalhigh_pages __read_mostly; 54unsigned long totalhigh_pages __read_mostly;
54long nr_swap_pages; 55long nr_swap_pages;
56int percpu_pagelist_fraction;
57
58static void fastcall free_hot_cold_page(struct page *page, int cold);
55 59
56/* 60/*
57 * results with 256, 32 in the lowmem_reserve sysctl: 61 * results with 256, 32 in the lowmem_reserve sysctl:
@@ -81,6 +85,7 @@ int min_free_kbytes = 1024;
81unsigned long __initdata nr_kernel_pages; 85unsigned long __initdata nr_kernel_pages;
82unsigned long __initdata nr_all_pages; 86unsigned long __initdata nr_all_pages;
83 87
88#ifdef CONFIG_DEBUG_VM
84static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 89static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
85{ 90{
86 int ret = 0; 91 int ret = 0;
@@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page)
122 return 0; 127 return 0;
123} 128}
124 129
125static void bad_page(const char *function, struct page *page) 130#else
131static inline int bad_range(struct zone *zone, struct page *page)
126{ 132{
127 printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", 133 return 0;
128 function, current->comm, page); 134}
129 printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 135#endif
130 (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, 136
131 page->mapping, page_mapcount(page), page_count(page)); 137static void bad_page(struct page *page)
132 printk(KERN_EMERG "Backtrace:\n"); 138{
139 printk(KERN_EMERG "Bad page state in process '%s'\n"
140 KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
141 KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
142 KERN_EMERG "Backtrace:\n",
143 current->comm, page, (int)(2*sizeof(unsigned long)),
144 (unsigned long)page->flags, page->mapping,
145 page_mapcount(page), page_count(page));
133 dump_stack(); 146 dump_stack();
134 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
135 page->flags &= ~(1 << PG_lru | 147 page->flags &= ~(1 << PG_lru |
136 1 << PG_private | 148 1 << PG_private |
137 1 << PG_locked | 149 1 << PG_locked |
@@ -184,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
184 int i; 196 int i;
185 int nr_pages = 1 << order; 197 int nr_pages = 1 << order;
186 198
187 if (!PageCompound(page)) 199 if (unlikely(page[1].index != order))
188 return; 200 bad_page(page);
189
190 if (page[1].index != order)
191 bad_page(__FUNCTION__, page);
192 201
193 for (i = 0; i < nr_pages; i++) { 202 for (i = 0; i < nr_pages; i++) {
194 struct page *p = page + i; 203 struct page *p = page + i;
195 204
196 if (!PageCompound(p)) 205 if (unlikely(!PageCompound(p) |
197 bad_page(__FUNCTION__, page); 206 (page_private(p) != (unsigned long)page)))
198 if (page_private(p) != (unsigned long)page) 207 bad_page(page);
199 bad_page(__FUNCTION__, page);
200 ClearPageCompound(p); 208 ClearPageCompound(p);
201 } 209 }
202} 210}
@@ -255,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
255/* 263/*
256 * This function checks whether a page is free && is the buddy 264 * This function checks whether a page is free && is the buddy
257 * we can do coalesce a page and its buddy if 265 * we can do coalesce a page and its buddy if
258 * (a) the buddy is free && 266 * (a) the buddy is not in a hole &&
259 * (b) the buddy is on the buddy system && 267 * (b) the buddy is free &&
260 * (c) a page and its buddy have the same order. 268 * (c) the buddy is on the buddy system &&
269 * (d) a page and its buddy have the same order.
261 * for recording page's order, we use page_private(page) and PG_private. 270 * for recording page's order, we use page_private(page) and PG_private.
262 * 271 *
263 */ 272 */
264static inline int page_is_buddy(struct page *page, int order) 273static inline int page_is_buddy(struct page *page, int order)
265{ 274{
275#ifdef CONFIG_HOLES_IN_ZONE
276 if (!pfn_valid(page_to_pfn(page)))
277 return 0;
278#endif
279
266 if (PagePrivate(page) && 280 if (PagePrivate(page) &&
267 (page_order(page) == order) && 281 (page_order(page) == order) &&
268 page_count(page) == 0) 282 page_count(page) == 0)
@@ -294,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order)
294 * -- wli 308 * -- wli
295 */ 309 */
296 310
297static inline void __free_pages_bulk (struct page *page, 311static inline void __free_one_page(struct page *page,
298 struct zone *zone, unsigned int order) 312 struct zone *zone, unsigned int order)
299{ 313{
300 unsigned long page_idx; 314 unsigned long page_idx;
301 int order_size = 1 << order; 315 int order_size = 1 << order;
302 316
303 if (unlikely(order)) 317 if (unlikely(PageCompound(page)))
304 destroy_compound_page(page, order); 318 destroy_compound_page(page, order);
305 319
306 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 320 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
@@ -314,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page,
314 struct free_area *area; 328 struct free_area *area;
315 struct page *buddy; 329 struct page *buddy;
316 330
317 combined_idx = __find_combined_index(page_idx, order);
318 buddy = __page_find_buddy(page, page_idx, order); 331 buddy = __page_find_buddy(page, page_idx, order);
319
320 if (bad_range(zone, buddy))
321 break;
322 if (!page_is_buddy(buddy, order)) 332 if (!page_is_buddy(buddy, order))
323 break; /* Move the buddy up one level. */ 333 break; /* Move the buddy up one level. */
334
324 list_del(&buddy->lru); 335 list_del(&buddy->lru);
325 area = zone->free_area + order; 336 area = zone->free_area + order;
326 area->nr_free--; 337 area->nr_free--;
327 rmv_page_order(buddy); 338 rmv_page_order(buddy);
339 combined_idx = __find_combined_index(page_idx, order);
328 page = page + (combined_idx - page_idx); 340 page = page + (combined_idx - page_idx);
329 page_idx = combined_idx; 341 page_idx = combined_idx;
330 order++; 342 order++;
@@ -334,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page,
334 zone->free_area[order].nr_free++; 346 zone->free_area[order].nr_free++;
335} 347}
336 348
337static inline int free_pages_check(const char *function, struct page *page) 349static inline int free_pages_check(struct page *page)
338{ 350{
339 if ( page_mapcount(page) || 351 if (unlikely(page_mapcount(page) |
340 page->mapping != NULL || 352 (page->mapping != NULL) |
341 page_count(page) != 0 || 353 (page_count(page) != 0) |
342 (page->flags & ( 354 (page->flags & (
343 1 << PG_lru | 355 1 << PG_lru |
344 1 << PG_private | 356 1 << PG_private |
@@ -348,8 +360,8 @@ static inline int free_pages_check(const char *function, struct page *page)
348 1 << PG_slab | 360 1 << PG_slab |
349 1 << PG_swapcache | 361 1 << PG_swapcache |
350 1 << PG_writeback | 362 1 << PG_writeback |
351 1 << PG_reserved ))) 363 1 << PG_reserved ))))
352 bad_page(function, page); 364 bad_page(page);
353 if (PageDirty(page)) 365 if (PageDirty(page))
354 __ClearPageDirty(page); 366 __ClearPageDirty(page);
355 /* 367 /*
@@ -371,51 +383,90 @@ static inline int free_pages_check(const char *function, struct page *page)
371 * And clear the zone's pages_scanned counter, to hold off the "all pages are 383 * And clear the zone's pages_scanned counter, to hold off the "all pages are
372 * pinned" detection logic. 384 * pinned" detection logic.
373 */ 385 */
374static int 386static void free_pages_bulk(struct zone *zone, int count,
375free_pages_bulk(struct zone *zone, int count, 387 struct list_head *list, int order)
376 struct list_head *list, unsigned int order)
377{ 388{
378 unsigned long flags; 389 spin_lock(&zone->lock);
379 struct page *page = NULL;
380 int ret = 0;
381
382 spin_lock_irqsave(&zone->lock, flags);
383 zone->all_unreclaimable = 0; 390 zone->all_unreclaimable = 0;
384 zone->pages_scanned = 0; 391 zone->pages_scanned = 0;
385 while (!list_empty(list) && count--) { 392 while (count--) {
393 struct page *page;
394
395 BUG_ON(list_empty(list));
386 page = list_entry(list->prev, struct page, lru); 396 page = list_entry(list->prev, struct page, lru);
387 /* have to delete it as __free_pages_bulk list manipulates */ 397 /* have to delete it as __free_one_page list manipulates */
388 list_del(&page->lru); 398 list_del(&page->lru);
389 __free_pages_bulk(page, zone, order); 399 __free_one_page(page, zone, order);
390 ret++;
391 } 400 }
392 spin_unlock_irqrestore(&zone->lock, flags); 401 spin_unlock(&zone->lock);
393 return ret;
394} 402}
395 403
396void __free_pages_ok(struct page *page, unsigned int order) 404static void free_one_page(struct zone *zone, struct page *page, int order)
397{ 405{
398 LIST_HEAD(list); 406 LIST_HEAD(list);
407 list_add(&page->lru, &list);
408 free_pages_bulk(zone, 1, &list, order);
409}
410
411static void __free_pages_ok(struct page *page, unsigned int order)
412{
413 unsigned long flags;
399 int i; 414 int i;
400 int reserved = 0; 415 int reserved = 0;
401 416
402 arch_free_page(page, order); 417 arch_free_page(page, order);
418 if (!PageHighMem(page))
419 mutex_debug_check_no_locks_freed(page_address(page),
420 PAGE_SIZE<<order);
403 421
404#ifndef CONFIG_MMU 422#ifndef CONFIG_MMU
405 if (order > 0) 423 for (i = 1 ; i < (1 << order) ; ++i)
406 for (i = 1 ; i < (1 << order) ; ++i) 424 __put_page(page + i);
407 __put_page(page + i);
408#endif 425#endif
409 426
410 for (i = 0 ; i < (1 << order) ; ++i) 427 for (i = 0 ; i < (1 << order) ; ++i)
411 reserved += free_pages_check(__FUNCTION__, page + i); 428 reserved += free_pages_check(page + i);
412 if (reserved) 429 if (reserved)
413 return; 430 return;
414 431
415 list_add(&page->lru, &list); 432 kernel_map_pages(page, 1 << order, 0);
416 mod_page_state(pgfree, 1 << order); 433 local_irq_save(flags);
417 kernel_map_pages(page, 1<<order, 0); 434 __mod_page_state(pgfree, 1 << order);
418 free_pages_bulk(page_zone(page), 1, &list, order); 435 free_one_page(page_zone(page), page, order);
436 local_irq_restore(flags);
437}
438
439/*
440 * permit the bootmem allocator to evade page validation on high-order frees
441 */
442void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
443{
444 if (order == 0) {
445 __ClearPageReserved(page);
446 set_page_count(page, 0);
447
448 free_hot_cold_page(page, 0);
449 } else {
450 LIST_HEAD(list);
451 int loop;
452
453 for (loop = 0; loop < BITS_PER_LONG; loop++) {
454 struct page *p = &page[loop];
455
456 if (loop + 16 < BITS_PER_LONG)
457 prefetchw(p + 16);
458 __ClearPageReserved(p);
459 set_page_count(p, 0);
460 }
461
462 arch_free_page(page, order);
463
464 mod_page_state(pgfree, 1 << order);
465
466 list_add(&page->lru, &list);
467 kernel_map_pages(page, 1 << order, 0);
468 free_pages_bulk(page_zone(page), 1, &list, order);
469 }
419} 470}
420 471
421 472
@@ -433,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order)
433 * 484 *
434 * -- wli 485 * -- wli
435 */ 486 */
436static inline struct page * 487static inline void expand(struct zone *zone, struct page *page,
437expand(struct zone *zone, struct page *page,
438 int low, int high, struct free_area *area) 488 int low, int high, struct free_area *area)
439{ 489{
440 unsigned long size = 1 << high; 490 unsigned long size = 1 << high;
@@ -448,24 +498,6 @@ expand(struct zone *zone, struct page *page,
448 area->nr_free++; 498 area->nr_free++;
449 set_page_order(&page[size], high); 499 set_page_order(&page[size], high);
450 } 500 }
451 return page;
452}
453
454void set_page_refs(struct page *page, int order)
455{
456#ifdef CONFIG_MMU
457 set_page_count(page, 1);
458#else
459 int i;
460
461 /*
462 * We need to reference all the pages for this order, otherwise if
463 * anyone accesses one of the pages with (get/put) it will be freed.
464 * - eg: access_process_vm()
465 */
466 for (i = 0; i < (1 << order); i++)
467 set_page_count(page + i, 1);
468#endif /* CONFIG_MMU */
469} 501}
470 502
471/* 503/*
@@ -473,9 +505,9 @@ void set_page_refs(struct page *page, int order)
473 */ 505 */
474static int prep_new_page(struct page *page, int order) 506static int prep_new_page(struct page *page, int order)
475{ 507{
476 if ( page_mapcount(page) || 508 if (unlikely(page_mapcount(page) |
477 page->mapping != NULL || 509 (page->mapping != NULL) |
478 page_count(page) != 0 || 510 (page_count(page) != 0) |
479 (page->flags & ( 511 (page->flags & (
480 1 << PG_lru | 512 1 << PG_lru |
481 1 << PG_private | 513 1 << PG_private |
@@ -486,8 +518,8 @@ static int prep_new_page(struct page *page, int order)
486 1 << PG_slab | 518 1 << PG_slab |
487 1 << PG_swapcache | 519 1 << PG_swapcache |
488 1 << PG_writeback | 520 1 << PG_writeback |
489 1 << PG_reserved ))) 521 1 << PG_reserved ))))
490 bad_page(__FUNCTION__, page); 522 bad_page(page);
491 523
492 /* 524 /*
493 * For now, we report if PG_reserved was found set, but do not 525 * For now, we report if PG_reserved was found set, but do not
@@ -525,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
525 rmv_page_order(page); 557 rmv_page_order(page);
526 area->nr_free--; 558 area->nr_free--;
527 zone->free_pages -= 1UL << order; 559 zone->free_pages -= 1UL << order;
528 return expand(zone, page, order, current_order, area); 560 expand(zone, page, order, current_order, area);
561 return page;
529 } 562 }
530 563
531 return NULL; 564 return NULL;
@@ -539,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order)
539static int rmqueue_bulk(struct zone *zone, unsigned int order, 572static int rmqueue_bulk(struct zone *zone, unsigned int order,
540 unsigned long count, struct list_head *list) 573 unsigned long count, struct list_head *list)
541{ 574{
542 unsigned long flags;
543 int i; 575 int i;
544 int allocated = 0;
545 struct page *page;
546 576
547 spin_lock_irqsave(&zone->lock, flags); 577 spin_lock(&zone->lock);
548 for (i = 0; i < count; ++i) { 578 for (i = 0; i < count; ++i) {
549 page = __rmqueue(zone, order); 579 struct page *page = __rmqueue(zone, order);
550 if (page == NULL) 580 if (unlikely(page == NULL))
551 break; 581 break;
552 allocated++;
553 list_add_tail(&page->lru, list); 582 list_add_tail(&page->lru, list);
554 } 583 }
555 spin_unlock_irqrestore(&zone->lock, flags); 584 spin_unlock(&zone->lock);
556 return allocated; 585 return i;
557} 586}
558 587
559#ifdef CONFIG_NUMA 588#ifdef CONFIG_NUMA
@@ -572,14 +601,13 @@ void drain_remote_pages(void)
572 if (zone->zone_pgdat->node_id == numa_node_id()) 601 if (zone->zone_pgdat->node_id == numa_node_id())
573 continue; 602 continue;
574 603
575 pset = zone->pageset[smp_processor_id()]; 604 pset = zone_pcp(zone, smp_processor_id());
576 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { 605 for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
577 struct per_cpu_pages *pcp; 606 struct per_cpu_pages *pcp;
578 607
579 pcp = &pset->pcp[i]; 608 pcp = &pset->pcp[i];
580 if (pcp->count) 609 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
581 pcp->count -= free_pages_bulk(zone, pcp->count, 610 pcp->count = 0;
582 &pcp->list, 0);
583 } 611 }
584 } 612 }
585 local_irq_restore(flags); 613 local_irq_restore(flags);
@@ -589,6 +617,7 @@ void drain_remote_pages(void)
589#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) 617#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
590static void __drain_pages(unsigned int cpu) 618static void __drain_pages(unsigned int cpu)
591{ 619{
620 unsigned long flags;
592 struct zone *zone; 621 struct zone *zone;
593 int i; 622 int i;
594 623
@@ -600,8 +629,10 @@ static void __drain_pages(unsigned int cpu)
600 struct per_cpu_pages *pcp; 629 struct per_cpu_pages *pcp;
601 630
602 pcp = &pset->pcp[i]; 631 pcp = &pset->pcp[i];
603 pcp->count -= free_pages_bulk(zone, pcp->count, 632 local_irq_save(flags);
604 &pcp->list, 0); 633 free_pages_bulk(zone, pcp->count, &pcp->list, 0);
634 pcp->count = 0;
635 local_irq_restore(flags);
605 } 636 }
606 } 637 }
607} 638}
@@ -647,18 +678,14 @@ void drain_local_pages(void)
647} 678}
648#endif /* CONFIG_PM */ 679#endif /* CONFIG_PM */
649 680
650static void zone_statistics(struct zonelist *zonelist, struct zone *z) 681static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu)
651{ 682{
652#ifdef CONFIG_NUMA 683#ifdef CONFIG_NUMA
653 unsigned long flags;
654 int cpu;
655 pg_data_t *pg = z->zone_pgdat; 684 pg_data_t *pg = z->zone_pgdat;
656 pg_data_t *orig = zonelist->zones[0]->zone_pgdat; 685 pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
657 struct per_cpu_pageset *p; 686 struct per_cpu_pageset *p;
658 687
659 local_irq_save(flags); 688 p = zone_pcp(z, cpu);
660 cpu = smp_processor_id();
661 p = zone_pcp(z,cpu);
662 if (pg == orig) { 689 if (pg == orig) {
663 p->numa_hit++; 690 p->numa_hit++;
664 } else { 691 } else {
@@ -669,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z)
669 p->local_node++; 696 p->local_node++;
670 else 697 else
671 p->other_node++; 698 p->other_node++;
672 local_irq_restore(flags);
673#endif 699#endif
674} 700}
675 701
676/* 702/*
677 * Free a 0-order page 703 * Free a 0-order page
678 */ 704 */
679static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
680static void fastcall free_hot_cold_page(struct page *page, int cold) 705static void fastcall free_hot_cold_page(struct page *page, int cold)
681{ 706{
682 struct zone *zone = page_zone(page); 707 struct zone *zone = page_zone(page);
@@ -687,18 +712,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
687 712
688 if (PageAnon(page)) 713 if (PageAnon(page))
689 page->mapping = NULL; 714 page->mapping = NULL;
690 if (free_pages_check(__FUNCTION__, page)) 715 if (free_pages_check(page))
691 return; 716 return;
692 717
693 inc_page_state(pgfree);
694 kernel_map_pages(page, 1, 0); 718 kernel_map_pages(page, 1, 0);
695 719
696 pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; 720 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
697 local_irq_save(flags); 721 local_irq_save(flags);
722 __inc_page_state(pgfree);
698 list_add(&page->lru, &pcp->list); 723 list_add(&page->lru, &pcp->list);
699 pcp->count++; 724 pcp->count++;
700 if (pcp->count >= pcp->high) 725 if (pcp->count >= pcp->high) {
701 pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 726 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
727 pcp->count -= pcp->batch;
728 }
702 local_irq_restore(flags); 729 local_irq_restore(flags);
703 put_cpu(); 730 put_cpu();
704} 731}
@@ -727,49 +754,58 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
727 * we cheat by calling it from here, in the order > 0 path. Saves a branch 754 * we cheat by calling it from here, in the order > 0 path. Saves a branch
728 * or two. 755 * or two.
729 */ 756 */
730static struct page * 757static struct page *buffered_rmqueue(struct zonelist *zonelist,
731buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) 758 struct zone *zone, int order, gfp_t gfp_flags)
732{ 759{
733 unsigned long flags; 760 unsigned long flags;
734 struct page *page; 761 struct page *page;
735 int cold = !!(gfp_flags & __GFP_COLD); 762 int cold = !!(gfp_flags & __GFP_COLD);
763 int cpu;
736 764
737again: 765again:
738 if (order == 0) { 766 cpu = get_cpu();
767 if (likely(order == 0)) {
739 struct per_cpu_pages *pcp; 768 struct per_cpu_pages *pcp;
740 769
741 page = NULL; 770 pcp = &zone_pcp(zone, cpu)->pcp[cold];
742 pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
743 local_irq_save(flags); 771 local_irq_save(flags);
744 if (pcp->count <= pcp->low) 772 if (!pcp->count) {
745 pcp->count += rmqueue_bulk(zone, 0, 773 pcp->count += rmqueue_bulk(zone, 0,
746 pcp->batch, &pcp->list); 774 pcp->batch, &pcp->list);
747 if (pcp->count) { 775 if (unlikely(!pcp->count))
748 page = list_entry(pcp->list.next, struct page, lru); 776 goto failed;
749 list_del(&page->lru);
750 pcp->count--;
751 } 777 }
752 local_irq_restore(flags); 778 page = list_entry(pcp->list.next, struct page, lru);
753 put_cpu(); 779 list_del(&page->lru);
780 pcp->count--;
754 } else { 781 } else {
755 spin_lock_irqsave(&zone->lock, flags); 782 spin_lock_irqsave(&zone->lock, flags);
756 page = __rmqueue(zone, order); 783 page = __rmqueue(zone, order);
757 spin_unlock_irqrestore(&zone->lock, flags); 784 spin_unlock(&zone->lock);
785 if (!page)
786 goto failed;
758 } 787 }
759 788
760 if (page != NULL) { 789 __mod_page_state_zone(zone, pgalloc, 1 << order);
761 BUG_ON(bad_range(zone, page)); 790 zone_statistics(zonelist, zone, cpu);
762 mod_page_state_zone(zone, pgalloc, 1 << order); 791 local_irq_restore(flags);
763 if (prep_new_page(page, order)) 792 put_cpu();
764 goto again;
765 793
766 if (gfp_flags & __GFP_ZERO) 794 BUG_ON(bad_range(zone, page));
767 prep_zero_page(page, order, gfp_flags); 795 if (prep_new_page(page, order))
796 goto again;
768 797
769 if (order && (gfp_flags & __GFP_COMP)) 798 if (gfp_flags & __GFP_ZERO)
770 prep_compound_page(page, order); 799 prep_zero_page(page, order, gfp_flags);
771 } 800
801 if (order && (gfp_flags & __GFP_COMP))
802 prep_compound_page(page, order);
772 return page; 803 return page;
804
805failed:
806 local_irq_restore(flags);
807 put_cpu();
808 return NULL;
773} 809}
774 810
775#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 811#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
@@ -845,9 +881,8 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
845 continue; 881 continue;
846 } 882 }
847 883
848 page = buffered_rmqueue(*z, order, gfp_mask); 884 page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
849 if (page) { 885 if (page) {
850 zone_statistics(zonelist, *z);
851 break; 886 break;
852 } 887 }
853 } while (*(++z) != NULL); 888 } while (*(++z) != NULL);
@@ -896,15 +931,15 @@ restart:
896 * 931 *
897 * The caller may dip into page reserves a bit more if the caller 932 * The caller may dip into page reserves a bit more if the caller
898 * cannot run direct reclaim, or if the caller has realtime scheduling 933 * cannot run direct reclaim, or if the caller has realtime scheduling
899 * policy. 934 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
935 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
900 */ 936 */
901 alloc_flags = ALLOC_WMARK_MIN; 937 alloc_flags = ALLOC_WMARK_MIN;
902 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) 938 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
903 alloc_flags |= ALLOC_HARDER; 939 alloc_flags |= ALLOC_HARDER;
904 if (gfp_mask & __GFP_HIGH) 940 if (gfp_mask & __GFP_HIGH)
905 alloc_flags |= ALLOC_HIGH; 941 alloc_flags |= ALLOC_HIGH;
906 if (wait) 942 alloc_flags |= ALLOC_CPUSET;
907 alloc_flags |= ALLOC_CPUSET;
908 943
909 /* 944 /*
910 * Go through the zonelist again. Let __GFP_HIGH and allocations 945 * Go through the zonelist again. Let __GFP_HIGH and allocations
@@ -926,7 +961,7 @@ restart:
926nofail_alloc: 961nofail_alloc:
927 /* go through the zonelist yet again, ignoring mins */ 962 /* go through the zonelist yet again, ignoring mins */
928 page = get_page_from_freelist(gfp_mask, order, 963 page = get_page_from_freelist(gfp_mask, order,
929 zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); 964 zonelist, ALLOC_NO_WATERMARKS);
930 if (page) 965 if (page)
931 goto got_pg; 966 goto got_pg;
932 if (gfp_mask & __GFP_NOFAIL) { 967 if (gfp_mask & __GFP_NOFAIL) {
@@ -945,6 +980,7 @@ rebalance:
945 cond_resched(); 980 cond_resched();
946 981
947 /* We now go into synchronous reclaim */ 982 /* We now go into synchronous reclaim */
983 cpuset_memory_pressure_bump();
948 p->flags |= PF_MEMALLOC; 984 p->flags |= PF_MEMALLOC;
949 reclaim_state.reclaimed_slab = 0; 985 reclaim_state.reclaimed_slab = 0;
950 p->reclaim_state = &reclaim_state; 986 p->reclaim_state = &reclaim_state;
@@ -1171,7 +1207,7 @@ EXPORT_SYMBOL(nr_pagecache);
1171DEFINE_PER_CPU(long, nr_pagecache_local) = 0; 1207DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
1172#endif 1208#endif
1173 1209
1174void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) 1210static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
1175{ 1211{
1176 int cpu = 0; 1212 int cpu = 0;
1177 1213
@@ -1224,7 +1260,7 @@ void get_full_page_state(struct page_state *ret)
1224 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); 1260 __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
1225} 1261}
1226 1262
1227unsigned long __read_page_state(unsigned long offset) 1263unsigned long read_page_state_offset(unsigned long offset)
1228{ 1264{
1229 unsigned long ret = 0; 1265 unsigned long ret = 0;
1230 int cpu; 1266 int cpu;
@@ -1238,18 +1274,26 @@ unsigned long __read_page_state(unsigned long offset)
1238 return ret; 1274 return ret;
1239} 1275}
1240 1276
1241void __mod_page_state(unsigned long offset, unsigned long delta) 1277void __mod_page_state_offset(unsigned long offset, unsigned long delta)
1278{
1279 void *ptr;
1280
1281 ptr = &__get_cpu_var(page_states);
1282 *(unsigned long *)(ptr + offset) += delta;
1283}
1284EXPORT_SYMBOL(__mod_page_state_offset);
1285
1286void mod_page_state_offset(unsigned long offset, unsigned long delta)
1242{ 1287{
1243 unsigned long flags; 1288 unsigned long flags;
1244 void* ptr; 1289 void *ptr;
1245 1290
1246 local_irq_save(flags); 1291 local_irq_save(flags);
1247 ptr = &__get_cpu_var(page_states); 1292 ptr = &__get_cpu_var(page_states);
1248 *(unsigned long*)(ptr + offset) += delta; 1293 *(unsigned long *)(ptr + offset) += delta;
1249 local_irq_restore(flags); 1294 local_irq_restore(flags);
1250} 1295}
1251 1296EXPORT_SYMBOL(mod_page_state_offset);
1252EXPORT_SYMBOL(__mod_page_state);
1253 1297
1254void __get_zone_counts(unsigned long *active, unsigned long *inactive, 1298void __get_zone_counts(unsigned long *active, unsigned long *inactive,
1255 unsigned long *free, struct pglist_data *pgdat) 1299 unsigned long *free, struct pglist_data *pgdat)
@@ -1335,7 +1379,7 @@ void show_free_areas(void)
1335 show_node(zone); 1379 show_node(zone);
1336 printk("%s per-cpu:", zone->name); 1380 printk("%s per-cpu:", zone->name);
1337 1381
1338 if (!zone->present_pages) { 1382 if (!populated_zone(zone)) {
1339 printk(" empty\n"); 1383 printk(" empty\n");
1340 continue; 1384 continue;
1341 } else 1385 } else
@@ -1347,10 +1391,9 @@ void show_free_areas(void)
1347 pageset = zone_pcp(zone, cpu); 1391 pageset = zone_pcp(zone, cpu);
1348 1392
1349 for (temperature = 0; temperature < 2; temperature++) 1393 for (temperature = 0; temperature < 2; temperature++)
1350 printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", 1394 printk("cpu %d %s: high %d, batch %d used:%d\n",
1351 cpu, 1395 cpu,
1352 temperature ? "cold" : "hot", 1396 temperature ? "cold" : "hot",
1353 pageset->pcp[temperature].low,
1354 pageset->pcp[temperature].high, 1397 pageset->pcp[temperature].high,
1355 pageset->pcp[temperature].batch, 1398 pageset->pcp[temperature].batch,
1356 pageset->pcp[temperature].count); 1399 pageset->pcp[temperature].count);
@@ -1413,7 +1456,7 @@ void show_free_areas(void)
1413 1456
1414 show_node(zone); 1457 show_node(zone);
1415 printk("%s: ", zone->name); 1458 printk("%s: ", zone->name);
1416 if (!zone->present_pages) { 1459 if (!populated_zone(zone)) {
1417 printk("empty\n"); 1460 printk("empty\n");
1418 continue; 1461 continue;
1419 } 1462 }
@@ -1433,36 +1476,29 @@ void show_free_areas(void)
1433 1476
1434/* 1477/*
1435 * Builds allocation fallback zone lists. 1478 * Builds allocation fallback zone lists.
1479 *
1480 * Add all populated zones of a node to the zonelist.
1436 */ 1481 */
1437static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) 1482static int __init build_zonelists_node(pg_data_t *pgdat,
1438{ 1483 struct zonelist *zonelist, int nr_zones, int zone_type)
1439 switch (k) { 1484{
1440 struct zone *zone; 1485 struct zone *zone;
1441 default: 1486
1442 BUG(); 1487 BUG_ON(zone_type > ZONE_HIGHMEM);
1443 case ZONE_HIGHMEM: 1488
1444 zone = pgdat->node_zones + ZONE_HIGHMEM; 1489 do {
1445 if (zone->present_pages) { 1490 zone = pgdat->node_zones + zone_type;
1491 if (populated_zone(zone)) {
1446#ifndef CONFIG_HIGHMEM 1492#ifndef CONFIG_HIGHMEM
1447 BUG(); 1493 BUG_ON(zone_type > ZONE_NORMAL);
1448#endif 1494#endif
1449 zonelist->zones[j++] = zone; 1495 zonelist->zones[nr_zones++] = zone;
1496 check_highest_zone(zone_type);
1450 } 1497 }
1451 case ZONE_NORMAL: 1498 zone_type--;
1452 zone = pgdat->node_zones + ZONE_NORMAL;
1453 if (zone->present_pages)
1454 zonelist->zones[j++] = zone;
1455 case ZONE_DMA32:
1456 zone = pgdat->node_zones + ZONE_DMA32;
1457 if (zone->present_pages)
1458 zonelist->zones[j++] = zone;
1459 case ZONE_DMA:
1460 zone = pgdat->node_zones + ZONE_DMA;
1461 if (zone->present_pages)
1462 zonelist->zones[j++] = zone;
1463 }
1464 1499
1465 return j; 1500 } while (zone_type >= 0);
1501 return nr_zones;
1466} 1502}
1467 1503
1468static inline int highest_zone(int zone_bits) 1504static inline int highest_zone(int zone_bits)
@@ -1706,11 +1742,9 @@ void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
1706 unsigned long end_pfn = start_pfn + size; 1742 unsigned long end_pfn = start_pfn + size;
1707 unsigned long pfn; 1743 unsigned long pfn;
1708 1744
1709 for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { 1745 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
1710 if (!early_pfn_valid(pfn)) 1746 if (!early_pfn_valid(pfn))
1711 continue; 1747 continue;
1712 if (!early_pfn_in_nid(pfn, nid))
1713 continue;
1714 page = pfn_to_page(pfn); 1748 page = pfn_to_page(pfn);
1715 set_page_links(page, zone, nid, pfn); 1749 set_page_links(page, zone, nid, pfn);
1716 set_page_count(page, 1); 1750 set_page_count(page, 1);
@@ -1794,19 +1828,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
1794 1828
1795 pcp = &p->pcp[0]; /* hot */ 1829 pcp = &p->pcp[0]; /* hot */
1796 pcp->count = 0; 1830 pcp->count = 0;
1797 pcp->low = 0;
1798 pcp->high = 6 * batch; 1831 pcp->high = 6 * batch;
1799 pcp->batch = max(1UL, 1 * batch); 1832 pcp->batch = max(1UL, 1 * batch);
1800 INIT_LIST_HEAD(&pcp->list); 1833 INIT_LIST_HEAD(&pcp->list);
1801 1834
1802 pcp = &p->pcp[1]; /* cold*/ 1835 pcp = &p->pcp[1]; /* cold*/
1803 pcp->count = 0; 1836 pcp->count = 0;
1804 pcp->low = 0;
1805 pcp->high = 2 * batch; 1837 pcp->high = 2 * batch;
1806 pcp->batch = max(1UL, batch/2); 1838 pcp->batch = max(1UL, batch/2);
1807 INIT_LIST_HEAD(&pcp->list); 1839 INIT_LIST_HEAD(&pcp->list);
1808} 1840}
1809 1841
1842/*
1843 * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
1844 * to the value high for the pageset p.
1845 */
1846
1847static void setup_pagelist_highmark(struct per_cpu_pageset *p,
1848 unsigned long high)
1849{
1850 struct per_cpu_pages *pcp;
1851
1852 pcp = &p->pcp[0]; /* hot list */
1853 pcp->high = high;
1854 pcp->batch = max(1UL, high/4);
1855 if ((high/4) > (PAGE_SHIFT * 8))
1856 pcp->batch = PAGE_SHIFT * 8;
1857}
1858
1859
1810#ifdef CONFIG_NUMA 1860#ifdef CONFIG_NUMA
1811/* 1861/*
1812 * Boot pageset table. One per cpu which is going to be used for all 1862 * Boot pageset table. One per cpu which is going to be used for all
@@ -1838,12 +1888,16 @@ static int __devinit process_zones(int cpu)
1838 1888
1839 for_each_zone(zone) { 1889 for_each_zone(zone) {
1840 1890
1841 zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), 1891 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
1842 GFP_KERNEL, cpu_to_node(cpu)); 1892 GFP_KERNEL, cpu_to_node(cpu));
1843 if (!zone->pageset[cpu]) 1893 if (!zone_pcp(zone, cpu))
1844 goto bad; 1894 goto bad;
1845 1895
1846 setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); 1896 setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
1897
1898 if (percpu_pagelist_fraction)
1899 setup_pagelist_highmark(zone_pcp(zone, cpu),
1900 (zone->present_pages / percpu_pagelist_fraction));
1847 } 1901 }
1848 1902
1849 return 0; 1903 return 0;
@@ -1851,15 +1905,14 @@ bad:
1851 for_each_zone(dzone) { 1905 for_each_zone(dzone) {
1852 if (dzone == zone) 1906 if (dzone == zone)
1853 break; 1907 break;
1854 kfree(dzone->pageset[cpu]); 1908 kfree(zone_pcp(dzone, cpu));
1855 dzone->pageset[cpu] = NULL; 1909 zone_pcp(dzone, cpu) = NULL;
1856 } 1910 }
1857 return -ENOMEM; 1911 return -ENOMEM;
1858} 1912}
1859 1913
1860static inline void free_zone_pagesets(int cpu) 1914static inline void free_zone_pagesets(int cpu)
1861{ 1915{
1862#ifdef CONFIG_NUMA
1863 struct zone *zone; 1916 struct zone *zone;
1864 1917
1865 for_each_zone(zone) { 1918 for_each_zone(zone) {
@@ -1868,7 +1921,6 @@ static inline void free_zone_pagesets(int cpu)
1868 zone_pcp(zone, cpu) = NULL; 1921 zone_pcp(zone, cpu) = NULL;
1869 kfree(pset); 1922 kfree(pset);
1870 } 1923 }
1871#endif
1872} 1924}
1873 1925
1874static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, 1926static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
@@ -1939,7 +1991,7 @@ static __devinit void zone_pcp_init(struct zone *zone)
1939 for (cpu = 0; cpu < NR_CPUS; cpu++) { 1991 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1940#ifdef CONFIG_NUMA 1992#ifdef CONFIG_NUMA
1941 /* Early boot. Slab allocator not functional yet */ 1993 /* Early boot. Slab allocator not functional yet */
1942 zone->pageset[cpu] = &boot_pageset[cpu]; 1994 zone_pcp(zone, cpu) = &boot_pageset[cpu];
1943 setup_pageset(&boot_pageset[cpu],0); 1995 setup_pageset(&boot_pageset[cpu],0);
1944#else 1996#else
1945 setup_pageset(zone_pcp(zone,cpu), batch); 1997 setup_pageset(zone_pcp(zone,cpu), batch);
@@ -2116,7 +2168,7 @@ static int frag_show(struct seq_file *m, void *arg)
2116 int order; 2168 int order;
2117 2169
2118 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { 2170 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2119 if (!zone->present_pages) 2171 if (!populated_zone(zone))
2120 continue; 2172 continue;
2121 2173
2122 spin_lock_irqsave(&zone->lock, flags); 2174 spin_lock_irqsave(&zone->lock, flags);
@@ -2149,7 +2201,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2149 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { 2201 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
2150 int i; 2202 int i;
2151 2203
2152 if (!zone->present_pages) 2204 if (!populated_zone(zone))
2153 continue; 2205 continue;
2154 2206
2155 spin_lock_irqsave(&zone->lock, flags); 2207 spin_lock_irqsave(&zone->lock, flags);
@@ -2182,7 +2234,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2182 seq_printf(m, 2234 seq_printf(m,
2183 ")" 2235 ")"
2184 "\n pagesets"); 2236 "\n pagesets");
2185 for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { 2237 for_each_online_cpu(i) {
2186 struct per_cpu_pageset *pageset; 2238 struct per_cpu_pageset *pageset;
2187 int j; 2239 int j;
2188 2240
@@ -2197,12 +2249,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
2197 seq_printf(m, 2249 seq_printf(m,
2198 "\n cpu: %i pcp: %i" 2250 "\n cpu: %i pcp: %i"
2199 "\n count: %i" 2251 "\n count: %i"
2200 "\n low: %i"
2201 "\n high: %i" 2252 "\n high: %i"
2202 "\n batch: %i", 2253 "\n batch: %i",
2203 i, j, 2254 i, j,
2204 pageset->pcp[j].count, 2255 pageset->pcp[j].count,
2205 pageset->pcp[j].low,
2206 pageset->pcp[j].high, 2256 pageset->pcp[j].high,
2207 pageset->pcp[j].batch); 2257 pageset->pcp[j].batch);
2208 } 2258 }
@@ -2257,32 +2307,40 @@ static char *vmstat_text[] = {
2257 "pgpgout", 2307 "pgpgout",
2258 "pswpin", 2308 "pswpin",
2259 "pswpout", 2309 "pswpout",
2260 "pgalloc_high",
2261 2310
2311 "pgalloc_high",
2262 "pgalloc_normal", 2312 "pgalloc_normal",
2313 "pgalloc_dma32",
2263 "pgalloc_dma", 2314 "pgalloc_dma",
2315
2264 "pgfree", 2316 "pgfree",
2265 "pgactivate", 2317 "pgactivate",
2266 "pgdeactivate", 2318 "pgdeactivate",
2267 2319
2268 "pgfault", 2320 "pgfault",
2269 "pgmajfault", 2321 "pgmajfault",
2322
2270 "pgrefill_high", 2323 "pgrefill_high",
2271 "pgrefill_normal", 2324 "pgrefill_normal",
2325 "pgrefill_dma32",
2272 "pgrefill_dma", 2326 "pgrefill_dma",
2273 2327
2274 "pgsteal_high", 2328 "pgsteal_high",
2275 "pgsteal_normal", 2329 "pgsteal_normal",
2330 "pgsteal_dma32",
2276 "pgsteal_dma", 2331 "pgsteal_dma",
2332
2277 "pgscan_kswapd_high", 2333 "pgscan_kswapd_high",
2278 "pgscan_kswapd_normal", 2334 "pgscan_kswapd_normal",
2279 2335 "pgscan_kswapd_dma32",
2280 "pgscan_kswapd_dma", 2336 "pgscan_kswapd_dma",
2337
2281 "pgscan_direct_high", 2338 "pgscan_direct_high",
2282 "pgscan_direct_normal", 2339 "pgscan_direct_normal",
2340 "pgscan_direct_dma32",
2283 "pgscan_direct_dma", 2341 "pgscan_direct_dma",
2284 "pginodesteal",
2285 2342
2343 "pginodesteal",
2286 "slabs_scanned", 2344 "slabs_scanned",
2287 "kswapd_steal", 2345 "kswapd_steal",
2288 "kswapd_inodesteal", 2346 "kswapd_inodesteal",
@@ -2539,6 +2597,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
2539 return 0; 2597 return 0;
2540} 2598}
2541 2599
2600/*
2601 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
2602 * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
2603 * can have before it gets flushed back to buddy allocator.
2604 */
2605
2606int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
2607 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
2608{
2609 struct zone *zone;
2610 unsigned int cpu;
2611 int ret;
2612
2613 ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
2614 if (!write || (ret == -EINVAL))
2615 return ret;
2616 for_each_zone(zone) {
2617 for_each_online_cpu(cpu) {
2618 unsigned long high;
2619 high = zone->present_pages / percpu_pagelist_fraction;
2620 setup_pagelist_highmark(zone_pcp(zone, cpu), high);
2621 }
2622 }
2623 return 0;
2624}
2625
2542__initdata int hashdist = HASHDIST_DEFAULT; 2626__initdata int hashdist = HASHDIST_DEFAULT;
2543 2627
2544#ifdef CONFIG_NUMA 2628#ifdef CONFIG_NUMA