aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c772
1 files changed, 493 insertions, 279 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 17d5f539a9aa..a5f3c278c573 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -23,6 +23,7 @@
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/compiler.h> 24#include <linux/compiler.h>
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/kmemcheck.h>
26#include <linux/module.h> 27#include <linux/module.h>
27#include <linux/suspend.h> 28#include <linux/suspend.h>
28#include <linux/pagevec.h> 29#include <linux/pagevec.h>
@@ -161,17 +162,25 @@ static unsigned long __meminitdata dma_reserve;
161 162
162#if MAX_NUMNODES > 1 163#if MAX_NUMNODES > 1
163int nr_node_ids __read_mostly = MAX_NUMNODES; 164int nr_node_ids __read_mostly = MAX_NUMNODES;
165int nr_online_nodes __read_mostly = 1;
164EXPORT_SYMBOL(nr_node_ids); 166EXPORT_SYMBOL(nr_node_ids);
167EXPORT_SYMBOL(nr_online_nodes);
165#endif 168#endif
166 169
167int page_group_by_mobility_disabled __read_mostly; 170int page_group_by_mobility_disabled __read_mostly;
168 171
169static void set_pageblock_migratetype(struct page *page, int migratetype) 172static void set_pageblock_migratetype(struct page *page, int migratetype)
170{ 173{
174
175 if (unlikely(page_group_by_mobility_disabled))
176 migratetype = MIGRATE_UNMOVABLE;
177
171 set_pageblock_flags_group(page, (unsigned long)migratetype, 178 set_pageblock_flags_group(page, (unsigned long)migratetype,
172 PB_migrate, PB_migrate_end); 179 PB_migrate, PB_migrate_end);
173} 180}
174 181
182bool oom_killer_disabled __read_mostly;
183
175#ifdef CONFIG_DEBUG_VM 184#ifdef CONFIG_DEBUG_VM
176static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 185static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
177{ 186{
@@ -294,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order)
294 } 303 }
295} 304}
296 305
297#ifdef CONFIG_HUGETLBFS
298void prep_compound_gigantic_page(struct page *page, unsigned long order)
299{
300 int i;
301 int nr_pages = 1 << order;
302 struct page *p = page + 1;
303
304 set_compound_page_dtor(page, free_compound_page);
305 set_compound_order(page, order);
306 __SetPageHead(page);
307 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
308 __SetPageTail(p);
309 p->first_page = page;
310 }
311}
312#endif
313
314static int destroy_compound_page(struct page *page, unsigned long order) 306static int destroy_compound_page(struct page *page, unsigned long order)
315{ 307{
316 int i; 308 int i;
@@ -417,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
417 return 0; 409 return 0;
418 410
419 if (PageBuddy(buddy) && page_order(buddy) == order) { 411 if (PageBuddy(buddy) && page_order(buddy) == order) {
420 BUG_ON(page_count(buddy) != 0); 412 VM_BUG_ON(page_count(buddy) != 0);
421 return 1; 413 return 1;
422 } 414 }
423 return 0; 415 return 0;
@@ -448,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
448 */ 440 */
449 441
450static inline void __free_one_page(struct page *page, 442static inline void __free_one_page(struct page *page,
451 struct zone *zone, unsigned int order) 443 struct zone *zone, unsigned int order,
444 int migratetype)
452{ 445{
453 unsigned long page_idx; 446 unsigned long page_idx;
454 int order_size = 1 << order;
455 int migratetype = get_pageblock_migratetype(page);
456 447
457 if (unlikely(PageCompound(page))) 448 if (unlikely(PageCompound(page)))
458 if (unlikely(destroy_compound_page(page, order))) 449 if (unlikely(destroy_compound_page(page, order)))
459 return; 450 return;
460 451
452 VM_BUG_ON(migratetype == -1);
453
461 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 454 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
462 455
463 VM_BUG_ON(page_idx & (order_size - 1)); 456 VM_BUG_ON(page_idx & ((1 << order) - 1));
464 VM_BUG_ON(bad_range(zone, page)); 457 VM_BUG_ON(bad_range(zone, page));
465 458
466 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
467 while (order < MAX_ORDER-1) { 459 while (order < MAX_ORDER-1) {
468 unsigned long combined_idx; 460 unsigned long combined_idx;
469 struct page *buddy; 461 struct page *buddy;
@@ -487,12 +479,27 @@ static inline void __free_one_page(struct page *page,
487 zone->free_area[order].nr_free++; 479 zone->free_area[order].nr_free++;
488} 480}
489 481
482#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
483/*
484 * free_page_mlock() -- clean up attempts to free and mlocked() page.
485 * Page should not be on lru, so no need to fix that up.
486 * free_pages_check() will verify...
487 */
488static inline void free_page_mlock(struct page *page)
489{
490 __ClearPageMlocked(page);
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
490static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
491{ 499{
492 free_page_mlock(page);
493 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
494 (page->mapping != NULL) | 501 (page->mapping != NULL) |
495 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
496 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
497 bad_page(page); 504 bad_page(page);
498 return 1; 505 return 1;
@@ -519,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
519 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
520 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
521 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
522 while (count--) { 531 while (count--) {
523 struct page *page; 532 struct page *page;
524 533
@@ -526,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
526 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
527 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
528 list_del(&page->lru); 537 list_del(&page->lru);
529 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
530 } 539 }
531 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
532} 541}
533 542
534static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
535{ 545{
536 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
537 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
538 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
539 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
540 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
541} 553}
542 554
@@ -545,6 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order)
545 unsigned long flags; 557 unsigned long flags;
546 int i; 558 int i;
547 int bad = 0; 559 int bad = 0;
560 int clearMlocked = PageMlocked(page);
561
562 kmemcheck_free_shadow(page, order);
548 563
549 for (i = 0 ; i < (1 << order) ; ++i) 564 for (i = 0 ; i < (1 << order) ; ++i)
550 bad += free_pages_check(page + i); 565 bad += free_pages_check(page + i);
@@ -560,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
560 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
561 576
562 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(clearMlocked))
579 free_page_mlock(page);
563 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
564 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
565 local_irq_restore(flags); 583 local_irq_restore(flags);
566} 584}
567 585
@@ -632,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
632{ 650{
633 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
634 (page->mapping != NULL) | 652 (page->mapping != NULL) |
635 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
636 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
637 bad_page(page); 655 bad_page(page);
638 return 1; 656 return 1;
@@ -657,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
657 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
658 * the smallest available page from the freelists 676 * the smallest available page from the freelists
659 */ 677 */
660static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
661 int migratetype) 680 int migratetype)
662{ 681{
663 unsigned int current_order; 682 unsigned int current_order;
@@ -675,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
675 list_del(&page->lru); 694 list_del(&page->lru);
676 rmv_page_order(page); 695 rmv_page_order(page);
677 area->nr_free--; 696 area->nr_free--;
678 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
679 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
680 return page; 698 return page;
681 } 699 }
@@ -766,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
766} 784}
767 785
768/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
769static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
770 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
771{ 789{
772 struct free_area * area; 790 struct free_area * area;
773 int current_order; 791 int current_order;
@@ -815,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
815 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
816 list_del(&page->lru); 834 list_del(&page->lru);
817 rmv_page_order(page); 835 rmv_page_order(page);
818 __mod_zone_page_state(zone, NR_FREE_PAGES,
819 -(1UL << order));
820 836
821 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
822 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -827,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
827 } 843 }
828 } 844 }
829 845
830 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
831 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
832} 847}
833 848
834/* 849/*
@@ -840,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
840{ 855{
841 struct page *page; 856 struct page *page;
842 857
858retry_reserve:
843 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
844 860
845 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
846 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
847 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
848 return page; 875 return page;
849} 876}
850 877
@@ -878,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
878 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
879 list = &page->lru; 906 list = &page->lru;
880 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
881 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
882 return i; 910 return i;
883} 911}
@@ -993,6 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold)
993 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
994 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
995 unsigned long flags; 1023 unsigned long flags;
1024 int clearMlocked = PageMlocked(page);
1025
1026 kmemcheck_free_shadow(page, 0);
996 1027
997 if (PageAnon(page)) 1028 if (PageAnon(page))
998 page->mapping = NULL; 1029 page->mapping = NULL;
@@ -1007,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1007 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1008 1039
1009 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1010 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(clearMlocked))
1044 free_page_mlock(page);
1011 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1012 if (cold) 1047 if (cold)
1013 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1014 else 1049 else
1015 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1016 set_page_private(page, get_pageblock_migratetype(page));
1017 pcp->count++; 1051 pcp->count++;
1018 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1019 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1047,6 +1081,16 @@ void split_page(struct page *page, unsigned int order)
1047 1081
1048 VM_BUG_ON(PageCompound(page)); 1082 VM_BUG_ON(PageCompound(page));
1049 VM_BUG_ON(!page_count(page)); 1083 VM_BUG_ON(!page_count(page));
1084
1085#ifdef CONFIG_KMEMCHECK
1086 /*
1087 * Split shadow pages too, because free(page[0]) would
1088 * otherwise free the whole shadow.
1089 */
1090 if (kmemcheck_page_is_tracked(page))
1091 split_page(virt_to_page(page[0].shadow), order);
1092#endif
1093
1050 for (i = 1; i < (1 << order); i++) 1094 for (i = 1; i < (1 << order); i++)
1051 set_page_refcounted(page + i); 1095 set_page_refcounted(page + i);
1052} 1096}
@@ -1056,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1056 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1057 * or two. 1101 * or two.
1058 */ 1102 */
1059static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1060 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1061{ 1107{
1062 unsigned long flags; 1108 unsigned long flags;
1063 struct page *page; 1109 struct page *page;
1064 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1065 int cpu; 1111 int cpu;
1066 int migratetype = allocflags_to_migratetype(gfp_flags);
1067 1112
1068again: 1113again:
1069 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1100,8 +1145,22 @@ again:
1100 list_del(&page->lru); 1145 list_del(&page->lru);
1101 pcp->count--; 1146 pcp->count--;
1102 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 0);
1160 }
1103 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1104 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1105 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1106 if (!page) 1165 if (!page)
1107 goto failed; 1166 goto failed;
@@ -1123,10 +1182,15 @@ failed:
1123 return NULL; 1182 return NULL;
1124} 1183}
1125 1184
1126#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1127#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1128#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1129#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1130#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1131#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1132#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1384,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1384 */ 1448 */
1385static struct page * 1449static struct page *
1386get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1387 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1388{ 1453{
1389 struct zoneref *z; 1454 struct zoneref *z;
1390 struct page *page = NULL; 1455 struct page *page = NULL;
1391 int classzone_idx; 1456 int classzone_idx;
1392 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1393 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1394 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1395 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1396 1461
1397 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1398 &preferred_zone);
1399 if (!preferred_zone)
1400 return NULL;
1401
1402 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1403
1404zonelist_scan: 1463zonelist_scan:
1405 /* 1464 /*
1406 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1415,31 +1474,49 @@ zonelist_scan:
1415 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1416 goto try_next_zone; 1475 goto try_next_zone;
1417 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1418 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1419 unsigned long mark; 1479 unsigned long mark;
1420 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1421 mark = zone->pages_min; 1481
1422 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1423 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1424 else 1484 classzone_idx, alloc_flags))
1425 mark = zone->pages_high; 1485 goto try_this_zone;
1426 if (!zone_watermark_ok(zone, order, mark, 1486
1427 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1428 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1429 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1430 goto this_zone_full; 1502 goto this_zone_full;
1431 } 1503 }
1432 } 1504 }
1433 1505
1434 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1435 if (page) 1509 if (page)
1436 break; 1510 break;
1437this_zone_full: 1511this_zone_full:
1438 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1439 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1440try_next_zone: 1514try_next_zone:
1441 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1442 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1443 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1444 zlc_active = 1; 1521 zlc_active = 1;
1445 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1454,47 +1531,217 @@ try_next_zone:
1454 return page; 1531 return page;
1455} 1532}
1456 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1457/* 1651/*
1458 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1459 */ 1654 */
1460struct page * 1655static inline struct page *
1461__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1462 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1463{ 1678{
1464 const gfp_t wait = gfp_mask & __GFP_WAIT;
1465 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1466 struct zoneref *z; 1679 struct zoneref *z;
1467 struct zone *zone; 1680 struct zone *zone;
1468 struct page *page;
1469 struct reclaim_state reclaim_state;
1470 struct task_struct *p = current;
1471 int do_retry;
1472 int alloc_flags;
1473 unsigned long did_some_progress;
1474 unsigned long pages_reclaimed = 0;
1475 1681
1476 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1477 1685
1478 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1479 1692
1480 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1481 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1482 1695
1483restart: 1696 /*
1484 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1485 1703
1486 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1487 /* 1706 /*
1488 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1489 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1490 */ 1709 */
1491 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1492 } 1719 }
1493 1720
1494 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1495 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1496 if (page) 1723
1497 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1498 1745
1499 /* 1746 /*
1500 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1507,154 +1754,83 @@ restart:
1507 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1508 goto nopage; 1755 goto nopage;
1509 1756
1510 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1511 wakeup_kswapd(zone, order);
1512 1758
1513 /* 1759 /*
1514 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1515 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1516 * to how we want to proceed. 1762 * to how we want to proceed.
1517 *
1518 * The caller may dip into page reserves a bit more if the caller
1519 * cannot run direct reclaim, or if the caller has realtime scheduling
1520 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1521 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1522 */ 1763 */
1523 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1524 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1525 alloc_flags |= ALLOC_HARDER;
1526 if (gfp_mask & __GFP_HIGH)
1527 alloc_flags |= ALLOC_HIGH;
1528 if (wait)
1529 alloc_flags |= ALLOC_CPUSET;
1530 1765
1531 /* 1766restart:
1532 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1533 * coming from realtime tasks go deeper into reserves.
1534 *
1535 * This is the last chance, in general, before the goto nopage.
1536 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1537 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1538 */
1539 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1540 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1541 if (page) 1771 if (page)
1542 goto got_pg; 1772 goto got_pg;
1543 1773
1544 /* This allocation should allow future memory freeing. */
1545
1546rebalance: 1774rebalance:
1547 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1548 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1549 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1550nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1551 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1552 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1553 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1554 if (page)
1555 goto got_pg;
1556 if (gfp_mask & __GFP_NOFAIL) {
1557 congestion_wait(WRITE, HZ/50);
1558 goto nofail_alloc;
1559 }
1560 }
1561 goto nopage;
1562 } 1782 }
1563 1783
1564 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1565 if (!wait) 1785 if (!wait)
1566 goto nopage; 1786 goto nopage;
1567 1787
1568 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1569 1800
1570 /* We now go into synchronous reclaim */
1571 cpuset_memory_pressure_bump();
1572 /* 1801 /*
1573 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1574 */ 1804 */
1575 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1576 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1577 1807 if (oom_killer_disabled)
1578 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1579 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1580 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1581 1811 nodemask, preferred_zone,
1582 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1583 gfp_mask, nodemask); 1813 if (page)
1584 1814 goto got_pg;
1585 p->reclaim_state = NULL;
1586 lockdep_clear_current_reclaim_state();
1587 p->flags &= ~PF_MEMALLOC;
1588
1589 cond_resched();
1590 1815
1591 if (order != 0) 1816 /*
1592 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1593 1825
1594 if (likely(did_some_progress)) {
1595 page = get_page_from_freelist(gfp_mask, nodemask, order,
1596 zonelist, high_zoneidx, alloc_flags);
1597 if (page)
1598 goto got_pg;
1599 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1600 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1601 schedule_timeout_uninterruptible(1);
1602 goto restart; 1826 goto restart;
1603 } 1827 }
1604
1605 /*
1606 * Go through the zonelist yet one more time, keep
1607 * very high watermark here, this is only to catch
1608 * a parallel oom killing, we must fail if we're still
1609 * under heavy pressure.
1610 */
1611 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1612 order, zonelist, high_zoneidx,
1613 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1614 if (page) {
1615 clear_zonelist_oom(zonelist, gfp_mask);
1616 goto got_pg;
1617 }
1618
1619 /* The OOM killer will not help higher order allocs so fail */
1620 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1621 clear_zonelist_oom(zonelist, gfp_mask);
1622 goto nopage;
1623 }
1624
1625 out_of_memory(zonelist, gfp_mask, order);
1626 clear_zonelist_oom(zonelist, gfp_mask);
1627 goto restart;
1628 } 1828 }
1629 1829
1630 /* 1830 /* Check if we should retry the allocation */
1631 * Don't let big-order allocations loop unless the caller explicitly
1632 * requests that. Wait for some write requests to complete then retry.
1633 *
1634 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1635 * means __GFP_NOFAIL, but that may not be true in other
1636 * implementations.
1637 *
1638 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1639 * specified, then we retry until we no longer reclaim any pages
1640 * (above), or we've reclaimed an order of pages at least as
1641 * large as the allocation's order. In both cases, if the
1642 * allocation still fails, we stop retrying.
1643 */
1644 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1645 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1646 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1647 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1648 do_retry = 1;
1649 } else {
1650 if (gfp_mask & __GFP_REPEAT &&
1651 pages_reclaimed < (1 << order))
1652 do_retry = 1;
1653 }
1654 if (gfp_mask & __GFP_NOFAIL)
1655 do_retry = 1;
1656 }
1657 if (do_retry) {
1658 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1659 goto rebalance; 1835 goto rebalance;
1660 } 1836 }
@@ -1667,10 +1843,58 @@ nopage:
1667 dump_stack(); 1843 dump_stack();
1668 show_mem(); 1844 show_mem();
1669 } 1845 }
1846 return page;
1670got_pg: 1847got_pg:
1848 if (kmemcheck_enabled)
1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1850 return page;
1851
1852}
1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 lockdep_trace_alloc(gfp_mask);
1867
1868 might_sleep_if(gfp_mask & __GFP_WAIT);
1869
1870 if (should_fail_alloc_page(gfp_mask, order))
1871 return NULL;
1872
1873 /*
1874 * Check the zones suitable for the gfp_mask contain at least one
1875 * valid zone. It's possible to have an empty zonelist as a result
1876 * of GFP_THISNODE and a memoryless node
1877 */
1878 if (unlikely(!zonelist->_zonerefs->zone))
1879 return NULL;
1880
1881 /* The preferred zone is used for statistics later */
1882 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1883 if (!preferred_zone)
1884 return NULL;
1885
1886 /* First allocation attempt */
1887 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1888 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1889 preferred_zone, migratetype);
1890 if (unlikely(!page))
1891 page = __alloc_pages_slowpath(gfp_mask, order,
1892 zonelist, high_zoneidx, nodemask,
1893 preferred_zone, migratetype);
1894
1671 return page; 1895 return page;
1672} 1896}
1673EXPORT_SYMBOL(__alloc_pages_internal); 1897EXPORT_SYMBOL(__alloc_pages_nodemask);
1674 1898
1675/* 1899/*
1676 * Common helper functions. 1900 * Common helper functions.
@@ -1799,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset)
1799 2023
1800 for_each_zone_zonelist(zone, z, zonelist, offset) { 2024 for_each_zone_zonelist(zone, z, zonelist, offset) {
1801 unsigned long size = zone->present_pages; 2025 unsigned long size = zone->present_pages;
1802 unsigned long high = zone->pages_high; 2026 unsigned long high = high_wmark_pages(zone);
1803 if (size > high) 2027 if (size > high)
1804 sum += size - high; 2028 sum += size - high;
1805 } 2029 }
@@ -1891,19 +2115,14 @@ void show_free_areas(void)
1891 2115
1892 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2116 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1893 " inactive_file:%lu" 2117 " inactive_file:%lu"
1894//TODO: check/adjust line lengths
1895#ifdef CONFIG_UNEVICTABLE_LRU
1896 " unevictable:%lu" 2118 " unevictable:%lu"
1897#endif
1898 " dirty:%lu writeback:%lu unstable:%lu\n" 2119 " dirty:%lu writeback:%lu unstable:%lu\n"
1899 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2120 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1900 global_page_state(NR_ACTIVE_ANON), 2121 global_page_state(NR_ACTIVE_ANON),
1901 global_page_state(NR_ACTIVE_FILE), 2122 global_page_state(NR_ACTIVE_FILE),
1902 global_page_state(NR_INACTIVE_ANON), 2123 global_page_state(NR_INACTIVE_ANON),
1903 global_page_state(NR_INACTIVE_FILE), 2124 global_page_state(NR_INACTIVE_FILE),
1904#ifdef CONFIG_UNEVICTABLE_LRU
1905 global_page_state(NR_UNEVICTABLE), 2125 global_page_state(NR_UNEVICTABLE),
1906#endif
1907 global_page_state(NR_FILE_DIRTY), 2126 global_page_state(NR_FILE_DIRTY),
1908 global_page_state(NR_WRITEBACK), 2127 global_page_state(NR_WRITEBACK),
1909 global_page_state(NR_UNSTABLE_NFS), 2128 global_page_state(NR_UNSTABLE_NFS),
@@ -1927,25 +2146,21 @@ void show_free_areas(void)
1927 " inactive_anon:%lukB" 2146 " inactive_anon:%lukB"
1928 " active_file:%lukB" 2147 " active_file:%lukB"
1929 " inactive_file:%lukB" 2148 " inactive_file:%lukB"
1930#ifdef CONFIG_UNEVICTABLE_LRU
1931 " unevictable:%lukB" 2149 " unevictable:%lukB"
1932#endif
1933 " present:%lukB" 2150 " present:%lukB"
1934 " pages_scanned:%lu" 2151 " pages_scanned:%lu"
1935 " all_unreclaimable? %s" 2152 " all_unreclaimable? %s"
1936 "\n", 2153 "\n",
1937 zone->name, 2154 zone->name,
1938 K(zone_page_state(zone, NR_FREE_PAGES)), 2155 K(zone_page_state(zone, NR_FREE_PAGES)),
1939 K(zone->pages_min), 2156 K(min_wmark_pages(zone)),
1940 K(zone->pages_low), 2157 K(low_wmark_pages(zone)),
1941 K(zone->pages_high), 2158 K(high_wmark_pages(zone)),
1942 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2159 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1943 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2160 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1944 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2161 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1945 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2162 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1946#ifdef CONFIG_UNEVICTABLE_LRU
1947 K(zone_page_state(zone, NR_UNEVICTABLE)), 2163 K(zone_page_state(zone, NR_UNEVICTABLE)),
1948#endif
1949 K(zone->present_pages), 2164 K(zone->present_pages),
1950 zone->pages_scanned, 2165 zone->pages_scanned,
1951 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2166 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2103,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2103} 2318}
2104 2319
2105 2320
2106#define MAX_NODE_LOAD (num_online_nodes()) 2321#define MAX_NODE_LOAD (nr_online_nodes)
2107static int node_load[MAX_NUMNODES]; 2322static int node_load[MAX_NUMNODES];
2108 2323
2109/** 2324/**
@@ -2312,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat)
2312 2527
2313 /* NUMA-aware ordering of nodes */ 2528 /* NUMA-aware ordering of nodes */
2314 local_node = pgdat->node_id; 2529 local_node = pgdat->node_id;
2315 load = num_online_nodes(); 2530 load = nr_online_nodes;
2316 prev_node = local_node; 2531 prev_node = local_node;
2317 nodes_clear(used_mask); 2532 nodes_clear(used_mask);
2318 2533
@@ -2463,7 +2678,7 @@ void build_all_zonelists(void)
2463 2678
2464 printk("Built %i zonelists in %s order, mobility grouping %s. " 2679 printk("Built %i zonelists in %s order, mobility grouping %s. "
2465 "Total pages: %ld\n", 2680 "Total pages: %ld\n",
2466 num_online_nodes(), 2681 nr_online_nodes,
2467 zonelist_order_name[current_zonelist_order], 2682 zonelist_order_name[current_zonelist_order],
2468 page_group_by_mobility_disabled ? "off" : "on", 2683 page_group_by_mobility_disabled ? "off" : "on",
2469 vm_total_pages); 2684 vm_total_pages);
@@ -2542,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2542 2757
2543/* 2758/*
2544 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2759 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2545 * of blocks reserved is based on zone->pages_min. The memory within the 2760 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2546 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2761 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2547 * higher will lead to a bigger reserve which will get freed as contiguous 2762 * higher will lead to a bigger reserve which will get freed as contiguous
2548 * blocks as reclaim kicks in 2763 * blocks as reclaim kicks in
2549 */ 2764 */
@@ -2556,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2556 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2771 /* Get the start pfn, end pfn and the number of blocks to reserve */
2557 start_pfn = zone->zone_start_pfn; 2772 start_pfn = zone->zone_start_pfn;
2558 end_pfn = start_pfn + zone->spanned_pages; 2773 end_pfn = start_pfn + zone->spanned_pages;
2559 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2774 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2560 pageblock_order; 2775 pageblock_order;
2561 2776
2562 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2777 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -3488,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3488 zone_pcp_init(zone); 3703 zone_pcp_init(zone);
3489 for_each_lru(l) { 3704 for_each_lru(l) {
3490 INIT_LIST_HEAD(&zone->lru[l].list); 3705 INIT_LIST_HEAD(&zone->lru[l].list);
3491 zone->lru[l].nr_scan = 0; 3706 zone->lru[l].nr_saved_scan = 0;
3492 } 3707 }
3493 zone->reclaim_stat.recent_rotated[0] = 0; 3708 zone->reclaim_stat.recent_rotated[0] = 0;
3494 zone->reclaim_stat.recent_rotated[1] = 0; 3709 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4025,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4025 early_node_map[i].start_pfn, 4240 early_node_map[i].start_pfn,
4026 early_node_map[i].end_pfn); 4241 early_node_map[i].end_pfn);
4027 4242
4243 /*
4244 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4245 * that node_mask, clear it at first
4246 */
4247 nodes_clear(node_states[N_HIGH_MEMORY]);
4028 /* Initialise every node */ 4248 /* Initialise every node */
4029 mminit_verify_pageflags_layout(); 4249 mminit_verify_pageflags_layout();
4030 setup_nr_node_ids(); 4250 setup_nr_node_ids();
@@ -4159,8 +4379,8 @@ static void calculate_totalreserve_pages(void)
4159 max = zone->lowmem_reserve[j]; 4379 max = zone->lowmem_reserve[j];
4160 } 4380 }
4161 4381
4162 /* we treat pages_high as reserved pages. */ 4382 /* we treat the high watermark as reserved pages. */
4163 max += zone->pages_high; 4383 max += high_wmark_pages(zone);
4164 4384
4165 if (max > zone->present_pages) 4385 if (max > zone->present_pages)
4166 max = zone->present_pages; 4386 max = zone->present_pages;
@@ -4210,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void)
4210} 4430}
4211 4431
4212/** 4432/**
4213 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4433 * setup_per_zone_wmarks - called when min_free_kbytes changes
4434 * or when memory is hot-{added|removed}
4214 * 4435 *
4215 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4436 * Ensures that the watermark[min,low,high] values for each zone are set
4216 * with respect to min_free_kbytes. 4437 * correctly with respect to min_free_kbytes.
4217 */ 4438 */
4218void setup_per_zone_pages_min(void) 4439void setup_per_zone_wmarks(void)
4219{ 4440{
4220 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4441 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4221 unsigned long lowmem_pages = 0; 4442 unsigned long lowmem_pages = 0;
@@ -4240,7 +4461,7 @@ void setup_per_zone_pages_min(void)
4240 * need highmem pages, so cap pages_min to a small 4461 * need highmem pages, so cap pages_min to a small
4241 * value here. 4462 * value here.
4242 * 4463 *
4243 * The (pages_high-pages_low) and (pages_low-pages_min) 4464 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4244 * deltas controls asynch page reclaim, and so should 4465 * deltas controls asynch page reclaim, and so should
4245 * not be capped for highmem. 4466 * not be capped for highmem.
4246 */ 4467 */
@@ -4251,17 +4472,17 @@ void setup_per_zone_pages_min(void)
4251 min_pages = SWAP_CLUSTER_MAX; 4472 min_pages = SWAP_CLUSTER_MAX;
4252 if (min_pages > 128) 4473 if (min_pages > 128)
4253 min_pages = 128; 4474 min_pages = 128;
4254 zone->pages_min = min_pages; 4475 zone->watermark[WMARK_MIN] = min_pages;
4255 } else { 4476 } else {
4256 /* 4477 /*
4257 * If it's a lowmem zone, reserve a number of pages 4478 * If it's a lowmem zone, reserve a number of pages
4258 * proportionate to the zone's size. 4479 * proportionate to the zone's size.
4259 */ 4480 */
4260 zone->pages_min = tmp; 4481 zone->watermark[WMARK_MIN] = tmp;
4261 } 4482 }
4262 4483
4263 zone->pages_low = zone->pages_min + (tmp >> 2); 4484 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4264 zone->pages_high = zone->pages_min + (tmp >> 1); 4485 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4265 setup_zone_migrate_reserve(zone); 4486 setup_zone_migrate_reserve(zone);
4266 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4267 } 4488 }
@@ -4271,8 +4492,6 @@ void setup_per_zone_pages_min(void)
4271} 4492}
4272 4493
4273/** 4494/**
4274 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4275 *
4276 * The inactive anon list should be small enough that the VM never has to 4495 * The inactive anon list should be small enough that the VM never has to
4277 * do too much work, but large enough that each inactive page has a chance 4496 * do too much work, but large enough that each inactive page has a chance
4278 * to be referenced again before it is swapped out. 4497 * to be referenced again before it is swapped out.
@@ -4293,21 +4512,26 @@ void setup_per_zone_pages_min(void)
4293 * 1TB 101 10GB 4512 * 1TB 101 10GB
4294 * 10TB 320 32GB 4513 * 10TB 320 32GB
4295 */ 4514 */
4296static void setup_per_zone_inactive_ratio(void) 4515void calculate_zone_inactive_ratio(struct zone *zone)
4297{ 4516{
4298 struct zone *zone; 4517 unsigned int gb, ratio;
4299
4300 for_each_zone(zone) {
4301 unsigned int gb, ratio;
4302 4518
4303 /* Zone size in gigabytes */ 4519 /* Zone size in gigabytes */
4304 gb = zone->present_pages >> (30 - PAGE_SHIFT); 4520 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4521 if (gb)
4305 ratio = int_sqrt(10 * gb); 4522 ratio = int_sqrt(10 * gb);
4306 if (!ratio) 4523 else
4307 ratio = 1; 4524 ratio = 1;
4308 4525
4309 zone->inactive_ratio = ratio; 4526 zone->inactive_ratio = ratio;
4310 } 4527}
4528
4529static void __init setup_per_zone_inactive_ratio(void)
4530{
4531 struct zone *zone;
4532
4533 for_each_zone(zone)
4534 calculate_zone_inactive_ratio(zone);
4311} 4535}
4312 4536
4313/* 4537/*
@@ -4334,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void)
4334 * 8192MB: 11584k 4558 * 8192MB: 11584k
4335 * 16384MB: 16384k 4559 * 16384MB: 16384k
4336 */ 4560 */
4337static int __init init_per_zone_pages_min(void) 4561static int __init init_per_zone_wmark_min(void)
4338{ 4562{
4339 unsigned long lowmem_kbytes; 4563 unsigned long lowmem_kbytes;
4340 4564
@@ -4345,12 +4569,12 @@ static int __init init_per_zone_pages_min(void)
4345 min_free_kbytes = 128; 4569 min_free_kbytes = 128;
4346 if (min_free_kbytes > 65536) 4570 if (min_free_kbytes > 65536)
4347 min_free_kbytes = 65536; 4571 min_free_kbytes = 65536;
4348 setup_per_zone_pages_min(); 4572 setup_per_zone_wmarks();
4349 setup_per_zone_lowmem_reserve(); 4573 setup_per_zone_lowmem_reserve();
4350 setup_per_zone_inactive_ratio(); 4574 setup_per_zone_inactive_ratio();
4351 return 0; 4575 return 0;
4352} 4576}
4353module_init(init_per_zone_pages_min) 4577module_init(init_per_zone_wmark_min)
4354 4578
4355/* 4579/*
4356 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4580 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4362,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4362{ 4586{
4363 proc_dointvec(table, write, file, buffer, length, ppos); 4587 proc_dointvec(table, write, file, buffer, length, ppos);
4364 if (write) 4588 if (write)
4365 setup_per_zone_pages_min(); 4589 setup_per_zone_wmarks();
4366 return 0; 4590 return 0;
4367} 4591}
4368 4592
@@ -4406,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4406 * whenever sysctl_lowmem_reserve_ratio changes. 4630 * whenever sysctl_lowmem_reserve_ratio changes.
4407 * 4631 *
4408 * The reserve ratio obviously has absolutely no relation with the 4632 * The reserve ratio obviously has absolutely no relation with the
4409 * pages_min watermarks. The lowmem reserve ratio can only make sense 4633 * minimum watermarks. The lowmem reserve ratio can only make sense
4410 * if in function of the boot time zone sizes. 4634 * if in function of the boot time zone sizes.
4411 */ 4635 */
4412int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4636int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4513,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4513 else if (hashdist) 4737 else if (hashdist)
4514 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4738 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4515 else { 4739 else {
4516 unsigned long order = get_order(size);
4517 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4518 /* 4740 /*
4519 * If bucketsize is not a power-of-two, we may free 4741 * If bucketsize is not a power-of-two, we may free
4520 * some pages at the end of hash table. 4742 * some pages at the end of hash table which
4743 * alloc_pages_exact() automatically does
4521 */ 4744 */
4522 if (table) { 4745 if (get_order(size) < MAX_ORDER)
4523 unsigned long alloc_end = (unsigned long)table + 4746 table = alloc_pages_exact(size, GFP_ATOMIC);
4524 (PAGE_SIZE << order);
4525 unsigned long used = (unsigned long)table +
4526 PAGE_ALIGN(size);
4527 split_page(virt_to_page(table), order);
4528 while (used < alloc_end) {
4529 free_page(used);
4530 used += PAGE_SIZE;
4531 }
4532 }
4533 } 4747 }
4534 } while (!table && size > PAGE_SIZE && --log2qty); 4748 } while (!table && size > PAGE_SIZE && --log2qty);
4535 4749