aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:50:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:50:13 -0400
commit517d08699b250021303f9a7cf0d758b6dc0748ed (patch)
tree5e5b0134c3fffb78fe9d8b1641a64ff28fdd7bbc /mm/page_alloc.c
parent8eeee4e2f04fc551f50c9d9847da2d73d7d33728 (diff)
parenta34601c5d84134055782ee031d58d82f5440e918 (diff)
Merge branch 'akpm'
* akpm: (182 commits) fbdev: bf54x-lq043fb: use kzalloc over kmalloc/memset fbdev: *bfin*: fix __dev{init,exit} markings fbdev: *bfin*: drop unnecessary calls to memset fbdev: bfin-t350mcqb-fb: drop unused local variables fbdev: blackfin has __raw I/O accessors, so use them in fb.h fbdev: s1d13xxxfb: add accelerated bitblt functions tcx: use standard fields for framebuffer physical address and length fbdev: add support for handoff from firmware to hw framebuffers intelfb: fix a bug when changing video timing fbdev: use framebuffer_release() for freeing fb_info structures radeon: P2G2CLK_ALWAYS_ONb tested twice, should 2nd be P2G2CLK_DAC_ALWAYS_ONb? s3c-fb: CPUFREQ frequency scaling support s3c-fb: fix resource releasing on error during probing carminefb: fix possible access beyond end of carmine_modedb[] acornfb: remove fb_mmap function mb862xxfb: use CONFIG_OF instead of CONFIG_PPC_OF mb862xxfb: restrict compliation of platform driver to PPC Samsung SoC Framebuffer driver: add Alpha Channel support atmel-lcdc: fix pixclock upper bound detection offb: use framebuffer_alloc() to allocate fb_info struct ... Manually fix up conflicts due to kmemcheck in mm/slab.c
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c754
1 files changed, 475 insertions, 279 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0727896a88ac..a5f3c278c573 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -162,17 +162,25 @@ static unsigned long __meminitdata dma_reserve;
162 162
163#if MAX_NUMNODES > 1 163#if MAX_NUMNODES > 1
164int nr_node_ids __read_mostly = MAX_NUMNODES; 164int nr_node_ids __read_mostly = MAX_NUMNODES;
165int nr_online_nodes __read_mostly = 1;
165EXPORT_SYMBOL(nr_node_ids); 166EXPORT_SYMBOL(nr_node_ids);
167EXPORT_SYMBOL(nr_online_nodes);
166#endif 168#endif
167 169
168int page_group_by_mobility_disabled __read_mostly; 170int page_group_by_mobility_disabled __read_mostly;
169 171
170static void set_pageblock_migratetype(struct page *page, int migratetype) 172static void set_pageblock_migratetype(struct page *page, int migratetype)
171{ 173{
174
175 if (unlikely(page_group_by_mobility_disabled))
176 migratetype = MIGRATE_UNMOVABLE;
177
172 set_pageblock_flags_group(page, (unsigned long)migratetype, 178 set_pageblock_flags_group(page, (unsigned long)migratetype,
173 PB_migrate, PB_migrate_end); 179 PB_migrate, PB_migrate_end);
174} 180}
175 181
182bool oom_killer_disabled __read_mostly;
183
176#ifdef CONFIG_DEBUG_VM 184#ifdef CONFIG_DEBUG_VM
177static int page_outside_zone_boundaries(struct zone *zone, struct page *page) 185static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
178{ 186{
@@ -295,23 +303,6 @@ void prep_compound_page(struct page *page, unsigned long order)
295 } 303 }
296} 304}
297 305
298#ifdef CONFIG_HUGETLBFS
299void prep_compound_gigantic_page(struct page *page, unsigned long order)
300{
301 int i;
302 int nr_pages = 1 << order;
303 struct page *p = page + 1;
304
305 set_compound_page_dtor(page, free_compound_page);
306 set_compound_order(page, order);
307 __SetPageHead(page);
308 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
309 __SetPageTail(p);
310 p->first_page = page;
311 }
312}
313#endif
314
315static int destroy_compound_page(struct page *page, unsigned long order) 306static int destroy_compound_page(struct page *page, unsigned long order)
316{ 307{
317 int i; 308 int i;
@@ -418,7 +409,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
418 return 0; 409 return 0;
419 410
420 if (PageBuddy(buddy) && page_order(buddy) == order) { 411 if (PageBuddy(buddy) && page_order(buddy) == order) {
421 BUG_ON(page_count(buddy) != 0); 412 VM_BUG_ON(page_count(buddy) != 0);
422 return 1; 413 return 1;
423 } 414 }
424 return 0; 415 return 0;
@@ -449,22 +440,22 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
449 */ 440 */
450 441
451static inline void __free_one_page(struct page *page, 442static inline void __free_one_page(struct page *page,
452 struct zone *zone, unsigned int order) 443 struct zone *zone, unsigned int order,
444 int migratetype)
453{ 445{
454 unsigned long page_idx; 446 unsigned long page_idx;
455 int order_size = 1 << order;
456 int migratetype = get_pageblock_migratetype(page);
457 447
458 if (unlikely(PageCompound(page))) 448 if (unlikely(PageCompound(page)))
459 if (unlikely(destroy_compound_page(page, order))) 449 if (unlikely(destroy_compound_page(page, order)))
460 return; 450 return;
461 451
452 VM_BUG_ON(migratetype == -1);
453
462 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 454 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
463 455
464 VM_BUG_ON(page_idx & (order_size - 1)); 456 VM_BUG_ON(page_idx & ((1 << order) - 1));
465 VM_BUG_ON(bad_range(zone, page)); 457 VM_BUG_ON(bad_range(zone, page));
466 458
467 __mod_zone_page_state(zone, NR_FREE_PAGES, order_size);
468 while (order < MAX_ORDER-1) { 459 while (order < MAX_ORDER-1) {
469 unsigned long combined_idx; 460 unsigned long combined_idx;
470 struct page *buddy; 461 struct page *buddy;
@@ -488,12 +479,27 @@ static inline void __free_one_page(struct page *page,
488 zone->free_area[order].nr_free++; 479 zone->free_area[order].nr_free++;
489} 480}
490 481
482#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
483/*
484 * free_page_mlock() -- clean up attempts to free and mlocked() page.
485 * Page should not be on lru, so no need to fix that up.
486 * free_pages_check() will verify...
487 */
488static inline void free_page_mlock(struct page *page)
489{
490 __ClearPageMlocked(page);
491 __dec_zone_page_state(page, NR_MLOCK);
492 __count_vm_event(UNEVICTABLE_MLOCKFREED);
493}
494#else
495static void free_page_mlock(struct page *page) { }
496#endif
497
491static inline int free_pages_check(struct page *page) 498static inline int free_pages_check(struct page *page)
492{ 499{
493 free_page_mlock(page);
494 if (unlikely(page_mapcount(page) | 500 if (unlikely(page_mapcount(page) |
495 (page->mapping != NULL) | 501 (page->mapping != NULL) |
496 (page_count(page) != 0) | 502 (atomic_read(&page->_count) != 0) |
497 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { 503 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
498 bad_page(page); 504 bad_page(page);
499 return 1; 505 return 1;
@@ -520,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count,
520 spin_lock(&zone->lock); 526 spin_lock(&zone->lock);
521 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 527 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
522 zone->pages_scanned = 0; 528 zone->pages_scanned = 0;
529
530 __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
523 while (count--) { 531 while (count--) {
524 struct page *page; 532 struct page *page;
525 533
@@ -527,17 +535,20 @@ static void free_pages_bulk(struct zone *zone, int count,
527 page = list_entry(list->prev, struct page, lru); 535 page = list_entry(list->prev, struct page, lru);
528 /* have to delete it as __free_one_page list manipulates */ 536 /* have to delete it as __free_one_page list manipulates */
529 list_del(&page->lru); 537 list_del(&page->lru);
530 __free_one_page(page, zone, order); 538 __free_one_page(page, zone, order, page_private(page));
531 } 539 }
532 spin_unlock(&zone->lock); 540 spin_unlock(&zone->lock);
533} 541}
534 542
535static void free_one_page(struct zone *zone, struct page *page, int order) 543static void free_one_page(struct zone *zone, struct page *page, int order,
544 int migratetype)
536{ 545{
537 spin_lock(&zone->lock); 546 spin_lock(&zone->lock);
538 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); 547 zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
539 zone->pages_scanned = 0; 548 zone->pages_scanned = 0;
540 __free_one_page(page, zone, order); 549
550 __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
551 __free_one_page(page, zone, order, migratetype);
541 spin_unlock(&zone->lock); 552 spin_unlock(&zone->lock);
542} 553}
543 554
@@ -546,6 +557,7 @@ static void __free_pages_ok(struct page *page, unsigned int order)
546 unsigned long flags; 557 unsigned long flags;
547 int i; 558 int i;
548 int bad = 0; 559 int bad = 0;
560 int clearMlocked = PageMlocked(page);
549 561
550 kmemcheck_free_shadow(page, order); 562 kmemcheck_free_shadow(page, order);
551 563
@@ -563,8 +575,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
563 kernel_map_pages(page, 1 << order, 0); 575 kernel_map_pages(page, 1 << order, 0);
564 576
565 local_irq_save(flags); 577 local_irq_save(flags);
578 if (unlikely(clearMlocked))
579 free_page_mlock(page);
566 __count_vm_events(PGFREE, 1 << order); 580 __count_vm_events(PGFREE, 1 << order);
567 free_one_page(page_zone(page), page, order); 581 free_one_page(page_zone(page), page, order,
582 get_pageblock_migratetype(page));
568 local_irq_restore(flags); 583 local_irq_restore(flags);
569} 584}
570 585
@@ -635,7 +650,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
635{ 650{
636 if (unlikely(page_mapcount(page) | 651 if (unlikely(page_mapcount(page) |
637 (page->mapping != NULL) | 652 (page->mapping != NULL) |
638 (page_count(page) != 0) | 653 (atomic_read(&page->_count) != 0) |
639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { 654 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
640 bad_page(page); 655 bad_page(page);
641 return 1; 656 return 1;
@@ -660,7 +675,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
660 * Go through the free lists for the given migratetype and remove 675 * Go through the free lists for the given migratetype and remove
661 * the smallest available page from the freelists 676 * the smallest available page from the freelists
662 */ 677 */
663static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, 678static inline
679struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
664 int migratetype) 680 int migratetype)
665{ 681{
666 unsigned int current_order; 682 unsigned int current_order;
@@ -678,7 +694,6 @@ static struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
678 list_del(&page->lru); 694 list_del(&page->lru);
679 rmv_page_order(page); 695 rmv_page_order(page);
680 area->nr_free--; 696 area->nr_free--;
681 __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order));
682 expand(zone, page, order, current_order, area, migratetype); 697 expand(zone, page, order, current_order, area, migratetype);
683 return page; 698 return page;
684 } 699 }
@@ -769,8 +784,8 @@ static int move_freepages_block(struct zone *zone, struct page *page,
769} 784}
770 785
771/* Remove an element from the buddy allocator from the fallback list */ 786/* Remove an element from the buddy allocator from the fallback list */
772static struct page *__rmqueue_fallback(struct zone *zone, int order, 787static inline struct page *
773 int start_migratetype) 788__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
774{ 789{
775 struct free_area * area; 790 struct free_area * area;
776 int current_order; 791 int current_order;
@@ -818,8 +833,6 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
818 /* Remove the page from the freelists */ 833 /* Remove the page from the freelists */
819 list_del(&page->lru); 834 list_del(&page->lru);
820 rmv_page_order(page); 835 rmv_page_order(page);
821 __mod_zone_page_state(zone, NR_FREE_PAGES,
822 -(1UL << order));
823 836
824 if (current_order == pageblock_order) 837 if (current_order == pageblock_order)
825 set_pageblock_migratetype(page, 838 set_pageblock_migratetype(page,
@@ -830,8 +843,7 @@ static struct page *__rmqueue_fallback(struct zone *zone, int order,
830 } 843 }
831 } 844 }
832 845
833 /* Use MIGRATE_RESERVE rather than fail an allocation */ 846 return NULL;
834 return __rmqueue_smallest(zone, order, MIGRATE_RESERVE);
835} 847}
836 848
837/* 849/*
@@ -843,11 +855,23 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order,
843{ 855{
844 struct page *page; 856 struct page *page;
845 857
858retry_reserve:
846 page = __rmqueue_smallest(zone, order, migratetype); 859 page = __rmqueue_smallest(zone, order, migratetype);
847 860
848 if (unlikely(!page)) 861 if (unlikely(!page) && migratetype != MIGRATE_RESERVE) {
849 page = __rmqueue_fallback(zone, order, migratetype); 862 page = __rmqueue_fallback(zone, order, migratetype);
850 863
864 /*
865 * Use MIGRATE_RESERVE rather than fail an allocation. goto
866 * is used because __rmqueue_smallest is an inline function
867 * and we want just one call site
868 */
869 if (!page) {
870 migratetype = MIGRATE_RESERVE;
871 goto retry_reserve;
872 }
873 }
874
851 return page; 875 return page;
852} 876}
853 877
@@ -881,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
881 set_page_private(page, migratetype); 905 set_page_private(page, migratetype);
882 list = &page->lru; 906 list = &page->lru;
883 } 907 }
908 __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
884 spin_unlock(&zone->lock); 909 spin_unlock(&zone->lock);
885 return i; 910 return i;
886} 911}
@@ -996,6 +1021,7 @@ static void free_hot_cold_page(struct page *page, int cold)
996 struct zone *zone = page_zone(page); 1021 struct zone *zone = page_zone(page);
997 struct per_cpu_pages *pcp; 1022 struct per_cpu_pages *pcp;
998 unsigned long flags; 1023 unsigned long flags;
1024 int clearMlocked = PageMlocked(page);
999 1025
1000 kmemcheck_free_shadow(page, 0); 1026 kmemcheck_free_shadow(page, 0);
1001 1027
@@ -1012,13 +1038,16 @@ static void free_hot_cold_page(struct page *page, int cold)
1012 kernel_map_pages(page, 1, 0); 1038 kernel_map_pages(page, 1, 0);
1013 1039
1014 pcp = &zone_pcp(zone, get_cpu())->pcp; 1040 pcp = &zone_pcp(zone, get_cpu())->pcp;
1041 set_page_private(page, get_pageblock_migratetype(page));
1015 local_irq_save(flags); 1042 local_irq_save(flags);
1043 if (unlikely(clearMlocked))
1044 free_page_mlock(page);
1016 __count_vm_event(PGFREE); 1045 __count_vm_event(PGFREE);
1046
1017 if (cold) 1047 if (cold)
1018 list_add_tail(&page->lru, &pcp->list); 1048 list_add_tail(&page->lru, &pcp->list);
1019 else 1049 else
1020 list_add(&page->lru, &pcp->list); 1050 list_add(&page->lru, &pcp->list);
1021 set_page_private(page, get_pageblock_migratetype(page));
1022 pcp->count++; 1051 pcp->count++;
1023 if (pcp->count >= pcp->high) { 1052 if (pcp->count >= pcp->high) {
1024 free_pages_bulk(zone, pcp->batch, &pcp->list, 0); 1053 free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
@@ -1071,14 +1100,15 @@ void split_page(struct page *page, unsigned int order)
1071 * we cheat by calling it from here, in the order > 0 path. Saves a branch 1100 * we cheat by calling it from here, in the order > 0 path. Saves a branch
1072 * or two. 1101 * or two.
1073 */ 1102 */
1074static struct page *buffered_rmqueue(struct zone *preferred_zone, 1103static inline
1075 struct zone *zone, int order, gfp_t gfp_flags) 1104struct page *buffered_rmqueue(struct zone *preferred_zone,
1105 struct zone *zone, int order, gfp_t gfp_flags,
1106 int migratetype)
1076{ 1107{
1077 unsigned long flags; 1108 unsigned long flags;
1078 struct page *page; 1109 struct page *page;
1079 int cold = !!(gfp_flags & __GFP_COLD); 1110 int cold = !!(gfp_flags & __GFP_COLD);
1080 int cpu; 1111 int cpu;
1081 int migratetype = allocflags_to_migratetype(gfp_flags);
1082 1112
1083again: 1113again:
1084 cpu = get_cpu(); 1114 cpu = get_cpu();
@@ -1115,8 +1145,22 @@ again:
1115 list_del(&page->lru); 1145 list_del(&page->lru);
1116 pcp->count--; 1146 pcp->count--;
1117 } else { 1147 } else {
1148 if (unlikely(gfp_flags & __GFP_NOFAIL)) {
1149 /*
1150 * __GFP_NOFAIL is not to be used in new code.
1151 *
1152 * All __GFP_NOFAIL callers should be fixed so that they
1153 * properly detect and handle allocation failures.
1154 *
1155 * We most definitely don't want callers attempting to
1156 * allocate greater than single-page units with
1157 * __GFP_NOFAIL.
1158 */
1159 WARN_ON_ONCE(order > 0);
1160 }
1118 spin_lock_irqsave(&zone->lock, flags); 1161 spin_lock_irqsave(&zone->lock, flags);
1119 page = __rmqueue(zone, order, migratetype); 1162 page = __rmqueue(zone, order, migratetype);
1163 __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1120 spin_unlock(&zone->lock); 1164 spin_unlock(&zone->lock);
1121 if (!page) 1165 if (!page)
1122 goto failed; 1166 goto failed;
@@ -1138,10 +1182,15 @@ failed:
1138 return NULL; 1182 return NULL;
1139} 1183}
1140 1184
1141#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ 1185/* The ALLOC_WMARK bits are used as an index to zone->watermark */
1142#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ 1186#define ALLOC_WMARK_MIN WMARK_MIN
1143#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ 1187#define ALLOC_WMARK_LOW WMARK_LOW
1144#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ 1188#define ALLOC_WMARK_HIGH WMARK_HIGH
1189#define ALLOC_NO_WATERMARKS 0x04 /* don't check watermarks at all */
1190
1191/* Mask to get the watermark bits */
1192#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
1193
1145#define ALLOC_HARDER 0x10 /* try to alloc harder */ 1194#define ALLOC_HARDER 0x10 /* try to alloc harder */
1146#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 1195#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
1147#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 1196#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
@@ -1399,23 +1448,18 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
1399 */ 1448 */
1400static struct page * 1449static struct page *
1401get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, 1450get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
1402 struct zonelist *zonelist, int high_zoneidx, int alloc_flags) 1451 struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
1452 struct zone *preferred_zone, int migratetype)
1403{ 1453{
1404 struct zoneref *z; 1454 struct zoneref *z;
1405 struct page *page = NULL; 1455 struct page *page = NULL;
1406 int classzone_idx; 1456 int classzone_idx;
1407 struct zone *zone, *preferred_zone; 1457 struct zone *zone;
1408 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ 1458 nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
1409 int zlc_active = 0; /* set if using zonelist_cache */ 1459 int zlc_active = 0; /* set if using zonelist_cache */
1410 int did_zlc_setup = 0; /* just call zlc_setup() one time */ 1460 int did_zlc_setup = 0; /* just call zlc_setup() one time */
1411 1461
1412 (void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
1413 &preferred_zone);
1414 if (!preferred_zone)
1415 return NULL;
1416
1417 classzone_idx = zone_idx(preferred_zone); 1462 classzone_idx = zone_idx(preferred_zone);
1418
1419zonelist_scan: 1463zonelist_scan:
1420 /* 1464 /*
1421 * Scan zonelist, looking for a zone with enough free. 1465 * Scan zonelist, looking for a zone with enough free.
@@ -1430,31 +1474,49 @@ zonelist_scan:
1430 !cpuset_zone_allowed_softwall(zone, gfp_mask)) 1474 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1431 goto try_next_zone; 1475 goto try_next_zone;
1432 1476
1477 BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
1433 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1478 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
1434 unsigned long mark; 1479 unsigned long mark;
1435 if (alloc_flags & ALLOC_WMARK_MIN) 1480 int ret;
1436 mark = zone->pages_min; 1481
1437 else if (alloc_flags & ALLOC_WMARK_LOW) 1482 mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1438 mark = zone->pages_low; 1483 if (zone_watermark_ok(zone, order, mark,
1439 else 1484 classzone_idx, alloc_flags))
1440 mark = zone->pages_high; 1485 goto try_this_zone;
1441 if (!zone_watermark_ok(zone, order, mark, 1486
1442 classzone_idx, alloc_flags)) { 1487 if (zone_reclaim_mode == 0)
1443 if (!zone_reclaim_mode || 1488 goto this_zone_full;
1444 !zone_reclaim(zone, gfp_mask, order)) 1489
1490 ret = zone_reclaim(zone, gfp_mask, order);
1491 switch (ret) {
1492 case ZONE_RECLAIM_NOSCAN:
1493 /* did not scan */
1494 goto try_next_zone;
1495 case ZONE_RECLAIM_FULL:
1496 /* scanned but unreclaimable */
1497 goto this_zone_full;
1498 default:
1499 /* did we reclaim enough */
1500 if (!zone_watermark_ok(zone, order, mark,
1501 classzone_idx, alloc_flags))
1445 goto this_zone_full; 1502 goto this_zone_full;
1446 } 1503 }
1447 } 1504 }
1448 1505
1449 page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask); 1506try_this_zone:
1507 page = buffered_rmqueue(preferred_zone, zone, order,
1508 gfp_mask, migratetype);
1450 if (page) 1509 if (page)
1451 break; 1510 break;
1452this_zone_full: 1511this_zone_full:
1453 if (NUMA_BUILD) 1512 if (NUMA_BUILD)
1454 zlc_mark_zone_full(zonelist, z); 1513 zlc_mark_zone_full(zonelist, z);
1455try_next_zone: 1514try_next_zone:
1456 if (NUMA_BUILD && !did_zlc_setup) { 1515 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
1457 /* we do zlc_setup after the first zone is tried */ 1516 /*
1517 * we do zlc_setup after the first zone is tried but only
1518 * if there are multiple nodes make it worthwhile
1519 */
1458 allowednodes = zlc_setup(zonelist, alloc_flags); 1520 allowednodes = zlc_setup(zonelist, alloc_flags);
1459 zlc_active = 1; 1521 zlc_active = 1;
1460 did_zlc_setup = 1; 1522 did_zlc_setup = 1;
@@ -1469,47 +1531,217 @@ try_next_zone:
1469 return page; 1531 return page;
1470} 1532}
1471 1533
1534static inline int
1535should_alloc_retry(gfp_t gfp_mask, unsigned int order,
1536 unsigned long pages_reclaimed)
1537{
1538 /* Do not loop if specifically requested */
1539 if (gfp_mask & __GFP_NORETRY)
1540 return 0;
1541
1542 /*
1543 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1544 * means __GFP_NOFAIL, but that may not be true in other
1545 * implementations.
1546 */
1547 if (order <= PAGE_ALLOC_COSTLY_ORDER)
1548 return 1;
1549
1550 /*
1551 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1552 * specified, then we retry until we no longer reclaim any pages
1553 * (above), or we've reclaimed an order of pages at least as
1554 * large as the allocation's order. In both cases, if the
1555 * allocation still fails, we stop retrying.
1556 */
1557 if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
1558 return 1;
1559
1560 /*
1561 * Don't let big-order allocations loop unless the caller
1562 * explicitly requests that.
1563 */
1564 if (gfp_mask & __GFP_NOFAIL)
1565 return 1;
1566
1567 return 0;
1568}
1569
1570static inline struct page *
1571__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1572 struct zonelist *zonelist, enum zone_type high_zoneidx,
1573 nodemask_t *nodemask, struct zone *preferred_zone,
1574 int migratetype)
1575{
1576 struct page *page;
1577
1578 /* Acquire the OOM killer lock for the zones in zonelist */
1579 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1580 schedule_timeout_uninterruptible(1);
1581 return NULL;
1582 }
1583
1584 /*
1585 * Go through the zonelist yet one more time, keep very high watermark
1586 * here, this is only to catch a parallel oom killing, we must fail if
1587 * we're still under heavy pressure.
1588 */
1589 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1590 order, zonelist, high_zoneidx,
1591 ALLOC_WMARK_HIGH|ALLOC_CPUSET,
1592 preferred_zone, migratetype);
1593 if (page)
1594 goto out;
1595
1596 /* The OOM killer will not help higher order allocs */
1597 if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
1598 goto out;
1599
1600 /* Exhausted what can be done so it's blamo time */
1601 out_of_memory(zonelist, gfp_mask, order);
1602
1603out:
1604 clear_zonelist_oom(zonelist, gfp_mask);
1605 return page;
1606}
1607
1608/* The really slow allocator path where we enter direct reclaim */
1609static inline struct page *
1610__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
1611 struct zonelist *zonelist, enum zone_type high_zoneidx,
1612 nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
1613 int migratetype, unsigned long *did_some_progress)
1614{
1615 struct page *page = NULL;
1616 struct reclaim_state reclaim_state;
1617 struct task_struct *p = current;
1618
1619 cond_resched();
1620
1621 /* We now go into synchronous reclaim */
1622 cpuset_memory_pressure_bump();
1623
1624 /*
1625 * The task's cpuset might have expanded its set of allowable nodes
1626 */
1627 p->flags |= PF_MEMALLOC;
1628 lockdep_set_current_reclaim_state(gfp_mask);
1629 reclaim_state.reclaimed_slab = 0;
1630 p->reclaim_state = &reclaim_state;
1631
1632 *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
1633
1634 p->reclaim_state = NULL;
1635 lockdep_clear_current_reclaim_state();
1636 p->flags &= ~PF_MEMALLOC;
1637
1638 cond_resched();
1639
1640 if (order != 0)
1641 drain_all_pages();
1642
1643 if (likely(*did_some_progress))
1644 page = get_page_from_freelist(gfp_mask, nodemask, order,
1645 zonelist, high_zoneidx,
1646 alloc_flags, preferred_zone,
1647 migratetype);
1648 return page;
1649}
1650
1472/* 1651/*
1473 * This is the 'heart' of the zoned buddy allocator. 1652 * This is called in the allocator slow-path if the allocation request is of
1653 * sufficient urgency to ignore watermarks and take other desperate measures
1474 */ 1654 */
1475struct page * 1655static inline struct page *
1476__alloc_pages_internal(gfp_t gfp_mask, unsigned int order, 1656__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
1477 struct zonelist *zonelist, nodemask_t *nodemask) 1657 struct zonelist *zonelist, enum zone_type high_zoneidx,
1658 nodemask_t *nodemask, struct zone *preferred_zone,
1659 int migratetype)
1660{
1661 struct page *page;
1662
1663 do {
1664 page = get_page_from_freelist(gfp_mask, nodemask, order,
1665 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
1666 preferred_zone, migratetype);
1667
1668 if (!page && gfp_mask & __GFP_NOFAIL)
1669 congestion_wait(WRITE, HZ/50);
1670 } while (!page && (gfp_mask & __GFP_NOFAIL));
1671
1672 return page;
1673}
1674
1675static inline
1676void wake_all_kswapd(unsigned int order, struct zonelist *zonelist,
1677 enum zone_type high_zoneidx)
1478{ 1678{
1479 const gfp_t wait = gfp_mask & __GFP_WAIT;
1480 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1481 struct zoneref *z; 1679 struct zoneref *z;
1482 struct zone *zone; 1680 struct zone *zone;
1483 struct page *page;
1484 struct reclaim_state reclaim_state;
1485 struct task_struct *p = current;
1486 int do_retry;
1487 int alloc_flags;
1488 unsigned long did_some_progress;
1489 unsigned long pages_reclaimed = 0;
1490 1681
1491 lockdep_trace_alloc(gfp_mask); 1682 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx)
1683 wakeup_kswapd(zone, order);
1684}
1492 1685
1493 might_sleep_if(wait); 1686static inline int
1687gfp_to_alloc_flags(gfp_t gfp_mask)
1688{
1689 struct task_struct *p = current;
1690 int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
1691 const gfp_t wait = gfp_mask & __GFP_WAIT;
1494 1692
1495 if (should_fail_alloc_page(gfp_mask, order)) 1693 /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
1496 return NULL; 1694 BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH);
1497 1695
1498restart: 1696 /*
1499 z = zonelist->_zonerefs; /* the list of zones suitable for gfp_mask */ 1697 * The caller may dip into page reserves a bit more if the caller
1698 * cannot run direct reclaim, or if the caller has realtime scheduling
1699 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1700 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1701 */
1702 alloc_flags |= (gfp_mask & __GFP_HIGH);
1500 1703
1501 if (unlikely(!z->zone)) { 1704 if (!wait) {
1705 alloc_flags |= ALLOC_HARDER;
1502 /* 1706 /*
1503 * Happens if we have an empty zonelist as a result of 1707 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1504 * GFP_THISNODE being used on a memoryless node 1708 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1505 */ 1709 */
1506 return NULL; 1710 alloc_flags &= ~ALLOC_CPUSET;
1711 } else if (unlikely(rt_task(p)))
1712 alloc_flags |= ALLOC_HARDER;
1713
1714 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
1715 if (!in_interrupt() &&
1716 ((p->flags & PF_MEMALLOC) ||
1717 unlikely(test_thread_flag(TIF_MEMDIE))))
1718 alloc_flags |= ALLOC_NO_WATERMARKS;
1507 } 1719 }
1508 1720
1509 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, 1721 return alloc_flags;
1510 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET); 1722}
1511 if (page) 1723
1512 goto got_pg; 1724static inline struct page *
1725__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
1726 struct zonelist *zonelist, enum zone_type high_zoneidx,
1727 nodemask_t *nodemask, struct zone *preferred_zone,
1728 int migratetype)
1729{
1730 const gfp_t wait = gfp_mask & __GFP_WAIT;
1731 struct page *page = NULL;
1732 int alloc_flags;
1733 unsigned long pages_reclaimed = 0;
1734 unsigned long did_some_progress;
1735 struct task_struct *p = current;
1736
1737 /*
1738 * In the slowpath, we sanity check order to avoid ever trying to
1739 * reclaim >= MAX_ORDER areas which will never succeed. Callers may
1740 * be using allocators in order of preference for an area that is
1741 * too large.
1742 */
1743 if (WARN_ON_ONCE(order >= MAX_ORDER))
1744 return NULL;
1513 1745
1514 /* 1746 /*
1515 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and 1747 * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
@@ -1522,154 +1754,83 @@ restart:
1522 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 1754 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
1523 goto nopage; 1755 goto nopage;
1524 1756
1525 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) 1757 wake_all_kswapd(order, zonelist, high_zoneidx);
1526 wakeup_kswapd(zone, order);
1527 1758
1528 /* 1759 /*
1529 * OK, we're below the kswapd watermark and have kicked background 1760 * OK, we're below the kswapd watermark and have kicked background
1530 * reclaim. Now things get more complex, so set up alloc_flags according 1761 * reclaim. Now things get more complex, so set up alloc_flags according
1531 * to how we want to proceed. 1762 * to how we want to proceed.
1532 *
1533 * The caller may dip into page reserves a bit more if the caller
1534 * cannot run direct reclaim, or if the caller has realtime scheduling
1535 * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
1536 * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
1537 */ 1763 */
1538 alloc_flags = ALLOC_WMARK_MIN; 1764 alloc_flags = gfp_to_alloc_flags(gfp_mask);
1539 if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
1540 alloc_flags |= ALLOC_HARDER;
1541 if (gfp_mask & __GFP_HIGH)
1542 alloc_flags |= ALLOC_HIGH;
1543 if (wait)
1544 alloc_flags |= ALLOC_CPUSET;
1545 1765
1546 /* 1766restart:
1547 * Go through the zonelist again. Let __GFP_HIGH and allocations 1767 /* This is the last chance, in general, before the goto nopage. */
1548 * coming from realtime tasks go deeper into reserves.
1549 *
1550 * This is the last chance, in general, before the goto nopage.
1551 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
1552 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
1553 */
1554 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, 1768 page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
1555 high_zoneidx, alloc_flags); 1769 high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
1770 preferred_zone, migratetype);
1556 if (page) 1771 if (page)
1557 goto got_pg; 1772 goto got_pg;
1558 1773
1559 /* This allocation should allow future memory freeing. */
1560
1561rebalance: 1774rebalance:
1562 if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) 1775 /* Allocate without watermarks if the context allows */
1563 && !in_interrupt()) { 1776 if (alloc_flags & ALLOC_NO_WATERMARKS) {
1564 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 1777 page = __alloc_pages_high_priority(gfp_mask, order,
1565nofail_alloc: 1778 zonelist, high_zoneidx, nodemask,
1566 /* go through the zonelist yet again, ignoring mins */ 1779 preferred_zone, migratetype);
1567 page = get_page_from_freelist(gfp_mask, nodemask, order, 1780 if (page)
1568 zonelist, high_zoneidx, ALLOC_NO_WATERMARKS); 1781 goto got_pg;
1569 if (page)
1570 goto got_pg;
1571 if (gfp_mask & __GFP_NOFAIL) {
1572 congestion_wait(WRITE, HZ/50);
1573 goto nofail_alloc;
1574 }
1575 }
1576 goto nopage;
1577 } 1782 }
1578 1783
1579 /* Atomic allocations - we can't balance anything */ 1784 /* Atomic allocations - we can't balance anything */
1580 if (!wait) 1785 if (!wait)
1581 goto nopage; 1786 goto nopage;
1582 1787
1583 cond_resched(); 1788 /* Avoid recursion of direct reclaim */
1789 if (p->flags & PF_MEMALLOC)
1790 goto nopage;
1791
1792 /* Try direct reclaim and then allocating */
1793 page = __alloc_pages_direct_reclaim(gfp_mask, order,
1794 zonelist, high_zoneidx,
1795 nodemask,
1796 alloc_flags, preferred_zone,
1797 migratetype, &did_some_progress);
1798 if (page)
1799 goto got_pg;
1584 1800
1585 /* We now go into synchronous reclaim */
1586 cpuset_memory_pressure_bump();
1587 /* 1801 /*
1588 * The task's cpuset might have expanded its set of allowable nodes 1802 * If we failed to make any progress reclaiming, then we are
1803 * running out of options and have to consider going OOM
1589 */ 1804 */
1590 cpuset_update_task_memory_state(); 1805 if (!did_some_progress) {
1591 p->flags |= PF_MEMALLOC; 1806 if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1592 1807 if (oom_killer_disabled)
1593 lockdep_set_current_reclaim_state(gfp_mask); 1808 goto nopage;
1594 reclaim_state.reclaimed_slab = 0; 1809 page = __alloc_pages_may_oom(gfp_mask, order,
1595 p->reclaim_state = &reclaim_state; 1810 zonelist, high_zoneidx,
1596 1811 nodemask, preferred_zone,
1597 did_some_progress = try_to_free_pages(zonelist, order, 1812 migratetype);
1598 gfp_mask, nodemask); 1813 if (page)
1599 1814 goto got_pg;
1600 p->reclaim_state = NULL;
1601 lockdep_clear_current_reclaim_state();
1602 p->flags &= ~PF_MEMALLOC;
1603
1604 cond_resched();
1605 1815
1606 if (order != 0) 1816 /*
1607 drain_all_pages(); 1817 * The OOM killer does not trigger for high-order
1818 * ~__GFP_NOFAIL allocations so if no progress is being
1819 * made, there are no other options and retrying is
1820 * unlikely to help.
1821 */
1822 if (order > PAGE_ALLOC_COSTLY_ORDER &&
1823 !(gfp_mask & __GFP_NOFAIL))
1824 goto nopage;
1608 1825
1609 if (likely(did_some_progress)) {
1610 page = get_page_from_freelist(gfp_mask, nodemask, order,
1611 zonelist, high_zoneidx, alloc_flags);
1612 if (page)
1613 goto got_pg;
1614 } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
1615 if (!try_set_zone_oom(zonelist, gfp_mask)) {
1616 schedule_timeout_uninterruptible(1);
1617 goto restart; 1826 goto restart;
1618 } 1827 }
1619
1620 /*
1621 * Go through the zonelist yet one more time, keep
1622 * very high watermark here, this is only to catch
1623 * a parallel oom killing, we must fail if we're still
1624 * under heavy pressure.
1625 */
1626 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
1627 order, zonelist, high_zoneidx,
1628 ALLOC_WMARK_HIGH|ALLOC_CPUSET);
1629 if (page) {
1630 clear_zonelist_oom(zonelist, gfp_mask);
1631 goto got_pg;
1632 }
1633
1634 /* The OOM killer will not help higher order allocs so fail */
1635 if (order > PAGE_ALLOC_COSTLY_ORDER) {
1636 clear_zonelist_oom(zonelist, gfp_mask);
1637 goto nopage;
1638 }
1639
1640 out_of_memory(zonelist, gfp_mask, order);
1641 clear_zonelist_oom(zonelist, gfp_mask);
1642 goto restart;
1643 } 1828 }
1644 1829
1645 /* 1830 /* Check if we should retry the allocation */
1646 * Don't let big-order allocations loop unless the caller explicitly
1647 * requests that. Wait for some write requests to complete then retry.
1648 *
1649 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
1650 * means __GFP_NOFAIL, but that may not be true in other
1651 * implementations.
1652 *
1653 * For order > PAGE_ALLOC_COSTLY_ORDER, if __GFP_REPEAT is
1654 * specified, then we retry until we no longer reclaim any pages
1655 * (above), or we've reclaimed an order of pages at least as
1656 * large as the allocation's order. In both cases, if the
1657 * allocation still fails, we stop retrying.
1658 */
1659 pages_reclaimed += did_some_progress; 1831 pages_reclaimed += did_some_progress;
1660 do_retry = 0; 1832 if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
1661 if (!(gfp_mask & __GFP_NORETRY)) { 1833 /* Wait for some write requests to complete then retry */
1662 if (order <= PAGE_ALLOC_COSTLY_ORDER) {
1663 do_retry = 1;
1664 } else {
1665 if (gfp_mask & __GFP_REPEAT &&
1666 pages_reclaimed < (1 << order))
1667 do_retry = 1;
1668 }
1669 if (gfp_mask & __GFP_NOFAIL)
1670 do_retry = 1;
1671 }
1672 if (do_retry) {
1673 congestion_wait(WRITE, HZ/50); 1834 congestion_wait(WRITE, HZ/50);
1674 goto rebalance; 1835 goto rebalance;
1675 } 1836 }
@@ -1687,8 +1848,53 @@ got_pg:
1687 if (kmemcheck_enabled) 1848 if (kmemcheck_enabled)
1688 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 1849 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
1689 return page; 1850 return page;
1851
1852}
1853
1854/*
1855 * This is the 'heart' of the zoned buddy allocator.
1856 */
1857struct page *
1858__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
1859 struct zonelist *zonelist, nodemask_t *nodemask)
1860{
1861 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
1862 struct zone *preferred_zone;
1863 struct page *page;
1864 int migratetype = allocflags_to_migratetype(gfp_mask);
1865
1866 lockdep_trace_alloc(gfp_mask);
1867
1868 might_sleep_if(gfp_mask & __GFP_WAIT);
1869
1870 if (should_fail_alloc_page(gfp_mask, order))
1871 return NULL;
1872
1873 /*
1874 * Check the zones suitable for the gfp_mask contain at least one
1875 * valid zone. It's possible to have an empty zonelist as a result
1876 * of GFP_THISNODE and a memoryless node
1877 */
1878 if (unlikely(!zonelist->_zonerefs->zone))
1879 return NULL;
1880
1881 /* The preferred zone is used for statistics later */
1882 first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
1883 if (!preferred_zone)
1884 return NULL;
1885
1886 /* First allocation attempt */
1887 page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
1888 zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
1889 preferred_zone, migratetype);
1890 if (unlikely(!page))
1891 page = __alloc_pages_slowpath(gfp_mask, order,
1892 zonelist, high_zoneidx, nodemask,
1893 preferred_zone, migratetype);
1894
1895 return page;
1690} 1896}
1691EXPORT_SYMBOL(__alloc_pages_internal); 1897EXPORT_SYMBOL(__alloc_pages_nodemask);
1692 1898
1693/* 1899/*
1694 * Common helper functions. 1900 * Common helper functions.
@@ -1817,7 +2023,7 @@ static unsigned int nr_free_zone_pages(int offset)
1817 2023
1818 for_each_zone_zonelist(zone, z, zonelist, offset) { 2024 for_each_zone_zonelist(zone, z, zonelist, offset) {
1819 unsigned long size = zone->present_pages; 2025 unsigned long size = zone->present_pages;
1820 unsigned long high = zone->pages_high; 2026 unsigned long high = high_wmark_pages(zone);
1821 if (size > high) 2027 if (size > high)
1822 sum += size - high; 2028 sum += size - high;
1823 } 2029 }
@@ -1909,19 +2115,14 @@ void show_free_areas(void)
1909 2115
1910 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" 2116 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1911 " inactive_file:%lu" 2117 " inactive_file:%lu"
1912//TODO: check/adjust line lengths
1913#ifdef CONFIG_UNEVICTABLE_LRU
1914 " unevictable:%lu" 2118 " unevictable:%lu"
1915#endif
1916 " dirty:%lu writeback:%lu unstable:%lu\n" 2119 " dirty:%lu writeback:%lu unstable:%lu\n"
1917 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 2120 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1918 global_page_state(NR_ACTIVE_ANON), 2121 global_page_state(NR_ACTIVE_ANON),
1919 global_page_state(NR_ACTIVE_FILE), 2122 global_page_state(NR_ACTIVE_FILE),
1920 global_page_state(NR_INACTIVE_ANON), 2123 global_page_state(NR_INACTIVE_ANON),
1921 global_page_state(NR_INACTIVE_FILE), 2124 global_page_state(NR_INACTIVE_FILE),
1922#ifdef CONFIG_UNEVICTABLE_LRU
1923 global_page_state(NR_UNEVICTABLE), 2125 global_page_state(NR_UNEVICTABLE),
1924#endif
1925 global_page_state(NR_FILE_DIRTY), 2126 global_page_state(NR_FILE_DIRTY),
1926 global_page_state(NR_WRITEBACK), 2127 global_page_state(NR_WRITEBACK),
1927 global_page_state(NR_UNSTABLE_NFS), 2128 global_page_state(NR_UNSTABLE_NFS),
@@ -1945,25 +2146,21 @@ void show_free_areas(void)
1945 " inactive_anon:%lukB" 2146 " inactive_anon:%lukB"
1946 " active_file:%lukB" 2147 " active_file:%lukB"
1947 " inactive_file:%lukB" 2148 " inactive_file:%lukB"
1948#ifdef CONFIG_UNEVICTABLE_LRU
1949 " unevictable:%lukB" 2149 " unevictable:%lukB"
1950#endif
1951 " present:%lukB" 2150 " present:%lukB"
1952 " pages_scanned:%lu" 2151 " pages_scanned:%lu"
1953 " all_unreclaimable? %s" 2152 " all_unreclaimable? %s"
1954 "\n", 2153 "\n",
1955 zone->name, 2154 zone->name,
1956 K(zone_page_state(zone, NR_FREE_PAGES)), 2155 K(zone_page_state(zone, NR_FREE_PAGES)),
1957 K(zone->pages_min), 2156 K(min_wmark_pages(zone)),
1958 K(zone->pages_low), 2157 K(low_wmark_pages(zone)),
1959 K(zone->pages_high), 2158 K(high_wmark_pages(zone)),
1960 K(zone_page_state(zone, NR_ACTIVE_ANON)), 2159 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1961 K(zone_page_state(zone, NR_INACTIVE_ANON)), 2160 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1962 K(zone_page_state(zone, NR_ACTIVE_FILE)), 2161 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1963 K(zone_page_state(zone, NR_INACTIVE_FILE)), 2162 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1964#ifdef CONFIG_UNEVICTABLE_LRU
1965 K(zone_page_state(zone, NR_UNEVICTABLE)), 2163 K(zone_page_state(zone, NR_UNEVICTABLE)),
1966#endif
1967 K(zone->present_pages), 2164 K(zone->present_pages),
1968 zone->pages_scanned, 2165 zone->pages_scanned,
1969 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 2166 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -2121,7 +2318,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2121} 2318}
2122 2319
2123 2320
2124#define MAX_NODE_LOAD (num_online_nodes()) 2321#define MAX_NODE_LOAD (nr_online_nodes)
2125static int node_load[MAX_NUMNODES]; 2322static int node_load[MAX_NUMNODES];
2126 2323
2127/** 2324/**
@@ -2330,7 +2527,7 @@ static void build_zonelists(pg_data_t *pgdat)
2330 2527
2331 /* NUMA-aware ordering of nodes */ 2528 /* NUMA-aware ordering of nodes */
2332 local_node = pgdat->node_id; 2529 local_node = pgdat->node_id;
2333 load = num_online_nodes(); 2530 load = nr_online_nodes;
2334 prev_node = local_node; 2531 prev_node = local_node;
2335 nodes_clear(used_mask); 2532 nodes_clear(used_mask);
2336 2533
@@ -2481,7 +2678,7 @@ void build_all_zonelists(void)
2481 2678
2482 printk("Built %i zonelists in %s order, mobility grouping %s. " 2679 printk("Built %i zonelists in %s order, mobility grouping %s. "
2483 "Total pages: %ld\n", 2680 "Total pages: %ld\n",
2484 num_online_nodes(), 2681 nr_online_nodes,
2485 zonelist_order_name[current_zonelist_order], 2682 zonelist_order_name[current_zonelist_order],
2486 page_group_by_mobility_disabled ? "off" : "on", 2683 page_group_by_mobility_disabled ? "off" : "on",
2487 vm_total_pages); 2684 vm_total_pages);
@@ -2560,8 +2757,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
2560 2757
2561/* 2758/*
2562 * Mark a number of pageblocks as MIGRATE_RESERVE. The number 2759 * Mark a number of pageblocks as MIGRATE_RESERVE. The number
2563 * of blocks reserved is based on zone->pages_min. The memory within the 2760 * of blocks reserved is based on min_wmark_pages(zone). The memory within
2564 * reserve will tend to store contiguous free pages. Setting min_free_kbytes 2761 * the reserve will tend to store contiguous free pages. Setting min_free_kbytes
2565 * higher will lead to a bigger reserve which will get freed as contiguous 2762 * higher will lead to a bigger reserve which will get freed as contiguous
2566 * blocks as reclaim kicks in 2763 * blocks as reclaim kicks in
2567 */ 2764 */
@@ -2574,7 +2771,7 @@ static void setup_zone_migrate_reserve(struct zone *zone)
2574 /* Get the start pfn, end pfn and the number of blocks to reserve */ 2771 /* Get the start pfn, end pfn and the number of blocks to reserve */
2575 start_pfn = zone->zone_start_pfn; 2772 start_pfn = zone->zone_start_pfn;
2576 end_pfn = start_pfn + zone->spanned_pages; 2773 end_pfn = start_pfn + zone->spanned_pages;
2577 reserve = roundup(zone->pages_min, pageblock_nr_pages) >> 2774 reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
2578 pageblock_order; 2775 pageblock_order;
2579 2776
2580 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { 2777 for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
@@ -3506,7 +3703,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3506 zone_pcp_init(zone); 3703 zone_pcp_init(zone);
3507 for_each_lru(l) { 3704 for_each_lru(l) {
3508 INIT_LIST_HEAD(&zone->lru[l].list); 3705 INIT_LIST_HEAD(&zone->lru[l].list);
3509 zone->lru[l].nr_scan = 0; 3706 zone->lru[l].nr_saved_scan = 0;
3510 } 3707 }
3511 zone->reclaim_stat.recent_rotated[0] = 0; 3708 zone->reclaim_stat.recent_rotated[0] = 0;
3512 zone->reclaim_stat.recent_rotated[1] = 0; 3709 zone->reclaim_stat.recent_rotated[1] = 0;
@@ -4043,6 +4240,11 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4043 early_node_map[i].start_pfn, 4240 early_node_map[i].start_pfn,
4044 early_node_map[i].end_pfn); 4241 early_node_map[i].end_pfn);
4045 4242
4243 /*
4244 * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init
4245 * that node_mask, clear it at first
4246 */
4247 nodes_clear(node_states[N_HIGH_MEMORY]);
4046 /* Initialise every node */ 4248 /* Initialise every node */
4047 mminit_verify_pageflags_layout(); 4249 mminit_verify_pageflags_layout();
4048 setup_nr_node_ids(); 4250 setup_nr_node_ids();
@@ -4177,8 +4379,8 @@ static void calculate_totalreserve_pages(void)
4177 max = zone->lowmem_reserve[j]; 4379 max = zone->lowmem_reserve[j];
4178 } 4380 }
4179 4381
4180 /* we treat pages_high as reserved pages. */ 4382 /* we treat the high watermark as reserved pages. */
4181 max += zone->pages_high; 4383 max += high_wmark_pages(zone);
4182 4384
4183 if (max > zone->present_pages) 4385 if (max > zone->present_pages)
4184 max = zone->present_pages; 4386 max = zone->present_pages;
@@ -4228,12 +4430,13 @@ static void setup_per_zone_lowmem_reserve(void)
4228} 4430}
4229 4431
4230/** 4432/**
4231 * setup_per_zone_pages_min - called when min_free_kbytes changes. 4433 * setup_per_zone_wmarks - called when min_free_kbytes changes
4434 * or when memory is hot-{added|removed}
4232 * 4435 *
4233 * Ensures that the pages_{min,low,high} values for each zone are set correctly 4436 * Ensures that the watermark[min,low,high] values for each zone are set
4234 * with respect to min_free_kbytes. 4437 * correctly with respect to min_free_kbytes.
4235 */ 4438 */
4236void setup_per_zone_pages_min(void) 4439void setup_per_zone_wmarks(void)
4237{ 4440{
4238 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); 4441 unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
4239 unsigned long lowmem_pages = 0; 4442 unsigned long lowmem_pages = 0;
@@ -4258,7 +4461,7 @@ void setup_per_zone_pages_min(void)
4258 * need highmem pages, so cap pages_min to a small 4461 * need highmem pages, so cap pages_min to a small
4259 * value here. 4462 * value here.
4260 * 4463 *
4261 * The (pages_high-pages_low) and (pages_low-pages_min) 4464 * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
4262 * deltas controls asynch page reclaim, and so should 4465 * deltas controls asynch page reclaim, and so should
4263 * not be capped for highmem. 4466 * not be capped for highmem.
4264 */ 4467 */
@@ -4269,17 +4472,17 @@ void setup_per_zone_pages_min(void)
4269 min_pages = SWAP_CLUSTER_MAX; 4472 min_pages = SWAP_CLUSTER_MAX;
4270 if (min_pages > 128) 4473 if (min_pages > 128)
4271 min_pages = 128; 4474 min_pages = 128;
4272 zone->pages_min = min_pages; 4475 zone->watermark[WMARK_MIN] = min_pages;
4273 } else { 4476 } else {
4274 /* 4477 /*
4275 * If it's a lowmem zone, reserve a number of pages 4478 * If it's a lowmem zone, reserve a number of pages
4276 * proportionate to the zone's size. 4479 * proportionate to the zone's size.
4277 */ 4480 */
4278 zone->pages_min = tmp; 4481 zone->watermark[WMARK_MIN] = tmp;
4279 } 4482 }
4280 4483
4281 zone->pages_low = zone->pages_min + (tmp >> 2); 4484 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
4282 zone->pages_high = zone->pages_min + (tmp >> 1); 4485 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
4283 setup_zone_migrate_reserve(zone); 4486 setup_zone_migrate_reserve(zone);
4284 spin_unlock_irqrestore(&zone->lock, flags); 4487 spin_unlock_irqrestore(&zone->lock, flags);
4285 } 4488 }
@@ -4289,8 +4492,6 @@ void setup_per_zone_pages_min(void)
4289} 4492}
4290 4493
4291/** 4494/**
4292 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4293 *
4294 * The inactive anon list should be small enough that the VM never has to 4495 * The inactive anon list should be small enough that the VM never has to
4295 * do too much work, but large enough that each inactive page has a chance 4496 * do too much work, but large enough that each inactive page has a chance
4296 * to be referenced again before it is swapped out. 4497 * to be referenced again before it is swapped out.
@@ -4311,21 +4512,26 @@ void setup_per_zone_pages_min(void)
4311 * 1TB 101 10GB 4512 * 1TB 101 10GB
4312 * 10TB 320 32GB 4513 * 10TB 320 32GB
4313 */ 4514 */
4314static void setup_per_zone_inactive_ratio(void) 4515void calculate_zone_inactive_ratio(struct zone *zone)
4315{ 4516{
4316 struct zone *zone; 4517 unsigned int gb, ratio;
4317
4318 for_each_zone(zone) {
4319 unsigned int gb, ratio;
4320 4518
4321 /* Zone size in gigabytes */ 4519 /* Zone size in gigabytes */
4322 gb = zone->present_pages >> (30 - PAGE_SHIFT); 4520 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4521 if (gb)
4323 ratio = int_sqrt(10 * gb); 4522 ratio = int_sqrt(10 * gb);
4324 if (!ratio) 4523 else
4325 ratio = 1; 4524 ratio = 1;
4326 4525
4327 zone->inactive_ratio = ratio; 4526 zone->inactive_ratio = ratio;
4328 } 4527}
4528
4529static void __init setup_per_zone_inactive_ratio(void)
4530{
4531 struct zone *zone;
4532
4533 for_each_zone(zone)
4534 calculate_zone_inactive_ratio(zone);
4329} 4535}
4330 4536
4331/* 4537/*
@@ -4352,7 +4558,7 @@ static void setup_per_zone_inactive_ratio(void)
4352 * 8192MB: 11584k 4558 * 8192MB: 11584k
4353 * 16384MB: 16384k 4559 * 16384MB: 16384k
4354 */ 4560 */
4355static int __init init_per_zone_pages_min(void) 4561static int __init init_per_zone_wmark_min(void)
4356{ 4562{
4357 unsigned long lowmem_kbytes; 4563 unsigned long lowmem_kbytes;
4358 4564
@@ -4363,12 +4569,12 @@ static int __init init_per_zone_pages_min(void)
4363 min_free_kbytes = 128; 4569 min_free_kbytes = 128;
4364 if (min_free_kbytes > 65536) 4570 if (min_free_kbytes > 65536)
4365 min_free_kbytes = 65536; 4571 min_free_kbytes = 65536;
4366 setup_per_zone_pages_min(); 4572 setup_per_zone_wmarks();
4367 setup_per_zone_lowmem_reserve(); 4573 setup_per_zone_lowmem_reserve();
4368 setup_per_zone_inactive_ratio(); 4574 setup_per_zone_inactive_ratio();
4369 return 0; 4575 return 0;
4370} 4576}
4371module_init(init_per_zone_pages_min) 4577module_init(init_per_zone_wmark_min)
4372 4578
4373/* 4579/*
4374 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so 4580 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -4380,7 +4586,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
4380{ 4586{
4381 proc_dointvec(table, write, file, buffer, length, ppos); 4587 proc_dointvec(table, write, file, buffer, length, ppos);
4382 if (write) 4588 if (write)
4383 setup_per_zone_pages_min(); 4589 setup_per_zone_wmarks();
4384 return 0; 4590 return 0;
4385} 4591}
4386 4592
@@ -4424,7 +4630,7 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
4424 * whenever sysctl_lowmem_reserve_ratio changes. 4630 * whenever sysctl_lowmem_reserve_ratio changes.
4425 * 4631 *
4426 * The reserve ratio obviously has absolutely no relation with the 4632 * The reserve ratio obviously has absolutely no relation with the
4427 * pages_min watermarks. The lowmem reserve ratio can only make sense 4633 * minimum watermarks. The lowmem reserve ratio can only make sense
4428 * if in function of the boot time zone sizes. 4634 * if in function of the boot time zone sizes.
4429 */ 4635 */
4430int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, 4636int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
@@ -4531,23 +4737,13 @@ void *__init alloc_large_system_hash(const char *tablename,
4531 else if (hashdist) 4737 else if (hashdist)
4532 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); 4738 table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
4533 else { 4739 else {
4534 unsigned long order = get_order(size);
4535 table = (void*) __get_free_pages(GFP_ATOMIC, order);
4536 /* 4740 /*
4537 * If bucketsize is not a power-of-two, we may free 4741 * If bucketsize is not a power-of-two, we may free
4538 * some pages at the end of hash table. 4742 * some pages at the end of hash table which
4743 * alloc_pages_exact() automatically does
4539 */ 4744 */
4540 if (table) { 4745 if (get_order(size) < MAX_ORDER)
4541 unsigned long alloc_end = (unsigned long)table + 4746 table = alloc_pages_exact(size, GFP_ATOMIC);
4542 (PAGE_SIZE << order);
4543 unsigned long used = (unsigned long)table +
4544 PAGE_ALIGN(size);
4545 split_page(virt_to_page(table), order);
4546 while (used < alloc_end) {
4547 free_page(used);
4548 used += PAGE_SIZE;
4549 }
4550 }
4551 } 4747 }
4552 } while (!table && size > PAGE_SIZE && --log2qty); 4748 } while (!table && size > PAGE_SIZE && --log2qty);
4553 4749