aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c354
1 files changed, 225 insertions, 129 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7e208f0ad68c..4ba5e37127fc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
90#ifdef CONFIG_HIGHMEM 90#ifdef CONFIG_HIGHMEM
91 [N_HIGH_MEMORY] = { { [0] = 1UL } }, 91 [N_HIGH_MEMORY] = { { [0] = 1UL } },
92#endif 92#endif
93#ifdef CONFIG_MOVABLE_NODE
94 [N_MEMORY] = { { [0] = 1UL } },
95#endif
93 [N_CPU] = { { [0] = 1UL } }, 96 [N_CPU] = { { [0] = 1UL } },
94#endif /* NUMA */ 97#endif /* NUMA */
95}; 98};
@@ -368,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
368 int nr_pages = 1 << order; 371 int nr_pages = 1 << order;
369 int bad = 0; 372 int bad = 0;
370 373
371 if (unlikely(compound_order(page) != order) || 374 if (unlikely(compound_order(page) != order)) {
372 unlikely(!PageHead(page))) {
373 bad_page(page); 375 bad_page(page);
374 bad++; 376 bad++;
375 } 377 }
@@ -523,7 +525,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
523 * If a block is freed, and its buddy is also free, then this 525 * If a block is freed, and its buddy is also free, then this
524 * triggers coalescing into a block of larger size. 526 * triggers coalescing into a block of larger size.
525 * 527 *
526 * -- wli 528 * -- nyc
527 */ 529 */
528 530
529static inline void __free_one_page(struct page *page, 531static inline void __free_one_page(struct page *page,
@@ -608,6 +610,7 @@ static inline int free_pages_check(struct page *page)
608 bad_page(page); 610 bad_page(page);
609 return 1; 611 return 1;
610 } 612 }
613 reset_page_last_nid(page);
611 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) 614 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
612 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; 615 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
613 return 0; 616 return 0;
@@ -667,11 +670,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
667 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ 670 /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
668 __free_one_page(page, zone, 0, mt); 671 __free_one_page(page, zone, 0, mt);
669 trace_mm_page_pcpu_drain(page, 0, mt); 672 trace_mm_page_pcpu_drain(page, 0, mt);
670 if (is_migrate_cma(mt)) 673 if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) {
671 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); 674 __mod_zone_page_state(zone, NR_FREE_PAGES, 1);
675 if (is_migrate_cma(mt))
676 __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1);
677 }
672 } while (--to_free && --batch_free && !list_empty(list)); 678 } while (--to_free && --batch_free && !list_empty(list));
673 } 679 }
674 __mod_zone_page_state(zone, NR_FREE_PAGES, count);
675 spin_unlock(&zone->lock); 680 spin_unlock(&zone->lock);
676} 681}
677 682
@@ -730,6 +735,13 @@ static void __free_pages_ok(struct page *page, unsigned int order)
730 local_irq_restore(flags); 735 local_irq_restore(flags);
731} 736}
732 737
738/*
739 * Read access to zone->managed_pages is safe because it's unsigned long,
740 * but we still need to serialize writers. Currently all callers of
741 * __free_pages_bootmem() except put_page_bootmem() should only be used
742 * at boot time. So for shorter boot time, we shift the burden to
743 * put_page_bootmem() to serialize writers.
744 */
733void __meminit __free_pages_bootmem(struct page *page, unsigned int order) 745void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
734{ 746{
735 unsigned int nr_pages = 1 << order; 747 unsigned int nr_pages = 1 << order;
@@ -745,6 +757,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order)
745 set_page_count(p, 0); 757 set_page_count(p, 0);
746 } 758 }
747 759
760 page_zone(page)->managed_pages += 1 << order;
748 set_page_refcounted(page); 761 set_page_refcounted(page);
749 __free_pages(page, order); 762 __free_pages(page, order);
750} 763}
@@ -780,7 +793,7 @@ void __init init_cma_reserved_pageblock(struct page *page)
780 * large block of memory acted on by a series of small allocations. 793 * large block of memory acted on by a series of small allocations.
781 * This behavior is a critical factor in sglist merging's success. 794 * This behavior is a critical factor in sglist merging's success.
782 * 795 *
783 * -- wli 796 * -- nyc
784 */ 797 */
785static inline void expand(struct zone *zone, struct page *page, 798static inline void expand(struct zone *zone, struct page *page,
786 int low, int high, struct free_area *area, 799 int low, int high, struct free_area *area,
@@ -1392,21 +1405,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype)
1392 1405
1393 zone = page_zone(page); 1406 zone = page_zone(page);
1394 order = page_order(page); 1407 order = page_order(page);
1408 mt = get_pageblock_migratetype(page);
1395 1409
1396 /* Obey watermarks as if the page was being allocated */ 1410 if (mt != MIGRATE_ISOLATE) {
1397 watermark = low_wmark_pages(zone) + (1 << order); 1411 /* Obey watermarks as if the page was being allocated */
1398 if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) 1412 watermark = low_wmark_pages(zone) + (1 << order);
1399 return 0; 1413 if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
1414 return 0;
1415
1416 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1417 }
1400 1418
1401 /* Remove page from free list */ 1419 /* Remove page from free list */
1402 list_del(&page->lru); 1420 list_del(&page->lru);
1403 zone->free_area[order].nr_free--; 1421 zone->free_area[order].nr_free--;
1404 rmv_page_order(page); 1422 rmv_page_order(page);
1405 1423
1406 mt = get_pageblock_migratetype(page);
1407 if (unlikely(mt != MIGRATE_ISOLATE))
1408 __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt);
1409
1410 if (alloc_order != order) 1424 if (alloc_order != order)
1411 expand(zone, page, alloc_order, order, 1425 expand(zone, page, alloc_order, order,
1412 &zone->free_area[order], migratetype); 1426 &zone->free_area[order], migratetype);
@@ -1692,7 +1706,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1692 * 1706 *
1693 * If the zonelist cache is present in the passed in zonelist, then 1707 * If the zonelist cache is present in the passed in zonelist, then
1694 * returns a pointer to the allowed node mask (either the current 1708 * returns a pointer to the allowed node mask (either the current
1695 * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) 1709 * tasks mems_allowed, or node_states[N_MEMORY].)
1696 * 1710 *
1697 * If the zonelist cache is not available for this zonelist, does 1711 * If the zonelist cache is not available for this zonelist, does
1698 * nothing and returns NULL. 1712 * nothing and returns NULL.
@@ -1721,7 +1735,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
1721 1735
1722 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? 1736 allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
1723 &cpuset_current_mems_allowed : 1737 &cpuset_current_mems_allowed :
1724 &node_states[N_HIGH_MEMORY]; 1738 &node_states[N_MEMORY];
1725 return allowednodes; 1739 return allowednodes;
1726} 1740}
1727 1741
@@ -1871,7 +1885,7 @@ zonelist_scan:
1871 */ 1885 */
1872 for_each_zone_zonelist_nodemask(zone, z, zonelist, 1886 for_each_zone_zonelist_nodemask(zone, z, zonelist,
1873 high_zoneidx, nodemask) { 1887 high_zoneidx, nodemask) {
1874 if (NUMA_BUILD && zlc_active && 1888 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1875 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1889 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1876 continue; 1890 continue;
1877 if ((alloc_flags & ALLOC_CPUSET) && 1891 if ((alloc_flags & ALLOC_CPUSET) &&
@@ -1917,7 +1931,8 @@ zonelist_scan:
1917 classzone_idx, alloc_flags)) 1931 classzone_idx, alloc_flags))
1918 goto try_this_zone; 1932 goto try_this_zone;
1919 1933
1920 if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { 1934 if (IS_ENABLED(CONFIG_NUMA) &&
1935 !did_zlc_setup && nr_online_nodes > 1) {
1921 /* 1936 /*
1922 * we do zlc_setup if there are multiple nodes 1937 * we do zlc_setup if there are multiple nodes
1923 * and before considering the first zone allowed 1938 * and before considering the first zone allowed
@@ -1936,7 +1951,7 @@ zonelist_scan:
1936 * As we may have just activated ZLC, check if the first 1951 * As we may have just activated ZLC, check if the first
1937 * eligible zone has failed zone_reclaim recently. 1952 * eligible zone has failed zone_reclaim recently.
1938 */ 1953 */
1939 if (NUMA_BUILD && zlc_active && 1954 if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
1940 !zlc_zone_worth_trying(zonelist, z, allowednodes)) 1955 !zlc_zone_worth_trying(zonelist, z, allowednodes))
1941 continue; 1956 continue;
1942 1957
@@ -1962,11 +1977,11 @@ try_this_zone:
1962 if (page) 1977 if (page)
1963 break; 1978 break;
1964this_zone_full: 1979this_zone_full:
1965 if (NUMA_BUILD) 1980 if (IS_ENABLED(CONFIG_NUMA))
1966 zlc_mark_zone_full(zonelist, z); 1981 zlc_mark_zone_full(zonelist, z);
1967 } 1982 }
1968 1983
1969 if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { 1984 if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) {
1970 /* Disable zlc cache for second zonelist scan */ 1985 /* Disable zlc cache for second zonelist scan */
1971 zlc_active = 0; 1986 zlc_active = 0;
1972 goto zonelist_scan; 1987 goto zonelist_scan;
@@ -2266,7 +2281,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2266 return NULL; 2281 return NULL;
2267 2282
2268 /* After successful reclaim, reconsider all zones for allocation */ 2283 /* After successful reclaim, reconsider all zones for allocation */
2269 if (NUMA_BUILD) 2284 if (IS_ENABLED(CONFIG_NUMA))
2270 zlc_clear_zones_full(zonelist); 2285 zlc_clear_zones_full(zonelist);
2271 2286
2272retry: 2287retry:
@@ -2412,7 +2427,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2412 * allowed per node queues are empty and that nodes are 2427 * allowed per node queues are empty and that nodes are
2413 * over allocated. 2428 * over allocated.
2414 */ 2429 */
2415 if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) 2430 if (IS_ENABLED(CONFIG_NUMA) &&
2431 (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
2416 goto nopage; 2432 goto nopage;
2417 2433
2418restart: 2434restart:
@@ -2596,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2596 int migratetype = allocflags_to_migratetype(gfp_mask); 2612 int migratetype = allocflags_to_migratetype(gfp_mask);
2597 unsigned int cpuset_mems_cookie; 2613 unsigned int cpuset_mems_cookie;
2598 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; 2614 int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
2615 struct mem_cgroup *memcg = NULL;
2599 2616
2600 gfp_mask &= gfp_allowed_mask; 2617 gfp_mask &= gfp_allowed_mask;
2601 2618
@@ -2614,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2614 if (unlikely(!zonelist->_zonerefs->zone)) 2631 if (unlikely(!zonelist->_zonerefs->zone))
2615 return NULL; 2632 return NULL;
2616 2633
2634 /*
2635 * Will only have any effect when __GFP_KMEMCG is set. This is
2636 * verified in the (always inline) callee
2637 */
2638 if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
2639 return NULL;
2640
2617retry_cpuset: 2641retry_cpuset:
2618 cpuset_mems_cookie = get_mems_allowed(); 2642 cpuset_mems_cookie = get_mems_allowed();
2619 2643
@@ -2649,6 +2673,8 @@ out:
2649 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) 2673 if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
2650 goto retry_cpuset; 2674 goto retry_cpuset;
2651 2675
2676 memcg_kmem_commit_charge(page, memcg, order);
2677
2652 return page; 2678 return page;
2653} 2679}
2654EXPORT_SYMBOL(__alloc_pages_nodemask); 2680EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2701,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
2701 2727
2702EXPORT_SYMBOL(free_pages); 2728EXPORT_SYMBOL(free_pages);
2703 2729
2730/*
2731 * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
2732 * pages allocated with __GFP_KMEMCG.
2733 *
2734 * Those pages are accounted to a particular memcg, embedded in the
2735 * corresponding page_cgroup. To avoid adding a hit in the allocator to search
2736 * for that information only to find out that it is NULL for users who have no
2737 * interest in that whatsoever, we provide these functions.
2738 *
2739 * The caller knows better which flags it relies on.
2740 */
2741void __free_memcg_kmem_pages(struct page *page, unsigned int order)
2742{
2743 memcg_kmem_uncharge_pages(page, order);
2744 __free_pages(page, order);
2745}
2746
2747void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
2748{
2749 if (addr != 0) {
2750 VM_BUG_ON(!virt_addr_valid((void *)addr));
2751 __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
2752 }
2753}
2754
2704static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) 2755static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
2705{ 2756{
2706 if (addr) { 2757 if (addr) {
@@ -2819,7 +2870,7 @@ unsigned int nr_free_pagecache_pages(void)
2819 2870
2820static inline void show_node(struct zone *zone) 2871static inline void show_node(struct zone *zone)
2821{ 2872{
2822 if (NUMA_BUILD) 2873 if (IS_ENABLED(CONFIG_NUMA))
2823 printk("Node %d ", zone_to_nid(zone)); 2874 printk("Node %d ", zone_to_nid(zone));
2824} 2875}
2825 2876
@@ -2877,6 +2928,31 @@ out:
2877 2928
2878#define K(x) ((x) << (PAGE_SHIFT-10)) 2929#define K(x) ((x) << (PAGE_SHIFT-10))
2879 2930
2931static void show_migration_types(unsigned char type)
2932{
2933 static const char types[MIGRATE_TYPES] = {
2934 [MIGRATE_UNMOVABLE] = 'U',
2935 [MIGRATE_RECLAIMABLE] = 'E',
2936 [MIGRATE_MOVABLE] = 'M',
2937 [MIGRATE_RESERVE] = 'R',
2938#ifdef CONFIG_CMA
2939 [MIGRATE_CMA] = 'C',
2940#endif
2941 [MIGRATE_ISOLATE] = 'I',
2942 };
2943 char tmp[MIGRATE_TYPES + 1];
2944 char *p = tmp;
2945 int i;
2946
2947 for (i = 0; i < MIGRATE_TYPES; i++) {
2948 if (type & (1 << i))
2949 *p++ = types[i];
2950 }
2951
2952 *p = '\0';
2953 printk("(%s) ", tmp);
2954}
2955
2880/* 2956/*
2881 * Show free area list (used inside shift_scroll-lock stuff) 2957 * Show free area list (used inside shift_scroll-lock stuff)
2882 * We also calculate the percentage fragmentation. We do this by counting the 2958 * We also calculate the percentage fragmentation. We do this by counting the
@@ -2951,6 +3027,7 @@ void show_free_areas(unsigned int filter)
2951 " isolated(anon):%lukB" 3027 " isolated(anon):%lukB"
2952 " isolated(file):%lukB" 3028 " isolated(file):%lukB"
2953 " present:%lukB" 3029 " present:%lukB"
3030 " managed:%lukB"
2954 " mlocked:%lukB" 3031 " mlocked:%lukB"
2955 " dirty:%lukB" 3032 " dirty:%lukB"
2956 " writeback:%lukB" 3033 " writeback:%lukB"
@@ -2980,6 +3057,7 @@ void show_free_areas(unsigned int filter)
2980 K(zone_page_state(zone, NR_ISOLATED_ANON)), 3057 K(zone_page_state(zone, NR_ISOLATED_ANON)),
2981 K(zone_page_state(zone, NR_ISOLATED_FILE)), 3058 K(zone_page_state(zone, NR_ISOLATED_FILE)),
2982 K(zone->present_pages), 3059 K(zone->present_pages),
3060 K(zone->managed_pages),
2983 K(zone_page_state(zone, NR_MLOCK)), 3061 K(zone_page_state(zone, NR_MLOCK)),
2984 K(zone_page_state(zone, NR_FILE_DIRTY)), 3062 K(zone_page_state(zone, NR_FILE_DIRTY)),
2985 K(zone_page_state(zone, NR_WRITEBACK)), 3063 K(zone_page_state(zone, NR_WRITEBACK)),
@@ -3005,6 +3083,7 @@ void show_free_areas(unsigned int filter)
3005 3083
3006 for_each_populated_zone(zone) { 3084 for_each_populated_zone(zone) {
3007 unsigned long nr[MAX_ORDER], flags, order, total = 0; 3085 unsigned long nr[MAX_ORDER], flags, order, total = 0;
3086 unsigned char types[MAX_ORDER];
3008 3087
3009 if (skip_free_areas_node(filter, zone_to_nid(zone))) 3088 if (skip_free_areas_node(filter, zone_to_nid(zone)))
3010 continue; 3089 continue;
@@ -3013,12 +3092,24 @@ void show_free_areas(unsigned int filter)
3013 3092
3014 spin_lock_irqsave(&zone->lock, flags); 3093 spin_lock_irqsave(&zone->lock, flags);
3015 for (order = 0; order < MAX_ORDER; order++) { 3094 for (order = 0; order < MAX_ORDER; order++) {
3016 nr[order] = zone->free_area[order].nr_free; 3095 struct free_area *area = &zone->free_area[order];
3096 int type;
3097
3098 nr[order] = area->nr_free;
3017 total += nr[order] << order; 3099 total += nr[order] << order;
3100
3101 types[order] = 0;
3102 for (type = 0; type < MIGRATE_TYPES; type++) {
3103 if (!list_empty(&area->free_list[type]))
3104 types[order] |= 1 << type;
3105 }
3018 } 3106 }
3019 spin_unlock_irqrestore(&zone->lock, flags); 3107 spin_unlock_irqrestore(&zone->lock, flags);
3020 for (order = 0; order < MAX_ORDER; order++) 3108 for (order = 0; order < MAX_ORDER; order++) {
3021 printk("%lu*%lukB ", nr[order], K(1UL) << order); 3109 printk("%lu*%lukB ", nr[order], K(1UL) << order);
3110 if (nr[order])
3111 show_migration_types(types[order]);
3112 }
3022 printk("= %lukB\n", K(total)); 3113 printk("= %lukB\n", K(total));
3023 } 3114 }
3024 3115
@@ -3195,7 +3286,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
3195 return node; 3286 return node;
3196 } 3287 }
3197 3288
3198 for_each_node_state(n, N_HIGH_MEMORY) { 3289 for_each_node_state(n, N_MEMORY) {
3199 3290
3200 /* Don't want a node to appear more than once */ 3291 /* Don't want a node to appear more than once */
3201 if (node_isset(n, *used_node_mask)) 3292 if (node_isset(n, *used_node_mask))
@@ -3337,7 +3428,7 @@ static int default_zonelist_order(void)
3337 * local memory, NODE_ORDER may be suitable. 3428 * local memory, NODE_ORDER may be suitable.
3338 */ 3429 */
3339 average_size = total_size / 3430 average_size = total_size /
3340 (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); 3431 (nodes_weight(node_states[N_MEMORY]) + 1);
3341 for_each_online_node(nid) { 3432 for_each_online_node(nid) {
3342 low_kmem_size = 0; 3433 low_kmem_size = 0;
3343 total_size = 0; 3434 total_size = 0;
@@ -3827,6 +3918,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
3827 mminit_verify_page_links(page, zone, nid, pfn); 3918 mminit_verify_page_links(page, zone, nid, pfn);
3828 init_page_count(page); 3919 init_page_count(page);
3829 reset_page_mapcount(page); 3920 reset_page_mapcount(page);
3921 reset_page_last_nid(page);
3830 SetPageReserved(page); 3922 SetPageReserved(page);
3831 /* 3923 /*
3832 * Mark the block movable so that blocks are reserved for 3924 * Mark the block movable so that blocks are reserved for
@@ -4433,6 +4525,26 @@ void __init set_pageblock_order(void)
4433 4525
4434#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ 4526#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
4435 4527
4528static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
4529 unsigned long present_pages)
4530{
4531 unsigned long pages = spanned_pages;
4532
4533 /*
4534 * Provide a more accurate estimation if there are holes within
4535 * the zone and SPARSEMEM is in use. If there are holes within the
4536 * zone, each populated memory region may cost us one or two extra
4537 * memmap pages due to alignment because memmap pages for each
4538 * populated regions may not naturally algined on page boundary.
4539 * So the (present_pages >> 4) heuristic is a tradeoff for that.
4540 */
4541 if (spanned_pages > present_pages + (present_pages >> 4) &&
4542 IS_ENABLED(CONFIG_SPARSEMEM))
4543 pages = present_pages;
4544
4545 return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
4546}
4547
4436/* 4548/*
4437 * Set up the zone data structures: 4549 * Set up the zone data structures:
4438 * - mark all pages reserved 4550 * - mark all pages reserved
@@ -4450,54 +4562,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4450 int ret; 4562 int ret;
4451 4563
4452 pgdat_resize_init(pgdat); 4564 pgdat_resize_init(pgdat);
4565#ifdef CONFIG_NUMA_BALANCING
4566 spin_lock_init(&pgdat->numabalancing_migrate_lock);
4567 pgdat->numabalancing_migrate_nr_pages = 0;
4568 pgdat->numabalancing_migrate_next_window = jiffies;
4569#endif
4453 init_waitqueue_head(&pgdat->kswapd_wait); 4570 init_waitqueue_head(&pgdat->kswapd_wait);
4454 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4571 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4455 pgdat_page_cgroup_init(pgdat); 4572 pgdat_page_cgroup_init(pgdat);
4456 4573
4457 for (j = 0; j < MAX_NR_ZONES; j++) { 4574 for (j = 0; j < MAX_NR_ZONES; j++) {
4458 struct zone *zone = pgdat->node_zones + j; 4575 struct zone *zone = pgdat->node_zones + j;
4459 unsigned long size, realsize, memmap_pages; 4576 unsigned long size, realsize, freesize, memmap_pages;
4460 4577
4461 size = zone_spanned_pages_in_node(nid, j, zones_size); 4578 size = zone_spanned_pages_in_node(nid, j, zones_size);
4462 realsize = size - zone_absent_pages_in_node(nid, j, 4579 realsize = freesize = size - zone_absent_pages_in_node(nid, j,
4463 zholes_size); 4580 zholes_size);
4464 4581
4465 /* 4582 /*
4466 * Adjust realsize so that it accounts for how much memory 4583 * Adjust freesize so that it accounts for how much memory
4467 * is used by this zone for memmap. This affects the watermark 4584 * is used by this zone for memmap. This affects the watermark
4468 * and per-cpu initialisations 4585 * and per-cpu initialisations
4469 */ 4586 */
4470 memmap_pages = 4587 memmap_pages = calc_memmap_size(size, realsize);
4471 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 4588 if (freesize >= memmap_pages) {
4472 if (realsize >= memmap_pages) { 4589 freesize -= memmap_pages;
4473 realsize -= memmap_pages;
4474 if (memmap_pages) 4590 if (memmap_pages)
4475 printk(KERN_DEBUG 4591 printk(KERN_DEBUG
4476 " %s zone: %lu pages used for memmap\n", 4592 " %s zone: %lu pages used for memmap\n",
4477 zone_names[j], memmap_pages); 4593 zone_names[j], memmap_pages);
4478 } else 4594 } else
4479 printk(KERN_WARNING 4595 printk(KERN_WARNING
4480 " %s zone: %lu pages exceeds realsize %lu\n", 4596 " %s zone: %lu pages exceeds freesize %lu\n",
4481 zone_names[j], memmap_pages, realsize); 4597 zone_names[j], memmap_pages, freesize);
4482 4598
4483 /* Account for reserved pages */ 4599 /* Account for reserved pages */
4484 if (j == 0 && realsize > dma_reserve) { 4600 if (j == 0 && freesize > dma_reserve) {
4485 realsize -= dma_reserve; 4601 freesize -= dma_reserve;
4486 printk(KERN_DEBUG " %s zone: %lu pages reserved\n", 4602 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
4487 zone_names[0], dma_reserve); 4603 zone_names[0], dma_reserve);
4488 } 4604 }
4489 4605
4490 if (!is_highmem_idx(j)) 4606 if (!is_highmem_idx(j))
4491 nr_kernel_pages += realsize; 4607 nr_kernel_pages += freesize;
4492 nr_all_pages += realsize; 4608 /* Charge for highmem memmap if there are enough kernel pages */
4609 else if (nr_kernel_pages > memmap_pages * 2)
4610 nr_kernel_pages -= memmap_pages;
4611 nr_all_pages += freesize;
4493 4612
4494 zone->spanned_pages = size; 4613 zone->spanned_pages = size;
4495 zone->present_pages = realsize; 4614 zone->present_pages = freesize;
4615 /*
4616 * Set an approximate value for lowmem here, it will be adjusted
4617 * when the bootmem allocator frees pages into the buddy system.
4618 * And all highmem pages will be managed by the buddy system.
4619 */
4620 zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
4496#ifdef CONFIG_NUMA 4621#ifdef CONFIG_NUMA
4497 zone->node = nid; 4622 zone->node = nid;
4498 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4623 zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio)
4499 / 100; 4624 / 100;
4500 zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; 4625 zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
4501#endif 4626#endif
4502 zone->name = zone_names[j]; 4627 zone->name = zone_names[j];
4503 spin_lock_init(&zone->lock); 4628 spin_lock_init(&zone->lock);
@@ -4688,7 +4813,7 @@ unsigned long __init find_min_pfn_with_active_regions(void)
4688/* 4813/*
4689 * early_calculate_totalpages() 4814 * early_calculate_totalpages()
4690 * Sum pages in active regions for movable zone. 4815 * Sum pages in active regions for movable zone.
4691 * Populate N_HIGH_MEMORY for calculating usable_nodes. 4816 * Populate N_MEMORY for calculating usable_nodes.
4692 */ 4817 */
4693static unsigned long __init early_calculate_totalpages(void) 4818static unsigned long __init early_calculate_totalpages(void)
4694{ 4819{
@@ -4701,7 +4826,7 @@ static unsigned long __init early_calculate_totalpages(void)
4701 4826
4702 totalpages += pages; 4827 totalpages += pages;
4703 if (pages) 4828 if (pages)
4704 node_set_state(nid, N_HIGH_MEMORY); 4829 node_set_state(nid, N_MEMORY);
4705 } 4830 }
4706 return totalpages; 4831 return totalpages;
4707} 4832}
@@ -4718,9 +4843,9 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4718 unsigned long usable_startpfn; 4843 unsigned long usable_startpfn;
4719 unsigned long kernelcore_node, kernelcore_remaining; 4844 unsigned long kernelcore_node, kernelcore_remaining;
4720 /* save the state before borrow the nodemask */ 4845 /* save the state before borrow the nodemask */
4721 nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; 4846 nodemask_t saved_node_state = node_states[N_MEMORY];
4722 unsigned long totalpages = early_calculate_totalpages(); 4847 unsigned long totalpages = early_calculate_totalpages();
4723 int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); 4848 int usable_nodes = nodes_weight(node_states[N_MEMORY]);
4724 4849
4725 /* 4850 /*
4726 * If movablecore was specified, calculate what size of 4851 * If movablecore was specified, calculate what size of
@@ -4755,7 +4880,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
4755restart: 4880restart:
4756 /* Spread kernelcore memory as evenly as possible throughout nodes */ 4881 /* Spread kernelcore memory as evenly as possible throughout nodes */
4757 kernelcore_node = required_kernelcore / usable_nodes; 4882 kernelcore_node = required_kernelcore / usable_nodes;
4758 for_each_node_state(nid, N_HIGH_MEMORY) { 4883 for_each_node_state(nid, N_MEMORY) {
4759 unsigned long start_pfn, end_pfn; 4884 unsigned long start_pfn, end_pfn;
4760 4885
4761 /* 4886 /*
@@ -4847,23 +4972,27 @@ restart:
4847 4972
4848out: 4973out:
4849 /* restore the node_state */ 4974 /* restore the node_state */
4850 node_states[N_HIGH_MEMORY] = saved_node_state; 4975 node_states[N_MEMORY] = saved_node_state;
4851} 4976}
4852 4977
4853/* Any regular memory on that node ? */ 4978/* Any regular or high memory on that node ? */
4854static void __init check_for_regular_memory(pg_data_t *pgdat) 4979static void check_for_memory(pg_data_t *pgdat, int nid)
4855{ 4980{
4856#ifdef CONFIG_HIGHMEM
4857 enum zone_type zone_type; 4981 enum zone_type zone_type;
4858 4982
4859 for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { 4983 if (N_MEMORY == N_NORMAL_MEMORY)
4984 return;
4985
4986 for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
4860 struct zone *zone = &pgdat->node_zones[zone_type]; 4987 struct zone *zone = &pgdat->node_zones[zone_type];
4861 if (zone->present_pages) { 4988 if (zone->present_pages) {
4862 node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); 4989 node_set_state(nid, N_HIGH_MEMORY);
4990 if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
4991 zone_type <= ZONE_NORMAL)
4992 node_set_state(nid, N_NORMAL_MEMORY);
4863 break; 4993 break;
4864 } 4994 }
4865 } 4995 }
4866#endif
4867} 4996}
4868 4997
4869/** 4998/**
@@ -4946,8 +5075,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4946 5075
4947 /* Any memory on that node */ 5076 /* Any memory on that node */
4948 if (pgdat->node_present_pages) 5077 if (pgdat->node_present_pages)
4949 node_set_state(nid, N_HIGH_MEMORY); 5078 node_set_state(nid, N_MEMORY);
4950 check_for_regular_memory(pgdat); 5079 check_for_memory(pgdat, nid);
4951 } 5080 }
4952} 5081}
4953 5082
@@ -5175,10 +5304,6 @@ static void __setup_per_zone_wmarks(void)
5175 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); 5304 zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2);
5176 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); 5305 zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1);
5177 5306
5178 zone->watermark[WMARK_MIN] += cma_wmark_pages(zone);
5179 zone->watermark[WMARK_LOW] += cma_wmark_pages(zone);
5180 zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone);
5181
5182 setup_zone_migrate_reserve(zone); 5307 setup_zone_migrate_reserve(zone);
5183 spin_unlock_irqrestore(&zone->lock, flags); 5308 spin_unlock_irqrestore(&zone->lock, flags);
5184 } 5309 }
@@ -5576,7 +5701,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5576 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't 5701 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5577 * expect this function should be exact. 5702 * expect this function should be exact.
5578 */ 5703 */
5579bool has_unmovable_pages(struct zone *zone, struct page *page, int count) 5704bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
5705 bool skip_hwpoisoned_pages)
5580{ 5706{
5581 unsigned long pfn, iter, found; 5707 unsigned long pfn, iter, found;
5582 int mt; 5708 int mt;
@@ -5611,6 +5737,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5611 continue; 5737 continue;
5612 } 5738 }
5613 5739
5740 /*
5741 * The HWPoisoned page may be not in buddy system, and
5742 * page_count() is not 0.
5743 */
5744 if (skip_hwpoisoned_pages && PageHWPoison(page))
5745 continue;
5746
5614 if (!PageLRU(page)) 5747 if (!PageLRU(page))
5615 found++; 5748 found++;
5616 /* 5749 /*
@@ -5653,7 +5786,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5653 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5786 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5654 return false; 5787 return false;
5655 5788
5656 return !has_unmovable_pages(zone, page, 0); 5789 return !has_unmovable_pages(zone, page, 0, true);
5657} 5790}
5658 5791
5659#ifdef CONFIG_CMA 5792#ifdef CONFIG_CMA
@@ -5680,7 +5813,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5680 unsigned int tries = 0; 5813 unsigned int tries = 0;
5681 int ret = 0; 5814 int ret = 0;
5682 5815
5683 migrate_prep_local(); 5816 migrate_prep();
5684 5817
5685 while (pfn < end || !list_empty(&cc->migratepages)) { 5818 while (pfn < end || !list_empty(&cc->migratepages)) {
5686 if (fatal_signal_pending(current)) { 5819 if (fatal_signal_pending(current)) {
@@ -5708,61 +5841,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
5708 5841
5709 ret = migrate_pages(&cc->migratepages, 5842 ret = migrate_pages(&cc->migratepages,
5710 alloc_migrate_target, 5843 alloc_migrate_target,
5711 0, false, MIGRATE_SYNC); 5844 0, false, MIGRATE_SYNC,
5845 MR_CMA);
5712 } 5846 }
5713 5847
5714 putback_lru_pages(&cc->migratepages); 5848 putback_movable_pages(&cc->migratepages);
5715 return ret > 0 ? 0 : ret; 5849 return ret > 0 ? 0 : ret;
5716} 5850}
5717 5851
5718/*
5719 * Update zone's cma pages counter used for watermark level calculation.
5720 */
5721static inline void __update_cma_watermarks(struct zone *zone, int count)
5722{
5723 unsigned long flags;
5724 spin_lock_irqsave(&zone->lock, flags);
5725 zone->min_cma_pages += count;
5726 spin_unlock_irqrestore(&zone->lock, flags);
5727 setup_per_zone_wmarks();
5728}
5729
5730/*
5731 * Trigger memory pressure bump to reclaim some pages in order to be able to
5732 * allocate 'count' pages in single page units. Does similar work as
5733 *__alloc_pages_slowpath() function.
5734 */
5735static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5736{
5737 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
5738 struct zonelist *zonelist = node_zonelist(0, gfp_mask);
5739 int did_some_progress = 0;
5740 int order = 1;
5741
5742 /*
5743 * Increase level of watermarks to force kswapd do his job
5744 * to stabilise at new watermark level.
5745 */
5746 __update_cma_watermarks(zone, count);
5747
5748 /* Obey watermarks as if the page was being allocated */
5749 while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) {
5750 wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone));
5751
5752 did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
5753 NULL);
5754 if (!did_some_progress) {
5755 /* Exhausted what can be done so it's blamo time */
5756 out_of_memory(zonelist, gfp_mask, order, NULL, false);
5757 }
5758 }
5759
5760 /* Restore original watermark levels. */
5761 __update_cma_watermarks(zone, -count);
5762
5763 return count;
5764}
5765
5766/** 5852/**
5767 * alloc_contig_range() -- tries to allocate given range of pages 5853 * alloc_contig_range() -- tries to allocate given range of pages
5768 * @start: start PFN to allocate 5854 * @start: start PFN to allocate
@@ -5786,7 +5872,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count)
5786int alloc_contig_range(unsigned long start, unsigned long end, 5872int alloc_contig_range(unsigned long start, unsigned long end,
5787 unsigned migratetype) 5873 unsigned migratetype)
5788{ 5874{
5789 struct zone *zone = page_zone(pfn_to_page(start));
5790 unsigned long outer_start, outer_end; 5875 unsigned long outer_start, outer_end;
5791 int ret = 0, order; 5876 int ret = 0, order;
5792 5877
@@ -5824,7 +5909,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5824 */ 5909 */
5825 5910
5826 ret = start_isolate_page_range(pfn_max_align_down(start), 5911 ret = start_isolate_page_range(pfn_max_align_down(start),
5827 pfn_max_align_up(end), migratetype); 5912 pfn_max_align_up(end), migratetype,
5913 false);
5828 if (ret) 5914 if (ret)
5829 return ret; 5915 return ret;
5830 5916
@@ -5863,18 +5949,13 @@ int alloc_contig_range(unsigned long start, unsigned long end,
5863 } 5949 }
5864 5950
5865 /* Make sure the range is really isolated. */ 5951 /* Make sure the range is really isolated. */
5866 if (test_pages_isolated(outer_start, end)) { 5952 if (test_pages_isolated(outer_start, end, false)) {
5867 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", 5953 pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n",
5868 outer_start, end); 5954 outer_start, end);
5869 ret = -EBUSY; 5955 ret = -EBUSY;
5870 goto done; 5956 goto done;
5871 } 5957 }
5872 5958
5873 /*
5874 * Reclaim enough pages to make sure that contiguous allocation
5875 * will not starve the system.
5876 */
5877 __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start);
5878 5959
5879 /* Grab isolated pages from freelists. */ 5960 /* Grab isolated pages from freelists. */
5880 outer_end = isolate_freepages_range(&cc, outer_start, end); 5961 outer_end = isolate_freepages_range(&cc, outer_start, end);
@@ -5897,8 +5978,15 @@ done:
5897 5978
5898void free_contig_range(unsigned long pfn, unsigned nr_pages) 5979void free_contig_range(unsigned long pfn, unsigned nr_pages)
5899{ 5980{
5900 for (; nr_pages--; ++pfn) 5981 unsigned int count = 0;
5901 __free_page(pfn_to_page(pfn)); 5982
5983 for (; nr_pages--; pfn++) {
5984 struct page *page = pfn_to_page(pfn);
5985
5986 count += page_count(page) != 1;
5987 __free_page(page);
5988 }
5989 WARN(count != 0, "%d pages are still in use!\n", count);
5902} 5990}
5903#endif 5991#endif
5904 5992
@@ -5932,7 +6020,6 @@ void __meminit zone_pcp_update(struct zone *zone)
5932} 6020}
5933#endif 6021#endif
5934 6022
5935#ifdef CONFIG_MEMORY_HOTREMOVE
5936void zone_pcp_reset(struct zone *zone) 6023void zone_pcp_reset(struct zone *zone)
5937{ 6024{
5938 unsigned long flags; 6025 unsigned long flags;
@@ -5952,6 +6039,7 @@ void zone_pcp_reset(struct zone *zone)
5952 local_irq_restore(flags); 6039 local_irq_restore(flags);
5953} 6040}
5954 6041
6042#ifdef CONFIG_MEMORY_HOTREMOVE
5955/* 6043/*
5956 * All pages in the range must be isolated before calling this. 6044 * All pages in the range must be isolated before calling this.
5957 */ 6045 */
@@ -5978,6 +6066,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5978 continue; 6066 continue;
5979 } 6067 }
5980 page = pfn_to_page(pfn); 6068 page = pfn_to_page(pfn);
6069 /*
6070 * The HWPoisoned page may be not in buddy system, and
6071 * page_count() is not 0.
6072 */
6073 if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
6074 pfn++;
6075 SetPageReserved(page);
6076 continue;
6077 }
6078
5981 BUG_ON(page_count(page)); 6079 BUG_ON(page_count(page));
5982 BUG_ON(!PageBuddy(page)); 6080 BUG_ON(!PageBuddy(page));
5983 order = page_order(page); 6081 order = page_order(page);
@@ -5988,8 +6086,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5988 list_del(&page->lru); 6086 list_del(&page->lru);
5989 rmv_page_order(page); 6087 rmv_page_order(page);
5990 zone->free_area[order].nr_free--; 6088 zone->free_area[order].nr_free--;
5991 __mod_zone_page_state(zone, NR_FREE_PAGES,
5992 - (1UL << order));
5993 for (i = 0; i < (1 << order); i++) 6089 for (i = 0; i < (1 << order); i++)
5994 SetPageReserved((page+i)); 6090 SetPageReserved((page+i));
5995 pfn += (1 << order); 6091 pfn += (1 << order);