diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 305 |
1 files changed, 180 insertions, 125 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0ad68c..d037c8bc1512 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_MOVABLE_NODE | ||
94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
95 | #endif | ||
93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
95 | }; | 98 | }; |
@@ -523,7 +526,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, | |||
523 | * If a block is freed, and its buddy is also free, then this | 526 | * If a block is freed, and its buddy is also free, then this |
524 | * triggers coalescing into a block of larger size. | 527 | * triggers coalescing into a block of larger size. |
525 | * | 528 | * |
526 | * -- wli | 529 | * -- nyc |
527 | */ | 530 | */ |
528 | 531 | ||
529 | static inline void __free_one_page(struct page *page, | 532 | static inline void __free_one_page(struct page *page, |
@@ -608,6 +611,7 @@ static inline int free_pages_check(struct page *page) | |||
608 | bad_page(page); | 611 | bad_page(page); |
609 | return 1; | 612 | return 1; |
610 | } | 613 | } |
614 | reset_page_last_nid(page); | ||
611 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 615 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
612 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 616 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
613 | return 0; | 617 | return 0; |
@@ -667,11 +671,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, | |||
667 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ | 671 | /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ |
668 | __free_one_page(page, zone, 0, mt); | 672 | __free_one_page(page, zone, 0, mt); |
669 | trace_mm_page_pcpu_drain(page, 0, mt); | 673 | trace_mm_page_pcpu_drain(page, 0, mt); |
670 | if (is_migrate_cma(mt)) | 674 | if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { |
671 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | 675 | __mod_zone_page_state(zone, NR_FREE_PAGES, 1); |
676 | if (is_migrate_cma(mt)) | ||
677 | __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); | ||
678 | } | ||
672 | } while (--to_free && --batch_free && !list_empty(list)); | 679 | } while (--to_free && --batch_free && !list_empty(list)); |
673 | } | 680 | } |
674 | __mod_zone_page_state(zone, NR_FREE_PAGES, count); | ||
675 | spin_unlock(&zone->lock); | 681 | spin_unlock(&zone->lock); |
676 | } | 682 | } |
677 | 683 | ||
@@ -730,6 +736,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
730 | local_irq_restore(flags); | 736 | local_irq_restore(flags); |
731 | } | 737 | } |
732 | 738 | ||
739 | /* | ||
740 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
741 | * but we still need to serialize writers. Currently all callers of | ||
742 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
743 | * at boot time. So for shorter boot time, we shift the burden to | ||
744 | * put_page_bootmem() to serialize writers. | ||
745 | */ | ||
733 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 746 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
734 | { | 747 | { |
735 | unsigned int nr_pages = 1 << order; | 748 | unsigned int nr_pages = 1 << order; |
@@ -745,6 +758,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
745 | set_page_count(p, 0); | 758 | set_page_count(p, 0); |
746 | } | 759 | } |
747 | 760 | ||
761 | page_zone(page)->managed_pages += 1 << order; | ||
748 | set_page_refcounted(page); | 762 | set_page_refcounted(page); |
749 | __free_pages(page, order); | 763 | __free_pages(page, order); |
750 | } | 764 | } |
@@ -780,7 +794,7 @@ void __init init_cma_reserved_pageblock(struct page *page) | |||
780 | * large block of memory acted on by a series of small allocations. | 794 | * large block of memory acted on by a series of small allocations. |
781 | * This behavior is a critical factor in sglist merging's success. | 795 | * This behavior is a critical factor in sglist merging's success. |
782 | * | 796 | * |
783 | * -- wli | 797 | * -- nyc |
784 | */ | 798 | */ |
785 | static inline void expand(struct zone *zone, struct page *page, | 799 | static inline void expand(struct zone *zone, struct page *page, |
786 | int low, int high, struct free_area *area, | 800 | int low, int high, struct free_area *area, |
@@ -1392,21 +1406,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1392 | 1406 | ||
1393 | zone = page_zone(page); | 1407 | zone = page_zone(page); |
1394 | order = page_order(page); | 1408 | order = page_order(page); |
1409 | mt = get_pageblock_migratetype(page); | ||
1395 | 1410 | ||
1396 | /* Obey watermarks as if the page was being allocated */ | 1411 | if (mt != MIGRATE_ISOLATE) { |
1397 | watermark = low_wmark_pages(zone) + (1 << order); | 1412 | /* Obey watermarks as if the page was being allocated */ |
1398 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1413 | watermark = low_wmark_pages(zone) + (1 << order); |
1399 | return 0; | 1414 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1415 | return 0; | ||
1416 | |||
1417 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1418 | } | ||
1400 | 1419 | ||
1401 | /* Remove page from free list */ | 1420 | /* Remove page from free list */ |
1402 | list_del(&page->lru); | 1421 | list_del(&page->lru); |
1403 | zone->free_area[order].nr_free--; | 1422 | zone->free_area[order].nr_free--; |
1404 | rmv_page_order(page); | 1423 | rmv_page_order(page); |
1405 | 1424 | ||
1406 | mt = get_pageblock_migratetype(page); | ||
1407 | if (unlikely(mt != MIGRATE_ISOLATE)) | ||
1408 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | ||
1409 | |||
1410 | if (alloc_order != order) | 1425 | if (alloc_order != order) |
1411 | expand(zone, page, alloc_order, order, | 1426 | expand(zone, page, alloc_order, order, |
1412 | &zone->free_area[order], migratetype); | 1427 | &zone->free_area[order], migratetype); |
@@ -1692,7 +1707,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1692 | * | 1707 | * |
1693 | * If the zonelist cache is present in the passed in zonelist, then | 1708 | * If the zonelist cache is present in the passed in zonelist, then |
1694 | * returns a pointer to the allowed node mask (either the current | 1709 | * returns a pointer to the allowed node mask (either the current |
1695 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1710 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1696 | * | 1711 | * |
1697 | * If the zonelist cache is not available for this zonelist, does | 1712 | * If the zonelist cache is not available for this zonelist, does |
1698 | * nothing and returns NULL. | 1713 | * nothing and returns NULL. |
@@ -1721,7 +1736,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1721 | 1736 | ||
1722 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1737 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1723 | &cpuset_current_mems_allowed : | 1738 | &cpuset_current_mems_allowed : |
1724 | &node_states[N_HIGH_MEMORY]; | 1739 | &node_states[N_MEMORY]; |
1725 | return allowednodes; | 1740 | return allowednodes; |
1726 | } | 1741 | } |
1727 | 1742 | ||
@@ -1871,7 +1886,7 @@ zonelist_scan: | |||
1871 | */ | 1886 | */ |
1872 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1887 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1873 | high_zoneidx, nodemask) { | 1888 | high_zoneidx, nodemask) { |
1874 | if (NUMA_BUILD && zlc_active && | 1889 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1875 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1890 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1876 | continue; | 1891 | continue; |
1877 | if ((alloc_flags & ALLOC_CPUSET) && | 1892 | if ((alloc_flags & ALLOC_CPUSET) && |
@@ -1917,7 +1932,8 @@ zonelist_scan: | |||
1917 | classzone_idx, alloc_flags)) | 1932 | classzone_idx, alloc_flags)) |
1918 | goto try_this_zone; | 1933 | goto try_this_zone; |
1919 | 1934 | ||
1920 | if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { | 1935 | if (IS_ENABLED(CONFIG_NUMA) && |
1936 | !did_zlc_setup && nr_online_nodes > 1) { | ||
1921 | /* | 1937 | /* |
1922 | * we do zlc_setup if there are multiple nodes | 1938 | * we do zlc_setup if there are multiple nodes |
1923 | * and before considering the first zone allowed | 1939 | * and before considering the first zone allowed |
@@ -1936,7 +1952,7 @@ zonelist_scan: | |||
1936 | * As we may have just activated ZLC, check if the first | 1952 | * As we may have just activated ZLC, check if the first |
1937 | * eligible zone has failed zone_reclaim recently. | 1953 | * eligible zone has failed zone_reclaim recently. |
1938 | */ | 1954 | */ |
1939 | if (NUMA_BUILD && zlc_active && | 1955 | if (IS_ENABLED(CONFIG_NUMA) && zlc_active && |
1940 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) | 1956 | !zlc_zone_worth_trying(zonelist, z, allowednodes)) |
1941 | continue; | 1957 | continue; |
1942 | 1958 | ||
@@ -1962,11 +1978,11 @@ try_this_zone: | |||
1962 | if (page) | 1978 | if (page) |
1963 | break; | 1979 | break; |
1964 | this_zone_full: | 1980 | this_zone_full: |
1965 | if (NUMA_BUILD) | 1981 | if (IS_ENABLED(CONFIG_NUMA)) |
1966 | zlc_mark_zone_full(zonelist, z); | 1982 | zlc_mark_zone_full(zonelist, z); |
1967 | } | 1983 | } |
1968 | 1984 | ||
1969 | if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { | 1985 | if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { |
1970 | /* Disable zlc cache for second zonelist scan */ | 1986 | /* Disable zlc cache for second zonelist scan */ |
1971 | zlc_active = 0; | 1987 | zlc_active = 0; |
1972 | goto zonelist_scan; | 1988 | goto zonelist_scan; |
@@ -2266,7 +2282,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2266 | return NULL; | 2282 | return NULL; |
2267 | 2283 | ||
2268 | /* After successful reclaim, reconsider all zones for allocation */ | 2284 | /* After successful reclaim, reconsider all zones for allocation */ |
2269 | if (NUMA_BUILD) | 2285 | if (IS_ENABLED(CONFIG_NUMA)) |
2270 | zlc_clear_zones_full(zonelist); | 2286 | zlc_clear_zones_full(zonelist); |
2271 | 2287 | ||
2272 | retry: | 2288 | retry: |
@@ -2412,7 +2428,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | |||
2412 | * allowed per node queues are empty and that nodes are | 2428 | * allowed per node queues are empty and that nodes are |
2413 | * over allocated. | 2429 | * over allocated. |
2414 | */ | 2430 | */ |
2415 | if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | 2431 | if (IS_ENABLED(CONFIG_NUMA) && |
2432 | (gfp_mask & GFP_THISNODE) == GFP_THISNODE) | ||
2416 | goto nopage; | 2433 | goto nopage; |
2417 | 2434 | ||
2418 | restart: | 2435 | restart: |
@@ -2819,7 +2836,7 @@ unsigned int nr_free_pagecache_pages(void) | |||
2819 | 2836 | ||
2820 | static inline void show_node(struct zone *zone) | 2837 | static inline void show_node(struct zone *zone) |
2821 | { | 2838 | { |
2822 | if (NUMA_BUILD) | 2839 | if (IS_ENABLED(CONFIG_NUMA)) |
2823 | printk("Node %d ", zone_to_nid(zone)); | 2840 | printk("Node %d ", zone_to_nid(zone)); |
2824 | } | 2841 | } |
2825 | 2842 | ||
@@ -2877,6 +2894,31 @@ out: | |||
2877 | 2894 | ||
2878 | #define K(x) ((x) << (PAGE_SHIFT-10)) | 2895 | #define K(x) ((x) << (PAGE_SHIFT-10)) |
2879 | 2896 | ||
2897 | static void show_migration_types(unsigned char type) | ||
2898 | { | ||
2899 | static const char types[MIGRATE_TYPES] = { | ||
2900 | [MIGRATE_UNMOVABLE] = 'U', | ||
2901 | [MIGRATE_RECLAIMABLE] = 'E', | ||
2902 | [MIGRATE_MOVABLE] = 'M', | ||
2903 | [MIGRATE_RESERVE] = 'R', | ||
2904 | #ifdef CONFIG_CMA | ||
2905 | [MIGRATE_CMA] = 'C', | ||
2906 | #endif | ||
2907 | [MIGRATE_ISOLATE] = 'I', | ||
2908 | }; | ||
2909 | char tmp[MIGRATE_TYPES + 1]; | ||
2910 | char *p = tmp; | ||
2911 | int i; | ||
2912 | |||
2913 | for (i = 0; i < MIGRATE_TYPES; i++) { | ||
2914 | if (type & (1 << i)) | ||
2915 | *p++ = types[i]; | ||
2916 | } | ||
2917 | |||
2918 | *p = '\0'; | ||
2919 | printk("(%s) ", tmp); | ||
2920 | } | ||
2921 | |||
2880 | /* | 2922 | /* |
2881 | * Show free area list (used inside shift_scroll-lock stuff) | 2923 | * Show free area list (used inside shift_scroll-lock stuff) |
2882 | * We also calculate the percentage fragmentation. We do this by counting the | 2924 | * We also calculate the percentage fragmentation. We do this by counting the |
@@ -2951,6 +2993,7 @@ void show_free_areas(unsigned int filter) | |||
2951 | " isolated(anon):%lukB" | 2993 | " isolated(anon):%lukB" |
2952 | " isolated(file):%lukB" | 2994 | " isolated(file):%lukB" |
2953 | " present:%lukB" | 2995 | " present:%lukB" |
2996 | " managed:%lukB" | ||
2954 | " mlocked:%lukB" | 2997 | " mlocked:%lukB" |
2955 | " dirty:%lukB" | 2998 | " dirty:%lukB" |
2956 | " writeback:%lukB" | 2999 | " writeback:%lukB" |
@@ -2980,6 +3023,7 @@ void show_free_areas(unsigned int filter) | |||
2980 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3023 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
2981 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3024 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
2982 | K(zone->present_pages), | 3025 | K(zone->present_pages), |
3026 | K(zone->managed_pages), | ||
2983 | K(zone_page_state(zone, NR_MLOCK)), | 3027 | K(zone_page_state(zone, NR_MLOCK)), |
2984 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3028 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
2985 | K(zone_page_state(zone, NR_WRITEBACK)), | 3029 | K(zone_page_state(zone, NR_WRITEBACK)), |
@@ -3005,6 +3049,7 @@ void show_free_areas(unsigned int filter) | |||
3005 | 3049 | ||
3006 | for_each_populated_zone(zone) { | 3050 | for_each_populated_zone(zone) { |
3007 | unsigned long nr[MAX_ORDER], flags, order, total = 0; | 3051 | unsigned long nr[MAX_ORDER], flags, order, total = 0; |
3052 | unsigned char types[MAX_ORDER]; | ||
3008 | 3053 | ||
3009 | if (skip_free_areas_node(filter, zone_to_nid(zone))) | 3054 | if (skip_free_areas_node(filter, zone_to_nid(zone))) |
3010 | continue; | 3055 | continue; |
@@ -3013,12 +3058,24 @@ void show_free_areas(unsigned int filter) | |||
3013 | 3058 | ||
3014 | spin_lock_irqsave(&zone->lock, flags); | 3059 | spin_lock_irqsave(&zone->lock, flags); |
3015 | for (order = 0; order < MAX_ORDER; order++) { | 3060 | for (order = 0; order < MAX_ORDER; order++) { |
3016 | nr[order] = zone->free_area[order].nr_free; | 3061 | struct free_area *area = &zone->free_area[order]; |
3062 | int type; | ||
3063 | |||
3064 | nr[order] = area->nr_free; | ||
3017 | total += nr[order] << order; | 3065 | total += nr[order] << order; |
3066 | |||
3067 | types[order] = 0; | ||
3068 | for (type = 0; type < MIGRATE_TYPES; type++) { | ||
3069 | if (!list_empty(&area->free_list[type])) | ||
3070 | types[order] |= 1 << type; | ||
3071 | } | ||
3018 | } | 3072 | } |
3019 | spin_unlock_irqrestore(&zone->lock, flags); | 3073 | spin_unlock_irqrestore(&zone->lock, flags); |
3020 | for (order = 0; order < MAX_ORDER; order++) | 3074 | for (order = 0; order < MAX_ORDER; order++) { |
3021 | printk("%lu*%lukB ", nr[order], K(1UL) << order); | 3075 | printk("%lu*%lukB ", nr[order], K(1UL) << order); |
3076 | if (nr[order]) | ||
3077 | show_migration_types(types[order]); | ||
3078 | } | ||
3022 | printk("= %lukB\n", K(total)); | 3079 | printk("= %lukB\n", K(total)); |
3023 | } | 3080 | } |
3024 | 3081 | ||
@@ -3195,7 +3252,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3195 | return node; | 3252 | return node; |
3196 | } | 3253 | } |
3197 | 3254 | ||
3198 | for_each_node_state(n, N_HIGH_MEMORY) { | 3255 | for_each_node_state(n, N_MEMORY) { |
3199 | 3256 | ||
3200 | /* Don't want a node to appear more than once */ | 3257 | /* Don't want a node to appear more than once */ |
3201 | if (node_isset(n, *used_node_mask)) | 3258 | if (node_isset(n, *used_node_mask)) |
@@ -3337,7 +3394,7 @@ static int default_zonelist_order(void) | |||
3337 | * local memory, NODE_ORDER may be suitable. | 3394 | * local memory, NODE_ORDER may be suitable. |
3338 | */ | 3395 | */ |
3339 | average_size = total_size / | 3396 | average_size = total_size / |
3340 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3397 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3341 | for_each_online_node(nid) { | 3398 | for_each_online_node(nid) { |
3342 | low_kmem_size = 0; | 3399 | low_kmem_size = 0; |
3343 | total_size = 0; | 3400 | total_size = 0; |
@@ -3827,6 +3884,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3827 | mminit_verify_page_links(page, zone, nid, pfn); | 3884 | mminit_verify_page_links(page, zone, nid, pfn); |
3828 | init_page_count(page); | 3885 | init_page_count(page); |
3829 | reset_page_mapcount(page); | 3886 | reset_page_mapcount(page); |
3887 | reset_page_last_nid(page); | ||
3830 | SetPageReserved(page); | 3888 | SetPageReserved(page); |
3831 | /* | 3889 | /* |
3832 | * Mark the block movable so that blocks are reserved for | 3890 | * Mark the block movable so that blocks are reserved for |
@@ -4433,6 +4491,26 @@ void __init set_pageblock_order(void) | |||
4433 | 4491 | ||
4434 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4492 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4435 | 4493 | ||
4494 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
4495 | unsigned long present_pages) | ||
4496 | { | ||
4497 | unsigned long pages = spanned_pages; | ||
4498 | |||
4499 | /* | ||
4500 | * Provide a more accurate estimation if there are holes within | ||
4501 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
4502 | * zone, each populated memory region may cost us one or two extra | ||
4503 | * memmap pages due to alignment because memmap pages for each | ||
4504 | * populated regions may not naturally algined on page boundary. | ||
4505 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
4506 | */ | ||
4507 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
4508 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
4509 | pages = present_pages; | ||
4510 | |||
4511 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
4512 | } | ||
4513 | |||
4436 | /* | 4514 | /* |
4437 | * Set up the zone data structures: | 4515 | * Set up the zone data structures: |
4438 | * - mark all pages reserved | 4516 | * - mark all pages reserved |
@@ -4450,54 +4528,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4450 | int ret; | 4528 | int ret; |
4451 | 4529 | ||
4452 | pgdat_resize_init(pgdat); | 4530 | pgdat_resize_init(pgdat); |
4531 | #ifdef CONFIG_NUMA_BALANCING | ||
4532 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4533 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4534 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4535 | #endif | ||
4453 | init_waitqueue_head(&pgdat->kswapd_wait); | 4536 | init_waitqueue_head(&pgdat->kswapd_wait); |
4454 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4537 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4455 | pgdat_page_cgroup_init(pgdat); | 4538 | pgdat_page_cgroup_init(pgdat); |
4456 | 4539 | ||
4457 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4540 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4458 | struct zone *zone = pgdat->node_zones + j; | 4541 | struct zone *zone = pgdat->node_zones + j; |
4459 | unsigned long size, realsize, memmap_pages; | 4542 | unsigned long size, realsize, freesize, memmap_pages; |
4460 | 4543 | ||
4461 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4544 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4462 | realsize = size - zone_absent_pages_in_node(nid, j, | 4545 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4463 | zholes_size); | 4546 | zholes_size); |
4464 | 4547 | ||
4465 | /* | 4548 | /* |
4466 | * Adjust realsize so that it accounts for how much memory | 4549 | * Adjust freesize so that it accounts for how much memory |
4467 | * is used by this zone for memmap. This affects the watermark | 4550 | * is used by this zone for memmap. This affects the watermark |
4468 | * and per-cpu initialisations | 4551 | * and per-cpu initialisations |
4469 | */ | 4552 | */ |
4470 | memmap_pages = | 4553 | memmap_pages = calc_memmap_size(size, realsize); |
4471 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4554 | if (freesize >= memmap_pages) { |
4472 | if (realsize >= memmap_pages) { | 4555 | freesize -= memmap_pages; |
4473 | realsize -= memmap_pages; | ||
4474 | if (memmap_pages) | 4556 | if (memmap_pages) |
4475 | printk(KERN_DEBUG | 4557 | printk(KERN_DEBUG |
4476 | " %s zone: %lu pages used for memmap\n", | 4558 | " %s zone: %lu pages used for memmap\n", |
4477 | zone_names[j], memmap_pages); | 4559 | zone_names[j], memmap_pages); |
4478 | } else | 4560 | } else |
4479 | printk(KERN_WARNING | 4561 | printk(KERN_WARNING |
4480 | " %s zone: %lu pages exceeds realsize %lu\n", | 4562 | " %s zone: %lu pages exceeds freesize %lu\n", |
4481 | zone_names[j], memmap_pages, realsize); | 4563 | zone_names[j], memmap_pages, freesize); |
4482 | 4564 | ||
4483 | /* Account for reserved pages */ | 4565 | /* Account for reserved pages */ |
4484 | if (j == 0 && realsize > dma_reserve) { | 4566 | if (j == 0 && freesize > dma_reserve) { |
4485 | realsize -= dma_reserve; | 4567 | freesize -= dma_reserve; |
4486 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4568 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4487 | zone_names[0], dma_reserve); | 4569 | zone_names[0], dma_reserve); |
4488 | } | 4570 | } |
4489 | 4571 | ||
4490 | if (!is_highmem_idx(j)) | 4572 | if (!is_highmem_idx(j)) |
4491 | nr_kernel_pages += realsize; | 4573 | nr_kernel_pages += freesize; |
4492 | nr_all_pages += realsize; | 4574 | /* Charge for highmem memmap if there are enough kernel pages */ |
4575 | else if (nr_kernel_pages > memmap_pages * 2) | ||
4576 | nr_kernel_pages -= memmap_pages; | ||
4577 | nr_all_pages += freesize; | ||
4493 | 4578 | ||
4494 | zone->spanned_pages = size; | 4579 | zone->spanned_pages = size; |
4495 | zone->present_pages = realsize; | 4580 | zone->present_pages = freesize; |
4581 | /* | ||
4582 | * Set an approximate value for lowmem here, it will be adjusted | ||
4583 | * when the bootmem allocator frees pages into the buddy system. | ||
4584 | * And all highmem pages will be managed by the buddy system. | ||
4585 | */ | ||
4586 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
4496 | #ifdef CONFIG_NUMA | 4587 | #ifdef CONFIG_NUMA |
4497 | zone->node = nid; | 4588 | zone->node = nid; |
4498 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4589 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
4499 | / 100; | 4590 | / 100; |
4500 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4591 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
4501 | #endif | 4592 | #endif |
4502 | zone->name = zone_names[j]; | 4593 | zone->name = zone_names[j]; |
4503 | spin_lock_init(&zone->lock); | 4594 | spin_lock_init(&zone->lock); |
@@ -4688,7 +4779,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
4688 | /* | 4779 | /* |
4689 | * early_calculate_totalpages() | 4780 | * early_calculate_totalpages() |
4690 | * Sum pages in active regions for movable zone. | 4781 | * Sum pages in active regions for movable zone. |
4691 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4782 | * Populate N_MEMORY for calculating usable_nodes. |
4692 | */ | 4783 | */ |
4693 | static unsigned long __init early_calculate_totalpages(void) | 4784 | static unsigned long __init early_calculate_totalpages(void) |
4694 | { | 4785 | { |
@@ -4701,7 +4792,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4701 | 4792 | ||
4702 | totalpages += pages; | 4793 | totalpages += pages; |
4703 | if (pages) | 4794 | if (pages) |
4704 | node_set_state(nid, N_HIGH_MEMORY); | 4795 | node_set_state(nid, N_MEMORY); |
4705 | } | 4796 | } |
4706 | return totalpages; | 4797 | return totalpages; |
4707 | } | 4798 | } |
@@ -4718,9 +4809,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4718 | unsigned long usable_startpfn; | 4809 | unsigned long usable_startpfn; |
4719 | unsigned long kernelcore_node, kernelcore_remaining; | 4810 | unsigned long kernelcore_node, kernelcore_remaining; |
4720 | /* save the state before borrow the nodemask */ | 4811 | /* save the state before borrow the nodemask */ |
4721 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4812 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
4722 | unsigned long totalpages = early_calculate_totalpages(); | 4813 | unsigned long totalpages = early_calculate_totalpages(); |
4723 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4814 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
4724 | 4815 | ||
4725 | /* | 4816 | /* |
4726 | * If movablecore was specified, calculate what size of | 4817 | * If movablecore was specified, calculate what size of |
@@ -4755,7 +4846,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4755 | restart: | 4846 | restart: |
4756 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4847 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4757 | kernelcore_node = required_kernelcore / usable_nodes; | 4848 | kernelcore_node = required_kernelcore / usable_nodes; |
4758 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4849 | for_each_node_state(nid, N_MEMORY) { |
4759 | unsigned long start_pfn, end_pfn; | 4850 | unsigned long start_pfn, end_pfn; |
4760 | 4851 | ||
4761 | /* | 4852 | /* |
@@ -4847,23 +4938,27 @@ restart: | |||
4847 | 4938 | ||
4848 | out: | 4939 | out: |
4849 | /* restore the node_state */ | 4940 | /* restore the node_state */ |
4850 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4941 | node_states[N_MEMORY] = saved_node_state; |
4851 | } | 4942 | } |
4852 | 4943 | ||
4853 | /* Any regular memory on that node ? */ | 4944 | /* Any regular or high memory on that node ? */ |
4854 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4945 | static void check_for_memory(pg_data_t *pgdat, int nid) |
4855 | { | 4946 | { |
4856 | #ifdef CONFIG_HIGHMEM | ||
4857 | enum zone_type zone_type; | 4947 | enum zone_type zone_type; |
4858 | 4948 | ||
4859 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4949 | if (N_MEMORY == N_NORMAL_MEMORY) |
4950 | return; | ||
4951 | |||
4952 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
4860 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4953 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4861 | if (zone->present_pages) { | 4954 | if (zone->present_pages) { |
4862 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4955 | node_set_state(nid, N_HIGH_MEMORY); |
4956 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
4957 | zone_type <= ZONE_NORMAL) | ||
4958 | node_set_state(nid, N_NORMAL_MEMORY); | ||
4863 | break; | 4959 | break; |
4864 | } | 4960 | } |
4865 | } | 4961 | } |
4866 | #endif | ||
4867 | } | 4962 | } |
4868 | 4963 | ||
4869 | /** | 4964 | /** |
@@ -4946,8 +5041,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4946 | 5041 | ||
4947 | /* Any memory on that node */ | 5042 | /* Any memory on that node */ |
4948 | if (pgdat->node_present_pages) | 5043 | if (pgdat->node_present_pages) |
4949 | node_set_state(nid, N_HIGH_MEMORY); | 5044 | node_set_state(nid, N_MEMORY); |
4950 | check_for_regular_memory(pgdat); | 5045 | check_for_memory(pgdat, nid); |
4951 | } | 5046 | } |
4952 | } | 5047 | } |
4953 | 5048 | ||
@@ -5175,10 +5270,6 @@ static void __setup_per_zone_wmarks(void) | |||
5175 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); | 5270 | zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); |
5176 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); | 5271 | zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); |
5177 | 5272 | ||
5178 | zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); | ||
5179 | zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); | ||
5180 | zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); | ||
5181 | |||
5182 | setup_zone_migrate_reserve(zone); | 5273 | setup_zone_migrate_reserve(zone); |
5183 | spin_unlock_irqrestore(&zone->lock, flags); | 5274 | spin_unlock_irqrestore(&zone->lock, flags); |
5184 | } | 5275 | } |
@@ -5576,7 +5667,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5576 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | 5667 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't |
5577 | * expect this function should be exact. | 5668 | * expect this function should be exact. |
5578 | */ | 5669 | */ |
5579 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | 5670 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count, |
5671 | bool skip_hwpoisoned_pages) | ||
5580 | { | 5672 | { |
5581 | unsigned long pfn, iter, found; | 5673 | unsigned long pfn, iter, found; |
5582 | int mt; | 5674 | int mt; |
@@ -5611,6 +5703,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |||
5611 | continue; | 5703 | continue; |
5612 | } | 5704 | } |
5613 | 5705 | ||
5706 | /* | ||
5707 | * The HWPoisoned page may be not in buddy system, and | ||
5708 | * page_count() is not 0. | ||
5709 | */ | ||
5710 | if (skip_hwpoisoned_pages && PageHWPoison(page)) | ||
5711 | continue; | ||
5712 | |||
5614 | if (!PageLRU(page)) | 5713 | if (!PageLRU(page)) |
5615 | found++; | 5714 | found++; |
5616 | /* | 5715 | /* |
@@ -5653,7 +5752,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5653 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5752 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5654 | return false; | 5753 | return false; |
5655 | 5754 | ||
5656 | return !has_unmovable_pages(zone, page, 0); | 5755 | return !has_unmovable_pages(zone, page, 0, true); |
5657 | } | 5756 | } |
5658 | 5757 | ||
5659 | #ifdef CONFIG_CMA | 5758 | #ifdef CONFIG_CMA |
@@ -5680,7 +5779,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5680 | unsigned int tries = 0; | 5779 | unsigned int tries = 0; |
5681 | int ret = 0; | 5780 | int ret = 0; |
5682 | 5781 | ||
5683 | migrate_prep_local(); | 5782 | migrate_prep(); |
5684 | 5783 | ||
5685 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5784 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5686 | if (fatal_signal_pending(current)) { | 5785 | if (fatal_signal_pending(current)) { |
@@ -5708,61 +5807,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5708 | 5807 | ||
5709 | ret = migrate_pages(&cc->migratepages, | 5808 | ret = migrate_pages(&cc->migratepages, |
5710 | alloc_migrate_target, | 5809 | alloc_migrate_target, |
5711 | 0, false, MIGRATE_SYNC); | 5810 | 0, false, MIGRATE_SYNC, |
5811 | MR_CMA); | ||
5712 | } | 5812 | } |
5713 | 5813 | ||
5714 | putback_lru_pages(&cc->migratepages); | 5814 | putback_movable_pages(&cc->migratepages); |
5715 | return ret > 0 ? 0 : ret; | 5815 | return ret > 0 ? 0 : ret; |
5716 | } | 5816 | } |
5717 | 5817 | ||
5718 | /* | ||
5719 | * Update zone's cma pages counter used for watermark level calculation. | ||
5720 | */ | ||
5721 | static inline void __update_cma_watermarks(struct zone *zone, int count) | ||
5722 | { | ||
5723 | unsigned long flags; | ||
5724 | spin_lock_irqsave(&zone->lock, flags); | ||
5725 | zone->min_cma_pages += count; | ||
5726 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5727 | setup_per_zone_wmarks(); | ||
5728 | } | ||
5729 | |||
5730 | /* | ||
5731 | * Trigger memory pressure bump to reclaim some pages in order to be able to | ||
5732 | * allocate 'count' pages in single page units. Does similar work as | ||
5733 | *__alloc_pages_slowpath() function. | ||
5734 | */ | ||
5735 | static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | ||
5736 | { | ||
5737 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | ||
5738 | struct zonelist *zonelist = node_zonelist(0, gfp_mask); | ||
5739 | int did_some_progress = 0; | ||
5740 | int order = 1; | ||
5741 | |||
5742 | /* | ||
5743 | * Increase level of watermarks to force kswapd do his job | ||
5744 | * to stabilise at new watermark level. | ||
5745 | */ | ||
5746 | __update_cma_watermarks(zone, count); | ||
5747 | |||
5748 | /* Obey watermarks as if the page was being allocated */ | ||
5749 | while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { | ||
5750 | wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); | ||
5751 | |||
5752 | did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, | ||
5753 | NULL); | ||
5754 | if (!did_some_progress) { | ||
5755 | /* Exhausted what can be done so it's blamo time */ | ||
5756 | out_of_memory(zonelist, gfp_mask, order, NULL, false); | ||
5757 | } | ||
5758 | } | ||
5759 | |||
5760 | /* Restore original watermark levels. */ | ||
5761 | __update_cma_watermarks(zone, -count); | ||
5762 | |||
5763 | return count; | ||
5764 | } | ||
5765 | |||
5766 | /** | 5818 | /** |
5767 | * alloc_contig_range() -- tries to allocate given range of pages | 5819 | * alloc_contig_range() -- tries to allocate given range of pages |
5768 | * @start: start PFN to allocate | 5820 | * @start: start PFN to allocate |
@@ -5786,7 +5838,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) | |||
5786 | int alloc_contig_range(unsigned long start, unsigned long end, | 5838 | int alloc_contig_range(unsigned long start, unsigned long end, |
5787 | unsigned migratetype) | 5839 | unsigned migratetype) |
5788 | { | 5840 | { |
5789 | struct zone *zone = page_zone(pfn_to_page(start)); | ||
5790 | unsigned long outer_start, outer_end; | 5841 | unsigned long outer_start, outer_end; |
5791 | int ret = 0, order; | 5842 | int ret = 0, order; |
5792 | 5843 | ||
@@ -5824,7 +5875,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5824 | */ | 5875 | */ |
5825 | 5876 | ||
5826 | ret = start_isolate_page_range(pfn_max_align_down(start), | 5877 | ret = start_isolate_page_range(pfn_max_align_down(start), |
5827 | pfn_max_align_up(end), migratetype); | 5878 | pfn_max_align_up(end), migratetype, |
5879 | false); | ||
5828 | if (ret) | 5880 | if (ret) |
5829 | return ret; | 5881 | return ret; |
5830 | 5882 | ||
@@ -5863,18 +5915,13 @@ int alloc_contig_range(unsigned long start, unsigned long end, | |||
5863 | } | 5915 | } |
5864 | 5916 | ||
5865 | /* Make sure the range is really isolated. */ | 5917 | /* Make sure the range is really isolated. */ |
5866 | if (test_pages_isolated(outer_start, end)) { | 5918 | if (test_pages_isolated(outer_start, end, false)) { |
5867 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", | 5919 | pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", |
5868 | outer_start, end); | 5920 | outer_start, end); |
5869 | ret = -EBUSY; | 5921 | ret = -EBUSY; |
5870 | goto done; | 5922 | goto done; |
5871 | } | 5923 | } |
5872 | 5924 | ||
5873 | /* | ||
5874 | * Reclaim enough pages to make sure that contiguous allocation | ||
5875 | * will not starve the system. | ||
5876 | */ | ||
5877 | __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); | ||
5878 | 5925 | ||
5879 | /* Grab isolated pages from freelists. */ | 5926 | /* Grab isolated pages from freelists. */ |
5880 | outer_end = isolate_freepages_range(&cc, outer_start, end); | 5927 | outer_end = isolate_freepages_range(&cc, outer_start, end); |
@@ -5932,7 +5979,6 @@ void __meminit zone_pcp_update(struct zone *zone) | |||
5932 | } | 5979 | } |
5933 | #endif | 5980 | #endif |
5934 | 5981 | ||
5935 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5936 | void zone_pcp_reset(struct zone *zone) | 5982 | void zone_pcp_reset(struct zone *zone) |
5937 | { | 5983 | { |
5938 | unsigned long flags; | 5984 | unsigned long flags; |
@@ -5952,6 +5998,7 @@ void zone_pcp_reset(struct zone *zone) | |||
5952 | local_irq_restore(flags); | 5998 | local_irq_restore(flags); |
5953 | } | 5999 | } |
5954 | 6000 | ||
6001 | #ifdef CONFIG_MEMORY_HOTREMOVE | ||
5955 | /* | 6002 | /* |
5956 | * All pages in the range must be isolated before calling this. | 6003 | * All pages in the range must be isolated before calling this. |
5957 | */ | 6004 | */ |
@@ -5978,6 +6025,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5978 | continue; | 6025 | continue; |
5979 | } | 6026 | } |
5980 | page = pfn_to_page(pfn); | 6027 | page = pfn_to_page(pfn); |
6028 | /* | ||
6029 | * The HWPoisoned page may be not in buddy system, and | ||
6030 | * page_count() is not 0. | ||
6031 | */ | ||
6032 | if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { | ||
6033 | pfn++; | ||
6034 | SetPageReserved(page); | ||
6035 | continue; | ||
6036 | } | ||
6037 | |||
5981 | BUG_ON(page_count(page)); | 6038 | BUG_ON(page_count(page)); |
5982 | BUG_ON(!PageBuddy(page)); | 6039 | BUG_ON(!PageBuddy(page)); |
5983 | order = page_order(page); | 6040 | order = page_order(page); |
@@ -5988,8 +6045,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) | |||
5988 | list_del(&page->lru); | 6045 | list_del(&page->lru); |
5989 | rmv_page_order(page); | 6046 | rmv_page_order(page); |
5990 | zone->free_area[order].nr_free--; | 6047 | zone->free_area[order].nr_free--; |
5991 | __mod_zone_page_state(zone, NR_FREE_PAGES, | ||
5992 | - (1UL << order)); | ||
5993 | for (i = 0; i < (1 << order); i++) | 6048 | for (i = 0; i < (1 << order); i++) |
5994 | SetPageReserved((page+i)); | 6049 | SetPageReserved((page+i)); |
5995 | pfn += (1 << order); | 6050 | pfn += (1 << order); |