diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 238 |
1 files changed, 143 insertions, 95 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index eee3efa58c91..df2022ff0c8a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -90,6 +90,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { | |||
90 | #ifdef CONFIG_HIGHMEM | 90 | #ifdef CONFIG_HIGHMEM |
91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, | 91 | [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
92 | #endif | 92 | #endif |
93 | #ifdef CONFIG_MOVABLE_NODE | ||
94 | [N_MEMORY] = { { [0] = 1UL } }, | ||
95 | #endif | ||
93 | [N_CPU] = { { [0] = 1UL } }, | 96 | [N_CPU] = { { [0] = 1UL } }, |
94 | #endif /* NUMA */ | 97 | #endif /* NUMA */ |
95 | }; | 98 | }; |
@@ -218,11 +221,6 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
218 | 221 | ||
219 | int page_group_by_mobility_disabled __read_mostly; | 222 | int page_group_by_mobility_disabled __read_mostly; |
220 | 223 | ||
221 | /* | ||
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | 224 | void set_pageblock_migratetype(struct page *page, int migratetype) |
227 | { | 225 | { |
228 | 226 | ||
@@ -368,8 +366,7 @@ static int destroy_compound_page(struct page *page, unsigned long order) | |||
368 | int nr_pages = 1 << order; | 366 | int nr_pages = 1 << order; |
369 | int bad = 0; | 367 | int bad = 0; |
370 | 368 | ||
371 | if (unlikely(compound_order(page) != order) || | 369 | if (unlikely(compound_order(page) != order)) { |
372 | unlikely(!PageHead(page))) { | ||
373 | bad_page(page); | 370 | bad_page(page); |
374 | bad++; | 371 | bad++; |
375 | } | 372 | } |
@@ -608,6 +605,7 @@ static inline int free_pages_check(struct page *page) | |||
608 | bad_page(page); | 605 | bad_page(page); |
609 | return 1; | 606 | return 1; |
610 | } | 607 | } |
608 | reset_page_last_nid(page); | ||
611 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) | 609 | if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) |
612 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; | 610 | page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
613 | return 0; | 611 | return 0; |
@@ -732,6 +730,13 @@ static void __free_pages_ok(struct page *page, unsigned int order) | |||
732 | local_irq_restore(flags); | 730 | local_irq_restore(flags); |
733 | } | 731 | } |
734 | 732 | ||
733 | /* | ||
734 | * Read access to zone->managed_pages is safe because it's unsigned long, | ||
735 | * but we still need to serialize writers. Currently all callers of | ||
736 | * __free_pages_bootmem() except put_page_bootmem() should only be used | ||
737 | * at boot time. So for shorter boot time, we shift the burden to | ||
738 | * put_page_bootmem() to serialize writers. | ||
739 | */ | ||
735 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | 740 | void __meminit __free_pages_bootmem(struct page *page, unsigned int order) |
736 | { | 741 | { |
737 | unsigned int nr_pages = 1 << order; | 742 | unsigned int nr_pages = 1 << order; |
@@ -747,6 +752,7 @@ void __meminit __free_pages_bootmem(struct page *page, unsigned int order) | |||
747 | set_page_count(p, 0); | 752 | set_page_count(p, 0); |
748 | } | 753 | } |
749 | 754 | ||
755 | page_zone(page)->managed_pages += 1 << order; | ||
750 | set_page_refcounted(page); | 756 | set_page_refcounted(page); |
751 | __free_pages(page, order); | 757 | __free_pages(page, order); |
752 | } | 758 | } |
@@ -1378,14 +1384,8 @@ void split_page(struct page *page, unsigned int order) | |||
1378 | set_page_refcounted(page + i); | 1384 | set_page_refcounted(page + i); |
1379 | } | 1385 | } |
1380 | 1386 | ||
1381 | /* | 1387 | static int __isolate_free_page(struct page *page, unsigned int order) |
1382 | * Similar to the split_page family of functions except that the page | ||
1383 | * required at the given order and being isolated now to prevent races | ||
1384 | * with parallel allocators | ||
1385 | */ | ||
1386 | int capture_free_page(struct page *page, int alloc_order, int migratetype) | ||
1387 | { | 1388 | { |
1388 | unsigned int order; | ||
1389 | unsigned long watermark; | 1389 | unsigned long watermark; |
1390 | struct zone *zone; | 1390 | struct zone *zone; |
1391 | int mt; | 1391 | int mt; |
@@ -1393,7 +1393,6 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1393 | BUG_ON(!PageBuddy(page)); | 1393 | BUG_ON(!PageBuddy(page)); |
1394 | 1394 | ||
1395 | zone = page_zone(page); | 1395 | zone = page_zone(page); |
1396 | order = page_order(page); | ||
1397 | mt = get_pageblock_migratetype(page); | 1396 | mt = get_pageblock_migratetype(page); |
1398 | 1397 | ||
1399 | if (mt != MIGRATE_ISOLATE) { | 1398 | if (mt != MIGRATE_ISOLATE) { |
@@ -1402,7 +1401,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1402 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) | 1401 | if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) |
1403 | return 0; | 1402 | return 0; |
1404 | 1403 | ||
1405 | __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); | 1404 | __mod_zone_freepage_state(zone, -(1UL << order), mt); |
1406 | } | 1405 | } |
1407 | 1406 | ||
1408 | /* Remove page from free list */ | 1407 | /* Remove page from free list */ |
@@ -1410,11 +1409,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1410 | zone->free_area[order].nr_free--; | 1409 | zone->free_area[order].nr_free--; |
1411 | rmv_page_order(page); | 1410 | rmv_page_order(page); |
1412 | 1411 | ||
1413 | if (alloc_order != order) | 1412 | /* Set the pageblock if the isolated page is at least a pageblock */ |
1414 | expand(zone, page, alloc_order, order, | ||
1415 | &zone->free_area[order], migratetype); | ||
1416 | |||
1417 | /* Set the pageblock if the captured page is at least a pageblock */ | ||
1418 | if (order >= pageblock_order - 1) { | 1413 | if (order >= pageblock_order - 1) { |
1419 | struct page *endpage = page + (1 << order) - 1; | 1414 | struct page *endpage = page + (1 << order) - 1; |
1420 | for (; page < endpage; page += pageblock_nr_pages) { | 1415 | for (; page < endpage; page += pageblock_nr_pages) { |
@@ -1425,7 +1420,7 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) | |||
1425 | } | 1420 | } |
1426 | } | 1421 | } |
1427 | 1422 | ||
1428 | return 1UL << alloc_order; | 1423 | return 1UL << order; |
1429 | } | 1424 | } |
1430 | 1425 | ||
1431 | /* | 1426 | /* |
@@ -1443,10 +1438,9 @@ int split_free_page(struct page *page) | |||
1443 | unsigned int order; | 1438 | unsigned int order; |
1444 | int nr_pages; | 1439 | int nr_pages; |
1445 | 1440 | ||
1446 | BUG_ON(!PageBuddy(page)); | ||
1447 | order = page_order(page); | 1441 | order = page_order(page); |
1448 | 1442 | ||
1449 | nr_pages = capture_free_page(page, order, 0); | 1443 | nr_pages = __isolate_free_page(page, order); |
1450 | if (!nr_pages) | 1444 | if (!nr_pages) |
1451 | return 0; | 1445 | return 0; |
1452 | 1446 | ||
@@ -1644,20 +1638,6 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1644 | return true; | 1638 | return true; |
1645 | } | 1639 | } |
1646 | 1640 | ||
1647 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1648 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1649 | { | ||
1650 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1651 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1652 | return 0; | ||
1653 | } | ||
1654 | #else | ||
1655 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1656 | { | ||
1657 | return 0; | ||
1658 | } | ||
1659 | #endif | ||
1660 | |||
1661 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1662 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1663 | { | 1643 | { |
@@ -1673,14 +1653,6 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1673 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1674 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1675 | 1655 | ||
1676 | /* | ||
1677 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1678 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1679 | * sleep although it could do so. But this is more desirable for memory | ||
1680 | * hotplug than sleeping which can cause a livelock in the direct | ||
1681 | * reclaim path. | ||
1682 | */ | ||
1683 | free_pages -= nr_zone_isolate_freepages(z); | ||
1684 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1656 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1685 | free_pages); | 1657 | free_pages); |
1686 | } | 1658 | } |
@@ -1695,7 +1667,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1695 | * | 1667 | * |
1696 | * If the zonelist cache is present in the passed in zonelist, then | 1668 | * If the zonelist cache is present in the passed in zonelist, then |
1697 | * returns a pointer to the allowed node mask (either the current | 1669 | * returns a pointer to the allowed node mask (either the current |
1698 | * tasks mems_allowed, or node_states[N_HIGH_MEMORY].) | 1670 | * tasks mems_allowed, or node_states[N_MEMORY].) |
1699 | * | 1671 | * |
1700 | * If the zonelist cache is not available for this zonelist, does | 1672 | * If the zonelist cache is not available for this zonelist, does |
1701 | * nothing and returns NULL. | 1673 | * nothing and returns NULL. |
@@ -1724,7 +1696,7 @@ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) | |||
1724 | 1696 | ||
1725 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? | 1697 | allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ? |
1726 | &cpuset_current_mems_allowed : | 1698 | &cpuset_current_mems_allowed : |
1727 | &node_states[N_HIGH_MEMORY]; | 1699 | &node_states[N_MEMORY]; |
1728 | return allowednodes; | 1700 | return allowednodes; |
1729 | } | 1701 | } |
1730 | 1702 | ||
@@ -2152,8 +2124,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2152 | bool *contended_compaction, bool *deferred_compaction, | 2124 | bool *contended_compaction, bool *deferred_compaction, |
2153 | unsigned long *did_some_progress) | 2125 | unsigned long *did_some_progress) |
2154 | { | 2126 | { |
2155 | struct page *page = NULL; | ||
2156 | |||
2157 | if (!order) | 2127 | if (!order) |
2158 | return NULL; | 2128 | return NULL; |
2159 | 2129 | ||
@@ -2165,16 +2135,12 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2165 | current->flags |= PF_MEMALLOC; | 2135 | current->flags |= PF_MEMALLOC; |
2166 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, | 2136 | *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, |
2167 | nodemask, sync_migration, | 2137 | nodemask, sync_migration, |
2168 | contended_compaction, &page); | 2138 | contended_compaction); |
2169 | current->flags &= ~PF_MEMALLOC; | 2139 | current->flags &= ~PF_MEMALLOC; |
2170 | 2140 | ||
2171 | /* If compaction captured a page, prep and use it */ | ||
2172 | if (page) { | ||
2173 | prep_new_page(page, order, gfp_mask); | ||
2174 | goto got_page; | ||
2175 | } | ||
2176 | |||
2177 | if (*did_some_progress != COMPACT_SKIPPED) { | 2141 | if (*did_some_progress != COMPACT_SKIPPED) { |
2142 | struct page *page; | ||
2143 | |||
2178 | /* Page migration frees to the PCP lists but we want merging */ | 2144 | /* Page migration frees to the PCP lists but we want merging */ |
2179 | drain_pages(get_cpu()); | 2145 | drain_pages(get_cpu()); |
2180 | put_cpu(); | 2146 | put_cpu(); |
@@ -2184,7 +2150,6 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2184 | alloc_flags & ~ALLOC_NO_WATERMARKS, | 2150 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2185 | preferred_zone, migratetype); | 2151 | preferred_zone, migratetype); |
2186 | if (page) { | 2152 | if (page) { |
2187 | got_page: | ||
2188 | preferred_zone->compact_blockskip_flush = false; | 2153 | preferred_zone->compact_blockskip_flush = false; |
2189 | preferred_zone->compact_considered = 0; | 2154 | preferred_zone->compact_considered = 0; |
2190 | preferred_zone->compact_defer_shift = 0; | 2155 | preferred_zone->compact_defer_shift = 0; |
@@ -2601,6 +2566,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2601 | int migratetype = allocflags_to_migratetype(gfp_mask); | 2566 | int migratetype = allocflags_to_migratetype(gfp_mask); |
2602 | unsigned int cpuset_mems_cookie; | 2567 | unsigned int cpuset_mems_cookie; |
2603 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; | 2568 | int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET; |
2569 | struct mem_cgroup *memcg = NULL; | ||
2604 | 2570 | ||
2605 | gfp_mask &= gfp_allowed_mask; | 2571 | gfp_mask &= gfp_allowed_mask; |
2606 | 2572 | ||
@@ -2619,6 +2585,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, | |||
2619 | if (unlikely(!zonelist->_zonerefs->zone)) | 2585 | if (unlikely(!zonelist->_zonerefs->zone)) |
2620 | return NULL; | 2586 | return NULL; |
2621 | 2587 | ||
2588 | /* | ||
2589 | * Will only have any effect when __GFP_KMEMCG is set. This is | ||
2590 | * verified in the (always inline) callee | ||
2591 | */ | ||
2592 | if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) | ||
2593 | return NULL; | ||
2594 | |||
2622 | retry_cpuset: | 2595 | retry_cpuset: |
2623 | cpuset_mems_cookie = get_mems_allowed(); | 2596 | cpuset_mems_cookie = get_mems_allowed(); |
2624 | 2597 | ||
@@ -2654,6 +2627,8 @@ out: | |||
2654 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) | 2627 | if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) |
2655 | goto retry_cpuset; | 2628 | goto retry_cpuset; |
2656 | 2629 | ||
2630 | memcg_kmem_commit_charge(page, memcg, order); | ||
2631 | |||
2657 | return page; | 2632 | return page; |
2658 | } | 2633 | } |
2659 | EXPORT_SYMBOL(__alloc_pages_nodemask); | 2634 | EXPORT_SYMBOL(__alloc_pages_nodemask); |
@@ -2706,6 +2681,31 @@ void free_pages(unsigned long addr, unsigned int order) | |||
2706 | 2681 | ||
2707 | EXPORT_SYMBOL(free_pages); | 2682 | EXPORT_SYMBOL(free_pages); |
2708 | 2683 | ||
2684 | /* | ||
2685 | * __free_memcg_kmem_pages and free_memcg_kmem_pages will free | ||
2686 | * pages allocated with __GFP_KMEMCG. | ||
2687 | * | ||
2688 | * Those pages are accounted to a particular memcg, embedded in the | ||
2689 | * corresponding page_cgroup. To avoid adding a hit in the allocator to search | ||
2690 | * for that information only to find out that it is NULL for users who have no | ||
2691 | * interest in that whatsoever, we provide these functions. | ||
2692 | * | ||
2693 | * The caller knows better which flags it relies on. | ||
2694 | */ | ||
2695 | void __free_memcg_kmem_pages(struct page *page, unsigned int order) | ||
2696 | { | ||
2697 | memcg_kmem_uncharge_pages(page, order); | ||
2698 | __free_pages(page, order); | ||
2699 | } | ||
2700 | |||
2701 | void free_memcg_kmem_pages(unsigned long addr, unsigned int order) | ||
2702 | { | ||
2703 | if (addr != 0) { | ||
2704 | VM_BUG_ON(!virt_addr_valid((void *)addr)); | ||
2705 | __free_memcg_kmem_pages(virt_to_page((void *)addr), order); | ||
2706 | } | ||
2707 | } | ||
2708 | |||
2709 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) | 2709 | static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size) |
2710 | { | 2710 | { |
2711 | if (addr) { | 2711 | if (addr) { |
@@ -2981,6 +2981,7 @@ void show_free_areas(unsigned int filter) | |||
2981 | " isolated(anon):%lukB" | 2981 | " isolated(anon):%lukB" |
2982 | " isolated(file):%lukB" | 2982 | " isolated(file):%lukB" |
2983 | " present:%lukB" | 2983 | " present:%lukB" |
2984 | " managed:%lukB" | ||
2984 | " mlocked:%lukB" | 2985 | " mlocked:%lukB" |
2985 | " dirty:%lukB" | 2986 | " dirty:%lukB" |
2986 | " writeback:%lukB" | 2987 | " writeback:%lukB" |
@@ -3010,6 +3011,7 @@ void show_free_areas(unsigned int filter) | |||
3010 | K(zone_page_state(zone, NR_ISOLATED_ANON)), | 3011 | K(zone_page_state(zone, NR_ISOLATED_ANON)), |
3011 | K(zone_page_state(zone, NR_ISOLATED_FILE)), | 3012 | K(zone_page_state(zone, NR_ISOLATED_FILE)), |
3012 | K(zone->present_pages), | 3013 | K(zone->present_pages), |
3014 | K(zone->managed_pages), | ||
3013 | K(zone_page_state(zone, NR_MLOCK)), | 3015 | K(zone_page_state(zone, NR_MLOCK)), |
3014 | K(zone_page_state(zone, NR_FILE_DIRTY)), | 3016 | K(zone_page_state(zone, NR_FILE_DIRTY)), |
3015 | K(zone_page_state(zone, NR_WRITEBACK)), | 3017 | K(zone_page_state(zone, NR_WRITEBACK)), |
@@ -3238,7 +3240,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) | |||
3238 | return node; | 3240 | return node; |
3239 | } | 3241 | } |
3240 | 3242 | ||
3241 | for_each_node_state(n, N_HIGH_MEMORY) { | 3243 | for_each_node_state(n, N_MEMORY) { |
3242 | 3244 | ||
3243 | /* Don't want a node to appear more than once */ | 3245 | /* Don't want a node to appear more than once */ |
3244 | if (node_isset(n, *used_node_mask)) | 3246 | if (node_isset(n, *used_node_mask)) |
@@ -3380,7 +3382,7 @@ static int default_zonelist_order(void) | |||
3380 | * local memory, NODE_ORDER may be suitable. | 3382 | * local memory, NODE_ORDER may be suitable. |
3381 | */ | 3383 | */ |
3382 | average_size = total_size / | 3384 | average_size = total_size / |
3383 | (nodes_weight(node_states[N_HIGH_MEMORY]) + 1); | 3385 | (nodes_weight(node_states[N_MEMORY]) + 1); |
3384 | for_each_online_node(nid) { | 3386 | for_each_online_node(nid) { |
3385 | low_kmem_size = 0; | 3387 | low_kmem_size = 0; |
3386 | total_size = 0; | 3388 | total_size = 0; |
@@ -3870,6 +3872,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
3870 | mminit_verify_page_links(page, zone, nid, pfn); | 3872 | mminit_verify_page_links(page, zone, nid, pfn); |
3871 | init_page_count(page); | 3873 | init_page_count(page); |
3872 | reset_page_mapcount(page); | 3874 | reset_page_mapcount(page); |
3875 | reset_page_last_nid(page); | ||
3873 | SetPageReserved(page); | 3876 | SetPageReserved(page); |
3874 | /* | 3877 | /* |
3875 | * Mark the block movable so that blocks are reserved for | 3878 | * Mark the block movable so that blocks are reserved for |
@@ -4476,6 +4479,26 @@ void __init set_pageblock_order(void) | |||
4476 | 4479 | ||
4477 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ | 4480 | #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ |
4478 | 4481 | ||
4482 | static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, | ||
4483 | unsigned long present_pages) | ||
4484 | { | ||
4485 | unsigned long pages = spanned_pages; | ||
4486 | |||
4487 | /* | ||
4488 | * Provide a more accurate estimation if there are holes within | ||
4489 | * the zone and SPARSEMEM is in use. If there are holes within the | ||
4490 | * zone, each populated memory region may cost us one or two extra | ||
4491 | * memmap pages due to alignment because memmap pages for each | ||
4492 | * populated regions may not naturally algined on page boundary. | ||
4493 | * So the (present_pages >> 4) heuristic is a tradeoff for that. | ||
4494 | */ | ||
4495 | if (spanned_pages > present_pages + (present_pages >> 4) && | ||
4496 | IS_ENABLED(CONFIG_SPARSEMEM)) | ||
4497 | pages = present_pages; | ||
4498 | |||
4499 | return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; | ||
4500 | } | ||
4501 | |||
4479 | /* | 4502 | /* |
4480 | * Set up the zone data structures: | 4503 | * Set up the zone data structures: |
4481 | * - mark all pages reserved | 4504 | * - mark all pages reserved |
@@ -4493,54 +4516,67 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4493 | int ret; | 4516 | int ret; |
4494 | 4517 | ||
4495 | pgdat_resize_init(pgdat); | 4518 | pgdat_resize_init(pgdat); |
4519 | #ifdef CONFIG_NUMA_BALANCING | ||
4520 | spin_lock_init(&pgdat->numabalancing_migrate_lock); | ||
4521 | pgdat->numabalancing_migrate_nr_pages = 0; | ||
4522 | pgdat->numabalancing_migrate_next_window = jiffies; | ||
4523 | #endif | ||
4496 | init_waitqueue_head(&pgdat->kswapd_wait); | 4524 | init_waitqueue_head(&pgdat->kswapd_wait); |
4497 | init_waitqueue_head(&pgdat->pfmemalloc_wait); | 4525 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4498 | pgdat_page_cgroup_init(pgdat); | 4526 | pgdat_page_cgroup_init(pgdat); |
4499 | 4527 | ||
4500 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4528 | for (j = 0; j < MAX_NR_ZONES; j++) { |
4501 | struct zone *zone = pgdat->node_zones + j; | 4529 | struct zone *zone = pgdat->node_zones + j; |
4502 | unsigned long size, realsize, memmap_pages; | 4530 | unsigned long size, realsize, freesize, memmap_pages; |
4503 | 4531 | ||
4504 | size = zone_spanned_pages_in_node(nid, j, zones_size); | 4532 | size = zone_spanned_pages_in_node(nid, j, zones_size); |
4505 | realsize = size - zone_absent_pages_in_node(nid, j, | 4533 | realsize = freesize = size - zone_absent_pages_in_node(nid, j, |
4506 | zholes_size); | 4534 | zholes_size); |
4507 | 4535 | ||
4508 | /* | 4536 | /* |
4509 | * Adjust realsize so that it accounts for how much memory | 4537 | * Adjust freesize so that it accounts for how much memory |
4510 | * is used by this zone for memmap. This affects the watermark | 4538 | * is used by this zone for memmap. This affects the watermark |
4511 | * and per-cpu initialisations | 4539 | * and per-cpu initialisations |
4512 | */ | 4540 | */ |
4513 | memmap_pages = | 4541 | memmap_pages = calc_memmap_size(size, realsize); |
4514 | PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; | 4542 | if (freesize >= memmap_pages) { |
4515 | if (realsize >= memmap_pages) { | 4543 | freesize -= memmap_pages; |
4516 | realsize -= memmap_pages; | ||
4517 | if (memmap_pages) | 4544 | if (memmap_pages) |
4518 | printk(KERN_DEBUG | 4545 | printk(KERN_DEBUG |
4519 | " %s zone: %lu pages used for memmap\n", | 4546 | " %s zone: %lu pages used for memmap\n", |
4520 | zone_names[j], memmap_pages); | 4547 | zone_names[j], memmap_pages); |
4521 | } else | 4548 | } else |
4522 | printk(KERN_WARNING | 4549 | printk(KERN_WARNING |
4523 | " %s zone: %lu pages exceeds realsize %lu\n", | 4550 | " %s zone: %lu pages exceeds freesize %lu\n", |
4524 | zone_names[j], memmap_pages, realsize); | 4551 | zone_names[j], memmap_pages, freesize); |
4525 | 4552 | ||
4526 | /* Account for reserved pages */ | 4553 | /* Account for reserved pages */ |
4527 | if (j == 0 && realsize > dma_reserve) { | 4554 | if (j == 0 && freesize > dma_reserve) { |
4528 | realsize -= dma_reserve; | 4555 | freesize -= dma_reserve; |
4529 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", | 4556 | printk(KERN_DEBUG " %s zone: %lu pages reserved\n", |
4530 | zone_names[0], dma_reserve); | 4557 | zone_names[0], dma_reserve); |
4531 | } | 4558 | } |
4532 | 4559 | ||
4533 | if (!is_highmem_idx(j)) | 4560 | if (!is_highmem_idx(j)) |
4534 | nr_kernel_pages += realsize; | 4561 | nr_kernel_pages += freesize; |
4535 | nr_all_pages += realsize; | 4562 | /* Charge for highmem memmap if there are enough kernel pages */ |
4563 | else if (nr_kernel_pages > memmap_pages * 2) | ||
4564 | nr_kernel_pages -= memmap_pages; | ||
4565 | nr_all_pages += freesize; | ||
4536 | 4566 | ||
4537 | zone->spanned_pages = size; | 4567 | zone->spanned_pages = size; |
4538 | zone->present_pages = realsize; | 4568 | zone->present_pages = freesize; |
4569 | /* | ||
4570 | * Set an approximate value for lowmem here, it will be adjusted | ||
4571 | * when the bootmem allocator frees pages into the buddy system. | ||
4572 | * And all highmem pages will be managed by the buddy system. | ||
4573 | */ | ||
4574 | zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; | ||
4539 | #ifdef CONFIG_NUMA | 4575 | #ifdef CONFIG_NUMA |
4540 | zone->node = nid; | 4576 | zone->node = nid; |
4541 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4577 | zone->min_unmapped_pages = (freesize*sysctl_min_unmapped_ratio) |
4542 | / 100; | 4578 | / 100; |
4543 | zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100; | 4579 | zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100; |
4544 | #endif | 4580 | #endif |
4545 | zone->name = zone_names[j]; | 4581 | zone->name = zone_names[j]; |
4546 | spin_lock_init(&zone->lock); | 4582 | spin_lock_init(&zone->lock); |
@@ -4731,7 +4767,7 @@ unsigned long __init find_min_pfn_with_active_regions(void) | |||
4731 | /* | 4767 | /* |
4732 | * early_calculate_totalpages() | 4768 | * early_calculate_totalpages() |
4733 | * Sum pages in active regions for movable zone. | 4769 | * Sum pages in active regions for movable zone. |
4734 | * Populate N_HIGH_MEMORY for calculating usable_nodes. | 4770 | * Populate N_MEMORY for calculating usable_nodes. |
4735 | */ | 4771 | */ |
4736 | static unsigned long __init early_calculate_totalpages(void) | 4772 | static unsigned long __init early_calculate_totalpages(void) |
4737 | { | 4773 | { |
@@ -4744,7 +4780,7 @@ static unsigned long __init early_calculate_totalpages(void) | |||
4744 | 4780 | ||
4745 | totalpages += pages; | 4781 | totalpages += pages; |
4746 | if (pages) | 4782 | if (pages) |
4747 | node_set_state(nid, N_HIGH_MEMORY); | 4783 | node_set_state(nid, N_MEMORY); |
4748 | } | 4784 | } |
4749 | return totalpages; | 4785 | return totalpages; |
4750 | } | 4786 | } |
@@ -4761,9 +4797,9 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4761 | unsigned long usable_startpfn; | 4797 | unsigned long usable_startpfn; |
4762 | unsigned long kernelcore_node, kernelcore_remaining; | 4798 | unsigned long kernelcore_node, kernelcore_remaining; |
4763 | /* save the state before borrow the nodemask */ | 4799 | /* save the state before borrow the nodemask */ |
4764 | nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; | 4800 | nodemask_t saved_node_state = node_states[N_MEMORY]; |
4765 | unsigned long totalpages = early_calculate_totalpages(); | 4801 | unsigned long totalpages = early_calculate_totalpages(); |
4766 | int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); | 4802 | int usable_nodes = nodes_weight(node_states[N_MEMORY]); |
4767 | 4803 | ||
4768 | /* | 4804 | /* |
4769 | * If movablecore was specified, calculate what size of | 4805 | * If movablecore was specified, calculate what size of |
@@ -4798,7 +4834,7 @@ static void __init find_zone_movable_pfns_for_nodes(void) | |||
4798 | restart: | 4834 | restart: |
4799 | /* Spread kernelcore memory as evenly as possible throughout nodes */ | 4835 | /* Spread kernelcore memory as evenly as possible throughout nodes */ |
4800 | kernelcore_node = required_kernelcore / usable_nodes; | 4836 | kernelcore_node = required_kernelcore / usable_nodes; |
4801 | for_each_node_state(nid, N_HIGH_MEMORY) { | 4837 | for_each_node_state(nid, N_MEMORY) { |
4802 | unsigned long start_pfn, end_pfn; | 4838 | unsigned long start_pfn, end_pfn; |
4803 | 4839 | ||
4804 | /* | 4840 | /* |
@@ -4890,23 +4926,27 @@ restart: | |||
4890 | 4926 | ||
4891 | out: | 4927 | out: |
4892 | /* restore the node_state */ | 4928 | /* restore the node_state */ |
4893 | node_states[N_HIGH_MEMORY] = saved_node_state; | 4929 | node_states[N_MEMORY] = saved_node_state; |
4894 | } | 4930 | } |
4895 | 4931 | ||
4896 | /* Any regular memory on that node ? */ | 4932 | /* Any regular or high memory on that node ? */ |
4897 | static void __init check_for_regular_memory(pg_data_t *pgdat) | 4933 | static void check_for_memory(pg_data_t *pgdat, int nid) |
4898 | { | 4934 | { |
4899 | #ifdef CONFIG_HIGHMEM | ||
4900 | enum zone_type zone_type; | 4935 | enum zone_type zone_type; |
4901 | 4936 | ||
4902 | for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) { | 4937 | if (N_MEMORY == N_NORMAL_MEMORY) |
4938 | return; | ||
4939 | |||
4940 | for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { | ||
4903 | struct zone *zone = &pgdat->node_zones[zone_type]; | 4941 | struct zone *zone = &pgdat->node_zones[zone_type]; |
4904 | if (zone->present_pages) { | 4942 | if (zone->present_pages) { |
4905 | node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY); | 4943 | node_set_state(nid, N_HIGH_MEMORY); |
4944 | if (N_NORMAL_MEMORY != N_HIGH_MEMORY && | ||
4945 | zone_type <= ZONE_NORMAL) | ||
4946 | node_set_state(nid, N_NORMAL_MEMORY); | ||
4906 | break; | 4947 | break; |
4907 | } | 4948 | } |
4908 | } | 4949 | } |
4909 | #endif | ||
4910 | } | 4950 | } |
4911 | 4951 | ||
4912 | /** | 4952 | /** |
@@ -4989,8 +5029,8 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) | |||
4989 | 5029 | ||
4990 | /* Any memory on that node */ | 5030 | /* Any memory on that node */ |
4991 | if (pgdat->node_present_pages) | 5031 | if (pgdat->node_present_pages) |
4992 | node_set_state(nid, N_HIGH_MEMORY); | 5032 | node_set_state(nid, N_MEMORY); |
4993 | check_for_regular_memory(pgdat); | 5033 | check_for_memory(pgdat, nid); |
4994 | } | 5034 | } |
4995 | } | 5035 | } |
4996 | 5036 | ||
@@ -5545,7 +5585,7 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) | |||
5545 | pfn &= (PAGES_PER_SECTION-1); | 5585 | pfn &= (PAGES_PER_SECTION-1); |
5546 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5586 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5547 | #else | 5587 | #else |
5548 | pfn = pfn - zone->zone_start_pfn; | 5588 | pfn = pfn - round_down(zone->zone_start_pfn, pageblock_nr_pages); |
5549 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; | 5589 | return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
5550 | #endif /* CONFIG_SPARSEMEM */ | 5590 | #endif /* CONFIG_SPARSEMEM */ |
5551 | } | 5591 | } |
@@ -5727,7 +5767,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5727 | unsigned int tries = 0; | 5767 | unsigned int tries = 0; |
5728 | int ret = 0; | 5768 | int ret = 0; |
5729 | 5769 | ||
5730 | migrate_prep_local(); | 5770 | migrate_prep(); |
5731 | 5771 | ||
5732 | while (pfn < end || !list_empty(&cc->migratepages)) { | 5772 | while (pfn < end || !list_empty(&cc->migratepages)) { |
5733 | if (fatal_signal_pending(current)) { | 5773 | if (fatal_signal_pending(current)) { |
@@ -5755,7 +5795,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, | |||
5755 | 5795 | ||
5756 | ret = migrate_pages(&cc->migratepages, | 5796 | ret = migrate_pages(&cc->migratepages, |
5757 | alloc_migrate_target, | 5797 | alloc_migrate_target, |
5758 | 0, false, MIGRATE_SYNC); | 5798 | 0, false, MIGRATE_SYNC, |
5799 | MR_CMA); | ||
5759 | } | 5800 | } |
5760 | 5801 | ||
5761 | putback_movable_pages(&cc->migratepages); | 5802 | putback_movable_pages(&cc->migratepages); |
@@ -5891,8 +5932,15 @@ done: | |||
5891 | 5932 | ||
5892 | void free_contig_range(unsigned long pfn, unsigned nr_pages) | 5933 | void free_contig_range(unsigned long pfn, unsigned nr_pages) |
5893 | { | 5934 | { |
5894 | for (; nr_pages--; ++pfn) | 5935 | unsigned int count = 0; |
5895 | __free_page(pfn_to_page(pfn)); | 5936 | |
5937 | for (; nr_pages--; pfn++) { | ||
5938 | struct page *page = pfn_to_page(pfn); | ||
5939 | |||
5940 | count += page_count(page) != 1; | ||
5941 | __free_page(page); | ||
5942 | } | ||
5943 | WARN(count != 0, "%d pages are still in use!\n", count); | ||
5896 | } | 5944 | } |
5897 | #endif | 5945 | #endif |
5898 | 5946 | ||