diff options
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r-- | mm/page_alloc.c | 318 |
1 files changed, 171 insertions, 147 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a4f9219683f..889532b8e6c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -51,7 +51,6 @@ | |||
51 | #include <linux/page_cgroup.h> | 51 | #include <linux/page_cgroup.h> |
52 | #include <linux/debugobjects.h> | 52 | #include <linux/debugobjects.h> |
53 | #include <linux/kmemleak.h> | 53 | #include <linux/kmemleak.h> |
54 | #include <linux/memory.h> | ||
55 | #include <linux/compaction.h> | 54 | #include <linux/compaction.h> |
56 | #include <trace/events/kmem.h> | 55 | #include <trace/events/kmem.h> |
57 | #include <linux/ftrace_event.h> | 56 | #include <linux/ftrace_event.h> |
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes); | |||
219 | 218 | ||
220 | int page_group_by_mobility_disabled __read_mostly; | 219 | int page_group_by_mobility_disabled __read_mostly; |
221 | 220 | ||
222 | static void set_pageblock_migratetype(struct page *page, int migratetype) | 221 | /* |
222 | * NOTE: | ||
223 | * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. | ||
224 | * Instead, use {un}set_pageblock_isolate. | ||
225 | */ | ||
226 | void set_pageblock_migratetype(struct page *page, int migratetype) | ||
223 | { | 227 | { |
224 | 228 | ||
225 | if (unlikely(page_group_by_mobility_disabled)) | 229 | if (unlikely(page_group_by_mobility_disabled)) |
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone, | |||
954 | return pages_moved; | 958 | return pages_moved; |
955 | } | 959 | } |
956 | 960 | ||
957 | static int move_freepages_block(struct zone *zone, struct page *page, | 961 | int move_freepages_block(struct zone *zone, struct page *page, |
958 | int migratetype) | 962 | int migratetype) |
959 | { | 963 | { |
960 | unsigned long start_pfn, end_pfn; | 964 | unsigned long start_pfn, end_pfn; |
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) | |||
1158 | to_drain = pcp->batch; | 1162 | to_drain = pcp->batch; |
1159 | else | 1163 | else |
1160 | to_drain = pcp->count; | 1164 | to_drain = pcp->count; |
1161 | free_pcppages_bulk(zone, to_drain, pcp); | 1165 | if (to_drain > 0) { |
1162 | pcp->count -= to_drain; | 1166 | free_pcppages_bulk(zone, to_drain, pcp); |
1167 | pcp->count -= to_drain; | ||
1168 | } | ||
1163 | local_irq_restore(flags); | 1169 | local_irq_restore(flags); |
1164 | } | 1170 | } |
1165 | #endif | 1171 | #endif |
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str) | |||
1529 | } | 1535 | } |
1530 | __setup("fail_page_alloc=", setup_fail_page_alloc); | 1536 | __setup("fail_page_alloc=", setup_fail_page_alloc); |
1531 | 1537 | ||
1532 | static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1538 | static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1533 | { | 1539 | { |
1534 | if (order < fail_page_alloc.min_order) | 1540 | if (order < fail_page_alloc.min_order) |
1535 | return 0; | 1541 | return false; |
1536 | if (gfp_mask & __GFP_NOFAIL) | 1542 | if (gfp_mask & __GFP_NOFAIL) |
1537 | return 0; | 1543 | return false; |
1538 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) | 1544 | if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) |
1539 | return 0; | 1545 | return false; |
1540 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) | 1546 | if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) |
1541 | return 0; | 1547 | return false; |
1542 | 1548 | ||
1543 | return should_fail(&fail_page_alloc.attr, 1 << order); | 1549 | return should_fail(&fail_page_alloc.attr, 1 << order); |
1544 | } | 1550 | } |
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs); | |||
1578 | 1584 | ||
1579 | #else /* CONFIG_FAIL_PAGE_ALLOC */ | 1585 | #else /* CONFIG_FAIL_PAGE_ALLOC */ |
1580 | 1586 | ||
1581 | static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) | 1587 | static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) |
1582 | { | 1588 | { |
1583 | return 0; | 1589 | return false; |
1584 | } | 1590 | } |
1585 | 1591 | ||
1586 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ | 1592 | #endif /* CONFIG_FAIL_PAGE_ALLOC */ |
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1594 | { | 1600 | { |
1595 | /* free_pages my go negative - that's OK */ | 1601 | /* free_pages my go negative - that's OK */ |
1596 | long min = mark; | 1602 | long min = mark; |
1603 | long lowmem_reserve = z->lowmem_reserve[classzone_idx]; | ||
1597 | int o; | 1604 | int o; |
1598 | 1605 | ||
1599 | free_pages -= (1 << order) - 1; | 1606 | free_pages -= (1 << order) - 1; |
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1602 | if (alloc_flags & ALLOC_HARDER) | 1609 | if (alloc_flags & ALLOC_HARDER) |
1603 | min -= min / 4; | 1610 | min -= min / 4; |
1604 | 1611 | ||
1605 | if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | 1612 | if (free_pages <= min + lowmem_reserve) |
1606 | return false; | 1613 | return false; |
1607 | for (o = 0; o < order; o++) { | 1614 | for (o = 0; o < order; o++) { |
1608 | /* At the next order, this order's pages become unavailable */ | 1615 | /* At the next order, this order's pages become unavailable */ |
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, | |||
1617 | return true; | 1624 | return true; |
1618 | } | 1625 | } |
1619 | 1626 | ||
1627 | #ifdef CONFIG_MEMORY_ISOLATION | ||
1628 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1629 | { | ||
1630 | if (unlikely(zone->nr_pageblock_isolate)) | ||
1631 | return zone->nr_pageblock_isolate * pageblock_nr_pages; | ||
1632 | return 0; | ||
1633 | } | ||
1634 | #else | ||
1635 | static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) | ||
1636 | { | ||
1637 | return 0; | ||
1638 | } | ||
1639 | #endif | ||
1640 | |||
1620 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, | 1641 | bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, |
1621 | int classzone_idx, int alloc_flags) | 1642 | int classzone_idx, int alloc_flags) |
1622 | { | 1643 | { |
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, | |||
1632 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) | 1653 | if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) |
1633 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); | 1654 | free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); |
1634 | 1655 | ||
1656 | /* | ||
1657 | * If the zone has MIGRATE_ISOLATE type free pages, we should consider | ||
1658 | * it. nr_zone_isolate_freepages is never accurate so kswapd might not | ||
1659 | * sleep although it could do so. But this is more desirable for memory | ||
1660 | * hotplug than sleeping which can cause a livelock in the direct | ||
1661 | * reclaim path. | ||
1662 | */ | ||
1663 | free_pages -= nr_zone_isolate_freepages(z); | ||
1635 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, | 1664 | return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, |
1636 | free_pages); | 1665 | free_pages); |
1637 | } | 1666 | } |
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, | |||
2087 | 2116 | ||
2088 | page = get_page_from_freelist(gfp_mask, nodemask, | 2117 | page = get_page_from_freelist(gfp_mask, nodemask, |
2089 | order, zonelist, high_zoneidx, | 2118 | order, zonelist, high_zoneidx, |
2090 | alloc_flags, preferred_zone, | 2119 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2091 | migratetype); | 2120 | preferred_zone, migratetype); |
2092 | if (page) { | 2121 | if (page) { |
2093 | preferred_zone->compact_considered = 0; | 2122 | preferred_zone->compact_considered = 0; |
2094 | preferred_zone->compact_defer_shift = 0; | 2123 | preferred_zone->compact_defer_shift = 0; |
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, | |||
2180 | retry: | 2209 | retry: |
2181 | page = get_page_from_freelist(gfp_mask, nodemask, order, | 2210 | page = get_page_from_freelist(gfp_mask, nodemask, order, |
2182 | zonelist, high_zoneidx, | 2211 | zonelist, high_zoneidx, |
2183 | alloc_flags, preferred_zone, | 2212 | alloc_flags & ~ALLOC_NO_WATERMARKS, |
2184 | migratetype); | 2213 | preferred_zone, migratetype); |
2185 | 2214 | ||
2186 | /* | 2215 | /* |
2187 | * If an allocation failed after direct reclaim, it could be because | 2216 | * If an allocation failed after direct reclaim, it could be because |
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2265 | alloc_flags |= ALLOC_HARDER; | 2294 | alloc_flags |= ALLOC_HARDER; |
2266 | 2295 | ||
2267 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { | 2296 | if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { |
2268 | if (!in_interrupt() && | 2297 | if (gfp_mask & __GFP_MEMALLOC) |
2269 | ((current->flags & PF_MEMALLOC) || | 2298 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2270 | unlikely(test_thread_flag(TIF_MEMDIE)))) | 2299 | else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) |
2300 | alloc_flags |= ALLOC_NO_WATERMARKS; | ||
2301 | else if (!in_interrupt() && | ||
2302 | ((current->flags & PF_MEMALLOC) || | ||
2303 | unlikely(test_thread_flag(TIF_MEMDIE)))) | ||
2271 | alloc_flags |= ALLOC_NO_WATERMARKS; | 2304 | alloc_flags |= ALLOC_NO_WATERMARKS; |
2272 | } | 2305 | } |
2273 | 2306 | ||
2274 | return alloc_flags; | 2307 | return alloc_flags; |
2275 | } | 2308 | } |
2276 | 2309 | ||
2310 | bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) | ||
2311 | { | ||
2312 | return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); | ||
2313 | } | ||
2314 | |||
2277 | static inline struct page * | 2315 | static inline struct page * |
2278 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, | 2316 | __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, |
2279 | struct zonelist *zonelist, enum zone_type high_zoneidx, | 2317 | struct zonelist *zonelist, enum zone_type high_zoneidx, |
@@ -2340,11 +2378,27 @@ rebalance: | |||
2340 | 2378 | ||
2341 | /* Allocate without watermarks if the context allows */ | 2379 | /* Allocate without watermarks if the context allows */ |
2342 | if (alloc_flags & ALLOC_NO_WATERMARKS) { | 2380 | if (alloc_flags & ALLOC_NO_WATERMARKS) { |
2381 | /* | ||
2382 | * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds | ||
2383 | * the allocation is high priority and these type of | ||
2384 | * allocations are system rather than user orientated | ||
2385 | */ | ||
2386 | zonelist = node_zonelist(numa_node_id(), gfp_mask); | ||
2387 | |||
2343 | page = __alloc_pages_high_priority(gfp_mask, order, | 2388 | page = __alloc_pages_high_priority(gfp_mask, order, |
2344 | zonelist, high_zoneidx, nodemask, | 2389 | zonelist, high_zoneidx, nodemask, |
2345 | preferred_zone, migratetype); | 2390 | preferred_zone, migratetype); |
2346 | if (page) | 2391 | if (page) { |
2392 | /* | ||
2393 | * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was | ||
2394 | * necessary to allocate the page. The expectation is | ||
2395 | * that the caller is taking steps that will free more | ||
2396 | * memory. The caller should avoid the page being used | ||
2397 | * for !PFMEMALLOC purposes. | ||
2398 | */ | ||
2399 | page->pfmemalloc = true; | ||
2347 | goto got_pg; | 2400 | goto got_pg; |
2401 | } | ||
2348 | } | 2402 | } |
2349 | 2403 | ||
2350 | /* Atomic allocations - we can't balance anything */ | 2404 | /* Atomic allocations - we can't balance anything */ |
@@ -2463,8 +2517,8 @@ nopage: | |||
2463 | got_pg: | 2517 | got_pg: |
2464 | if (kmemcheck_enabled) | 2518 | if (kmemcheck_enabled) |
2465 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); | 2519 | kmemcheck_pagealloc_alloc(page, order, gfp_mask); |
2466 | return page; | ||
2467 | 2520 | ||
2521 | return page; | ||
2468 | } | 2522 | } |
2469 | 2523 | ||
2470 | /* | 2524 | /* |
@@ -2515,6 +2569,8 @@ retry_cpuset: | |||
2515 | page = __alloc_pages_slowpath(gfp_mask, order, | 2569 | page = __alloc_pages_slowpath(gfp_mask, order, |
2516 | zonelist, high_zoneidx, nodemask, | 2570 | zonelist, high_zoneidx, nodemask, |
2517 | preferred_zone, migratetype); | 2571 | preferred_zone, migratetype); |
2572 | else | ||
2573 | page->pfmemalloc = false; | ||
2518 | 2574 | ||
2519 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); | 2575 | trace_mm_page_alloc(page, order, gfp_mask, migratetype); |
2520 | 2576 | ||
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, | |||
3030 | user_zonelist_order = oldval; | 3086 | user_zonelist_order = oldval; |
3031 | } else if (oldval != user_zonelist_order) { | 3087 | } else if (oldval != user_zonelist_order) { |
3032 | mutex_lock(&zonelists_mutex); | 3088 | mutex_lock(&zonelists_mutex); |
3033 | build_all_zonelists(NULL); | 3089 | build_all_zonelists(NULL, NULL); |
3034 | mutex_unlock(&zonelists_mutex); | 3090 | mutex_unlock(&zonelists_mutex); |
3035 | } | 3091 | } |
3036 | } | 3092 | } |
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone); | |||
3409 | DEFINE_MUTEX(zonelists_mutex); | 3465 | DEFINE_MUTEX(zonelists_mutex); |
3410 | 3466 | ||
3411 | /* return values int ....just for stop_machine() */ | 3467 | /* return values int ....just for stop_machine() */ |
3412 | static __init_refok int __build_all_zonelists(void *data) | 3468 | static int __build_all_zonelists(void *data) |
3413 | { | 3469 | { |
3414 | int nid; | 3470 | int nid; |
3415 | int cpu; | 3471 | int cpu; |
3472 | pg_data_t *self = data; | ||
3416 | 3473 | ||
3417 | #ifdef CONFIG_NUMA | 3474 | #ifdef CONFIG_NUMA |
3418 | memset(node_load, 0, sizeof(node_load)); | 3475 | memset(node_load, 0, sizeof(node_load)); |
3419 | #endif | 3476 | #endif |
3477 | |||
3478 | if (self && !node_online(self->node_id)) { | ||
3479 | build_zonelists(self); | ||
3480 | build_zonelist_cache(self); | ||
3481 | } | ||
3482 | |||
3420 | for_each_online_node(nid) { | 3483 | for_each_online_node(nid) { |
3421 | pg_data_t *pgdat = NODE_DATA(nid); | 3484 | pg_data_t *pgdat = NODE_DATA(nid); |
3422 | 3485 | ||
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data) | |||
3461 | * Called with zonelists_mutex held always | 3524 | * Called with zonelists_mutex held always |
3462 | * unless system_state == SYSTEM_BOOTING. | 3525 | * unless system_state == SYSTEM_BOOTING. |
3463 | */ | 3526 | */ |
3464 | void __ref build_all_zonelists(void *data) | 3527 | void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) |
3465 | { | 3528 | { |
3466 | set_zonelist_order(); | 3529 | set_zonelist_order(); |
3467 | 3530 | ||
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data) | |||
3473 | /* we have to stop all cpus to guarantee there is no user | 3536 | /* we have to stop all cpus to guarantee there is no user |
3474 | of zonelist */ | 3537 | of zonelist */ |
3475 | #ifdef CONFIG_MEMORY_HOTPLUG | 3538 | #ifdef CONFIG_MEMORY_HOTPLUG |
3476 | if (data) | 3539 | if (zone) |
3477 | setup_zone_pageset((struct zone *)data); | 3540 | setup_zone_pageset(zone); |
3478 | #endif | 3541 | #endif |
3479 | stop_machine(__build_all_zonelists, NULL, NULL); | 3542 | stop_machine(__build_all_zonelists, pgdat, NULL); |
3480 | /* cpuset refresh routine should be here */ | 3543 | /* cpuset refresh routine should be here */ |
3481 | } | 3544 | } |
3482 | vm_total_pages = nr_free_pagecache_pages(); | 3545 | vm_total_pages = nr_free_pagecache_pages(); |
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) | |||
3746 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) | 3809 | memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) |
3747 | #endif | 3810 | #endif |
3748 | 3811 | ||
3749 | static int zone_batchsize(struct zone *zone) | 3812 | static int __meminit zone_batchsize(struct zone *zone) |
3750 | { | 3813 | { |
3751 | #ifdef CONFIG_MMU | 3814 | #ifdef CONFIG_MMU |
3752 | int batch; | 3815 | int batch; |
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, | |||
3828 | pcp->batch = PAGE_SHIFT * 8; | 3891 | pcp->batch = PAGE_SHIFT * 8; |
3829 | } | 3892 | } |
3830 | 3893 | ||
3831 | static void setup_zone_pageset(struct zone *zone) | 3894 | static void __meminit setup_zone_pageset(struct zone *zone) |
3832 | { | 3895 | { |
3833 | int cpu; | 3896 | int cpu; |
3834 | 3897 | ||
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) | |||
3901 | return 0; | 3964 | return 0; |
3902 | } | 3965 | } |
3903 | 3966 | ||
3904 | static int __zone_pcp_update(void *data) | ||
3905 | { | ||
3906 | struct zone *zone = data; | ||
3907 | int cpu; | ||
3908 | unsigned long batch = zone_batchsize(zone), flags; | ||
3909 | |||
3910 | for_each_possible_cpu(cpu) { | ||
3911 | struct per_cpu_pageset *pset; | ||
3912 | struct per_cpu_pages *pcp; | ||
3913 | |||
3914 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
3915 | pcp = &pset->pcp; | ||
3916 | |||
3917 | local_irq_save(flags); | ||
3918 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
3919 | setup_pageset(pset, batch); | ||
3920 | local_irq_restore(flags); | ||
3921 | } | ||
3922 | return 0; | ||
3923 | } | ||
3924 | |||
3925 | void zone_pcp_update(struct zone *zone) | ||
3926 | { | ||
3927 | stop_machine(__zone_pcp_update, zone, NULL); | ||
3928 | } | ||
3929 | |||
3930 | static __meminit void zone_pcp_init(struct zone *zone) | 3967 | static __meminit void zone_pcp_init(struct zone *zone) |
3931 | { | 3968 | { |
3932 | /* | 3969 | /* |
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone) | |||
3942 | zone_batchsize(zone)); | 3979 | zone_batchsize(zone)); |
3943 | } | 3980 | } |
3944 | 3981 | ||
3945 | __meminit int init_currently_empty_zone(struct zone *zone, | 3982 | int __meminit init_currently_empty_zone(struct zone *zone, |
3946 | unsigned long zone_start_pfn, | 3983 | unsigned long zone_start_pfn, |
3947 | unsigned long size, | 3984 | unsigned long size, |
3948 | enum memmap_context context) | 3985 | enum memmap_context context) |
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, | |||
4301 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE | 4338 | #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
4302 | 4339 | ||
4303 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ | 4340 | /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ |
4304 | static inline void __init set_pageblock_order(void) | 4341 | void __init set_pageblock_order(void) |
4305 | { | 4342 | { |
4306 | unsigned int order; | 4343 | unsigned int order; |
4307 | 4344 | ||
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void) | |||
4329 | * include/linux/pageblock-flags.h for the values of pageblock_order based on | 4366 | * include/linux/pageblock-flags.h for the values of pageblock_order based on |
4330 | * the kernel config | 4367 | * the kernel config |
4331 | */ | 4368 | */ |
4332 | static inline void set_pageblock_order(void) | 4369 | void __init set_pageblock_order(void) |
4333 | { | 4370 | { |
4334 | } | 4371 | } |
4335 | 4372 | ||
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void) | |||
4340 | * - mark all pages reserved | 4377 | * - mark all pages reserved |
4341 | * - mark all memory queues empty | 4378 | * - mark all memory queues empty |
4342 | * - clear the memory bitmaps | 4379 | * - clear the memory bitmaps |
4380 | * | ||
4381 | * NOTE: pgdat should get zeroed by caller. | ||
4343 | */ | 4382 | */ |
4344 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, | 4383 | static void __paginginit free_area_init_core(struct pglist_data *pgdat, |
4345 | unsigned long *zones_size, unsigned long *zholes_size) | 4384 | unsigned long *zones_size, unsigned long *zholes_size) |
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4350 | int ret; | 4389 | int ret; |
4351 | 4390 | ||
4352 | pgdat_resize_init(pgdat); | 4391 | pgdat_resize_init(pgdat); |
4353 | pgdat->nr_zones = 0; | ||
4354 | init_waitqueue_head(&pgdat->kswapd_wait); | 4392 | init_waitqueue_head(&pgdat->kswapd_wait); |
4355 | pgdat->kswapd_max_order = 0; | 4393 | init_waitqueue_head(&pgdat->pfmemalloc_wait); |
4356 | pgdat_page_cgroup_init(pgdat); | 4394 | pgdat_page_cgroup_init(pgdat); |
4357 | 4395 | ||
4358 | for (j = 0; j < MAX_NR_ZONES; j++) { | 4396 | for (j = 0; j < MAX_NR_ZONES; j++) { |
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4394 | 4432 | ||
4395 | zone->spanned_pages = size; | 4433 | zone->spanned_pages = size; |
4396 | zone->present_pages = realsize; | 4434 | zone->present_pages = realsize; |
4435 | #if defined CONFIG_COMPACTION || defined CONFIG_CMA | ||
4436 | zone->compact_cached_free_pfn = zone->zone_start_pfn + | ||
4437 | zone->spanned_pages; | ||
4438 | zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); | ||
4439 | #endif | ||
4397 | #ifdef CONFIG_NUMA | 4440 | #ifdef CONFIG_NUMA |
4398 | zone->node = nid; | 4441 | zone->node = nid; |
4399 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) | 4442 | zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) |
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, | |||
4408 | 4451 | ||
4409 | zone_pcp_init(zone); | 4452 | zone_pcp_init(zone); |
4410 | lruvec_init(&zone->lruvec, zone); | 4453 | lruvec_init(&zone->lruvec, zone); |
4411 | zap_zone_vm_stats(zone); | ||
4412 | zone->flags = 0; | ||
4413 | if (!size) | 4454 | if (!size) |
4414 | continue; | 4455 | continue; |
4415 | 4456 | ||
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, | |||
4469 | { | 4510 | { |
4470 | pg_data_t *pgdat = NODE_DATA(nid); | 4511 | pg_data_t *pgdat = NODE_DATA(nid); |
4471 | 4512 | ||
4513 | /* pg_data_t should be reset to zero when it's allocated */ | ||
4514 | WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx); | ||
4515 | |||
4472 | pgdat->node_id = nid; | 4516 | pgdat->node_id = nid; |
4473 | pgdat->node_start_pfn = node_start_pfn; | 4517 | pgdat->node_start_pfn = node_start_pfn; |
4474 | calculate_node_totalpages(pgdat, zones_size, zholes_size); | 4518 | calculate_node_totalpages(pgdat, zones_size, zholes_size); |
@@ -4750,7 +4794,7 @@ out: | |||
4750 | } | 4794 | } |
4751 | 4795 | ||
4752 | /* Any regular memory on that node ? */ | 4796 | /* Any regular memory on that node ? */ |
4753 | static void check_for_regular_memory(pg_data_t *pgdat) | 4797 | static void __init check_for_regular_memory(pg_data_t *pgdat) |
4754 | { | 4798 | { |
4755 | #ifdef CONFIG_HIGHMEM | 4799 | #ifdef CONFIG_HIGHMEM |
4756 | enum zone_type zone_type; | 4800 | enum zone_type zone_type; |
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, | |||
5468 | } | 5512 | } |
5469 | 5513 | ||
5470 | /* | 5514 | /* |
5471 | * This is designed as sub function...plz see page_isolation.c also. | 5515 | * This function checks whether pageblock includes unmovable pages or not. |
5472 | * set/clear page block's type to be ISOLATE. | 5516 | * If @count is not zero, it is okay to include less @count unmovable pages |
5473 | * page allocater never alloc memory from ISOLATE block. | 5517 | * |
5518 | * PageLRU check wihtout isolation or lru_lock could race so that | ||
5519 | * MIGRATE_MOVABLE block might include unmovable pages. It means you can't | ||
5520 | * expect this function should be exact. | ||
5474 | */ | 5521 | */ |
5475 | 5522 | bool has_unmovable_pages(struct zone *zone, struct page *page, int count) | |
5476 | static int | ||
5477 | __count_immobile_pages(struct zone *zone, struct page *page, int count) | ||
5478 | { | 5523 | { |
5479 | unsigned long pfn, iter, found; | 5524 | unsigned long pfn, iter, found; |
5480 | int mt; | 5525 | int mt; |
5481 | 5526 | ||
5482 | /* | 5527 | /* |
5483 | * For avoiding noise data, lru_add_drain_all() should be called | 5528 | * For avoiding noise data, lru_add_drain_all() should be called |
5484 | * If ZONE_MOVABLE, the zone never contains immobile pages | 5529 | * If ZONE_MOVABLE, the zone never contains unmovable pages |
5485 | */ | 5530 | */ |
5486 | if (zone_idx(zone) == ZONE_MOVABLE) | 5531 | if (zone_idx(zone) == ZONE_MOVABLE) |
5487 | return true; | 5532 | return false; |
5488 | mt = get_pageblock_migratetype(page); | 5533 | mt = get_pageblock_migratetype(page); |
5489 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) | 5534 | if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) |
5490 | return true; | 5535 | return false; |
5491 | 5536 | ||
5492 | pfn = page_to_pfn(page); | 5537 | pfn = page_to_pfn(page); |
5493 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { | 5538 | for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { |
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5497 | continue; | 5542 | continue; |
5498 | 5543 | ||
5499 | page = pfn_to_page(check); | 5544 | page = pfn_to_page(check); |
5500 | if (!page_count(page)) { | 5545 | /* |
5546 | * We can't use page_count without pin a page | ||
5547 | * because another CPU can free compound page. | ||
5548 | * This check already skips compound tails of THP | ||
5549 | * because their page->_count is zero at all time. | ||
5550 | */ | ||
5551 | if (!atomic_read(&page->_count)) { | ||
5501 | if (PageBuddy(page)) | 5552 | if (PageBuddy(page)) |
5502 | iter += (1 << page_order(page)) - 1; | 5553 | iter += (1 << page_order(page)) - 1; |
5503 | continue; | 5554 | continue; |
5504 | } | 5555 | } |
5556 | |||
5505 | if (!PageLRU(page)) | 5557 | if (!PageLRU(page)) |
5506 | found++; | 5558 | found++; |
5507 | /* | 5559 | /* |
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) | |||
5518 | * page at boot. | 5570 | * page at boot. |
5519 | */ | 5571 | */ |
5520 | if (found > count) | 5572 | if (found > count) |
5521 | return false; | 5573 | return true; |
5522 | } | 5574 | } |
5523 | return true; | 5575 | return false; |
5524 | } | 5576 | } |
5525 | 5577 | ||
5526 | bool is_pageblock_removable_nolock(struct page *page) | 5578 | bool is_pageblock_removable_nolock(struct page *page) |
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page) | |||
5544 | zone->zone_start_pfn + zone->spanned_pages <= pfn) | 5596 | zone->zone_start_pfn + zone->spanned_pages <= pfn) |
5545 | return false; | 5597 | return false; |
5546 | 5598 | ||
5547 | return __count_immobile_pages(zone, page, 0); | 5599 | return !has_unmovable_pages(zone, page, 0); |
5548 | } | ||
5549 | |||
5550 | int set_migratetype_isolate(struct page *page) | ||
5551 | { | ||
5552 | struct zone *zone; | ||
5553 | unsigned long flags, pfn; | ||
5554 | struct memory_isolate_notify arg; | ||
5555 | int notifier_ret; | ||
5556 | int ret = -EBUSY; | ||
5557 | |||
5558 | zone = page_zone(page); | ||
5559 | |||
5560 | spin_lock_irqsave(&zone->lock, flags); | ||
5561 | |||
5562 | pfn = page_to_pfn(page); | ||
5563 | arg.start_pfn = pfn; | ||
5564 | arg.nr_pages = pageblock_nr_pages; | ||
5565 | arg.pages_found = 0; | ||
5566 | |||
5567 | /* | ||
5568 | * It may be possible to isolate a pageblock even if the | ||
5569 | * migratetype is not MIGRATE_MOVABLE. The memory isolation | ||
5570 | * notifier chain is used by balloon drivers to return the | ||
5571 | * number of pages in a range that are held by the balloon | ||
5572 | * driver to shrink memory. If all the pages are accounted for | ||
5573 | * by balloons, are free, or on the LRU, isolation can continue. | ||
5574 | * Later, for example, when memory hotplug notifier runs, these | ||
5575 | * pages reported as "can be isolated" should be isolated(freed) | ||
5576 | * by the balloon driver through the memory notifier chain. | ||
5577 | */ | ||
5578 | notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); | ||
5579 | notifier_ret = notifier_to_errno(notifier_ret); | ||
5580 | if (notifier_ret) | ||
5581 | goto out; | ||
5582 | /* | ||
5583 | * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. | ||
5584 | * We just check MOVABLE pages. | ||
5585 | */ | ||
5586 | if (__count_immobile_pages(zone, page, arg.pages_found)) | ||
5587 | ret = 0; | ||
5588 | |||
5589 | /* | ||
5590 | * immobile means "not-on-lru" paes. If immobile is larger than | ||
5591 | * removable-by-driver pages reported by notifier, we'll fail. | ||
5592 | */ | ||
5593 | |||
5594 | out: | ||
5595 | if (!ret) { | ||
5596 | set_pageblock_migratetype(page, MIGRATE_ISOLATE); | ||
5597 | move_freepages_block(zone, page, MIGRATE_ISOLATE); | ||
5598 | } | ||
5599 | |||
5600 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5601 | if (!ret) | ||
5602 | drain_all_pages(); | ||
5603 | return ret; | ||
5604 | } | ||
5605 | |||
5606 | void unset_migratetype_isolate(struct page *page, unsigned migratetype) | ||
5607 | { | ||
5608 | struct zone *zone; | ||
5609 | unsigned long flags; | ||
5610 | zone = page_zone(page); | ||
5611 | spin_lock_irqsave(&zone->lock, flags); | ||
5612 | if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) | ||
5613 | goto out; | ||
5614 | set_pageblock_migratetype(page, migratetype); | ||
5615 | move_freepages_block(zone, page, migratetype); | ||
5616 | out: | ||
5617 | spin_unlock_irqrestore(&zone->lock, flags); | ||
5618 | } | 5600 | } |
5619 | 5601 | ||
5620 | #ifdef CONFIG_CMA | 5602 | #ifdef CONFIG_CMA |
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) | |||
5869 | } | 5851 | } |
5870 | #endif | 5852 | #endif |
5871 | 5853 | ||
5854 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
5855 | static int __meminit __zone_pcp_update(void *data) | ||
5856 | { | ||
5857 | struct zone *zone = data; | ||
5858 | int cpu; | ||
5859 | unsigned long batch = zone_batchsize(zone), flags; | ||
5860 | |||
5861 | for_each_possible_cpu(cpu) { | ||
5862 | struct per_cpu_pageset *pset; | ||
5863 | struct per_cpu_pages *pcp; | ||
5864 | |||
5865 | pset = per_cpu_ptr(zone->pageset, cpu); | ||
5866 | pcp = &pset->pcp; | ||
5867 | |||
5868 | local_irq_save(flags); | ||
5869 | if (pcp->count > 0) | ||
5870 | free_pcppages_bulk(zone, pcp->count, pcp); | ||
5871 | setup_pageset(pset, batch); | ||
5872 | local_irq_restore(flags); | ||
5873 | } | ||
5874 | return 0; | ||
5875 | } | ||
5876 | |||
5877 | void __meminit zone_pcp_update(struct zone *zone) | ||
5878 | { | ||
5879 | stop_machine(__zone_pcp_update, zone, NULL); | ||
5880 | } | ||
5881 | #endif | ||
5882 | |||
5872 | #ifdef CONFIG_MEMORY_HOTREMOVE | 5883 | #ifdef CONFIG_MEMORY_HOTREMOVE |
5884 | void zone_pcp_reset(struct zone *zone) | ||
5885 | { | ||
5886 | unsigned long flags; | ||
5887 | |||
5888 | /* avoid races with drain_pages() */ | ||
5889 | local_irq_save(flags); | ||
5890 | if (zone->pageset != &boot_pageset) { | ||
5891 | free_percpu(zone->pageset); | ||
5892 | zone->pageset = &boot_pageset; | ||
5893 | } | ||
5894 | local_irq_restore(flags); | ||
5895 | } | ||
5896 | |||
5873 | /* | 5897 | /* |
5874 | * All pages in the range must be isolated before calling this. | 5898 | * All pages in the range must be isolated before calling this. |
5875 | */ | 5899 | */ |