aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c318
1 files changed, 171 insertions, 147 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4a4f9219683f..889532b8e6c1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -51,7 +51,6 @@
51#include <linux/page_cgroup.h> 51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 52#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 53#include <linux/kmemleak.h>
54#include <linux/memory.h>
55#include <linux/compaction.h> 54#include <linux/compaction.h>
56#include <trace/events/kmem.h> 55#include <trace/events/kmem.h>
57#include <linux/ftrace_event.h> 56#include <linux/ftrace_event.h>
@@ -219,7 +218,12 @@ EXPORT_SYMBOL(nr_online_nodes);
219 218
220int page_group_by_mobility_disabled __read_mostly; 219int page_group_by_mobility_disabled __read_mostly;
221 220
222static void set_pageblock_migratetype(struct page *page, int migratetype) 221/*
222 * NOTE:
223 * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly.
224 * Instead, use {un}set_pageblock_isolate.
225 */
226void set_pageblock_migratetype(struct page *page, int migratetype)
223{ 227{
224 228
225 if (unlikely(page_group_by_mobility_disabled)) 229 if (unlikely(page_group_by_mobility_disabled))
@@ -954,7 +958,7 @@ static int move_freepages(struct zone *zone,
954 return pages_moved; 958 return pages_moved;
955} 959}
956 960
957static int move_freepages_block(struct zone *zone, struct page *page, 961int move_freepages_block(struct zone *zone, struct page *page,
958 int migratetype) 962 int migratetype)
959{ 963{
960 unsigned long start_pfn, end_pfn; 964 unsigned long start_pfn, end_pfn;
@@ -1158,8 +1162,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
1158 to_drain = pcp->batch; 1162 to_drain = pcp->batch;
1159 else 1163 else
1160 to_drain = pcp->count; 1164 to_drain = pcp->count;
1161 free_pcppages_bulk(zone, to_drain, pcp); 1165 if (to_drain > 0) {
1162 pcp->count -= to_drain; 1166 free_pcppages_bulk(zone, to_drain, pcp);
1167 pcp->count -= to_drain;
1168 }
1163 local_irq_restore(flags); 1169 local_irq_restore(flags);
1164} 1170}
1165#endif 1171#endif
@@ -1529,16 +1535,16 @@ static int __init setup_fail_page_alloc(char *str)
1529} 1535}
1530__setup("fail_page_alloc=", setup_fail_page_alloc); 1536__setup("fail_page_alloc=", setup_fail_page_alloc);
1531 1537
1532static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1538static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1533{ 1539{
1534 if (order < fail_page_alloc.min_order) 1540 if (order < fail_page_alloc.min_order)
1535 return 0; 1541 return false;
1536 if (gfp_mask & __GFP_NOFAIL) 1542 if (gfp_mask & __GFP_NOFAIL)
1537 return 0; 1543 return false;
1538 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) 1544 if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
1539 return 0; 1545 return false;
1540 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) 1546 if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
1541 return 0; 1547 return false;
1542 1548
1543 return should_fail(&fail_page_alloc.attr, 1 << order); 1549 return should_fail(&fail_page_alloc.attr, 1 << order);
1544} 1550}
@@ -1578,9 +1584,9 @@ late_initcall(fail_page_alloc_debugfs);
1578 1584
1579#else /* CONFIG_FAIL_PAGE_ALLOC */ 1585#else /* CONFIG_FAIL_PAGE_ALLOC */
1580 1586
1581static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) 1587static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
1582{ 1588{
1583 return 0; 1589 return false;
1584} 1590}
1585 1591
1586#endif /* CONFIG_FAIL_PAGE_ALLOC */ 1592#endif /* CONFIG_FAIL_PAGE_ALLOC */
@@ -1594,6 +1600,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1594{ 1600{
1595 /* free_pages my go negative - that's OK */ 1601 /* free_pages my go negative - that's OK */
1596 long min = mark; 1602 long min = mark;
1603 long lowmem_reserve = z->lowmem_reserve[classzone_idx];
1597 int o; 1604 int o;
1598 1605
1599 free_pages -= (1 << order) - 1; 1606 free_pages -= (1 << order) - 1;
@@ -1602,7 +1609,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1602 if (alloc_flags & ALLOC_HARDER) 1609 if (alloc_flags & ALLOC_HARDER)
1603 min -= min / 4; 1610 min -= min / 4;
1604 1611
1605 if (free_pages <= min + z->lowmem_reserve[classzone_idx]) 1612 if (free_pages <= min + lowmem_reserve)
1606 return false; 1613 return false;
1607 for (o = 0; o < order; o++) { 1614 for (o = 0; o < order; o++) {
1608 /* At the next order, this order's pages become unavailable */ 1615 /* At the next order, this order's pages become unavailable */
@@ -1617,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1617 return true; 1624 return true;
1618} 1625}
1619 1626
1627#ifdef CONFIG_MEMORY_ISOLATION
1628static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1629{
1630 if (unlikely(zone->nr_pageblock_isolate))
1631 return zone->nr_pageblock_isolate * pageblock_nr_pages;
1632 return 0;
1633}
1634#else
1635static inline unsigned long nr_zone_isolate_freepages(struct zone *zone)
1636{
1637 return 0;
1638}
1639#endif
1640
1620bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, 1641bool zone_watermark_ok(struct zone *z, int order, unsigned long mark,
1621 int classzone_idx, int alloc_flags) 1642 int classzone_idx, int alloc_flags)
1622{ 1643{
@@ -1632,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
1632 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) 1653 if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
1633 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); 1654 free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
1634 1655
1656 /*
1657 * If the zone has MIGRATE_ISOLATE type free pages, we should consider
1658 * it. nr_zone_isolate_freepages is never accurate so kswapd might not
1659 * sleep although it could do so. But this is more desirable for memory
1660 * hotplug than sleeping which can cause a livelock in the direct
1661 * reclaim path.
1662 */
1663 free_pages -= nr_zone_isolate_freepages(z);
1635 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, 1664 return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
1636 free_pages); 1665 free_pages);
1637} 1666}
@@ -2087,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2087 2116
2088 page = get_page_from_freelist(gfp_mask, nodemask, 2117 page = get_page_from_freelist(gfp_mask, nodemask,
2089 order, zonelist, high_zoneidx, 2118 order, zonelist, high_zoneidx,
2090 alloc_flags, preferred_zone, 2119 alloc_flags & ~ALLOC_NO_WATERMARKS,
2091 migratetype); 2120 preferred_zone, migratetype);
2092 if (page) { 2121 if (page) {
2093 preferred_zone->compact_considered = 0; 2122 preferred_zone->compact_considered = 0;
2094 preferred_zone->compact_defer_shift = 0; 2123 preferred_zone->compact_defer_shift = 0;
@@ -2180,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
2180retry: 2209retry:
2181 page = get_page_from_freelist(gfp_mask, nodemask, order, 2210 page = get_page_from_freelist(gfp_mask, nodemask, order,
2182 zonelist, high_zoneidx, 2211 zonelist, high_zoneidx,
2183 alloc_flags, preferred_zone, 2212 alloc_flags & ~ALLOC_NO_WATERMARKS,
2184 migratetype); 2213 preferred_zone, migratetype);
2185 2214
2186 /* 2215 /*
2187 * If an allocation failed after direct reclaim, it could be because 2216 * If an allocation failed after direct reclaim, it could be because
@@ -2265,15 +2294,24 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
2265 alloc_flags |= ALLOC_HARDER; 2294 alloc_flags |= ALLOC_HARDER;
2266 2295
2267 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { 2296 if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
2268 if (!in_interrupt() && 2297 if (gfp_mask & __GFP_MEMALLOC)
2269 ((current->flags & PF_MEMALLOC) || 2298 alloc_flags |= ALLOC_NO_WATERMARKS;
2270 unlikely(test_thread_flag(TIF_MEMDIE)))) 2299 else if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
2300 alloc_flags |= ALLOC_NO_WATERMARKS;
2301 else if (!in_interrupt() &&
2302 ((current->flags & PF_MEMALLOC) ||
2303 unlikely(test_thread_flag(TIF_MEMDIE))))
2271 alloc_flags |= ALLOC_NO_WATERMARKS; 2304 alloc_flags |= ALLOC_NO_WATERMARKS;
2272 } 2305 }
2273 2306
2274 return alloc_flags; 2307 return alloc_flags;
2275} 2308}
2276 2309
2310bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
2311{
2312 return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
2313}
2314
2277static inline struct page * 2315static inline struct page *
2278__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, 2316__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2279 struct zonelist *zonelist, enum zone_type high_zoneidx, 2317 struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -2340,11 +2378,27 @@ rebalance:
2340 2378
2341 /* Allocate without watermarks if the context allows */ 2379 /* Allocate without watermarks if the context allows */
2342 if (alloc_flags & ALLOC_NO_WATERMARKS) { 2380 if (alloc_flags & ALLOC_NO_WATERMARKS) {
2381 /*
2382 * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds
2383 * the allocation is high priority and these type of
2384 * allocations are system rather than user orientated
2385 */
2386 zonelist = node_zonelist(numa_node_id(), gfp_mask);
2387
2343 page = __alloc_pages_high_priority(gfp_mask, order, 2388 page = __alloc_pages_high_priority(gfp_mask, order,
2344 zonelist, high_zoneidx, nodemask, 2389 zonelist, high_zoneidx, nodemask,
2345 preferred_zone, migratetype); 2390 preferred_zone, migratetype);
2346 if (page) 2391 if (page) {
2392 /*
2393 * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
2394 * necessary to allocate the page. The expectation is
2395 * that the caller is taking steps that will free more
2396 * memory. The caller should avoid the page being used
2397 * for !PFMEMALLOC purposes.
2398 */
2399 page->pfmemalloc = true;
2347 goto got_pg; 2400 goto got_pg;
2401 }
2348 } 2402 }
2349 2403
2350 /* Atomic allocations - we can't balance anything */ 2404 /* Atomic allocations - we can't balance anything */
@@ -2463,8 +2517,8 @@ nopage:
2463got_pg: 2517got_pg:
2464 if (kmemcheck_enabled) 2518 if (kmemcheck_enabled)
2465 kmemcheck_pagealloc_alloc(page, order, gfp_mask); 2519 kmemcheck_pagealloc_alloc(page, order, gfp_mask);
2466 return page;
2467 2520
2521 return page;
2468} 2522}
2469 2523
2470/* 2524/*
@@ -2515,6 +2569,8 @@ retry_cpuset:
2515 page = __alloc_pages_slowpath(gfp_mask, order, 2569 page = __alloc_pages_slowpath(gfp_mask, order,
2516 zonelist, high_zoneidx, nodemask, 2570 zonelist, high_zoneidx, nodemask,
2517 preferred_zone, migratetype); 2571 preferred_zone, migratetype);
2572 else
2573 page->pfmemalloc = false;
2518 2574
2519 trace_mm_page_alloc(page, order, gfp_mask, migratetype); 2575 trace_mm_page_alloc(page, order, gfp_mask, migratetype);
2520 2576
@@ -3030,7 +3086,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
3030 user_zonelist_order = oldval; 3086 user_zonelist_order = oldval;
3031 } else if (oldval != user_zonelist_order) { 3087 } else if (oldval != user_zonelist_order) {
3032 mutex_lock(&zonelists_mutex); 3088 mutex_lock(&zonelists_mutex);
3033 build_all_zonelists(NULL); 3089 build_all_zonelists(NULL, NULL);
3034 mutex_unlock(&zonelists_mutex); 3090 mutex_unlock(&zonelists_mutex);
3035 } 3091 }
3036 } 3092 }
@@ -3409,14 +3465,21 @@ static void setup_zone_pageset(struct zone *zone);
3409DEFINE_MUTEX(zonelists_mutex); 3465DEFINE_MUTEX(zonelists_mutex);
3410 3466
3411/* return values int ....just for stop_machine() */ 3467/* return values int ....just for stop_machine() */
3412static __init_refok int __build_all_zonelists(void *data) 3468static int __build_all_zonelists(void *data)
3413{ 3469{
3414 int nid; 3470 int nid;
3415 int cpu; 3471 int cpu;
3472 pg_data_t *self = data;
3416 3473
3417#ifdef CONFIG_NUMA 3474#ifdef CONFIG_NUMA
3418 memset(node_load, 0, sizeof(node_load)); 3475 memset(node_load, 0, sizeof(node_load));
3419#endif 3476#endif
3477
3478 if (self && !node_online(self->node_id)) {
3479 build_zonelists(self);
3480 build_zonelist_cache(self);
3481 }
3482
3420 for_each_online_node(nid) { 3483 for_each_online_node(nid) {
3421 pg_data_t *pgdat = NODE_DATA(nid); 3484 pg_data_t *pgdat = NODE_DATA(nid);
3422 3485
@@ -3461,7 +3524,7 @@ static __init_refok int __build_all_zonelists(void *data)
3461 * Called with zonelists_mutex held always 3524 * Called with zonelists_mutex held always
3462 * unless system_state == SYSTEM_BOOTING. 3525 * unless system_state == SYSTEM_BOOTING.
3463 */ 3526 */
3464void __ref build_all_zonelists(void *data) 3527void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3465{ 3528{
3466 set_zonelist_order(); 3529 set_zonelist_order();
3467 3530
@@ -3473,10 +3536,10 @@ void __ref build_all_zonelists(void *data)
3473 /* we have to stop all cpus to guarantee there is no user 3536 /* we have to stop all cpus to guarantee there is no user
3474 of zonelist */ 3537 of zonelist */
3475#ifdef CONFIG_MEMORY_HOTPLUG 3538#ifdef CONFIG_MEMORY_HOTPLUG
3476 if (data) 3539 if (zone)
3477 setup_zone_pageset((struct zone *)data); 3540 setup_zone_pageset(zone);
3478#endif 3541#endif
3479 stop_machine(__build_all_zonelists, NULL, NULL); 3542 stop_machine(__build_all_zonelists, pgdat, NULL);
3480 /* cpuset refresh routine should be here */ 3543 /* cpuset refresh routine should be here */
3481 } 3544 }
3482 vm_total_pages = nr_free_pagecache_pages(); 3545 vm_total_pages = nr_free_pagecache_pages();
@@ -3746,7 +3809,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
3746 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) 3809 memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
3747#endif 3810#endif
3748 3811
3749static int zone_batchsize(struct zone *zone) 3812static int __meminit zone_batchsize(struct zone *zone)
3750{ 3813{
3751#ifdef CONFIG_MMU 3814#ifdef CONFIG_MMU
3752 int batch; 3815 int batch;
@@ -3828,7 +3891,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p,
3828 pcp->batch = PAGE_SHIFT * 8; 3891 pcp->batch = PAGE_SHIFT * 8;
3829} 3892}
3830 3893
3831static void setup_zone_pageset(struct zone *zone) 3894static void __meminit setup_zone_pageset(struct zone *zone)
3832{ 3895{
3833 int cpu; 3896 int cpu;
3834 3897
@@ -3901,32 +3964,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
3901 return 0; 3964 return 0;
3902} 3965}
3903 3966
3904static int __zone_pcp_update(void *data)
3905{
3906 struct zone *zone = data;
3907 int cpu;
3908 unsigned long batch = zone_batchsize(zone), flags;
3909
3910 for_each_possible_cpu(cpu) {
3911 struct per_cpu_pageset *pset;
3912 struct per_cpu_pages *pcp;
3913
3914 pset = per_cpu_ptr(zone->pageset, cpu);
3915 pcp = &pset->pcp;
3916
3917 local_irq_save(flags);
3918 free_pcppages_bulk(zone, pcp->count, pcp);
3919 setup_pageset(pset, batch);
3920 local_irq_restore(flags);
3921 }
3922 return 0;
3923}
3924
3925void zone_pcp_update(struct zone *zone)
3926{
3927 stop_machine(__zone_pcp_update, zone, NULL);
3928}
3929
3930static __meminit void zone_pcp_init(struct zone *zone) 3967static __meminit void zone_pcp_init(struct zone *zone)
3931{ 3968{
3932 /* 3969 /*
@@ -3942,7 +3979,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
3942 zone_batchsize(zone)); 3979 zone_batchsize(zone));
3943} 3980}
3944 3981
3945__meminit int init_currently_empty_zone(struct zone *zone, 3982int __meminit init_currently_empty_zone(struct zone *zone,
3946 unsigned long zone_start_pfn, 3983 unsigned long zone_start_pfn,
3947 unsigned long size, 3984 unsigned long size,
3948 enum memmap_context context) 3985 enum memmap_context context)
@@ -4301,7 +4338,7 @@ static inline void setup_usemap(struct pglist_data *pgdat,
4301#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 4338#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
4302 4339
4303/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ 4340/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
4304static inline void __init set_pageblock_order(void) 4341void __init set_pageblock_order(void)
4305{ 4342{
4306 unsigned int order; 4343 unsigned int order;
4307 4344
@@ -4329,7 +4366,7 @@ static inline void __init set_pageblock_order(void)
4329 * include/linux/pageblock-flags.h for the values of pageblock_order based on 4366 * include/linux/pageblock-flags.h for the values of pageblock_order based on
4330 * the kernel config 4367 * the kernel config
4331 */ 4368 */
4332static inline void set_pageblock_order(void) 4369void __init set_pageblock_order(void)
4333{ 4370{
4334} 4371}
4335 4372
@@ -4340,6 +4377,8 @@ static inline void set_pageblock_order(void)
4340 * - mark all pages reserved 4377 * - mark all pages reserved
4341 * - mark all memory queues empty 4378 * - mark all memory queues empty
4342 * - clear the memory bitmaps 4379 * - clear the memory bitmaps
4380 *
4381 * NOTE: pgdat should get zeroed by caller.
4343 */ 4382 */
4344static void __paginginit free_area_init_core(struct pglist_data *pgdat, 4383static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4345 unsigned long *zones_size, unsigned long *zholes_size) 4384 unsigned long *zones_size, unsigned long *zholes_size)
@@ -4350,9 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4350 int ret; 4389 int ret;
4351 4390
4352 pgdat_resize_init(pgdat); 4391 pgdat_resize_init(pgdat);
4353 pgdat->nr_zones = 0;
4354 init_waitqueue_head(&pgdat->kswapd_wait); 4392 init_waitqueue_head(&pgdat->kswapd_wait);
4355 pgdat->kswapd_max_order = 0; 4393 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4356 pgdat_page_cgroup_init(pgdat); 4394 pgdat_page_cgroup_init(pgdat);
4357 4395
4358 for (j = 0; j < MAX_NR_ZONES; j++) { 4396 for (j = 0; j < MAX_NR_ZONES; j++) {
@@ -4394,6 +4432,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4394 4432
4395 zone->spanned_pages = size; 4433 zone->spanned_pages = size;
4396 zone->present_pages = realsize; 4434 zone->present_pages = realsize;
4435#if defined CONFIG_COMPACTION || defined CONFIG_CMA
4436 zone->compact_cached_free_pfn = zone->zone_start_pfn +
4437 zone->spanned_pages;
4438 zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1);
4439#endif
4397#ifdef CONFIG_NUMA 4440#ifdef CONFIG_NUMA
4398 zone->node = nid; 4441 zone->node = nid;
4399 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) 4442 zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
@@ -4408,8 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4408 4451
4409 zone_pcp_init(zone); 4452 zone_pcp_init(zone);
4410 lruvec_init(&zone->lruvec, zone); 4453 lruvec_init(&zone->lruvec, zone);
4411 zap_zone_vm_stats(zone);
4412 zone->flags = 0;
4413 if (!size) 4454 if (!size)
4414 continue; 4455 continue;
4415 4456
@@ -4469,6 +4510,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
4469{ 4510{
4470 pg_data_t *pgdat = NODE_DATA(nid); 4511 pg_data_t *pgdat = NODE_DATA(nid);
4471 4512
4513 /* pg_data_t should be reset to zero when it's allocated */
4514 WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx);
4515
4472 pgdat->node_id = nid; 4516 pgdat->node_id = nid;
4473 pgdat->node_start_pfn = node_start_pfn; 4517 pgdat->node_start_pfn = node_start_pfn;
4474 calculate_node_totalpages(pgdat, zones_size, zholes_size); 4518 calculate_node_totalpages(pgdat, zones_size, zholes_size);
@@ -4750,7 +4794,7 @@ out:
4750} 4794}
4751 4795
4752/* Any regular memory on that node ? */ 4796/* Any regular memory on that node ? */
4753static void check_for_regular_memory(pg_data_t *pgdat) 4797static void __init check_for_regular_memory(pg_data_t *pgdat)
4754{ 4798{
4755#ifdef CONFIG_HIGHMEM 4799#ifdef CONFIG_HIGHMEM
4756 enum zone_type zone_type; 4800 enum zone_type zone_type;
@@ -5468,26 +5512,27 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5468} 5512}
5469 5513
5470/* 5514/*
5471 * This is designed as sub function...plz see page_isolation.c also. 5515 * This function checks whether pageblock includes unmovable pages or not.
5472 * set/clear page block's type to be ISOLATE. 5516 * If @count is not zero, it is okay to include less @count unmovable pages
5473 * page allocater never alloc memory from ISOLATE block. 5517 *
5518 * PageLRU check wihtout isolation or lru_lock could race so that
5519 * MIGRATE_MOVABLE block might include unmovable pages. It means you can't
5520 * expect this function should be exact.
5474 */ 5521 */
5475 5522bool has_unmovable_pages(struct zone *zone, struct page *page, int count)
5476static int
5477__count_immobile_pages(struct zone *zone, struct page *page, int count)
5478{ 5523{
5479 unsigned long pfn, iter, found; 5524 unsigned long pfn, iter, found;
5480 int mt; 5525 int mt;
5481 5526
5482 /* 5527 /*
5483 * For avoiding noise data, lru_add_drain_all() should be called 5528 * For avoiding noise data, lru_add_drain_all() should be called
5484 * If ZONE_MOVABLE, the zone never contains immobile pages 5529 * If ZONE_MOVABLE, the zone never contains unmovable pages
5485 */ 5530 */
5486 if (zone_idx(zone) == ZONE_MOVABLE) 5531 if (zone_idx(zone) == ZONE_MOVABLE)
5487 return true; 5532 return false;
5488 mt = get_pageblock_migratetype(page); 5533 mt = get_pageblock_migratetype(page);
5489 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) 5534 if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
5490 return true; 5535 return false;
5491 5536
5492 pfn = page_to_pfn(page); 5537 pfn = page_to_pfn(page);
5493 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { 5538 for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
@@ -5497,11 +5542,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5497 continue; 5542 continue;
5498 5543
5499 page = pfn_to_page(check); 5544 page = pfn_to_page(check);
5500 if (!page_count(page)) { 5545 /*
5546 * We can't use page_count without pin a page
5547 * because another CPU can free compound page.
5548 * This check already skips compound tails of THP
5549 * because their page->_count is zero at all time.
5550 */
5551 if (!atomic_read(&page->_count)) {
5501 if (PageBuddy(page)) 5552 if (PageBuddy(page))
5502 iter += (1 << page_order(page)) - 1; 5553 iter += (1 << page_order(page)) - 1;
5503 continue; 5554 continue;
5504 } 5555 }
5556
5505 if (!PageLRU(page)) 5557 if (!PageLRU(page))
5506 found++; 5558 found++;
5507 /* 5559 /*
@@ -5518,9 +5570,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
5518 * page at boot. 5570 * page at boot.
5519 */ 5571 */
5520 if (found > count) 5572 if (found > count)
5521 return false; 5573 return true;
5522 } 5574 }
5523 return true; 5575 return false;
5524} 5576}
5525 5577
5526bool is_pageblock_removable_nolock(struct page *page) 5578bool is_pageblock_removable_nolock(struct page *page)
@@ -5544,77 +5596,7 @@ bool is_pageblock_removable_nolock(struct page *page)
5544 zone->zone_start_pfn + zone->spanned_pages <= pfn) 5596 zone->zone_start_pfn + zone->spanned_pages <= pfn)
5545 return false; 5597 return false;
5546 5598
5547 return __count_immobile_pages(zone, page, 0); 5599 return !has_unmovable_pages(zone, page, 0);
5548}
5549
5550int set_migratetype_isolate(struct page *page)
5551{
5552 struct zone *zone;
5553 unsigned long flags, pfn;
5554 struct memory_isolate_notify arg;
5555 int notifier_ret;
5556 int ret = -EBUSY;
5557
5558 zone = page_zone(page);
5559
5560 spin_lock_irqsave(&zone->lock, flags);
5561
5562 pfn = page_to_pfn(page);
5563 arg.start_pfn = pfn;
5564 arg.nr_pages = pageblock_nr_pages;
5565 arg.pages_found = 0;
5566
5567 /*
5568 * It may be possible to isolate a pageblock even if the
5569 * migratetype is not MIGRATE_MOVABLE. The memory isolation
5570 * notifier chain is used by balloon drivers to return the
5571 * number of pages in a range that are held by the balloon
5572 * driver to shrink memory. If all the pages are accounted for
5573 * by balloons, are free, or on the LRU, isolation can continue.
5574 * Later, for example, when memory hotplug notifier runs, these
5575 * pages reported as "can be isolated" should be isolated(freed)
5576 * by the balloon driver through the memory notifier chain.
5577 */
5578 notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5579 notifier_ret = notifier_to_errno(notifier_ret);
5580 if (notifier_ret)
5581 goto out;
5582 /*
5583 * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself.
5584 * We just check MOVABLE pages.
5585 */
5586 if (__count_immobile_pages(zone, page, arg.pages_found))
5587 ret = 0;
5588
5589 /*
5590 * immobile means "not-on-lru" paes. If immobile is larger than
5591 * removable-by-driver pages reported by notifier, we'll fail.
5592 */
5593
5594out:
5595 if (!ret) {
5596 set_pageblock_migratetype(page, MIGRATE_ISOLATE);
5597 move_freepages_block(zone, page, MIGRATE_ISOLATE);
5598 }
5599
5600 spin_unlock_irqrestore(&zone->lock, flags);
5601 if (!ret)
5602 drain_all_pages();
5603 return ret;
5604}
5605
5606void unset_migratetype_isolate(struct page *page, unsigned migratetype)
5607{
5608 struct zone *zone;
5609 unsigned long flags;
5610 zone = page_zone(page);
5611 spin_lock_irqsave(&zone->lock, flags);
5612 if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
5613 goto out;
5614 set_pageblock_migratetype(page, migratetype);
5615 move_freepages_block(zone, page, migratetype);
5616out:
5617 spin_unlock_irqrestore(&zone->lock, flags);
5618} 5600}
5619 5601
5620#ifdef CONFIG_CMA 5602#ifdef CONFIG_CMA
@@ -5869,7 +5851,49 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages)
5869} 5851}
5870#endif 5852#endif
5871 5853
5854#ifdef CONFIG_MEMORY_HOTPLUG
5855static int __meminit __zone_pcp_update(void *data)
5856{
5857 struct zone *zone = data;
5858 int cpu;
5859 unsigned long batch = zone_batchsize(zone), flags;
5860
5861 for_each_possible_cpu(cpu) {
5862 struct per_cpu_pageset *pset;
5863 struct per_cpu_pages *pcp;
5864
5865 pset = per_cpu_ptr(zone->pageset, cpu);
5866 pcp = &pset->pcp;
5867
5868 local_irq_save(flags);
5869 if (pcp->count > 0)
5870 free_pcppages_bulk(zone, pcp->count, pcp);
5871 setup_pageset(pset, batch);
5872 local_irq_restore(flags);
5873 }
5874 return 0;
5875}
5876
5877void __meminit zone_pcp_update(struct zone *zone)
5878{
5879 stop_machine(__zone_pcp_update, zone, NULL);
5880}
5881#endif
5882
5872#ifdef CONFIG_MEMORY_HOTREMOVE 5883#ifdef CONFIG_MEMORY_HOTREMOVE
5884void zone_pcp_reset(struct zone *zone)
5885{
5886 unsigned long flags;
5887
5888 /* avoid races with drain_pages() */
5889 local_irq_save(flags);
5890 if (zone->pageset != &boot_pageset) {
5891 free_percpu(zone->pageset);
5892 zone->pageset = &boot_pageset;
5893 }
5894 local_irq_restore(flags);
5895}
5896
5873/* 5897/*
5874 * All pages in the range must be isolated before calling this. 5898 * All pages in the range must be isolated before calling this.
5875 */ 5899 */