Merge commit 'c104f1fa1ecf4ee0fc06e31b1f77630b2551be81' into stable/for-linus-3.4

* commit 'c104f1fa1ecf4ee0fc06e31b1f77630b2551be81': (14566 commits) cpufreq: OMAP: fix build errors: depends on ARCH_OMAP2PLUS sparc64: Eliminate obsolete __handle_softirq() function sparc64: Fix bootup crash on sun4v. kconfig: delete last traces of __enabled_ from autoconf.h Revert "kconfig: fix __enabled_ macros definition for invisible and un-selected symbols" kconfig: fix IS_ENABLED to not require all options to be defined irq_domain: fix type mismatch in debugfs output format staging: android: fix mem leaks in __persistent_ram_init() staging: vt6656: Don't leak memory in drivers/staging/vt6656/ioctl.c::private_ioctl() staging: iio: hmc5843: Fix crash in probe function. panic: fix stack dump print on direct call to panic() drivers/rtc/rtc-pl031.c: enable clock on all ST variants Revert "mm: vmscan: fix misused nr_reclaimed in shrink_mem_cgroup_zone()" hugetlb: fix race condition in hugetlb_fault() drivers/rtc/rtc-twl.c: use static register while reading time drivers/rtc/rtc-s3c.c: add placeholder for driver private data drivers/rtc/rtc-s3c.c: fix compilation error MAINTAINERS: add PCDP console maintainer memcg: do not open code accesses to res_counter members drivers/rtc/rtc-efi.c: fix section mismatch warning ...
author: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 2012-04-18 15:52:50 -0400
committer: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> 2012-04-18 15:52:50 -0400
commit: 681e4a5e13c1c8315694eb4f44e0cdd84c9082d2 (patch)
tree: 699f14527c118859026e8ce0214e689d0b9c88cb /mm/page_alloc.c
parent: b960d6c43a63ebd2d8518b328da3816b833ee8cc (diff)
parent: c104f1fa1ecf4ee0fc06e31b1f77630b2551be81 (diff)
1 files changed, 132 insertions, 42 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 794e6715c226..a712fb9e04ce 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
 }
 /*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * Note that this code is protected against sending an IPI to an offline
+ * CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
+ * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
+ * nothing keeps CPUs from showing up after we populated the cpumask and
+ * before the call to on_each_cpu_mask().
 */
 void drain_all_pages(void)
 {
-        on_each_cpu(drain_local_pages, NULL, 1);
+        int cpu;
+        struct per_cpu_pageset *pcp;
+        struct zone *zone;
+        /*
+         * Allocate in the BSS so we wont require allocation in
+         * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
+         */
+        static cpumask_t cpus_with_pcps;
+        /*
+         * We don't care about racing with CPU hotplug event
+         * as offline notification will cause the notified
+         * cpu to drain that CPU pcps and on_each_cpu_mask
+         * disables preemption as part of its processing
+         */
+        for_each_online_cpu(cpu) {
+                bool has_pcps = false;
+                for_each_populated_zone(zone) {
+                        pcp = per_cpu_ptr(zone->pageset, cpu);
+                        if (pcp->pcp.count) {
+                                has_pcps = true;
+                                break;
+                        }
+                }
+                if (has_pcps)
+                        cpumask_set_cpu(cpu, &cpus_with_pcps);
+                else
+                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
+        }
+        on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
 }
 #ifdef CONFIG_HIBERNATION
@@ -1968,7 +2004,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                        goto out;
        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order, nodemask);
+        out_of_memory(zonelist, gfp_mask, order, nodemask, false);
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -1981,14 +2017,20 @@ static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        struct page *page;
-        if (!order || compaction_deferred(preferred_zone))
+        if (!order)
                return NULL;
+        if (compaction_deferred(preferred_zone, order)) {
+                *deferred_compaction = true;
+                return NULL;
+        }
        current->flags |= PF_MEMALLOC;
        *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
                                                nodemask, sync_migration);
@@ -2006,6 +2048,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                if (page) {
                        preferred_zone->compact_considered = 0;
                        preferred_zone->compact_defer_shift = 0;
+                        if (order >= preferred_zone->compact_order_failed)
+                                preferred_zone->compact_order_failed = order + 1;
                        count_vm_event(COMPACTSUCCESS);
                        return page;
                }
@@ -2016,7 +2060,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                 * but not enough to satisfy watermarks.
                 */
                count_vm_event(COMPACTFAIL);
-                defer_compaction(preferred_zone);
+                /*
+                 * As async compaction considers a subset of pageblocks, only
+                 * defer if the failure was a sync compaction failure.
+                 */
+                if (sync_migration)
+                        defer_compaction(preferred_zone, order);
                cond_resched();
        }
@@ -2028,8 +2078,9 @@ static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
        struct zonelist *zonelist, enum zone_type high_zoneidx,
        nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
-        int migratetype, unsigned long *did_some_progress,
+        int migratetype, bool sync_migration,
-        bool sync_migration)
+        bool *deferred_compaction,
+        unsigned long *did_some_progress)
 {
        return NULL;
 }
@@ -2179,6 +2230,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
        unsigned long pages_reclaimed = 0;
        unsigned long did_some_progress;
        bool sync_migration = false;
+        bool deferred_compaction = false;
        /*
         * In the slowpath, we sanity check order to avoid ever trying to
@@ -2259,12 +2311,22 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
        if (page)
                goto got_pg;
        sync_migration = true;
+        /*
+         * If compaction is deferred for high-order allocations, it is because
+         * sync compaction recently failed. In this is the case and the caller
+         * has requested the system not be heavily disrupted, fail the
+         * allocation now instead of entering direct reclaim
+         */
+        if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
+                goto nopage;
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
@@ -2282,6 +2344,10 @@ rebalance:
                if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
                        if (oom_killer_disabled)
                                goto nopage;
+                        /* Coredumps can quickly deplete all memory reserves */
+                        if ((current->flags & PF_DUMPCORE) &&
+                            !(gfp_mask & __GFP_NOFAIL))
+                                goto nopage;
                        page = __alloc_pages_may_oom(gfp_mask, order,
                                        zonelist, high_zoneidx,
                                        nodemask, preferred_zone,
@@ -2328,8 +2394,9 @@ rebalance:
                                        zonelist, high_zoneidx,
                                        nodemask,
                                        alloc_flags, preferred_zone,
-                                        migratetype, &did_some_progress,
+                                        migratetype, sync_migration,
-                                        sync_migration);
+                                        &deferred_compaction,
+                                        &did_some_progress);
                if (page)
                        goto got_pg;
        }
@@ -2353,8 +2420,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 {
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
        struct zone *preferred_zone;
-        struct page *page;
+        struct page *page = NULL;
        int migratetype = allocflags_to_migratetype(gfp_mask);
+        unsigned int cpuset_mems_cookie;
        gfp_mask &= gfp_allowed_mask;
@@ -2373,15 +2441,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
-        get_mems_allowed();
+retry_cpuset:
+        cpuset_mems_cookie = get_mems_allowed();
        /* The preferred zone is used for statistics later */
        first_zones_zonelist(zonelist, high_zoneidx,
                                nodemask ? : &cpuset_current_mems_allowed,
                                &preferred_zone);
-        if (!preferred_zone) {
+        if (!preferred_zone)
-                put_mems_allowed();
+                goto out;
-                return NULL;
-        }
        /* First allocation attempt */
        page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
@@ -2391,9 +2459,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
                page = __alloc_pages_slowpath(gfp_mask, order,
                                zonelist, high_zoneidx, nodemask,
                                preferred_zone, migratetype);
-        put_mems_allowed();
        trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+out:
+        /*
+         * When updating a task's mems_allowed, it is possible to race with
+         * parallel threads in such a way that an allocation can fail while
+         * the mask is being updated. If a page allocation is about to fail,
+         * check if the cpuset changed during allocation and if so, retry.
+         */
+        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
+                goto retry_cpuset;
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2607,13 +2685,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
 bool skip_free_areas_node(unsigned int flags, int nid)
 {
        bool ret = false;
+        unsigned int cpuset_mems_cookie;
        if (!(flags & SHOW_MEM_FILTER_NODES))
                goto out;
-        get_mems_allowed();
+        do {
-        ret = !node_isset(nid, cpuset_current_mems_allowed);
+                cpuset_mems_cookie = get_mems_allowed();
-        put_mems_allowed();
+                ret = !node_isset(nid, cpuset_current_mems_allowed);
+        } while (!put_mems_allowed(cpuset_mems_cookie));
 out:
        return ret;
 }
@@ -3900,18 +3980,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
        }
 }
-int __init add_from_early_node_map(struct range *range, int az,
-                                   int nr_range, int nid)
-{
-        unsigned long start_pfn, end_pfn;
-        int i;
-        /* need to go over early_node_map to find out good range for node */
-        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
-                nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
-        return nr_range;
-}
 /**
 * sparse_memory_present_with_active_regions - Call memory_present for each active range
 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4237,7 +4305,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
        for (j = 0; j < MAX_NR_ZONES; j++) {
                struct zone *zone = pgdat->node_zones + j;
                unsigned long size, realsize, memmap_pages;
-                enum lru_list l;
+                enum lru_list lru;
                size = zone_spanned_pages_in_node(nid, j, zones_size);
                realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4287,8 +4355,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone->zone_pgdat = pgdat;
                zone_pcp_init(zone);
-                for_each_lru(l)
+                for_each_lru(lru)
-                        INIT_LIST_HEAD(&zone->lru[l].list);
+                        INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
                zone->reclaim_stat.recent_rotated[0] = 0;
                zone->reclaim_stat.recent_rotated[1] = 0;
                zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4496,7 +4564,7 @@ static unsigned long __init early_calculate_totalpages(void)
 * memory. When they don't, some nodes will have more kernelcore than
 * others
 */
-static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)
+static void __init find_zone_movable_pfns_for_nodes(void)
 {
        int i, nid;
        unsigned long usable_startpfn;
@@ -4642,8 +4710,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
        for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
                struct zone *zone = &pgdat->node_zones[zone_type];
-                if (zone->present_pages)
+                if (zone->present_pages) {
                        node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+                        break;
+                }
        }
 #endif
 }
@@ -4686,7 +4756,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
-        find_zone_movable_pfns_for_nodes(zone_movable_pfn);
+        find_zone_movable_pfns_for_nodes();
        /* Print out the zone ranges */
        printk("Zone PFN ranges:\n");
@@ -4796,6 +4866,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
        int cpu = (unsigned long)hcpu;
        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
+                lru_add_drain_cpu(cpu);
                drain_pages(cpu);
                /*
@@ -5209,6 +5280,7 @@ void *__init alloc_large_system_hash(const char *tablename,
                max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
                do_div(max, bucketsize);
        }
+        max = min(max, 0x80000000ULL);
        if (numentries > max)
                numentries = max;
@@ -5386,7 +5458,25 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count)
 bool is_pageblock_removable_nolock(struct page *page)
 {
-        struct zone *zone = page_zone(page);
+        struct zone *zone;
+        unsigned long pfn;
+        /*
+         * We have to be careful here because we are iterating over memory
+         * sections which are not zone aware so we might end up outside of
+         * the zone but still within the section.
+         * We have to take care about the node as well. If the node is offline
+         * its NODE_DATA will be NULL - see page_zone.
+         */
+        if (!node_online(page_to_nid(page)))
+                return false;
+        zone = page_zone(page);
+        pfn = page_to_pfn(page);
+        if (zone->zone_start_pfn > pfn ||
+                        zone->zone_start_pfn + zone->spanned_pages <= pfn)
+                return false;
        return __count_immobile_pages(zone, page, 0);
 }
author	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>	2012-04-18 15:52:50 -0400
committer	Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>	2012-04-18 15:52:50 -0400
commit	681e4a5e13c1c8315694eb4f44e0cdd84c9082d2 (patch)
tree	699f14527c118859026e8ce0214e689d0b9c88cb /mm/page_alloc.c
parent	b960d6c43a63ebd2d8518b328da3816b833ee8cc (diff)
parent	c104f1fa1ecf4ee0fc06e31b1f77630b2551be81 (diff)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 794e6715c226..a712fb9e04ce 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg)
1161	}	1161	}
1162		1162
1163	/*	1163	/*
1164	* Spill all the per-cpu pages from all CPUs back into the buddy allocator	1164	* Spill all the per-cpu pages from all CPUs back into the buddy allocator.
		1165	*
		1166	* Note that this code is protected against sending an IPI to an offline
		1167	* CPU but does not guarantee sending an IPI to newly hotplugged CPUs:
		1168	* on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but
		1169	* nothing keeps CPUs from showing up after we populated the cpumask and
		1170	* before the call to on_each_cpu_mask().
1165	*/	1171	*/
1166	void drain_all_pages(void)	1172	void drain_all_pages(void)
1167	{	1173	{
1168	on_each_cpu(drain_local_pages, NULL, 1);	1174	int cpu;
		1175	struct per_cpu_pageset *pcp;
		1176	struct zone *zone;
		1177
		1178	/*
		1179	* Allocate in the BSS so we wont require allocation in
		1180	* direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
		1181	*/
		1182	static cpumask_t cpus_with_pcps;
		1183
		1184	/*
		1185	* We don't care about racing with CPU hotplug event
		1186	* as offline notification will cause the notified
		1187	* cpu to drain that CPU pcps and on_each_cpu_mask
		1188	* disables preemption as part of its processing
		1189	*/
		1190	for_each_online_cpu(cpu) {
		1191	bool has_pcps = false;
		1192	for_each_populated_zone(zone) {
		1193	pcp = per_cpu_ptr(zone->pageset, cpu);
		1194	if (pcp->pcp.count) {
		1195	has_pcps = true;
		1196	break;
		1197	}
		1198	}
		1199	if (has_pcps)
		1200	cpumask_set_cpu(cpu, &cpus_with_pcps);
		1201	else
		1202	cpumask_clear_cpu(cpu, &cpus_with_pcps);
		1203	}
		1204	on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1);
1169	}	1205	}
1170		1206
1171	#ifdef CONFIG_HIBERNATION	1207	#ifdef CONFIG_HIBERNATION
@@ -1968,7 +2004,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1968	goto out;	2004	goto out;
1969	}	2005	}
1970	/* Exhausted what can be done so it's blamo time */	2006	/* Exhausted what can be done so it's blamo time */
1971	out_of_memory(zonelist, gfp_mask, order, nodemask);	2007	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
1972		2008
1973	out:	2009	out:
1974	clear_zonelist_oom(zonelist, gfp_mask);	2010	clear_zonelist_oom(zonelist, gfp_mask);
@@ -1981,14 +2017,20 @@ static struct page *
1981	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,	2017	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
1982	struct zonelist *zonelist, enum zone_type high_zoneidx,	2018	struct zonelist *zonelist, enum zone_type high_zoneidx,
1983	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,	2019	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,
1984	int migratetype, unsigned long *did_some_progress,	2020	int migratetype, bool sync_migration,
1985	bool sync_migration)	2021	bool *deferred_compaction,
		2022	unsigned long *did_some_progress)
1986	{	2023	{
1987	struct page *page;	2024	struct page *page;
1988		2025
1989	if (!order \|\| compaction_deferred(preferred_zone))	2026	if (!order)
1990	return NULL;	2027	return NULL;
1991		2028
		2029	if (compaction_deferred(preferred_zone, order)) {
		2030	*deferred_compaction = true;
		2031	return NULL;
		2032	}
		2033
1992	current->flags \|= PF_MEMALLOC;	2034	current->flags \|= PF_MEMALLOC;
1993	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,	2035	*did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
1994	nodemask, sync_migration);	2036	nodemask, sync_migration);
@@ -2006,6 +2048,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2006	if (page) {	2048	if (page) {
2007	preferred_zone->compact_considered = 0;	2049	preferred_zone->compact_considered = 0;
2008	preferred_zone->compact_defer_shift = 0;	2050	preferred_zone->compact_defer_shift = 0;
		2051	if (order >= preferred_zone->compact_order_failed)
		2052	preferred_zone->compact_order_failed = order + 1;
2009	count_vm_event(COMPACTSUCCESS);	2053	count_vm_event(COMPACTSUCCESS);
2010	return page;	2054	return page;
2011	}	2055	}
@@ -2016,7 +2060,13 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2016	* but not enough to satisfy watermarks.	2060	* but not enough to satisfy watermarks.
2017	*/	2061	*/
2018	count_vm_event(COMPACTFAIL);	2062	count_vm_event(COMPACTFAIL);
2019	defer_compaction(preferred_zone);	2063
		2064	/*
		2065	* As async compaction considers a subset of pageblocks, only
		2066	* defer if the failure was a sync compaction failure.
		2067	*/
		2068	if (sync_migration)
		2069	defer_compaction(preferred_zone, order);
2020		2070
2021	cond_resched();	2071	cond_resched();
2022	}	2072	}
@@ -2028,8 +2078,9 @@ static inline struct page *
2028	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,	2078	__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
2029	struct zonelist *zonelist, enum zone_type high_zoneidx,	2079	struct zonelist *zonelist, enum zone_type high_zoneidx,
2030	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,	2080	nodemask_t nodemask, int alloc_flags, struct zone preferred_zone,
2031	int migratetype, unsigned long *did_some_progress,	2081	int migratetype, bool sync_migration,
2032	bool sync_migration)	2082	bool *deferred_compaction,
		2083	unsigned long *did_some_progress)
2033	{	2084	{
2034	return NULL;	2085	return NULL;
2035	}	2086	}
@@ -2179,6 +2230,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
2179	unsigned long pages_reclaimed = 0;	2230	unsigned long pages_reclaimed = 0;
2180	unsigned long did_some_progress;	2231	unsigned long did_some_progress;
2181	bool sync_migration = false;	2232	bool sync_migration = false;
		2233	bool deferred_compaction = false;
2182		2234
2183	/*	2235	/*
2184	* In the slowpath, we sanity check order to avoid ever trying to	2236	* In the slowpath, we sanity check order to avoid ever trying to
@@ -2259,12 +2311,22 @@ rebalance:
2259	zonelist, high_zoneidx,	2311	zonelist, high_zoneidx,
2260	nodemask,	2312	nodemask,
2261	alloc_flags, preferred_zone,	2313	alloc_flags, preferred_zone,
2262	migratetype, &did_some_progress,	2314	migratetype, sync_migration,
2263	sync_migration);	2315	&deferred_compaction,
		2316	&did_some_progress);
2264	if (page)	2317	if (page)
2265	goto got_pg;	2318	goto got_pg;
2266	sync_migration = true;	2319	sync_migration = true;
2267		2320
		2321	/*
		2322	* If compaction is deferred for high-order allocations, it is because
		2323	* sync compaction recently failed. In this is the case and the caller
		2324	* has requested the system not be heavily disrupted, fail the
		2325	* allocation now instead of entering direct reclaim
		2326	*/
		2327	if (deferred_compaction && (gfp_mask & __GFP_NO_KSWAPD))
		2328	goto nopage;
		2329
2268	/* Try direct reclaim and then allocating */	2330	/* Try direct reclaim and then allocating */
2269	page = __alloc_pages_direct_reclaim(gfp_mask, order,	2331	page = __alloc_pages_direct_reclaim(gfp_mask, order,
2270	zonelist, high_zoneidx,	2332	zonelist, high_zoneidx,
@@ -2282,6 +2344,10 @@ rebalance:
2282	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {	2344	if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
2283	if (oom_killer_disabled)	2345	if (oom_killer_disabled)
2284	goto nopage;	2346	goto nopage;
		2347	/* Coredumps can quickly deplete all memory reserves */
		2348	if ((current->flags & PF_DUMPCORE) &&
		2349	!(gfp_mask & __GFP_NOFAIL))
		2350	goto nopage;
2285	page = __alloc_pages_may_oom(gfp_mask, order,	2351	page = __alloc_pages_may_oom(gfp_mask, order,
2286	zonelist, high_zoneidx,	2352	zonelist, high_zoneidx,
2287	nodemask, preferred_zone,	2353	nodemask, preferred_zone,
@@ -2328,8 +2394,9 @@ rebalance:
2328	zonelist, high_zoneidx,	2394	zonelist, high_zoneidx,
2329	nodemask,	2395	nodemask,
2330	alloc_flags, preferred_zone,	2396	alloc_flags, preferred_zone,
2331	migratetype, &did_some_progress,	2397	migratetype, sync_migration,
2332	sync_migration);	2398	&deferred_compaction,
		2399	&did_some_progress);
2333	if (page)	2400	if (page)
2334	goto got_pg;	2401	goto got_pg;
2335	}	2402	}
@@ -2353,8 +2420,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2353	{	2420	{
2354	enum zone_type high_zoneidx = gfp_zone(gfp_mask);	2421	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
2355	struct zone *preferred_zone;	2422	struct zone *preferred_zone;
2356	struct page *page;	2423	struct page *page = NULL;
2357	int migratetype = allocflags_to_migratetype(gfp_mask);	2424	int migratetype = allocflags_to_migratetype(gfp_mask);
		2425	unsigned int cpuset_mems_cookie;
2358		2426
2359	gfp_mask &= gfp_allowed_mask;	2427	gfp_mask &= gfp_allowed_mask;
2360		2428
@@ -2373,15 +2441,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2373	if (unlikely(!zonelist->_zonerefs->zone))	2441	if (unlikely(!zonelist->_zonerefs->zone))
2374	return NULL;	2442	return NULL;
2375		2443
2376	get_mems_allowed();	2444	retry_cpuset:
		2445	cpuset_mems_cookie = get_mems_allowed();
		2446
2377	/* The preferred zone is used for statistics later */	2447	/* The preferred zone is used for statistics later */
2378	first_zones_zonelist(zonelist, high_zoneidx,	2448	first_zones_zonelist(zonelist, high_zoneidx,
2379	nodemask ? : &cpuset_current_mems_allowed,	2449	nodemask ? : &cpuset_current_mems_allowed,
2380	&preferred_zone);	2450	&preferred_zone);
2381	if (!preferred_zone) {	2451	if (!preferred_zone)
2382	put_mems_allowed();	2452	goto out;
2383	return NULL;
2384	}
2385		2453
2386	/* First allocation attempt */	2454	/* First allocation attempt */
2387	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,	2455	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, nodemask, order,
@@ -2391,9 +2459,19 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
2391	page = __alloc_pages_slowpath(gfp_mask, order,	2459	page = __alloc_pages_slowpath(gfp_mask, order,
2392	zonelist, high_zoneidx, nodemask,	2460	zonelist, high_zoneidx, nodemask,
2393	preferred_zone, migratetype);	2461	preferred_zone, migratetype);
2394	put_mems_allowed();
2395		2462
2396	trace_mm_page_alloc(page, order, gfp_mask, migratetype);	2463	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
		2464
		2465	out:
		2466	/*
		2467	* When updating a task's mems_allowed, it is possible to race with
		2468	* parallel threads in such a way that an allocation can fail while
		2469	* the mask is being updated. If a page allocation is about to fail,
		2470	* check if the cpuset changed during allocation and if so, retry.
		2471	*/
		2472	if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
		2473	goto retry_cpuset;
		2474
2397	return page;	2475	return page;
2398	}	2476	}
2399	EXPORT_SYMBOL(__alloc_pages_nodemask);	2477	EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2607,13 +2685,15 @@ void si_meminfo_node(struct sysinfo *val, int nid)
2607	bool skip_free_areas_node(unsigned int flags, int nid)	2685	bool skip_free_areas_node(unsigned int flags, int nid)
2608	{	2686	{
2609	bool ret = false;	2687	bool ret = false;
		2688	unsigned int cpuset_mems_cookie;
2610		2689
2611	if (!(flags & SHOW_MEM_FILTER_NODES))	2690	if (!(flags & SHOW_MEM_FILTER_NODES))
2612	goto out;	2691	goto out;
2613		2692
2614	get_mems_allowed();	2693	do {
2615	ret = !node_isset(nid, cpuset_current_mems_allowed);	2694	cpuset_mems_cookie = get_mems_allowed();
2616	put_mems_allowed();	2695	ret = !node_isset(nid, cpuset_current_mems_allowed);
		2696	} while (!put_mems_allowed(cpuset_mems_cookie));
2617	out:	2697	out:
2618	return ret;	2698	return ret;
2619	}	2699	}
@@ -3900,18 +3980,6 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
3900	}	3980	}
3901	}	3981	}
3902		3982
3903	int __init add_from_early_node_map(struct range *range, int az,
3904	int nr_range, int nid)
3905	{
3906	unsigned long start_pfn, end_pfn;
3907	int i;
3908
3909	/* need to go over early_node_map to find out good range for node */
3910	for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL)
3911	nr_range = add_range(range, az, nr_range, start_pfn, end_pfn);
3912	return nr_range;
3913	}
3914
3915	/**	3983	/**
3916	* sparse_memory_present_with_active_regions - Call memory_present for each active range	3984	* sparse_memory_present_with_active_regions - Call memory_present for each active range
3917	* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.	3985	* @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
@@ -4237,7 +4305,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4237	for (j = 0; j < MAX_NR_ZONES; j++) {	4305	for (j = 0; j < MAX_NR_ZONES; j++) {
4238	struct zone *zone = pgdat->node_zones + j;	4306	struct zone *zone = pgdat->node_zones + j;
4239	unsigned long size, realsize, memmap_pages;	4307	unsigned long size, realsize, memmap_pages;
4240	enum lru_list l;	4308	enum lru_list lru;
4241		4309
4242	size = zone_spanned_pages_in_node(nid, j, zones_size);	4310	size = zone_spanned_pages_in_node(nid, j, zones_size);
4243	realsize = size - zone_absent_pages_in_node(nid, j,	4311	realsize = size - zone_absent_pages_in_node(nid, j,
@@ -4287,8 +4355,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4287	zone->zone_pgdat = pgdat;	4355	zone->zone_pgdat = pgdat;
4288		4356
4289	zone_pcp_init(zone);	4357	zone_pcp_init(zone);
4290	for_each_lru(l)	4358	for_each_lru(lru)
4291	INIT_LIST_HEAD(&zone->lru[l].list);	4359	INIT_LIST_HEAD(&zone->lruvec.lists[lru]);
4292	zone->reclaim_stat.recent_rotated[0] = 0;	4360	zone->reclaim_stat.recent_rotated[0] = 0;
4293	zone->reclaim_stat.recent_rotated[1] = 0;	4361	zone->reclaim_stat.recent_rotated[1] = 0;
4294	zone->reclaim_stat.recent_scanned[0] = 0;	4362	zone->reclaim_stat.recent_scanned[0] = 0;
@@ -4496,7 +4564,7 @@ static unsigned long __init early_calculate_totalpages(void)
4496	* memory. When they don't, some nodes will have more kernelcore than	4564	* memory. When they don't, some nodes will have more kernelcore than
4497	* others	4565	* others
4498	*/	4566	*/
4499	static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn)	4567	static void __init find_zone_movable_pfns_for_nodes(void)
4500	{	4568	{
4501	int i, nid;	4569	int i, nid;
4502	unsigned long usable_startpfn;	4570	unsigned long usable_startpfn;
@@ -4642,8 +4710,10 @@ static void check_for_regular_memory(pg_data_t *pgdat)
4642		4710
4643	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {	4711	for (zone_type = 0; zone_type <= ZONE_NORMAL; zone_type++) {
4644	struct zone *zone = &pgdat->node_zones[zone_type];	4712	struct zone *zone = &pgdat->node_zones[zone_type];
4645	if (zone->present_pages)	4713	if (zone->present_pages) {
4646	node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);	4714	node_set_state(zone_to_nid(zone), N_NORMAL_MEMORY);
		4715	break;
		4716	}
4647	}	4717	}
4648	#endif	4718	#endif
4649	}	4719	}
@@ -4686,7 +4756,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
4686		4756
4687	/* Find the PFNs that ZONE_MOVABLE begins at in each node */	4757	/* Find the PFNs that ZONE_MOVABLE begins at in each node */
4688	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));	4758	memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
4689	find_zone_movable_pfns_for_nodes(zone_movable_pfn);	4759	find_zone_movable_pfns_for_nodes();
4690		4760
4691	/* Print out the zone ranges */	4761	/* Print out the zone ranges */
4692	printk("Zone PFN ranges:\n");	4762	printk("Zone PFN ranges:\n");
@@ -4796,6 +4866,7 @@ static int page_alloc_cpu_notify(struct notifier_block *self,
4796	int cpu = (unsigned long)hcpu;	4866	int cpu = (unsigned long)hcpu;
4797		4867
4798	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {	4868	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {
		4869	lru_add_drain_cpu(cpu);
4799	drain_pages(cpu);	4870	drain_pages(cpu);
4800		4871
4801	/*	4872	/*
@@ -5209,6 +5280,7 @@ void __init alloc_large_system_hash(const char tablename,
5209	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;	5280	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
5210	do_div(max, bucketsize);	5281	do_div(max, bucketsize);
5211	}	5282	}
		5283	max = min(max, 0x80000000ULL);
5212		5284
5213	if (numentries > max)	5285	if (numentries > max)
5214	numentries = max;	5286	numentries = max;
@@ -5386,7 +5458,25 @@ __count_immobile_pages(struct zone zone, struct page page, int count)
5386		5458
5387	bool is_pageblock_removable_nolock(struct page *page)	5459	bool is_pageblock_removable_nolock(struct page *page)
5388	{	5460	{
5389	struct zone *zone = page_zone(page);	5461	struct zone *zone;
		5462	unsigned long pfn;
		5463
		5464	/*
		5465	* We have to be careful here because we are iterating over memory
		5466	* sections which are not zone aware so we might end up outside of
		5467	* the zone but still within the section.
		5468	* We have to take care about the node as well. If the node is offline
		5469	* its NODE_DATA will be NULL - see page_zone.
		5470	*/
		5471	if (!node_online(page_to_nid(page)))
		5472	return false;
		5473
		5474	zone = page_zone(page);
		5475	pfn = page_to_pfn(page);
		5476	if (zone->zone_start_pfn > pfn \|\|
		5477	zone->zone_start_pfn + zone->spanned_pages <= pfn)
		5478	return false;
		5479
5390	return __count_immobile_pages(zone, page, 0);	5480	return __count_immobile_pages(zone, page, 0);
5391	}	5481	}
5392		5482