1 files changed, 101 insertions, 27 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2bc2ac63f41e..8deb9d0fd5b1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
+#include <linux/memory.h>
 #include <trace/events/kmem.h>
 #include <asm/tlbflush.h>
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page,
        zone->free_area[order].nr_free++;
 }
-#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
 /*
 * free_page_mlock() -- clean up attempts to free and mlocked() page.
 * Page should not be on lru, so no need to fix that up.
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page)
        __dec_zone_page_state(page, NR_MLOCK);
        __count_vm_event(UNEVICTABLE_MLOCKFREED);
 }
-#else
-static void free_page_mlock(struct page *page) { }
-#endif
 static inline int free_pages_check(struct page *page)
 {
@@ -559,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                        page = list_entry(list->prev, struct page, lru);
                        /* must delete as __free_one_page list manipulates */
                        list_del(&page->lru);
-                        __free_one_page(page, zone, 0, migratetype);
+                        /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
-                        trace_mm_page_pcpu_drain(page, 0, migratetype);
+                        __free_one_page(page, zone, 0, page_private(page));
+                        trace_mm_page_pcpu_drain(page, 0, page_private(page));
                } while (--count && --batch_free && !list_empty(list));
        }
        spin_unlock(&zone->lock);
@@ -1225,10 +1223,10 @@ again:
                }
                spin_lock_irqsave(&zone->lock, flags);
                page = __rmqueue(zone, order, migratetype);
-                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
                spin_unlock(&zone->lock);
                if (!page)
                        goto failed;
+                __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
        }
        __count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1658,12 +1656,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        if (page)
                goto out;
-        /* The OOM killer will not help higher order allocs */
+        if (!(gfp_mask & __GFP_NOFAIL)) {
-        if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
+                /* The OOM killer will not help higher order allocs */
-                goto out;
+                if (order > PAGE_ALLOC_COSTLY_ORDER)
+                        goto out;
+                /*
+                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
+                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
+                 * The caller should handle page allocation failure by itself if
+                 * it specifies __GFP_THISNODE.
+                 * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
+                 */
+                if (gfp_mask & __GFP_THISNODE)
+                        goto out;
+        }
        /* Exhausted what can be done so it's blamo time */
-        out_of_memory(zonelist, gfp_mask, order);
+        out_of_memory(zonelist, gfp_mask, order, nodemask);
 out:
        clear_zonelist_oom(zonelist, gfp_mask);
@@ -2395,13 +2403,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
 {
        char saved_string[NUMA_ZONELIST_ORDER_LEN];
        int ret;
+        static DEFINE_MUTEX(zl_order_mutex);
+        mutex_lock(&zl_order_mutex);
        if (write)
-                strncpy(saved_string, (char*)table->data,
+                strcpy(saved_string, (char*)table->data);
-                        NUMA_ZONELIST_ORDER_LEN);
        ret = proc_dostring(table, write, buffer, length, ppos);
        if (ret)
-                return ret;
+                goto out;
        if (write) {
                int oldval = user_zonelist_order;
                if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2414,7 +2423,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
                } else if (oldval != user_zonelist_order)
                        build_all_zonelists();
        }
-        return 0;
+out:
+        mutex_unlock(&zl_order_mutex);
+        return ret;
 }
@@ -3127,7 +3138,7 @@ static int __cpuinit process_zones(int cpu)
                if (percpu_pagelist_fraction)
                        setup_pagelist_highmark(zone_pcp(zone, cpu),
-                                (zone->present_pages / percpu_pagelist_fraction));
+                            (zone->present_pages / percpu_pagelist_fraction));
        }
        return 0;
@@ -3573,7 +3584,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
 * then all holes in the requested range will be accounted for.
 */
-static unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __meminit __absent_pages_in_range(int nid,
                                unsigned long range_start_pfn,
                                unsigned long range_end_pfn)
 {
@@ -3988,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
                }
                /* Merge backward if suitable */
-                if (start_pfn < early_node_map[i].end_pfn &&
+                if (start_pfn < early_node_map[i].start_pfn &&
                                end_pfn >= early_node_map[i].start_pfn) {
                        early_node_map[i].start_pfn = start_pfn;
                        return;
@@ -4102,7 +4113,7 @@ static int __init cmp_node_active_region(const void *a, const void *b)
 }
 /* sort the node_map by start_pfn */
-static void __init sort_node_map(void)
+void __init sort_node_map(void)
 {
        sort(early_node_map, (size_t)nr_nodemap_entries,
                        sizeof(struct node_active_region),
@@ -5002,23 +5013,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
 int set_migratetype_isolate(struct page *page)
 {
        struct zone *zone;
-        unsigned long flags;
+        struct page *curr_page;
+        unsigned long flags, pfn, iter;
+        unsigned long immobile = 0;
+        struct memory_isolate_notify arg;
+        int notifier_ret;
        int ret = -EBUSY;
        int zone_idx;
        zone = page_zone(page);
        zone_idx = zone_idx(zone);
        spin_lock_irqsave(&zone->lock, flags);
+        if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE ||
+            zone_idx == ZONE_MOVABLE) {
+                ret = 0;
+                goto out;
+        }
+        pfn = page_to_pfn(page);
+        arg.start_pfn = pfn;
+        arg.nr_pages = pageblock_nr_pages;
+        arg.pages_found = 0;
        /*
-         * In future, more migrate types will be able to be isolation target.
+         * It may be possible to isolate a pageblock even if the
+         * migratetype is not MIGRATE_MOVABLE. The memory isolation
+         * notifier chain is used by balloon drivers to return the
+         * number of pages in a range that are held by the balloon
+         * driver to shrink memory. If all the pages are accounted for
+         * by balloons, are free, or on the LRU, isolation can continue.
+         * Later, for example, when memory hotplug notifier runs, these
+         * pages reported as "can be isolated" should be isolated(freed)
+         * by the balloon driver through the memory notifier chain.
         */
-        if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+        notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
-            zone_idx != ZONE_MOVABLE)
+        notifier_ret = notifier_to_errno(notifier_ret);
+        if (notifier_ret || !arg.pages_found)
                goto out;
-        set_pageblock_migratetype(page, MIGRATE_ISOLATE);
-        move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
-        ret = 0;
+                if (!pfn_valid_within(pfn))
+                        continue;
+                curr_page = pfn_to_page(iter);
+                if (!page_count(curr_page) || PageLRU(curr_page))
+                        continue;
+                immobile++;
+        }
+        if (arg.pages_found == immobile)
+                ret = 0;
 out:
+        if (!ret) {
+                set_pageblock_migratetype(page, MIGRATE_ISOLATE);
+                move_freepages_block(zone, page, MIGRATE_ISOLATE);
+        }
        spin_unlock_irqrestore(&zone->lock, flags);
        if (!ret)
                drain_all_pages();
@@ -5085,3 +5138,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
        spin_unlock_irqrestore(&zone->lock, flags);
 }
 #endif
+#ifdef CONFIG_MEMORY_FAILURE
+bool is_free_buddy_page(struct page *page)
+{
+        struct zone *zone = page_zone(page);
+        unsigned long pfn = page_to_pfn(page);
+        unsigned long flags;
+        int order;
+        spin_lock_irqsave(&zone->lock, flags);
+        for (order = 0; order < MAX_ORDER; order++) {
+                struct page *page_head = page - (pfn & ((1 << order) - 1));
+                if (PageBuddy(page_head) && page_order(page_head) >= order)
+                        break;
+        }
+        spin_unlock_irqrestore(&zone->lock, flags);
+        return order < MAX_ORDER;
+}
+#endif

diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2bc2ac63f41e..8deb9d0fd5b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c
@@ -48,6 +48,7 @@
48	#include <linux/page_cgroup.h>	48	#include <linux/page_cgroup.h>
49	#include <linux/debugobjects.h>	49	#include <linux/debugobjects.h>
50	#include <linux/kmemleak.h>	50	#include <linux/kmemleak.h>
		51	#include <linux/memory.h>
51	#include <trace/events/kmem.h>	52	#include <trace/events/kmem.h>
52		53
53	#include <asm/tlbflush.h>	54	#include <asm/tlbflush.h>
@@ -486,7 +487,6 @@ static inline void __free_one_page(struct page *page,
486	zone->free_area[order].nr_free++;	487	zone->free_area[order].nr_free++;
487	}	488	}
488		489
489	#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
490	/*	490	/*
491	* free_page_mlock() -- clean up attempts to free and mlocked() page.	491	* free_page_mlock() -- clean up attempts to free and mlocked() page.
492	* Page should not be on lru, so no need to fix that up.	492	* Page should not be on lru, so no need to fix that up.
@@ -497,9 +497,6 @@ static inline void free_page_mlock(struct page *page)
497	__dec_zone_page_state(page, NR_MLOCK);	497	__dec_zone_page_state(page, NR_MLOCK);
498	__count_vm_event(UNEVICTABLE_MLOCKFREED);	498	__count_vm_event(UNEVICTABLE_MLOCKFREED);
499	}	499	}
500	#else
501	static void free_page_mlock(struct page *page) { }
502	#endif
503		500
504	static inline int free_pages_check(struct page *page)	501	static inline int free_pages_check(struct page *page)
505	{	502	{
@@ -559,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
559	page = list_entry(list->prev, struct page, lru);	556	page = list_entry(list->prev, struct page, lru);
560	/* must delete as __free_one_page list manipulates */	557	/* must delete as __free_one_page list manipulates */
561	list_del(&page->lru);	558	list_del(&page->lru);
562	__free_one_page(page, zone, 0, migratetype);	559	/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
563	trace_mm_page_pcpu_drain(page, 0, migratetype);	560	__free_one_page(page, zone, 0, page_private(page));
		561	trace_mm_page_pcpu_drain(page, 0, page_private(page));
564	} while (--count && --batch_free && !list_empty(list));	562	} while (--count && --batch_free && !list_empty(list));
565	}	563	}
566	spin_unlock(&zone->lock);	564	spin_unlock(&zone->lock);
@@ -1225,10 +1223,10 @@ again:
1225	}	1223	}
1226	spin_lock_irqsave(&zone->lock, flags);	1224	spin_lock_irqsave(&zone->lock, flags);
1227	page = __rmqueue(zone, order, migratetype);	1225	page = __rmqueue(zone, order, migratetype);
1228	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1229	spin_unlock(&zone->lock);	1226	spin_unlock(&zone->lock);
1230	if (!page)	1227	if (!page)
1231	goto failed;	1228	goto failed;
		1229	__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
1232	}	1230	}
1233		1231
1234	__count_zone_vm_events(PGALLOC, zone, 1 << order);	1232	__count_zone_vm_events(PGALLOC, zone, 1 << order);
@@ -1658,12 +1656,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
1658	if (page)	1656	if (page)
1659	goto out;	1657	goto out;
1660		1658
1661	/* The OOM killer will not help higher order allocs */	1659	if (!(gfp_mask & __GFP_NOFAIL)) {
1662	if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))	1660	/* The OOM killer will not help higher order allocs */
1663	goto out;	1661	if (order > PAGE_ALLOC_COSTLY_ORDER)
1664		1662	goto out;
		1663	/*
		1664	* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
		1665	* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
		1666	* The caller should handle page allocation failure by itself if
		1667	* it specifies __GFP_THISNODE.
		1668	* Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER.
		1669	*/
		1670	if (gfp_mask & __GFP_THISNODE)
		1671	goto out;
		1672	}
1665	/* Exhausted what can be done so it's blamo time */	1673	/* Exhausted what can be done so it's blamo time */
1666	out_of_memory(zonelist, gfp_mask, order);	1674	out_of_memory(zonelist, gfp_mask, order, nodemask);
1667		1675
1668	out:	1676	out:
1669	clear_zonelist_oom(zonelist, gfp_mask);	1677	clear_zonelist_oom(zonelist, gfp_mask);
@@ -2395,13 +2403,14 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2395	{	2403	{
2396	char saved_string[NUMA_ZONELIST_ORDER_LEN];	2404	char saved_string[NUMA_ZONELIST_ORDER_LEN];
2397	int ret;	2405	int ret;
		2406	static DEFINE_MUTEX(zl_order_mutex);
2398		2407
		2408	mutex_lock(&zl_order_mutex);
2399	if (write)	2409	if (write)
2400	strncpy(saved_string, (char*)table->data,	2410	strcpy(saved_string, (char*)table->data);
2401	NUMA_ZONELIST_ORDER_LEN);
2402	ret = proc_dostring(table, write, buffer, length, ppos);	2411	ret = proc_dostring(table, write, buffer, length, ppos);
2403	if (ret)	2412	if (ret)
2404	return ret;	2413	goto out;
2405	if (write) {	2414	if (write) {
2406	int oldval = user_zonelist_order;	2415	int oldval = user_zonelist_order;
2407	if (__parse_numa_zonelist_order((char*)table->data)) {	2416	if (__parse_numa_zonelist_order((char*)table->data)) {
@@ -2414,7 +2423,9 @@ int numa_zonelist_order_handler(ctl_table *table, int write,
2414	} else if (oldval != user_zonelist_order)	2423	} else if (oldval != user_zonelist_order)
2415	build_all_zonelists();	2424	build_all_zonelists();
2416	}	2425	}
2417	return 0;	2426	out:
		2427	mutex_unlock(&zl_order_mutex);
		2428	return ret;
2418	}	2429	}
2419		2430
2420		2431
@@ -3127,7 +3138,7 @@ static int __cpuinit process_zones(int cpu)
3127		3138
3128	if (percpu_pagelist_fraction)	3139	if (percpu_pagelist_fraction)
3129	setup_pagelist_highmark(zone_pcp(zone, cpu),	3140	setup_pagelist_highmark(zone_pcp(zone, cpu),
3130	(zone->present_pages / percpu_pagelist_fraction));	3141	(zone->present_pages / percpu_pagelist_fraction));
3131	}	3142	}
3132		3143
3133	return 0;	3144	return 0;
@@ -3573,7 +3584,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
3573	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,	3584	* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
3574	* then all holes in the requested range will be accounted for.	3585	* then all holes in the requested range will be accounted for.
3575	*/	3586	*/
3576	static unsigned long __meminit __absent_pages_in_range(int nid,	3587	unsigned long __meminit __absent_pages_in_range(int nid,
3577	unsigned long range_start_pfn,	3588	unsigned long range_start_pfn,
3578	unsigned long range_end_pfn)	3589	unsigned long range_end_pfn)
3579	{	3590	{
@@ -3988,7 +3999,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn,
3988	}	3999	}
3989		4000
3990	/* Merge backward if suitable */	4001	/* Merge backward if suitable */
3991	if (start_pfn < early_node_map[i].end_pfn &&	4002	if (start_pfn < early_node_map[i].start_pfn &&
3992	end_pfn >= early_node_map[i].start_pfn) {	4003	end_pfn >= early_node_map[i].start_pfn) {
3993	early_node_map[i].start_pfn = start_pfn;	4004	early_node_map[i].start_pfn = start_pfn;
3994	return;	4005	return;
@@ -4102,7 +4113,7 @@ static int __init cmp_node_active_region(const void a, const void b)
4102	}	4113	}
4103		4114
4104	/* sort the node_map by start_pfn */	4115	/* sort the node_map by start_pfn */
4105	static void __init sort_node_map(void)	4116	void __init sort_node_map(void)
4106	{	4117	{
4107	sort(early_node_map, (size_t)nr_nodemap_entries,	4118	sort(early_node_map, (size_t)nr_nodemap_entries,
4108	sizeof(struct node_active_region),	4119	sizeof(struct node_active_region),
@@ -5002,23 +5013,65 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags,
5002	int set_migratetype_isolate(struct page *page)	5013	int set_migratetype_isolate(struct page *page)
5003	{	5014	{
5004	struct zone *zone;	5015	struct zone *zone;
5005	unsigned long flags;	5016	struct page *curr_page;
		5017	unsigned long flags, pfn, iter;
		5018	unsigned long immobile = 0;
		5019	struct memory_isolate_notify arg;
		5020	int notifier_ret;
5006	int ret = -EBUSY;	5021	int ret = -EBUSY;
5007	int zone_idx;	5022	int zone_idx;
5008		5023
5009	zone = page_zone(page);	5024	zone = page_zone(page);
5010	zone_idx = zone_idx(zone);	5025	zone_idx = zone_idx(zone);
		5026
5011	spin_lock_irqsave(&zone->lock, flags);	5027	spin_lock_irqsave(&zone->lock, flags);
		5028	if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE \|\|
		5029	zone_idx == ZONE_MOVABLE) {
		5030	ret = 0;
		5031	goto out;
		5032	}
		5033
		5034	pfn = page_to_pfn(page);
		5035	arg.start_pfn = pfn;
		5036	arg.nr_pages = pageblock_nr_pages;
		5037	arg.pages_found = 0;
		5038
5012	/*	5039	/*
5013	* In future, more migrate types will be able to be isolation target.	5040	* It may be possible to isolate a pageblock even if the
		5041	* migratetype is not MIGRATE_MOVABLE. The memory isolation
		5042	* notifier chain is used by balloon drivers to return the
		5043	* number of pages in a range that are held by the balloon
		5044	* driver to shrink memory. If all the pages are accounted for
		5045	* by balloons, are free, or on the LRU, isolation can continue.
		5046	* Later, for example, when memory hotplug notifier runs, these
		5047	* pages reported as "can be isolated" should be isolated(freed)
		5048	* by the balloon driver through the memory notifier chain.
5014	*/	5049	*/
5015	if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&	5050	notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg);
5016	zone_idx != ZONE_MOVABLE)	5051	notifier_ret = notifier_to_errno(notifier_ret);
		5052	if (notifier_ret \|\| !arg.pages_found)
5017	goto out;	5053	goto out;
5018	set_pageblock_migratetype(page, MIGRATE_ISOLATE);	5054
5019	move_freepages_block(zone, page, MIGRATE_ISOLATE);	5055	for (iter = pfn; iter < (pfn + pageblock_nr_pages); iter++) {
5020	ret = 0;	5056	if (!pfn_valid_within(pfn))
		5057	continue;
		5058
		5059	curr_page = pfn_to_page(iter);
		5060	if (!page_count(curr_page) \|\| PageLRU(curr_page))
		5061	continue;
		5062
		5063	immobile++;
		5064	}
		5065
		5066	if (arg.pages_found == immobile)
		5067	ret = 0;
		5068
5021	out:	5069	out:
		5070	if (!ret) {
		5071	set_pageblock_migratetype(page, MIGRATE_ISOLATE);
		5072	move_freepages_block(zone, page, MIGRATE_ISOLATE);
		5073	}
		5074
5022	spin_unlock_irqrestore(&zone->lock, flags);	5075	spin_unlock_irqrestore(&zone->lock, flags);
5023	if (!ret)	5076	if (!ret)
5024	drain_all_pages();	5077	drain_all_pages();
@@ -5085,3 +5138,24 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
5085	spin_unlock_irqrestore(&zone->lock, flags);	5138	spin_unlock_irqrestore(&zone->lock, flags);
5086	}	5139	}
5087	#endif	5140	#endif
		5141
		5142	#ifdef CONFIG_MEMORY_FAILURE
		5143	bool is_free_buddy_page(struct page *page)
		5144	{
		5145	struct zone *zone = page_zone(page);
		5146	unsigned long pfn = page_to_pfn(page);
		5147	unsigned long flags;
		5148	int order;
		5149
		5150	spin_lock_irqsave(&zone->lock, flags);
		5151	for (order = 0; order < MAX_ORDER; order++) {
		5152	struct page *page_head = page - (pfn & ((1 << order) - 1));
		5153
		5154	if (PageBuddy(page_head) && page_order(page_head) >= order)
		5155	break;
		5156	}
		5157	spin_unlock_irqrestore(&zone->lock, flags);
		5158
		5159	return order < MAX_ORDER;
		5160	}
		5161	#endif