mm, hugetlb: do not rely on overcommit limit during migration

hugepage migration relies on __alloc_buddy_huge_page to get a new page. This has 2 main disadvantages. 1) it doesn't allow to migrate any huge page if the pool is used completely which is not an exceptional case as the pool is static and unused memory is just wasted. 2) it leads to a weird semantic when migration between two numa nodes might increase the pool size of the destination NUMA node while the page is in use. The issue is caused by per NUMA node surplus pages tracking (see free_huge_page). Address both issues by changing the way how we allocate and account pages allocated for migration. Those should temporal by definition. So we mark them that way (we will abuse page flags in the 3rd page) and update free_huge_page to free such pages to the page allocator. Page migration path then just transfers the temporal status from the new page to the old one which will be freed on the last reference. The global surplus count will never change during this path but we still have to be careful when migrating a per-node suprlus page. This is now handled in move_hugetlb_state which is called from the migration path and it copies the hugetlb specific page state and fixes up the accounting when needed Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better reflect its purpose. The new allocation routine for the migration path is __alloc_migrate_huge_page. The user visible effect of this patch is that migrated pages are really temporal and they travel between NUMA nodes as per the migration request: Before migration /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 After /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 with the previous implementation, both nodes would have nr_hugepages:1 until the page is freed. Link: http://lkml.kernel.org/r/20180103093213.26329-4-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andrea Reale <ar@linux.vnet.ibm.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Zi Yan <zi.yan@cs.rutgers.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Michal Hocko <mhocko@suse.com> 2018-01-31 19:20:48 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-01-31 20:18:40 -0500
commit: ab5ac90aecf5685eb630c42c396f5f14726b0afd (patch)
tree: a10a1a6a3ea2bb742c1cae7d89e472bcbec25742
parent: d9cc948f6fa1c3384037f500e0acd35f03850d15 (diff)
3 files changed, 99 insertions, 18 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 944e6e8bd572..66992348531e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
                                                long freed);
 bool isolate_huge_page(struct page *page, struct list_head *list);
 void putback_active_hugepage(struct page *page);
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
 void free_huge_page(struct page *page);
 void hugetlb_fix_reserve_counts(struct inode *inode);
 extern struct mutex *hugetlb_fault_mutex_table;
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot);
 bool is_hugetlb_entry_migration(pte_t pte);
 #else /* !CONFIG_HUGETLB_PAGE */
 static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
        return false;
 }
 #define putback_active_hugepage(p)      do {} while (0)
+#define move_hugetlb_state(old, new, reason)    do {} while (0)
 static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 360765156c7c..f260ffa26363 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_owner.h>
 #include "internal.h"
 int hugetlb_max_hstate __read_mostly;
@@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
        ClearPagePrivate(&page[1]);
 }
+/*
+ * Internal hugetlb specific page flag. Do not use outside of the hugetlb
+ * code
+ */
+static inline bool PageHugeTemporary(struct page *page)
+{
+        if (!PageHuge(page))
+                return false;
+        return (unsigned long)page[2].mapping == -1U;
+}
+static inline void SetPageHugeTemporary(struct page *page)
+{
+        page[2].mapping = (void *)-1U;
+}
+static inline void ClearPageHugeTemporary(struct page *page)
+{
+        page[2].mapping = NULL;
+}
 void free_huge_page(struct page *page)
 {
        /*
@@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page)
        if (restore_reserve)
                h->resv_huge_pages++;
-        if (h->surplus_huge_pages_node[nid]) {
+        if (PageHugeTemporary(page)) {
+                list_del(&page->lru);
+                ClearPageHugeTemporary(page);
+                update_and_free_page(h, page);
+        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                list_del(&page->lru);
                update_and_free_page(h, page);
@@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
        return rc;
 }
-static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
+/*
+ * Allocates a fresh surplus page from the page allocator.
+ */
+static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nmask)
 {
        struct page *page;
@@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
        return page;
 }
+static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+                int nid, nodemask_t *nmask)
+{
+        struct page *page;
+        if (hstate_is_gigantic(h))
+                return NULL;
+        page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
+        if (!page)
+                return NULL;
+        /*
+         * We do not account these pages as surplus because they are only
+         * temporary and will be released properly on the last reference
+         */
+        prep_new_huge_page(h, page, page_to_nid(page));
+        SetPageHugeTemporary(page);
+        return page;
+}
 /*
 * Use the VMA's mpolicy to allocate a huge page from the buddy.
 */
@@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
        nodemask_t *nodemask;
        nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
-        page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);
+        page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);
        return page;
 }
-/*
+/* page migration callback function */
- * This allocation function is useful in the context where vma is irrelevant.
- * E.g. soft-offlining uses this function because it only cares physical
- * address of error page.
- */
 struct page *alloc_huge_page_node(struct hstate *h, int nid)
 {
        gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
        spin_unlock(&hugetlb_lock);
        if (!page)
-                page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);
+                page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
        return page;
 }
+/* page migration callback function */
 struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask)
 {
@@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
        }
        spin_unlock(&hugetlb_lock);
-        /* No reservations, try to overcommit */
+        return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
-        return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
 }
 /*
@@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
 retry:
        spin_unlock(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
-                page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),
+                page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                NUMA_NO_NODE, NULL);
                if (!page) {
                        alloc_ok = false;
@@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * First take pages out of surplus state.  Then make up the
         * remaining difference by allocating fresh huge pages.
         *
-         * We might race with __alloc_buddy_huge_page() here and be unable
+         * We might race with __alloc_surplus_huge_page() here and be unable
         * to convert a surplus huge page to a normal huge page. That is
         * not critical, though, it just means the overall size of the
         * pool might be one hugepage larger than it needs to be, but
@@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
         * By placing pages into the surplus state independent of the
         * overcommit value, we are allowing the surplus pool size to
         * exceed overcommit. There are few sane options here. Since
-         * __alloc_buddy_huge_page() is checking the global counter,
+         * __alloc_surplus_huge_page() is checking the global counter,
         * though, we'll note that we're not allowed to exceed surplus
         * and won't grow the pool anywhere else. Not until one of the
         * sysctls are changed, or the surplus pages go out of use.
@@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page)
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
+void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
+{
+        struct hstate *h = page_hstate(oldpage);
+        hugetlb_cgroup_migrate(oldpage, newpage);
+        set_page_owner_migrate_reason(newpage, reason);
+        /*
+         * transfer temporary state of the new huge page. This is
+         * reverse to other transitions because the newpage is going to
+         * be final while the old one will be freed so it takes over
+         * the temporary status.
+         *
+         * Also note that we have to transfer the per-node surplus state
+         * here as well otherwise the global surplus count will not match
+         * the per-node's.
+         */
+        if (PageHugeTemporary(newpage)) {
+                int old_nid = page_to_nid(oldpage);
+                int new_nid = page_to_nid(newpage);
+                SetPageHugeTemporary(oldpage);
+                ClearPageHugeTemporary(newpage);
+                spin_lock(&hugetlb_lock);
+                if (h->surplus_huge_pages_node[old_nid]) {
+                        h->surplus_huge_pages_node[old_nid]--;
+                        h->surplus_huge_pages_node[new_nid]++;
+                }
+                spin_unlock(&hugetlb_lock);
+        }
+}
diff --git a/mm/migrate.c b/mm/migrate.c
index 4d0be47a322a..1e5525a25691 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
                put_anon_vma(anon_vma);
        if (rc == MIGRATEPAGE_SUCCESS) {
-                hugetlb_cgroup_migrate(hpage, new_hpage);
+                move_hugetlb_state(hpage, new_hpage, reason);
                put_new_page = NULL;
-                set_page_owner_migrate_reason(new_hpage, reason);
        }
        unlock_page(hpage);
author	Michal Hocko <mhocko@suse.com>	2018-01-31 19:20:48 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-01-31 20:18:40 -0500
commit	ab5ac90aecf5685eb630c42c396f5f14726b0afd (patch)
tree	a10a1a6a3ea2bb742c1cae7d89e472bcbec25742
parent	d9cc948f6fa1c3384037f500e0acd35f03850d15 (diff)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 944e6e8bd572..66992348531e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
119	long freed);	119	long freed);
120	bool isolate_huge_page(struct page page, struct list_head list);	120	bool isolate_huge_page(struct page page, struct list_head list);
121	void putback_active_hugepage(struct page *page);	121	void putback_active_hugepage(struct page *page);
		122	void move_hugetlb_state(struct page oldpage, struct page newpage, int reason);
122	void free_huge_page(struct page *page);	123	void free_huge_page(struct page *page);
123	void hugetlb_fix_reserve_counts(struct inode *inode);	124	void hugetlb_fix_reserve_counts(struct inode *inode);
124	extern struct mutex *hugetlb_fault_mutex_table;	125	extern struct mutex *hugetlb_fault_mutex_table;
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
157	unsigned long address, unsigned long end, pgprot_t newprot);	158	unsigned long address, unsigned long end, pgprot_t newprot);
158		159
159	bool is_hugetlb_entry_migration(pte_t pte);	160	bool is_hugetlb_entry_migration(pte_t pte);
		161
160	#else /* !CONFIG_HUGETLB_PAGE */	162	#else /* !CONFIG_HUGETLB_PAGE */
161		163
162	static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)	164	static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page page, struct list_head list)
197	return false;	199	return false;
198	}	200	}
199	#define putback_active_hugepage(p) do {} while (0)	201	#define putback_active_hugepage(p) do {} while (0)
		202	#define move_hugetlb_state(old, new, reason) do {} while (0)
200		203
201	static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,	204	static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
202	unsigned long address, unsigned long end, pgprot_t newprot)	205	unsigned long address, unsigned long end, pgprot_t newprot)


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 360765156c7c..f260ffa26363 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
34	#include <linux/hugetlb_cgroup.h>	34	#include <linux/hugetlb_cgroup.h>
35	#include <linux/node.h>	35	#include <linux/node.h>
36	#include <linux/userfaultfd_k.h>	36	#include <linux/userfaultfd_k.h>
		37	#include <linux/page_owner.h>
37	#include "internal.h"	38	#include "internal.h"
38		39
39	int hugetlb_max_hstate __read_mostly;	40	int hugetlb_max_hstate __read_mostly;
@@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
1219	ClearPagePrivate(&page[1]);	1220	ClearPagePrivate(&page[1]);
1220	}	1221	}
1221		1222
		1223	/*
		1224	* Internal hugetlb specific page flag. Do not use outside of the hugetlb
		1225	* code
		1226	*/
		1227	static inline bool PageHugeTemporary(struct page *page)
		1228	{
		1229	if (!PageHuge(page))
		1230	return false;
		1231
		1232	return (unsigned long)page[2].mapping == -1U;
		1233	}
		1234
		1235	static inline void SetPageHugeTemporary(struct page *page)
		1236	{
		1237	page[2].mapping = (void *)-1U;
		1238	}
		1239
		1240	static inline void ClearPageHugeTemporary(struct page *page)
		1241	{
		1242	page[2].mapping = NULL;
		1243	}
		1244
1222	void free_huge_page(struct page *page)	1245	void free_huge_page(struct page *page)
1223	{	1246	{
1224	/*	1247	/*
@@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page)
1253	if (restore_reserve)	1276	if (restore_reserve)
1254	h->resv_huge_pages++;	1277	h->resv_huge_pages++;
1255		1278
1256	if (h->surplus_huge_pages_node[nid]) {	1279	if (PageHugeTemporary(page)) {
		1280	list_del(&page->lru);
		1281	ClearPageHugeTemporary(page);
		1282	update_and_free_page(h, page);
		1283	} else if (h->surplus_huge_pages_node[nid]) {
1257	/* remove the page from active list */	1284	/* remove the page from active list */
1258	list_del(&page->lru);	1285	list_del(&page->lru);
1259	update_and_free_page(h, page);	1286	update_and_free_page(h, page);
@@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1507	return rc;	1534	return rc;
1508	}	1535	}
1509		1536
1510	static struct page __alloc_buddy_huge_page(struct hstate h, gfp_t gfp_mask,	1537	/*
		1538	* Allocates a fresh surplus page from the page allocator.
		1539	*/
		1540	static struct page __alloc_surplus_huge_page(struct hstate h, gfp_t gfp_mask,
1511	int nid, nodemask_t *nmask)	1541	int nid, nodemask_t *nmask)
1512	{	1542	{
1513	struct page *page;	1543	struct page *page;
@@ -1571,6 +1601,28 @@ static struct page __alloc_buddy_huge_page(struct hstate h, gfp_t gfp_mask,
1571	return page;	1601	return page;
1572	}	1602	}
1573		1603
		1604	static struct page __alloc_migrate_huge_page(struct hstate h, gfp_t gfp_mask,
		1605	int nid, nodemask_t *nmask)
		1606	{
		1607	struct page *page;
		1608
		1609	if (hstate_is_gigantic(h))
		1610	return NULL;
		1611
		1612	page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
		1613	if (!page)
		1614	return NULL;
		1615
		1616	/*
		1617	* We do not account these pages as surplus because they are only
		1618	* temporary and will be released properly on the last reference
		1619	*/
		1620	prep_new_huge_page(h, page, page_to_nid(page));
		1621	SetPageHugeTemporary(page);
		1622
		1623	return page;
		1624	}
		1625
1574	/*	1626	/*
1575	* Use the VMA's mpolicy to allocate a huge page from the buddy.	1627	* Use the VMA's mpolicy to allocate a huge page from the buddy.
1576	*/	1628	*/
@@ -1585,17 +1637,13 @@ struct page __alloc_buddy_huge_page_with_mpol(struct hstate h,
1585	nodemask_t *nodemask;	1637	nodemask_t *nodemask;
1586		1638
1587	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);	1639	nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
1588	page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask);	1640	page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
1589	mpol_cond_put(mpol);	1641	mpol_cond_put(mpol);
1590		1642
1591	return page;	1643	return page;
1592	}	1644	}
1593		1645
1594	/*	1646	/* page migration callback function */
1595	* This allocation function is useful in the context where vma is irrelevant.
1596	* E.g. soft-offlining uses this function because it only cares physical
1597	* address of error page.
1598	*/
1599	struct page alloc_huge_page_node(struct hstate h, int nid)	1647	struct page alloc_huge_page_node(struct hstate h, int nid)
1600	{	1648	{
1601	gfp_t gfp_mask = htlb_alloc_mask(h);	1649	gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1610,12 +1658,12 @@ struct page alloc_huge_page_node(struct hstate h, int nid)
1610	spin_unlock(&hugetlb_lock);	1658	spin_unlock(&hugetlb_lock);
1611		1659
1612	if (!page)	1660	if (!page)
1613	page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL);	1661	page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1614		1662
1615	return page;	1663	return page;
1616	}	1664	}
1617		1665
1618		1666	/* page migration callback function */
1619	struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,	1667	struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,
1620	nodemask_t *nmask)	1668	nodemask_t *nmask)
1621	{	1669	{
@@ -1633,9 +1681,7 @@ struct page alloc_huge_page_nodemask(struct hstate h, int preferred_nid,
1633	}	1681	}
1634	spin_unlock(&hugetlb_lock);	1682	spin_unlock(&hugetlb_lock);
1635		1683
1636	/* No reservations, try to overcommit */	1684	return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
1637
1638	return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
1639	}	1685	}
1640		1686
1641	/*	1687	/*
@@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
1663	retry:	1709	retry:
1664	spin_unlock(&hugetlb_lock);	1710	spin_unlock(&hugetlb_lock);
1665	for (i = 0; i < needed; i++) {	1711	for (i = 0; i < needed; i++) {
1666	page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h),	1712	page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
1667	NUMA_NO_NODE, NULL);	1713	NUMA_NO_NODE, NULL);
1668	if (!page) {	1714	if (!page) {
1669	alloc_ok = false;	1715	alloc_ok = false;
@@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2260	* First take pages out of surplus state. Then make up the	2306	* First take pages out of surplus state. Then make up the
2261	* remaining difference by allocating fresh huge pages.	2307	* remaining difference by allocating fresh huge pages.
2262	*	2308	*
2263	* We might race with __alloc_buddy_huge_page() here and be unable	2309	* We might race with __alloc_surplus_huge_page() here and be unable
2264	* to convert a surplus huge page to a normal huge page. That is	2310	* to convert a surplus huge page to a normal huge page. That is
2265	* not critical, though, it just means the overall size of the	2311	* not critical, though, it just means the overall size of the
2266	* pool might be one hugepage larger than it needs to be, but	2312	* pool might be one hugepage larger than it needs to be, but
@@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2303	* By placing pages into the surplus state independent of the	2349	* By placing pages into the surplus state independent of the
2304	* overcommit value, we are allowing the surplus pool size to	2350	* overcommit value, we are allowing the surplus pool size to
2305	* exceed overcommit. There are few sane options here. Since	2351	* exceed overcommit. There are few sane options here. Since
2306	* __alloc_buddy_huge_page() is checking the global counter,	2352	* __alloc_surplus_huge_page() is checking the global counter,
2307	* though, we'll note that we're not allowed to exceed surplus	2353	* though, we'll note that we're not allowed to exceed surplus
2308	* and won't grow the pool anywhere else. Not until one of the	2354	* and won't grow the pool anywhere else. Not until one of the
2309	* sysctls are changed, or the surplus pages go out of use.	2355	* sysctls are changed, or the surplus pages go out of use.
@@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page)
4779	spin_unlock(&hugetlb_lock);	4825	spin_unlock(&hugetlb_lock);
4780	put_page(page);	4826	put_page(page);
4781	}	4827	}
		4828
		4829	void move_hugetlb_state(struct page oldpage, struct page newpage, int reason)
		4830	{
		4831	struct hstate *h = page_hstate(oldpage);
		4832
		4833	hugetlb_cgroup_migrate(oldpage, newpage);
		4834	set_page_owner_migrate_reason(newpage, reason);
		4835
		4836	/*
		4837	* transfer temporary state of the new huge page. This is
		4838	* reverse to other transitions because the newpage is going to
		4839	* be final while the old one will be freed so it takes over
		4840	* the temporary status.
		4841	*
		4842	* Also note that we have to transfer the per-node surplus state
		4843	* here as well otherwise the global surplus count will not match
		4844	* the per-node's.
		4845	*/
		4846	if (PageHugeTemporary(newpage)) {
		4847	int old_nid = page_to_nid(oldpage);
		4848	int new_nid = page_to_nid(newpage);
		4849
		4850	SetPageHugeTemporary(oldpage);
		4851	ClearPageHugeTemporary(newpage);
		4852
		4853	spin_lock(&hugetlb_lock);
		4854	if (h->surplus_huge_pages_node[old_nid]) {
		4855	h->surplus_huge_pages_node[old_nid]--;
		4856	h->surplus_huge_pages_node[new_nid]++;
		4857	}
		4858	spin_unlock(&hugetlb_lock);
		4859	}
		4860	}


diff --git a/mm/migrate.c b/mm/migrate.c index 4d0be47a322a..1e5525a25691 100644 --- a/mm/migrate.c +++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
1323	put_anon_vma(anon_vma);	1323	put_anon_vma(anon_vma);
1324		1324
1325	if (rc == MIGRATEPAGE_SUCCESS) {	1325	if (rc == MIGRATEPAGE_SUCCESS) {
1326	hugetlb_cgroup_migrate(hpage, new_hpage);	1326	move_hugetlb_state(hpage, new_hpage, reason);
1327	put_new_page = NULL;	1327	put_new_page = NULL;
1328	set_page_owner_migrate_reason(new_hpage, reason);
1329	}	1328	}
1330		1329
1331	unlock_page(hpage);	1330	unlock_page(hpage);