aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2018-01-31 19:20:48 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-31 20:18:40 -0500
commitab5ac90aecf5685eb630c42c396f5f14726b0afd (patch)
treea10a1a6a3ea2bb742c1cae7d89e472bcbec25742
parentd9cc948f6fa1c3384037f500e0acd35f03850d15 (diff)
mm, hugetlb: do not rely on overcommit limit during migration
hugepage migration relies on __alloc_buddy_huge_page to get a new page. This has 2 main disadvantages. 1) it doesn't allow to migrate any huge page if the pool is used completely which is not an exceptional case as the pool is static and unused memory is just wasted. 2) it leads to a weird semantic when migration between two numa nodes might increase the pool size of the destination NUMA node while the page is in use. The issue is caused by per NUMA node surplus pages tracking (see free_huge_page). Address both issues by changing the way how we allocate and account pages allocated for migration. Those should temporal by definition. So we mark them that way (we will abuse page flags in the 3rd page) and update free_huge_page to free such pages to the page allocator. Page migration path then just transfers the temporal status from the new page to the old one which will be freed on the last reference. The global surplus count will never change during this path but we still have to be careful when migrating a per-node suprlus page. This is now handled in move_hugetlb_state which is called from the migration path and it copies the hugetlb specific page state and fixes up the accounting when needed Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better reflect its purpose. The new allocation routine for the migration path is __alloc_migrate_huge_page. The user visible effect of this patch is that migrated pages are really temporal and they travel between NUMA nodes as per the migration request: Before migration /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 After /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 with the previous implementation, both nodes would have nr_hugepages:1 until the page is freed. Link: http://lkml.kernel.org/r/20180103093213.26329-4-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Cc: Andrea Reale <ar@linux.vnet.ibm.com> Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com> Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Zi Yan <zi.yan@cs.rutgers.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/hugetlb.h3
-rw-r--r--mm/hugetlb.c111
-rw-r--r--mm/migrate.c3
3 files changed, 99 insertions, 18 deletions
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 944e6e8bd572..66992348531e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
119 long freed); 119 long freed);
120bool isolate_huge_page(struct page *page, struct list_head *list); 120bool isolate_huge_page(struct page *page, struct list_head *list);
121void putback_active_hugepage(struct page *page); 121void putback_active_hugepage(struct page *page);
122void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason);
122void free_huge_page(struct page *page); 123void free_huge_page(struct page *page);
123void hugetlb_fix_reserve_counts(struct inode *inode); 124void hugetlb_fix_reserve_counts(struct inode *inode);
124extern struct mutex *hugetlb_fault_mutex_table; 125extern struct mutex *hugetlb_fault_mutex_table;
@@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
157 unsigned long address, unsigned long end, pgprot_t newprot); 158 unsigned long address, unsigned long end, pgprot_t newprot);
158 159
159bool is_hugetlb_entry_migration(pte_t pte); 160bool is_hugetlb_entry_migration(pte_t pte);
161
160#else /* !CONFIG_HUGETLB_PAGE */ 162#else /* !CONFIG_HUGETLB_PAGE */
161 163
162static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) 164static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
@@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list)
197 return false; 199 return false;
198} 200}
199#define putback_active_hugepage(p) do {} while (0) 201#define putback_active_hugepage(p) do {} while (0)
202#define move_hugetlb_state(old, new, reason) do {} while (0)
200 203
201static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, 204static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
202 unsigned long address, unsigned long end, pgprot_t newprot) 205 unsigned long address, unsigned long end, pgprot_t newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 360765156c7c..f260ffa26363 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -34,6 +34,7 @@
34#include <linux/hugetlb_cgroup.h> 34#include <linux/hugetlb_cgroup.h>
35#include <linux/node.h> 35#include <linux/node.h>
36#include <linux/userfaultfd_k.h> 36#include <linux/userfaultfd_k.h>
37#include <linux/page_owner.h>
37#include "internal.h" 38#include "internal.h"
38 39
39int hugetlb_max_hstate __read_mostly; 40int hugetlb_max_hstate __read_mostly;
@@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page)
1219 ClearPagePrivate(&page[1]); 1220 ClearPagePrivate(&page[1]);
1220} 1221}
1221 1222
1223/*
1224 * Internal hugetlb specific page flag. Do not use outside of the hugetlb
1225 * code
1226 */
1227static inline bool PageHugeTemporary(struct page *page)
1228{
1229 if (!PageHuge(page))
1230 return false;
1231
1232 return (unsigned long)page[2].mapping == -1U;
1233}
1234
1235static inline void SetPageHugeTemporary(struct page *page)
1236{
1237 page[2].mapping = (void *)-1U;
1238}
1239
1240static inline void ClearPageHugeTemporary(struct page *page)
1241{
1242 page[2].mapping = NULL;
1243}
1244
1222void free_huge_page(struct page *page) 1245void free_huge_page(struct page *page)
1223{ 1246{
1224 /* 1247 /*
@@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page)
1253 if (restore_reserve) 1276 if (restore_reserve)
1254 h->resv_huge_pages++; 1277 h->resv_huge_pages++;
1255 1278
1256 if (h->surplus_huge_pages_node[nid]) { 1279 if (PageHugeTemporary(page)) {
1280 list_del(&page->lru);
1281 ClearPageHugeTemporary(page);
1282 update_and_free_page(h, page);
1283 } else if (h->surplus_huge_pages_node[nid]) {
1257 /* remove the page from active list */ 1284 /* remove the page from active list */
1258 list_del(&page->lru); 1285 list_del(&page->lru);
1259 update_and_free_page(h, page); 1286 update_and_free_page(h, page);
@@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
1507 return rc; 1534 return rc;
1508} 1535}
1509 1536
1510static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, 1537/*
1538 * Allocates a fresh surplus page from the page allocator.
1539 */
1540static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
1511 int nid, nodemask_t *nmask) 1541 int nid, nodemask_t *nmask)
1512{ 1542{
1513 struct page *page; 1543 struct page *page;
@@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask,
1571 return page; 1601 return page;
1572} 1602}
1573 1603
1604static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
1605 int nid, nodemask_t *nmask)
1606{
1607 struct page *page;
1608
1609 if (hstate_is_gigantic(h))
1610 return NULL;
1611
1612 page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask);
1613 if (!page)
1614 return NULL;
1615
1616 /*
1617 * We do not account these pages as surplus because they are only
1618 * temporary and will be released properly on the last reference
1619 */
1620 prep_new_huge_page(h, page, page_to_nid(page));
1621 SetPageHugeTemporary(page);
1622
1623 return page;
1624}
1625
1574/* 1626/*
1575 * Use the VMA's mpolicy to allocate a huge page from the buddy. 1627 * Use the VMA's mpolicy to allocate a huge page from the buddy.
1576 */ 1628 */
@@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h,
1585 nodemask_t *nodemask; 1637 nodemask_t *nodemask;
1586 1638
1587 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); 1639 nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
1588 page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); 1640 page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
1589 mpol_cond_put(mpol); 1641 mpol_cond_put(mpol);
1590 1642
1591 return page; 1643 return page;
1592} 1644}
1593 1645
1594/* 1646/* page migration callback function */
1595 * This allocation function is useful in the context where vma is irrelevant.
1596 * E.g. soft-offlining uses this function because it only cares physical
1597 * address of error page.
1598 */
1599struct page *alloc_huge_page_node(struct hstate *h, int nid) 1647struct page *alloc_huge_page_node(struct hstate *h, int nid)
1600{ 1648{
1601 gfp_t gfp_mask = htlb_alloc_mask(h); 1649 gfp_t gfp_mask = htlb_alloc_mask(h);
@@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid)
1610 spin_unlock(&hugetlb_lock); 1658 spin_unlock(&hugetlb_lock);
1611 1659
1612 if (!page) 1660 if (!page)
1613 page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); 1661 page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
1614 1662
1615 return page; 1663 return page;
1616} 1664}
1617 1665
1618 1666/* page migration callback function */
1619struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, 1667struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
1620 nodemask_t *nmask) 1668 nodemask_t *nmask)
1621{ 1669{
@@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
1633 } 1681 }
1634 spin_unlock(&hugetlb_lock); 1682 spin_unlock(&hugetlb_lock);
1635 1683
1636 /* No reservations, try to overcommit */ 1684 return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
1637
1638 return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask);
1639} 1685}
1640 1686
1641/* 1687/*
@@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta)
1663retry: 1709retry:
1664 spin_unlock(&hugetlb_lock); 1710 spin_unlock(&hugetlb_lock);
1665 for (i = 0; i < needed; i++) { 1711 for (i = 0; i < needed; i++) {
1666 page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), 1712 page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h),
1667 NUMA_NO_NODE, NULL); 1713 NUMA_NO_NODE, NULL);
1668 if (!page) { 1714 if (!page) {
1669 alloc_ok = false; 1715 alloc_ok = false;
@@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2260 * First take pages out of surplus state. Then make up the 2306 * First take pages out of surplus state. Then make up the
2261 * remaining difference by allocating fresh huge pages. 2307 * remaining difference by allocating fresh huge pages.
2262 * 2308 *
2263 * We might race with __alloc_buddy_huge_page() here and be unable 2309 * We might race with __alloc_surplus_huge_page() here and be unable
2264 * to convert a surplus huge page to a normal huge page. That is 2310 * to convert a surplus huge page to a normal huge page. That is
2265 * not critical, though, it just means the overall size of the 2311 * not critical, though, it just means the overall size of the
2266 * pool might be one hugepage larger than it needs to be, but 2312 * pool might be one hugepage larger than it needs to be, but
@@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
2303 * By placing pages into the surplus state independent of the 2349 * By placing pages into the surplus state independent of the
2304 * overcommit value, we are allowing the surplus pool size to 2350 * overcommit value, we are allowing the surplus pool size to
2305 * exceed overcommit. There are few sane options here. Since 2351 * exceed overcommit. There are few sane options here. Since
2306 * __alloc_buddy_huge_page() is checking the global counter, 2352 * __alloc_surplus_huge_page() is checking the global counter,
2307 * though, we'll note that we're not allowed to exceed surplus 2353 * though, we'll note that we're not allowed to exceed surplus
2308 * and won't grow the pool anywhere else. Not until one of the 2354 * and won't grow the pool anywhere else. Not until one of the
2309 * sysctls are changed, or the surplus pages go out of use. 2355 * sysctls are changed, or the surplus pages go out of use.
@@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page)
4779 spin_unlock(&hugetlb_lock); 4825 spin_unlock(&hugetlb_lock);
4780 put_page(page); 4826 put_page(page);
4781} 4827}
4828
4829void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
4830{
4831 struct hstate *h = page_hstate(oldpage);
4832
4833 hugetlb_cgroup_migrate(oldpage, newpage);
4834 set_page_owner_migrate_reason(newpage, reason);
4835
4836 /*
4837 * transfer temporary state of the new huge page. This is
4838 * reverse to other transitions because the newpage is going to
4839 * be final while the old one will be freed so it takes over
4840 * the temporary status.
4841 *
4842 * Also note that we have to transfer the per-node surplus state
4843 * here as well otherwise the global surplus count will not match
4844 * the per-node's.
4845 */
4846 if (PageHugeTemporary(newpage)) {
4847 int old_nid = page_to_nid(oldpage);
4848 int new_nid = page_to_nid(newpage);
4849
4850 SetPageHugeTemporary(oldpage);
4851 ClearPageHugeTemporary(newpage);
4852
4853 spin_lock(&hugetlb_lock);
4854 if (h->surplus_huge_pages_node[old_nid]) {
4855 h->surplus_huge_pages_node[old_nid]--;
4856 h->surplus_huge_pages_node[new_nid]++;
4857 }
4858 spin_unlock(&hugetlb_lock);
4859 }
4860}
diff --git a/mm/migrate.c b/mm/migrate.c
index 4d0be47a322a..1e5525a25691 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1323,9 +1323,8 @@ put_anon:
1323 put_anon_vma(anon_vma); 1323 put_anon_vma(anon_vma);
1324 1324
1325 if (rc == MIGRATEPAGE_SUCCESS) { 1325 if (rc == MIGRATEPAGE_SUCCESS) {
1326 hugetlb_cgroup_migrate(hpage, new_hpage); 1326 move_hugetlb_state(hpage, new_hpage, reason);
1327 put_new_page = NULL; 1327 put_new_page = NULL;
1328 set_page_owner_migrate_reason(new_hpage, reason);
1329 } 1328 }
1330 1329
1331 unlock_page(hpage); 1330 unlock_page(hpage);