1 files changed, 165 insertions, 69 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 8874c8ad55aa..271e4432734c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
 static int num_fault_mutexes;
 static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
+/* Forward declaration */
+static int hugetlb_acct_memory(struct hstate *h, long delta);
 static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
 {
        bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
        spin_unlock(&spool->lock);
        /* If no pages are used, and no other handles to the subpool
-         * remain, free the subpool the subpool remain */
+         * remain, give up any reservations mased on minimum size and
-        if (free)
+         * free the subpool */
+        if (free) {
+                if (spool->min_hpages != -1)
+                        hugetlb_acct_memory(spool->hstate,
+                                                -spool->min_hpages);
                kfree(spool);
+        }
 }
-struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)
+struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
+                                                long min_hpages)
 {
        struct hugepage_subpool *spool;
-        spool = kmalloc(sizeof(*spool), GFP_KERNEL);
+        spool = kzalloc(sizeof(*spool), GFP_KERNEL);
        if (!spool)
                return NULL;
        spin_lock_init(&spool->lock);
        spool->count = 1;
-        spool->max_hpages = nr_blocks;
+        spool->max_hpages = max_hpages;
-        spool->used_hpages = 0;
+        spool->hstate = h;
+        spool->min_hpages = min_hpages;
+        if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
+                kfree(spool);
+                return NULL;
+        }
+        spool->rsv_hpages = min_hpages;
        return spool;
 }
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
        unlock_or_release_subpool(spool);
 }
-static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for allocating and reserving pages.
+ * Return -ENOMEM if there are not enough resources to satisfy the
+ * the request.  Otherwise, return the number of pages by which the
+ * global pools must be adjusted (upward).  The returned value may
+ * only be different than the passed value (delta) in the case where
+ * a subpool minimum size must be manitained.
+ */
+static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
                                      long delta)
 {
-        int ret = 0;
+        long ret = delta;
        if (!spool)
-                return 0;
+                return ret;
        spin_lock(&spool->lock);
-        if ((spool->used_hpages + delta) <= spool->max_hpages) {
-                spool->used_hpages += delta;
+        if (spool->max_hpages != -1) {          /* maximum size accounting */
-        } else {
+                if ((spool->used_hpages + delta) <= spool->max_hpages)
-                ret = -ENOMEM;
+                        spool->used_hpages += delta;
+                else {
+                        ret = -ENOMEM;
+                        goto unlock_ret;
+                }
+        }
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (delta > spool->rsv_hpages) {
+                        /*
+                         * Asking for more reserves than those already taken on
+                         * behalf of subpool.  Return difference.
+                         */
+                        ret = delta - spool->rsv_hpages;
+                        spool->rsv_hpages = 0;
+                } else {
+                        ret = 0;        /* reserves already accounted for */
+                        spool->rsv_hpages -= delta;
+                }
        }
-        spin_unlock(&spool->lock);
+unlock_ret:
+        spin_unlock(&spool->lock);
        return ret;
 }
-static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,
+/*
+ * Subpool accounting for freeing and unreserving pages.
+ * Return the number of global page reservations that must be dropped.
+ * The return value may only be different than the passed value (delta)
+ * in the case where a subpool minimum size must be maintained.
+ */
+static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                       long delta)
 {
+        long ret = delta;
        if (!spool)
-                return;
+                return delta;
        spin_lock(&spool->lock);
-        spool->used_hpages -= delta;
-        /* If hugetlbfs_put_super couldn't free spool due to
+        if (spool->max_hpages != -1)            /* maximum size accounting */
-        * an outstanding quota reference, free it now. */
+                spool->used_hpages -= delta;
+        if (spool->min_hpages != -1) {          /* minimum size accounting */
+                if (spool->rsv_hpages + delta <= spool->min_hpages)
+                        ret = 0;
+                else
+                        ret = spool->rsv_hpages + delta - spool->min_hpages;
+                spool->rsv_hpages += delta;
+                if (spool->rsv_hpages > spool->min_hpages)
+                        spool->rsv_hpages = spool->min_hpages;
+        }
+        /*
+         * If hugetlbfs_put_super couldn't free spool due to an outstanding
+         * quota reference, free it now.
+         */
        unlock_or_release_subpool(spool);
+        return ret;
 }
 static inline struct hugepage_subpool *subpool_inode(struct inode *inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
        return NULL;
 }
+/*
+ * Test to determine whether the hugepage is "active/in-use" (i.e. being linked
+ * to hstate->hugepage_activelist.)
+ *
+ * This function can be called for tail pages, but never returns true for them.
+ */
+bool page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHuge(page), page);
+        return PageHead(page) && PagePrivate(&page[1]);
+}
+/* never called for tail page */
+static void set_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        SetPagePrivate(&page[1]);
+}
+static void clear_page_huge_active(struct page *page)
+{
+        VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
+        ClearPagePrivate(&page[1]);
+}
 void free_huge_page(struct page *page)
 {
        /*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
        restore_reserve = PagePrivate(page);
        ClearPagePrivate(page);
+        /*
+         * A return code of zero implies that the subpool will be under its
+         * minimum size if the reservation is not restored after page is free.
+         * Therefore, force restore_reserve operation.
+         */
+        if (hugepage_subpool_put_pages(spool, 1) == 0)
+                restore_reserve = true;
        spin_lock(&hugetlb_lock);
+        clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
        if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
                enqueue_huge_page(h, page);
        }
        spin_unlock(&hugetlb_lock);
-        hugepage_subpool_put_pages(spool, 1);
 }
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
@@ -1386,7 +1488,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        if (chg < 0)
                return ERR_PTR(-ENOMEM);
        if (chg || avoid_reserve)
-                if (hugepage_subpool_get_pages(spool, 1))
+                if (hugepage_subpool_get_pages(spool, 1) < 0)
                        return ERR_PTR(-ENOSPC);
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        struct resv_map *resv = vma_resv_map(vma);
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
+        long gbl_reserve;
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        kref_put(&resv->refs, resv_map_release);
        if (reserve) {
-                hugetlb_acct_memory(h, -reserve);
+                /*
-                hugepage_subpool_put_pages(spool, reserve);
+                 * Decrement reserve counts.  The global reserve count may be
+                 * adjusted if the subpool has a minimum size.
+                 */
+                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
+                hugetlb_acct_memory(h, -gbl_reserve);
        }
 }
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
        copy_user_huge_page(new_page, old_page, address, vma,
                            pages_per_huge_page(h));
        __SetPageUptodate(new_page);
+        set_page_huge_active(new_page);
        mmun_start = address & huge_page_mask(h);
        mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
                __SetPageUptodate(page);
+                set_page_huge_active(page);
                if (vma->vm_flags & VM_MAYSHARE) {
                        int err;
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
+        long gbl_reserve;
        /*
         * Only apply hugepage reservation if asked. At fault time, an
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
                goto out_err;
        }
-        /* There must be enough pages in the subpool for the mapping */
+        /*
-        if (hugepage_subpool_get_pages(spool, chg)) {
+         * There must be enough pages in the subpool for the mapping. If
+         * the subpool has a minimum size, there may be some global
+         * reservations already in place (gbl_reserve).
+         */
+        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
+        if (gbl_reserve < 0) {
                ret = -ENOSPC;
                goto out_err;
        }
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
         * Check enough hugepages are available for the reservation.
         * Hand the pages back to the subpool if there are not
         */
-        ret = hugetlb_acct_memory(h, chg);
+        ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-                hugepage_subpool_put_pages(spool, chg);
+                /* put back original number of pages, chg */
+                (void)hugepage_subpool_put_pages(spool, chg);
                goto out_err;
        }
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        struct resv_map *resv_map = inode_resv_map(inode);
        long chg = 0;
        struct hugepage_subpool *spool = subpool_inode(inode);
+        long gbl_reserve;
        if (resv_map)
                chg = region_truncate(resv_map, offset);
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        inode->i_blocks -= (blocks_per_huge_page(h) * freed);
        spin_unlock(&inode->i_lock);
-        hugepage_subpool_put_pages(spool, (chg - freed));
+        /*
-        hugetlb_acct_memory(h, -(chg - freed));
+         * If the subpool has a minimum size, the number of global
+         * reservations to be released may be adjusted.
+         */
+        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
+        hugetlb_acct_memory(h, -gbl_reserve);
 }
 #ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 #ifdef CONFIG_MEMORY_FAILURE
-/* Should be called in hugetlb_lock */
-static int is_hugepage_on_freelist(struct page *hpage)
-{
-        struct page *page;
-        struct page *tmp;
-        struct hstate *h = page_hstate(hpage);
-        int nid = page_to_nid(hpage);
-        list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
-                if (page == hpage)
-                        return 1;
-        return 0;
-}
 /*
 * This function is called from memory failure code.
 * Assume the caller holds page lock of the head page.
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
        int ret = -EBUSY;
        spin_lock(&hugetlb_lock);
-        if (is_hugepage_on_freelist(hpage)) {
+        /*
+         * Just checking !page_huge_active is not enough, because that could be
+         * an isolated/hwpoisoned hugepage (which have >0 refcount).
+         */
+        if (!page_huge_active(hpage) && !page_count(hpage)) {
                /*
                 * Hwpoisoned hugepage isn't linked to activelist or freelist,
                 * but dangling hpage->lru can trigger list-debug warnings
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
 bool isolate_huge_page(struct page *page, struct list_head *list)
 {
+        bool ret = true;
        VM_BUG_ON_PAGE(!PageHead(page), page);
-        if (!get_page_unless_zero(page))
-                return false;
        spin_lock(&hugetlb_lock);
+        if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+                ret = false;
+                goto unlock;
+        }
+        clear_page_huge_active(page);
        list_move_tail(&page->lru, list);
+unlock:
        spin_unlock(&hugetlb_lock);
-        return true;
+        return ret;
 }
 void putback_active_hugepage(struct page *page)
 {
        VM_BUG_ON_PAGE(!PageHead(page), page);
        spin_lock(&hugetlb_lock);
+        set_page_huge_active(page);
        list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
        spin_unlock(&hugetlb_lock);
        put_page(page);
 }
-bool is_hugepage_active(struct page *page)
-{
-        VM_BUG_ON_PAGE(!PageHuge(page), page);
-        /*
-         * This function can be called for a tail page because the caller,
-         * scan_movable_pages, scans through a given pfn-range which typically
-         * covers one memory block. In systems using gigantic hugepage (1GB
-         * for x86_64,) a hugepage is larger than a memory block, and we don't
-         * support migrating such large hugepages for now, so return false
-         * when called for tail pages.
-         */
-        if (PageTail(page))
-                return false;
-        /*
-         * Refcount of a hwpoisoned hugepages is 1, but they are not active,
-         * so we should return false for them.
-         */
-        if (unlikely(PageHWPoison(page)))
-                return false;
-        return page_count(page) > 0;
-}

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8874c8ad55aa..271e4432734c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -61,6 +61,9 @@ DEFINE_SPINLOCK(hugetlb_lock);
61	static int num_fault_mutexes;	61	static int num_fault_mutexes;
62	static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;	62	static struct mutex *htlb_fault_mutex_table ____cacheline_aligned_in_smp;
63		63
		64	/* Forward declaration */
		65	static int hugetlb_acct_memory(struct hstate *h, long delta);
		66
64	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)	67	static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
65	{	68	{
66	bool free = (spool->count == 0) && (spool->used_hpages == 0);	69	bool free = (spool->count == 0) && (spool->used_hpages == 0);
@@ -68,23 +71,36 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
68	spin_unlock(&spool->lock);	71	spin_unlock(&spool->lock);
69		72
70	/* If no pages are used, and no other handles to the subpool	73	/* If no pages are used, and no other handles to the subpool
71	* remain, free the subpool the subpool remain */	74	* remain, give up any reservations mased on minimum size and
72	if (free)	75	* free the subpool */
		76	if (free) {
		77	if (spool->min_hpages != -1)
		78	hugetlb_acct_memory(spool->hstate,
		79	-spool->min_hpages);
73	kfree(spool);	80	kfree(spool);
		81	}
74	}	82	}
75		83
76	struct hugepage_subpool *hugepage_new_subpool(long nr_blocks)	84	struct hugepage_subpool hugepage_new_subpool(struct hstate h, long max_hpages,
		85	long min_hpages)
77	{	86	{
78	struct hugepage_subpool *spool;	87	struct hugepage_subpool *spool;
79		88
80	spool = kmalloc(sizeof(*spool), GFP_KERNEL);	89	spool = kzalloc(sizeof(*spool), GFP_KERNEL);
81	if (!spool)	90	if (!spool)
82	return NULL;	91	return NULL;
83		92
84	spin_lock_init(&spool->lock);	93	spin_lock_init(&spool->lock);
85	spool->count = 1;	94	spool->count = 1;
86	spool->max_hpages = nr_blocks;	95	spool->max_hpages = max_hpages;
87	spool->used_hpages = 0;	96	spool->hstate = h;
		97	spool->min_hpages = min_hpages;
		98
		99	if (min_hpages != -1 && hugetlb_acct_memory(h, min_hpages)) {
		100	kfree(spool);
		101	return NULL;
		102	}
		103	spool->rsv_hpages = min_hpages;
88		104
89	return spool;	105	return spool;
90	}	106	}
@@ -97,36 +113,89 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
97	unlock_or_release_subpool(spool);	113	unlock_or_release_subpool(spool);
98	}	114	}
99		115
100	static int hugepage_subpool_get_pages(struct hugepage_subpool *spool,	116	/*
		117	* Subpool accounting for allocating and reserving pages.
		118	* Return -ENOMEM if there are not enough resources to satisfy the
		119	* the request. Otherwise, return the number of pages by which the
		120	* global pools must be adjusted (upward). The returned value may
		121	* only be different than the passed value (delta) in the case where
		122	* a subpool minimum size must be manitained.
		123	*/
		124	static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
101	long delta)	125	long delta)
102	{	126	{
103	int ret = 0;	127	long ret = delta;
104		128
105	if (!spool)	129	if (!spool)
106	return 0;	130	return ret;
107		131
108	spin_lock(&spool->lock);	132	spin_lock(&spool->lock);
109	if ((spool->used_hpages + delta) <= spool->max_hpages) {	133
110	spool->used_hpages += delta;	134	if (spool->max_hpages != -1) { /* maximum size accounting */
111	} else {	135	if ((spool->used_hpages + delta) <= spool->max_hpages)
112	ret = -ENOMEM;	136	spool->used_hpages += delta;
		137	else {
		138	ret = -ENOMEM;
		139	goto unlock_ret;
		140	}
		141	}
		142
		143	if (spool->min_hpages != -1) { /* minimum size accounting */
		144	if (delta > spool->rsv_hpages) {
		145	/*
		146	* Asking for more reserves than those already taken on
		147	* behalf of subpool. Return difference.
		148	*/
		149	ret = delta - spool->rsv_hpages;
		150	spool->rsv_hpages = 0;
		151	} else {
		152	ret = 0; /* reserves already accounted for */
		153	spool->rsv_hpages -= delta;
		154	}
113	}	155	}
114	spin_unlock(&spool->lock);
115		156
		157	unlock_ret:
		158	spin_unlock(&spool->lock);
116	return ret;	159	return ret;
117	}	160	}
118		161
119	static void hugepage_subpool_put_pages(struct hugepage_subpool *spool,	162	/*
		163	* Subpool accounting for freeing and unreserving pages.
		164	* Return the number of global page reservations that must be dropped.
		165	* The return value may only be different than the passed value (delta)
		166	* in the case where a subpool minimum size must be maintained.
		167	*/
		168	static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
120	long delta)	169	long delta)
121	{	170	{
		171	long ret = delta;
		172
122	if (!spool)	173	if (!spool)
123	return;	174	return delta;
124		175
125	spin_lock(&spool->lock);	176	spin_lock(&spool->lock);
126	spool->used_hpages -= delta;	177
127	/* If hugetlbfs_put_super couldn't free spool due to	178	if (spool->max_hpages != -1) /* maximum size accounting */
128	* an outstanding quota reference, free it now. */	179	spool->used_hpages -= delta;
		180
		181	if (spool->min_hpages != -1) { /* minimum size accounting */
		182	if (spool->rsv_hpages + delta <= spool->min_hpages)
		183	ret = 0;
		184	else
		185	ret = spool->rsv_hpages + delta - spool->min_hpages;
		186
		187	spool->rsv_hpages += delta;
		188	if (spool->rsv_hpages > spool->min_hpages)
		189	spool->rsv_hpages = spool->min_hpages;
		190	}
		191
		192	/*
		193	* If hugetlbfs_put_super couldn't free spool due to an outstanding
		194	* quota reference, free it now.
		195	*/
129	unlock_or_release_subpool(spool);	196	unlock_or_release_subpool(spool);
		197
		198	return ret;
130	}	199	}
131		200
132	static inline struct hugepage_subpool subpool_inode(struct inode inode)	201	static inline struct hugepage_subpool subpool_inode(struct inode inode)
@@ -855,6 +924,31 @@ struct hstate *size_to_hstate(unsigned long size)
855	return NULL;	924	return NULL;
856	}	925	}
857		926
		927	/*
		928	* Test to determine whether the hugepage is "active/in-use" (i.e. being linked
		929	* to hstate->hugepage_activelist.)
		930	*
		931	* This function can be called for tail pages, but never returns true for them.
		932	*/
		933	bool page_huge_active(struct page *page)
		934	{
		935	VM_BUG_ON_PAGE(!PageHuge(page), page);
		936	return PageHead(page) && PagePrivate(&page[1]);
		937	}
		938
		939	/* never called for tail page */
		940	static void set_page_huge_active(struct page *page)
		941	{
		942	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
		943	SetPagePrivate(&page[1]);
		944	}
		945
		946	static void clear_page_huge_active(struct page *page)
		947	{
		948	VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
		949	ClearPagePrivate(&page[1]);
		950	}
		951
858	void free_huge_page(struct page *page)	952	void free_huge_page(struct page *page)
859	{	953	{
860	/*	954	/*
@@ -874,7 +968,16 @@ void free_huge_page(struct page *page)
874	restore_reserve = PagePrivate(page);	968	restore_reserve = PagePrivate(page);
875	ClearPagePrivate(page);	969	ClearPagePrivate(page);
876		970
		971	/*
		972	* A return code of zero implies that the subpool will be under its
		973	* minimum size if the reservation is not restored after page is free.
		974	* Therefore, force restore_reserve operation.
		975	*/
		976	if (hugepage_subpool_put_pages(spool, 1) == 0)
		977	restore_reserve = true;
		978
877	spin_lock(&hugetlb_lock);	979	spin_lock(&hugetlb_lock);
		980	clear_page_huge_active(page);
878	hugetlb_cgroup_uncharge_page(hstate_index(h),	981	hugetlb_cgroup_uncharge_page(hstate_index(h),
879	pages_per_huge_page(h), page);	982	pages_per_huge_page(h), page);
880	if (restore_reserve)	983	if (restore_reserve)
@@ -891,7 +994,6 @@ void free_huge_page(struct page *page)
891	enqueue_huge_page(h, page);	994	enqueue_huge_page(h, page);
892	}	995	}
893	spin_unlock(&hugetlb_lock);	996	spin_unlock(&hugetlb_lock);
894	hugepage_subpool_put_pages(spool, 1);
895	}	997	}
896		998
897	static void prep_new_huge_page(struct hstate h, struct page page, int nid)	999	static void prep_new_huge_page(struct hstate h, struct page page, int nid)
@@ -1386,7 +1488,7 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
1386	if (chg < 0)	1488	if (chg < 0)
1387	return ERR_PTR(-ENOMEM);	1489	return ERR_PTR(-ENOMEM);
1388	if (chg \|\| avoid_reserve)	1490	if (chg \|\| avoid_reserve)
1389	if (hugepage_subpool_get_pages(spool, 1))	1491	if (hugepage_subpool_get_pages(spool, 1) < 0)
1390	return ERR_PTR(-ENOSPC);	1492	return ERR_PTR(-ENOSPC);
1391		1493
1392	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);	1494	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
@@ -2454,6 +2556,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2454	struct resv_map *resv = vma_resv_map(vma);	2556	struct resv_map *resv = vma_resv_map(vma);
2455	struct hugepage_subpool *spool = subpool_vma(vma);	2557	struct hugepage_subpool *spool = subpool_vma(vma);
2456	unsigned long reserve, start, end;	2558	unsigned long reserve, start, end;
		2559	long gbl_reserve;
2457		2560
2458	if (!resv \|\| !is_vma_resv_set(vma, HPAGE_RESV_OWNER))	2561	if (!resv \|\| !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
2459	return;	2562	return;
@@ -2466,8 +2569,12 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
2466	kref_put(&resv->refs, resv_map_release);	2569	kref_put(&resv->refs, resv_map_release);
2467		2570
2468	if (reserve) {	2571	if (reserve) {
2469	hugetlb_acct_memory(h, -reserve);	2572	/*
2470	hugepage_subpool_put_pages(spool, reserve);	2573	* Decrement reserve counts. The global reserve count may be
		2574	* adjusted if the subpool has a minimum size.
		2575	*/
		2576	gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
		2577	hugetlb_acct_memory(h, -gbl_reserve);
2471	}	2578	}
2472	}	2579	}
2473		2580
@@ -2891,6 +2998,7 @@ retry_avoidcopy:
2891	copy_user_huge_page(new_page, old_page, address, vma,	2998	copy_user_huge_page(new_page, old_page, address, vma,
2892	pages_per_huge_page(h));	2999	pages_per_huge_page(h));
2893	__SetPageUptodate(new_page);	3000	__SetPageUptodate(new_page);
		3001	set_page_huge_active(new_page);
2894		3002
2895	mmun_start = address & huge_page_mask(h);	3003	mmun_start = address & huge_page_mask(h);
2896	mmun_end = mmun_start + huge_page_size(h);	3004	mmun_end = mmun_start + huge_page_size(h);
@@ -3003,6 +3111,7 @@ retry:
3003	}	3111	}
3004	clear_huge_page(page, address, pages_per_huge_page(h));	3112	clear_huge_page(page, address, pages_per_huge_page(h));
3005	__SetPageUptodate(page);	3113	__SetPageUptodate(page);
		3114	set_page_huge_active(page);
3006		3115
3007	if (vma->vm_flags & VM_MAYSHARE) {	3116	if (vma->vm_flags & VM_MAYSHARE) {
3008	int err;	3117	int err;
@@ -3447,6 +3556,7 @@ int hugetlb_reserve_pages(struct inode *inode,
3447	struct hstate *h = hstate_inode(inode);	3556	struct hstate *h = hstate_inode(inode);
3448	struct hugepage_subpool *spool = subpool_inode(inode);	3557	struct hugepage_subpool *spool = subpool_inode(inode);
3449	struct resv_map *resv_map;	3558	struct resv_map *resv_map;
		3559	long gbl_reserve;
3450		3560
3451	/*	3561	/*
3452	* Only apply hugepage reservation if asked. At fault time, an	3562	* Only apply hugepage reservation if asked. At fault time, an
@@ -3483,8 +3593,13 @@ int hugetlb_reserve_pages(struct inode *inode,
3483	goto out_err;	3593	goto out_err;
3484	}	3594	}
3485		3595
3486	/* There must be enough pages in the subpool for the mapping */	3596	/*
3487	if (hugepage_subpool_get_pages(spool, chg)) {	3597	* There must be enough pages in the subpool for the mapping. If
		3598	* the subpool has a minimum size, there may be some global
		3599	* reservations already in place (gbl_reserve).
		3600	*/
		3601	gbl_reserve = hugepage_subpool_get_pages(spool, chg);
		3602	if (gbl_reserve < 0) {
3488	ret = -ENOSPC;	3603	ret = -ENOSPC;
3489	goto out_err;	3604	goto out_err;
3490	}	3605	}
@@ -3493,9 +3608,10 @@ int hugetlb_reserve_pages(struct inode *inode,
3493	* Check enough hugepages are available for the reservation.	3608	* Check enough hugepages are available for the reservation.
3494	* Hand the pages back to the subpool if there are not	3609	* Hand the pages back to the subpool if there are not
3495	*/	3610	*/
3496	ret = hugetlb_acct_memory(h, chg);	3611	ret = hugetlb_acct_memory(h, gbl_reserve);
3497	if (ret < 0) {	3612	if (ret < 0) {
3498	hugepage_subpool_put_pages(spool, chg);	3613	/* put back original number of pages, chg */
		3614	(void)hugepage_subpool_put_pages(spool, chg);
3499	goto out_err;	3615	goto out_err;
3500	}	3616	}
3501		3617
@@ -3525,6 +3641,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3525	struct resv_map *resv_map = inode_resv_map(inode);	3641	struct resv_map *resv_map = inode_resv_map(inode);
3526	long chg = 0;	3642	long chg = 0;
3527	struct hugepage_subpool *spool = subpool_inode(inode);	3643	struct hugepage_subpool *spool = subpool_inode(inode);
		3644	long gbl_reserve;
3528		3645
3529	if (resv_map)	3646	if (resv_map)
3530	chg = region_truncate(resv_map, offset);	3647	chg = region_truncate(resv_map, offset);
@@ -3532,8 +3649,12 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
3532	inode->i_blocks -= (blocks_per_huge_page(h) * freed);	3649	inode->i_blocks -= (blocks_per_huge_page(h) * freed);
3533	spin_unlock(&inode->i_lock);	3650	spin_unlock(&inode->i_lock);
3534		3651
3535	hugepage_subpool_put_pages(spool, (chg - freed));	3652	/*
3536	hugetlb_acct_memory(h, -(chg - freed));	3653	* If the subpool has a minimum size, the number of global
		3654	* reservations to be released may be adjusted.
		3655	*/
		3656	gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
		3657	hugetlb_acct_memory(h, -gbl_reserve);
3537	}	3658	}
3538		3659
3539	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE	3660	#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
@@ -3775,20 +3896,6 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
3775		3896
3776	#ifdef CONFIG_MEMORY_FAILURE	3897	#ifdef CONFIG_MEMORY_FAILURE
3777		3898
3778	/* Should be called in hugetlb_lock */
3779	static int is_hugepage_on_freelist(struct page *hpage)
3780	{
3781	struct page *page;
3782	struct page *tmp;
3783	struct hstate *h = page_hstate(hpage);
3784	int nid = page_to_nid(hpage);
3785
3786	list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
3787	if (page == hpage)
3788	return 1;
3789	return 0;
3790	}
3791
3792	/*	3899	/*
3793	* This function is called from memory failure code.	3900	* This function is called from memory failure code.
3794	* Assume the caller holds page lock of the head page.	3901	* Assume the caller holds page lock of the head page.
@@ -3800,7 +3907,11 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3800	int ret = -EBUSY;	3907	int ret = -EBUSY;
3801		3908
3802	spin_lock(&hugetlb_lock);	3909	spin_lock(&hugetlb_lock);
3803	if (is_hugepage_on_freelist(hpage)) {	3910	/*
		3911	* Just checking !page_huge_active is not enough, because that could be
		3912	* an isolated/hwpoisoned hugepage (which have >0 refcount).
		3913	*/
		3914	if (!page_huge_active(hpage) && !page_count(hpage)) {
3804	/*	3915	/*
3805	* Hwpoisoned hugepage isn't linked to activelist or freelist,	3916	* Hwpoisoned hugepage isn't linked to activelist or freelist,
3806	* but dangling hpage->lru can trigger list-debug warnings	3917	* but dangling hpage->lru can trigger list-debug warnings
@@ -3820,42 +3931,27 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage)
3820		3931
3821	bool isolate_huge_page(struct page page, struct list_head list)	3932	bool isolate_huge_page(struct page page, struct list_head list)
3822	{	3933	{
		3934	bool ret = true;
		3935
3823	VM_BUG_ON_PAGE(!PageHead(page), page);	3936	VM_BUG_ON_PAGE(!PageHead(page), page);
3824	if (!get_page_unless_zero(page))
3825	return false;
3826	spin_lock(&hugetlb_lock);	3937	spin_lock(&hugetlb_lock);
		3938	if (!page_huge_active(page) \|\| !get_page_unless_zero(page)) {
		3939	ret = false;
		3940	goto unlock;
		3941	}
		3942	clear_page_huge_active(page);
3827	list_move_tail(&page->lru, list);	3943	list_move_tail(&page->lru, list);
		3944	unlock:
3828	spin_unlock(&hugetlb_lock);	3945	spin_unlock(&hugetlb_lock);
3829	return true;	3946	return ret;
3830	}	3947	}
3831		3948
3832	void putback_active_hugepage(struct page *page)	3949	void putback_active_hugepage(struct page *page)
3833	{	3950	{
3834	VM_BUG_ON_PAGE(!PageHead(page), page);	3951	VM_BUG_ON_PAGE(!PageHead(page), page);
3835	spin_lock(&hugetlb_lock);	3952	spin_lock(&hugetlb_lock);
		3953	set_page_huge_active(page);
3836	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);	3954	list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
3837	spin_unlock(&hugetlb_lock);	3955	spin_unlock(&hugetlb_lock);
3838	put_page(page);	3956	put_page(page);
3839	}	3957	}
3840
3841	bool is_hugepage_active(struct page *page)
3842	{
3843	VM_BUG_ON_PAGE(!PageHuge(page), page);
3844	/*
3845	* This function can be called for a tail page because the caller,
3846	* scan_movable_pages, scans through a given pfn-range which typically
3847	* covers one memory block. In systems using gigantic hugepage (1GB
3848	* for x86_64,) a hugepage is larger than a memory block, and we don't
3849	* support migrating such large hugepages for now, so return false
3850	* when called for tail pages.
3851	*/
3852	if (PageTail(page))
3853	return false;
3854	/*
3855	* Refcount of a hwpoisoned hugepages is 1, but they are not active,
3856	* so we should return false for them.
3857	*/
3858	if (unlikely(PageHWPoison(page)))
3859	return false;
3860	return page_count(page) > 0;
3861	}