1 files changed, 150 insertions, 101 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b16d63634777..815dbd4a6dcb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(struct hstate *h)
-{
-        int nid;
-        struct page *page = NULL;
-        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
-                                          struct page, lru);
-                        list_del(&page->lru);
-                        h->free_huge_pages--;
-                        h->free_huge_pages_node[nid]--;
-                        break;
-                }
-        }
-        return page;
-}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 /*
 * Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
+ * copy it back to next_nid_to_alloc afterwards:
 * otherwise there's a window in which a racer might
 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
 * But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 * if we just successfully allocated a hugepage so that
 * the next caller gets hugepages on the next node.
 */
-static int hstate_next_node(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h)
 {
        int next_nid;
-        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+        next_nid = next_node(h->next_nid_to_alloc, node_online_map);
        if (next_nid == MAX_NUMNODES)
                next_nid = first_node(node_online_map);
-        h->hugetlb_next_nid = next_nid;
+        h->next_nid_to_alloc = next_nid;
        return next_nid;
 }
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
        int next_nid;
        int ret = 0;
-        start_nid = h->hugetlb_next_nid;
+        start_nid = h->next_nid_to_alloc;
+        next_nid = start_nid;
        do {
-                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+                page = alloc_fresh_huge_page_node(h, next_nid);
                if (page)
                        ret = 1;
-                next_nid = hstate_next_node(h);
+                next_nid = hstate_next_node_to_alloc(h);
-        } while (!page && h->hugetlb_next_nid != start_nid);
+        } while (!page && next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
        return ret;
 }
+/*
+ * helper for free_pool_huge_page() - find next node
+ * from which to free a huge page
+ */
+static int hstate_next_node_to_free(struct hstate *h)
+{
+        int next_nid;
+        next_nid = next_node(h->next_nid_to_free, node_online_map);
+        if (next_nid == MAX_NUMNODES)
+                next_nid = first_node(node_online_map);
+        h->next_nid_to_free = next_nid;
+        return next_nid;
+}
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+{
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = h->next_nid_to_free;
+        next_nid = start_nid;
+        do {
+                /*
+                 * If we're returning unused surplus pages, only examine
+                 * nodes with surplus pages.
+                 */
+                if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+                    !list_empty(&h->hugepage_freelists[next_nid])) {
+                        struct page *page =
+                                list_entry(h->hugepage_freelists[next_nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        h->free_huge_pages--;
+                        h->free_huge_pages_node[next_nid]--;
+                        if (acct_surplus) {
+                                h->surplus_huge_pages--;
+                                h->surplus_huge_pages_node[next_nid]--;
+                        }
+                        update_and_free_page(h, page);
+                        ret = 1;
+                }
+                next_nid = hstate_next_node_to_free(h);
+        } while (!ret && next_nid != start_nid);
+        return ret;
+}
 static struct page *alloc_buddy_huge_page(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
 {
@@ -855,22 +893,13 @@ free:
 * When releasing a hugetlb pool reservation, any surplus pages that were
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
+ * Called with hugetlb_lock held.
 */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
-        static int nid = -1;
-        struct page *page;
        unsigned long nr_pages;
-        /*
-         * We want to release as many surplus pages as possible, spread
-         * evenly across all nodes. Iterate across all nodes until we
-         * can no longer free unreserved surplus pages. This occurs when
-         * the nodes with surplus pages have no free pages.
-         */
-        unsigned long remaining_iterations = nr_online_nodes;
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
-        while (remaining_iterations-- && nr_pages) {
+        /*
-                nid = next_node(nid, node_online_map);
+         * We want to release as many surplus pages as possible, spread
-                if (nid == MAX_NUMNODES)
+         * evenly across all nodes. Iterate across all nodes until we
-                        nid = first_node(node_online_map);
+         * can no longer free unreserved surplus pages. This occurs when
+         * the nodes with surplus pages have no free pages.
-                if (!h->surplus_huge_pages_node[nid])
+         * free_pool_huge_page() will balance the the frees across the
-                        continue;
+         * on-line nodes for us and will handle the hstate accounting.
+         */
-                if (!list_empty(&h->hugepage_freelists[nid])) {
+        while (nr_pages--) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
+                if (!free_pool_huge_page(h, 1))
-                                          struct page, lru);
+                        break;
-                        list_del(&page->lru);
-                        update_and_free_page(h, page);
-                        h->free_huge_pages--;
-                        h->free_huge_pages_node[nid]--;
-                        h->surplus_huge_pages--;
-                        h->surplus_huge_pages_node[nid]--;
-                        nr_pages--;
-                        remaining_iterations = nr_online_nodes;
-                }
        }
 }
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                void *addr;
                addr = __alloc_bootmem_node_nopanic(
-                                NODE_DATA(h->hugetlb_next_nid),
+                                NODE_DATA(h->next_nid_to_alloc),
                                huge_page_size(h), huge_page_size(h), 0);
+                hstate_next_node_to_alloc(h);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                        m = addr;
                        goto found;
                }
-                hstate_next_node(h);
                nr_nodes--;
        }
        return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 */
 static int adjust_pool_surplus(struct hstate *h, int delta)
 {
-        static int prev_nid;
+        int start_nid, next_nid;
-        int nid = prev_nid;
        int ret = 0;
        VM_BUG_ON(delta != -1 && delta != 1);
-        do {
-                nid = next_node(nid, node_online_map);
-                if (nid == MAX_NUMNODES)
-                        nid = first_node(node_online_map);
-                /* To shrink on this node, there must be a surplus page */
+        if (delta < 0)
-                if (delta < 0 && !h->surplus_huge_pages_node[nid])
+                start_nid = h->next_nid_to_alloc;
-                        continue;
+        else
-                /* Surplus cannot exceed the total number of pages */
+                start_nid = h->next_nid_to_free;
-                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+        next_nid = start_nid;
+        do {
+                int nid = next_nid;
+                if (delta < 0)  {
+                        next_nid = hstate_next_node_to_alloc(h);
+                        /*
+                         * To shrink on this node, there must be a surplus page
+                         */
+                        if (!h->surplus_huge_pages_node[nid])
+                                continue;
+                }
+                if (delta > 0) {
+                        next_nid = hstate_next_node_to_free(h);
+                        /*
+                         * Surplus cannot exceed the total number of pages
+                         */
+                        if (h->surplus_huge_pages_node[nid] >=
                                                h->nr_huge_pages_node[nid])
-                        continue;
+                                continue;
+                }
                h->surplus_huge_pages += delta;
                h->surplus_huge_pages_node[nid] += delta;
                ret = 1;
                break;
-        } while (nid != prev_nid);
+        } while (next_nid != start_nid);
-        prev_nid = nid;
        return ret;
 }
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
        min_count = max(count, min_count);
        try_to_free_low(h, min_count);
        while (min_count < persistent_huge_pages(h)) {
-                struct page *page = dequeue_huge_page(h);
+                if (!free_pool_huge_page(h, 0))
-                if (!page)
                        break;
-                update_and_free_page(h, page);
        }
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
-        h->hugetlb_next_nid = first_node(node_online_map);
+        h->next_nid_to_alloc = first_node(node_online_map);
+        h->next_nid_to_free = first_node(node_online_map);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
@@ -1985,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
        return find_lock_page(mapping, idx);
 }
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct address_space *mapping;
+        pgoff_t idx;
+        struct page *page;
+        mapping = vma->vm_file->f_mapping;
+        idx = vma_hugecache_offset(h, vma, address);
+        page = find_get_page(mapping, idx);
+        if (page)
+                put_page(page);
+        return page != NULL;
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, unsigned int flags)
 {
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
-        if (!ptep || write || shared)
-                return 0;
-        else
-                return huge_pte_none(huge_ptep_get(ptep));
-}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,
-                        int write)
+                        unsigned int flags)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        int remainder = *length;
        struct hstate *h = hstate_vma(vma);
-        int zeropage_ok = 0;
-        int shared = vma->vm_flags & VM_SHARED;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
+                int absent;
                struct page *page;
                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
-                 * each hugepage.  We have to make * sure we get the
+                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
                 */
                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
-                if (huge_zeropage_ok(pte, write, shared))
+                absent = !pte || huge_pte_none(huge_ptep_get(pte));
-                        zeropage_ok = 1;
+                /*
+                 * When coredumping, it suits get_dump_page if we just return
+                 * an error where there's an empty slot with no huge pagecache
+                 * to back it.  This way, we avoid allocating a hugepage, and
+                 * the sparse dumpfile avoids allocating disk blocks, but its
+                 * huge holes still show up with zeroes where they need to be.
+                 */
+                if (absent && (flags & FOLL_DUMP) &&
+                    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+                        remainder = 0;
+                        break;
+                }
-                if (!pte ||
+                if (absent ||
-                    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
+                    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
-                    (write && !pte_write(huge_ptep_get(pte)))) {
                        int ret;
                        spin_unlock(&mm->page_table_lock);
-                        ret = hugetlb_fault(mm, vma, vaddr, write);
+                        ret = hugetlb_fault(mm, vma, vaddr,
+                                (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
                        spin_lock(&mm->page_table_lock);
                        if (!(ret & VM_FAULT_ERROR))
                                continue;
                        remainder = 0;
-                        if (!i)
-                                i = -EFAULT;
                        break;
                }
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
-                        if (zeropage_ok)
+                        pages[i] = mem_map_offset(page, pfn_offset);
-                                pages[i] = ZERO_PAGE(0);
-                        else
-                                pages[i] = mem_map_offset(page, pfn_offset);
                        get_page(pages[i]);
                }
@@ -2262,7 +2311,7 @@ same_page:
        *length = remainder;
        *position = vaddr;
-        return i;
+        return i ? i : -EFAULT;
 }
 void hugetlb_change_protection(struct vm_area_struct *vma,

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b16d63634777..815dbd4a6dcb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -456,24 +456,6 @@ static void enqueue_huge_page(struct hstate h, struct page page)
456	h->free_huge_pages_node[nid]++;	456	h->free_huge_pages_node[nid]++;
457	}	457	}
458		458
459	static struct page dequeue_huge_page(struct hstate h)
460	{
461	int nid;
462	struct page *page = NULL;
463
464	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
465	if (!list_empty(&h->hugepage_freelists[nid])) {
466	page = list_entry(h->hugepage_freelists[nid].next,
467	struct page, lru);
468	list_del(&page->lru);
469	h->free_huge_pages--;
470	h->free_huge_pages_node[nid]--;
471	break;
472	}
473	}
474	return page;
475	}
476
477	static struct page dequeue_huge_page_vma(struct hstate h,	459	static struct page dequeue_huge_page_vma(struct hstate h,
478	struct vm_area_struct *vma,	460	struct vm_area_struct *vma,
479	unsigned long address, int avoid_reserve)	461	unsigned long address, int avoid_reserve)
@@ -641,7 +623,7 @@ static struct page alloc_fresh_huge_page_node(struct hstate h, int nid)
641		623
642	/*	624	/*
643	* Use a helper variable to find the next node and then	625	* Use a helper variable to find the next node and then
644	* copy it back to hugetlb_next_nid afterwards:	626	* copy it back to next_nid_to_alloc afterwards:
645	* otherwise there's a window in which a racer might	627	* otherwise there's a window in which a racer might
646	* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.	628	* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
647	* But we don't need to use a spin_lock here: it really	629	* But we don't need to use a spin_lock here: it really
@@ -650,13 +632,13 @@ static struct page alloc_fresh_huge_page_node(struct hstate h, int nid)
650	* if we just successfully allocated a hugepage so that	632	* if we just successfully allocated a hugepage so that
651	* the next caller gets hugepages on the next node.	633	* the next caller gets hugepages on the next node.
652	*/	634	*/
653	static int hstate_next_node(struct hstate *h)	635	static int hstate_next_node_to_alloc(struct hstate *h)
654	{	636	{
655	int next_nid;	637	int next_nid;
656	next_nid = next_node(h->hugetlb_next_nid, node_online_map);	638	next_nid = next_node(h->next_nid_to_alloc, node_online_map);
657	if (next_nid == MAX_NUMNODES)	639	if (next_nid == MAX_NUMNODES)
658	next_nid = first_node(node_online_map);	640	next_nid = first_node(node_online_map);
659	h->hugetlb_next_nid = next_nid;	641	h->next_nid_to_alloc = next_nid;
660	return next_nid;	642	return next_nid;
661	}	643	}
662		644
@@ -667,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
667	int next_nid;	649	int next_nid;
668	int ret = 0;	650	int ret = 0;
669		651
670	start_nid = h->hugetlb_next_nid;	652	start_nid = h->next_nid_to_alloc;
		653	next_nid = start_nid;
671		654
672	do {	655	do {
673	page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);	656	page = alloc_fresh_huge_page_node(h, next_nid);
674	if (page)	657	if (page)
675	ret = 1;	658	ret = 1;
676	next_nid = hstate_next_node(h);	659	next_nid = hstate_next_node_to_alloc(h);
677	} while (!page && h->hugetlb_next_nid != start_nid);	660	} while (!page && next_nid != start_nid);
678		661
679	if (ret)	662	if (ret)
680	count_vm_event(HTLB_BUDDY_PGALLOC);	663	count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -684,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
684	return ret;	667	return ret;
685	}	668	}
686		669
		670	/*
		671	* helper for free_pool_huge_page() - find next node
		672	* from which to free a huge page
		673	*/
		674	static int hstate_next_node_to_free(struct hstate *h)
		675	{
		676	int next_nid;
		677	next_nid = next_node(h->next_nid_to_free, node_online_map);
		678	if (next_nid == MAX_NUMNODES)
		679	next_nid = first_node(node_online_map);
		680	h->next_nid_to_free = next_nid;
		681	return next_nid;
		682	}
		683
		684	/*
		685	* Free huge page from pool from next node to free.
		686	* Attempt to keep persistent huge pages more or less
		687	* balanced over allowed nodes.
		688	* Called with hugetlb_lock locked.
		689	*/
		690	static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
		691	{
		692	int start_nid;
		693	int next_nid;
		694	int ret = 0;
		695
		696	start_nid = h->next_nid_to_free;
		697	next_nid = start_nid;
		698
		699	do {
		700	/*
		701	* If we're returning unused surplus pages, only examine
		702	* nodes with surplus pages.
		703	*/
		704	if ((!acct_surplus \|\| h->surplus_huge_pages_node[next_nid]) &&
		705	!list_empty(&h->hugepage_freelists[next_nid])) {
		706	struct page *page =
		707	list_entry(h->hugepage_freelists[next_nid].next,
		708	struct page, lru);
		709	list_del(&page->lru);
		710	h->free_huge_pages--;
		711	h->free_huge_pages_node[next_nid]--;
		712	if (acct_surplus) {
		713	h->surplus_huge_pages--;
		714	h->surplus_huge_pages_node[next_nid]--;
		715	}
		716	update_and_free_page(h, page);
		717	ret = 1;
		718	}
		719	next_nid = hstate_next_node_to_free(h);
		720	} while (!ret && next_nid != start_nid);
		721
		722	return ret;
		723	}
		724
687	static struct page alloc_buddy_huge_page(struct hstate h,	725	static struct page alloc_buddy_huge_page(struct hstate h,
688	struct vm_area_struct *vma, unsigned long address)	726	struct vm_area_struct *vma, unsigned long address)
689	{	727	{
@@ -855,22 +893,13 @@ free:
855	* When releasing a hugetlb pool reservation, any surplus pages that were	893	* When releasing a hugetlb pool reservation, any surplus pages that were
856	* allocated to satisfy the reservation must be explicitly freed if they were	894	* allocated to satisfy the reservation must be explicitly freed if they were
857	* never used.	895	* never used.
		896	* Called with hugetlb_lock held.
858	*/	897	*/
859	static void return_unused_surplus_pages(struct hstate *h,	898	static void return_unused_surplus_pages(struct hstate *h,
860	unsigned long unused_resv_pages)	899	unsigned long unused_resv_pages)
861	{	900	{
862	static int nid = -1;
863	struct page *page;
864	unsigned long nr_pages;	901	unsigned long nr_pages;
865		902
866	/*
867	* We want to release as many surplus pages as possible, spread
868	* evenly across all nodes. Iterate across all nodes until we
869	* can no longer free unreserved surplus pages. This occurs when
870	* the nodes with surplus pages have no free pages.
871	*/
872	unsigned long remaining_iterations = nr_online_nodes;
873
874	/* Uncommit the reservation */	903	/* Uncommit the reservation */
875	h->resv_huge_pages -= unused_resv_pages;	904	h->resv_huge_pages -= unused_resv_pages;
876		905
@@ -880,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
880		909
881	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);	910	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
882		911
883	while (remaining_iterations-- && nr_pages) {	912	/*
884	nid = next_node(nid, node_online_map);	913	* We want to release as many surplus pages as possible, spread
885	if (nid == MAX_NUMNODES)	914	* evenly across all nodes. Iterate across all nodes until we
886	nid = first_node(node_online_map);	915	* can no longer free unreserved surplus pages. This occurs when
887		916	* the nodes with surplus pages have no free pages.
888	if (!h->surplus_huge_pages_node[nid])	917	* free_pool_huge_page() will balance the the frees across the
889	continue;	918	* on-line nodes for us and will handle the hstate accounting.
890		919	*/
891	if (!list_empty(&h->hugepage_freelists[nid])) {	920	while (nr_pages--) {
892	page = list_entry(h->hugepage_freelists[nid].next,	921	if (!free_pool_huge_page(h, 1))
893	struct page, lru);	922	break;
894	list_del(&page->lru);
895	update_and_free_page(h, page);
896	h->free_huge_pages--;
897	h->free_huge_pages_node[nid]--;
898	h->surplus_huge_pages--;
899	h->surplus_huge_pages_node[nid]--;
900	nr_pages--;
901	remaining_iterations = nr_online_nodes;
902	}
903	}	923	}
904	}	924	}
905		925
@@ -1008,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1008	void *addr;	1028	void *addr;
1009		1029
1010	addr = __alloc_bootmem_node_nopanic(	1030	addr = __alloc_bootmem_node_nopanic(
1011	NODE_DATA(h->hugetlb_next_nid),	1031	NODE_DATA(h->next_nid_to_alloc),
1012	huge_page_size(h), huge_page_size(h), 0);	1032	huge_page_size(h), huge_page_size(h), 0);
1013		1033
		1034	hstate_next_node_to_alloc(h);
1014	if (addr) {	1035	if (addr) {
1015	/*	1036	/*
1016	* Use the beginning of the huge page to store the	1037	* Use the beginning of the huge page to store the
@@ -1020,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1020	m = addr;	1041	m = addr;
1021	goto found;	1042	goto found;
1022	}	1043	}
1023	hstate_next_node(h);
1024	nr_nodes--;	1044	nr_nodes--;
1025	}	1045	}
1026	return 0;	1046	return 0;
@@ -1141,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1141	*/	1161	*/
1142	static int adjust_pool_surplus(struct hstate *h, int delta)	1162	static int adjust_pool_surplus(struct hstate *h, int delta)
1143	{	1163	{
1144	static int prev_nid;	1164	int start_nid, next_nid;
1145	int nid = prev_nid;
1146	int ret = 0;	1165	int ret = 0;
1147		1166
1148	VM_BUG_ON(delta != -1 && delta != 1);	1167	VM_BUG_ON(delta != -1 && delta != 1);
1149	do {
1150	nid = next_node(nid, node_online_map);
1151	if (nid == MAX_NUMNODES)
1152	nid = first_node(node_online_map);
1153		1168
1154	/* To shrink on this node, there must be a surplus page */	1169	if (delta < 0)
1155	if (delta < 0 && !h->surplus_huge_pages_node[nid])	1170	start_nid = h->next_nid_to_alloc;
1156	continue;	1171	else
1157	/* Surplus cannot exceed the total number of pages */	1172	start_nid = h->next_nid_to_free;
1158	if (delta > 0 && h->surplus_huge_pages_node[nid] >=	1173	next_nid = start_nid;
		1174
		1175	do {
		1176	int nid = next_nid;
		1177	if (delta < 0) {
		1178	next_nid = hstate_next_node_to_alloc(h);
		1179	/*
		1180	* To shrink on this node, there must be a surplus page
		1181	*/
		1182	if (!h->surplus_huge_pages_node[nid])
		1183	continue;
		1184	}
		1185	if (delta > 0) {
		1186	next_nid = hstate_next_node_to_free(h);
		1187	/*
		1188	* Surplus cannot exceed the total number of pages
		1189	*/
		1190	if (h->surplus_huge_pages_node[nid] >=
1159	h->nr_huge_pages_node[nid])	1191	h->nr_huge_pages_node[nid])
1160	continue;	1192	continue;
		1193	}
1161		1194
1162	h->surplus_huge_pages += delta;	1195	h->surplus_huge_pages += delta;
1163	h->surplus_huge_pages_node[nid] += delta;	1196	h->surplus_huge_pages_node[nid] += delta;
1164	ret = 1;	1197	ret = 1;
1165	break;	1198	break;
1166	} while (nid != prev_nid);	1199	} while (next_nid != start_nid);
1167		1200
1168	prev_nid = nid;
1169	return ret;	1201	return ret;
1170	}	1202	}
1171		1203
@@ -1227,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1227	min_count = max(count, min_count);	1259	min_count = max(count, min_count);
1228	try_to_free_low(h, min_count);	1260	try_to_free_low(h, min_count);
1229	while (min_count < persistent_huge_pages(h)) {	1261	while (min_count < persistent_huge_pages(h)) {
1230	struct page *page = dequeue_huge_page(h);	1262	if (!free_pool_huge_page(h, 0))
1231	if (!page)
1232	break;	1263	break;
1233	update_and_free_page(h, page);
1234	}	1264	}
1235	while (count < persistent_huge_pages(h)) {	1265	while (count < persistent_huge_pages(h)) {
1236	if (!adjust_pool_surplus(h, 1))	1266	if (!adjust_pool_surplus(h, 1))
@@ -1442,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1442	h->free_huge_pages = 0;	1472	h->free_huge_pages = 0;
1443	for (i = 0; i < MAX_NUMNODES; ++i)	1473	for (i = 0; i < MAX_NUMNODES; ++i)
1444	INIT_LIST_HEAD(&h->hugepage_freelists[i]);	1474	INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1445	h->hugetlb_next_nid = first_node(node_online_map);	1475	h->next_nid_to_alloc = first_node(node_online_map);
		1476	h->next_nid_to_free = first_node(node_online_map);
1446	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",	1477	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1447	huge_page_size(h)/1024);	1478	huge_page_size(h)/1024);
1448		1479
@@ -1985,6 +2016,26 @@ static struct page hugetlbfs_pagecache_page(struct hstate h,
1985	return find_lock_page(mapping, idx);	2016	return find_lock_page(mapping, idx);
1986	}	2017	}
1987		2018
		2019	/*
		2020	* Return whether there is a pagecache page to back given address within VMA.
		2021	* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
		2022	*/
		2023	static bool hugetlbfs_pagecache_present(struct hstate *h,
		2024	struct vm_area_struct *vma, unsigned long address)
		2025	{
		2026	struct address_space *mapping;
		2027	pgoff_t idx;
		2028	struct page *page;
		2029
		2030	mapping = vma->vm_file->f_mapping;
		2031	idx = vma_hugecache_offset(h, vma, address);
		2032
		2033	page = find_get_page(mapping, idx);
		2034	if (page)
		2035	put_page(page);
		2036	return page != NULL;
		2037	}
		2038
1988	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,	2039	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
1989	unsigned long address, pte_t *ptep, unsigned int flags)	2040	unsigned long address, pte_t *ptep, unsigned int flags)
1990	{	2041	{
@@ -2180,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2180	return NULL;	2231	return NULL;
2181	}	2232	}
2182		2233
2183	static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2184	{
2185	if (!ptep \|\| write \|\| shared)
2186	return 0;
2187	else
2188	return huge_pte_none(huge_ptep_get(ptep));
2189	}
2190
2191	int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,	2234	int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
2192	struct page pages, struct vm_area_struct vmas,	2235	struct page pages, struct vm_area_struct vmas,
2193	unsigned long position, int length, int i,	2236	unsigned long position, int length, int i,
2194	int write)	2237	unsigned int flags)
2195	{	2238	{
2196	unsigned long pfn_offset;	2239	unsigned long pfn_offset;
2197	unsigned long vaddr = *position;	2240	unsigned long vaddr = *position;
2198	int remainder = *length;	2241	int remainder = *length;
2199	struct hstate *h = hstate_vma(vma);	2242	struct hstate *h = hstate_vma(vma);
2200	int zeropage_ok = 0;
2201	int shared = vma->vm_flags & VM_SHARED;
2202		2243
2203	spin_lock(&mm->page_table_lock);	2244	spin_lock(&mm->page_table_lock);
2204	while (vaddr < vma->vm_end && remainder) {	2245	while (vaddr < vma->vm_end && remainder) {
2205	pte_t *pte;	2246	pte_t *pte;
		2247	int absent;
2206	struct page *page;	2248	struct page *page;
2207		2249
2208	/*	2250	/*
2209	* Some archs (sparc64, sh*) have multiple pte_ts to	2251	* Some archs (sparc64, sh*) have multiple pte_ts to
2210	* each hugepage. We have to make * sure we get the	2252	* each hugepage. We have to make sure we get the
2211	* first, for the page indexing below to work.	2253	* first, for the page indexing below to work.
2212	*/	2254	*/
2213	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));	2255	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2214	if (huge_zeropage_ok(pte, write, shared))	2256	absent = !pte \|\| huge_pte_none(huge_ptep_get(pte));
2215	zeropage_ok = 1;	2257
		2258	/*
		2259	* When coredumping, it suits get_dump_page if we just return
		2260	* an error where there's an empty slot with no huge pagecache
		2261	* to back it. This way, we avoid allocating a hugepage, and
		2262	* the sparse dumpfile avoids allocating disk blocks, but its
		2263	* huge holes still show up with zeroes where they need to be.
		2264	*/
		2265	if (absent && (flags & FOLL_DUMP) &&
		2266	!hugetlbfs_pagecache_present(h, vma, vaddr)) {
		2267	remainder = 0;
		2268	break;
		2269	}
2216		2270
2217	if (!pte \|\|	2271	if (absent \|\|
2218	(huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) \|\|	2272	((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2219	(write && !pte_write(huge_ptep_get(pte)))) {
2220	int ret;	2273	int ret;
2221		2274
2222	spin_unlock(&mm->page_table_lock);	2275	spin_unlock(&mm->page_table_lock);
2223	ret = hugetlb_fault(mm, vma, vaddr, write);	2276	ret = hugetlb_fault(mm, vma, vaddr,
		2277	(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2224	spin_lock(&mm->page_table_lock);	2278	spin_lock(&mm->page_table_lock);
2225	if (!(ret & VM_FAULT_ERROR))	2279	if (!(ret & VM_FAULT_ERROR))
2226	continue;	2280	continue;
2227		2281
2228	remainder = 0;	2282	remainder = 0;
2229	if (!i)
2230	i = -EFAULT;
2231	break;	2283	break;
2232	}	2284	}
2233		2285
@@ -2235,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
2235	page = pte_page(huge_ptep_get(pte));	2287	page = pte_page(huge_ptep_get(pte));
2236	same_page:	2288	same_page:
2237	if (pages) {	2289	if (pages) {
2238	if (zeropage_ok)	2290	pages[i] = mem_map_offset(page, pfn_offset);
2239	pages[i] = ZERO_PAGE(0);
2240	else
2241	pages[i] = mem_map_offset(page, pfn_offset);
2242	get_page(pages[i]);	2291	get_page(pages[i]);
2243	}	2292	}
2244		2293
@@ -2262,7 +2311,7 @@ same_page:
2262	*length = remainder;	2311	*length = remainder;
2263	*position = vaddr;	2312	*position = vaddr;
2264		2313
2265	return i;	2314	return i ? i : -EFAULT;
2266	}	2315	}
2267		2316
2268	void hugetlb_change_protection(struct vm_area_struct *vma,	2317	void hugetlb_change_protection(struct vm_area_struct *vma,