1 files changed, 158 insertions, 108 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index cafdcee154e8..5d7601b02874 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
        return 1UL << (hstate->order + PAGE_SHIFT);
 }
+EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
 /*
 * Return the page size being used by the MMU to back a VMA. In the majority
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
        h->free_huge_pages_node[nid]++;
 }
-static struct page *dequeue_huge_page(struct hstate *h)
-{
-        int nid;
-        struct page *page = NULL;
-        for (nid = 0; nid < MAX_NUMNODES; ++nid) {
-                if (!list_empty(&h->hugepage_freelists[nid])) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
-                                          struct page, lru);
-                        list_del(&page->lru);
-                        h->free_huge_pages--;
-                        h->free_huge_pages_node[nid]--;
-                        break;
-                }
-        }
-        return page;
-}
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve)
@@ -640,7 +623,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 /*
 * Use a helper variable to find the next node and then
- * copy it back to hugetlb_next_nid afterwards:
+ * copy it back to next_nid_to_alloc afterwards:
 * otherwise there's a window in which a racer might
 * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
 * But we don't need to use a spin_lock here: it really
@@ -649,13 +632,13 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 * if we just successfully allocated a hugepage so that
 * the next caller gets hugepages on the next node.
 */
-static int hstate_next_node(struct hstate *h)
+static int hstate_next_node_to_alloc(struct hstate *h)
 {
        int next_nid;
-        next_nid = next_node(h->hugetlb_next_nid, node_online_map);
+        next_nid = next_node(h->next_nid_to_alloc, node_online_map);
        if (next_nid == MAX_NUMNODES)
                next_nid = first_node(node_online_map);
-        h->hugetlb_next_nid = next_nid;
+        h->next_nid_to_alloc = next_nid;
        return next_nid;
 }
@@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
        int next_nid;
        int ret = 0;
-        start_nid = h->hugetlb_next_nid;
+        start_nid = h->next_nid_to_alloc;
+        next_nid = start_nid;
        do {
-                page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);
+                page = alloc_fresh_huge_page_node(h, next_nid);
                if (page)
                        ret = 1;
-                next_nid = hstate_next_node(h);
+                next_nid = hstate_next_node_to_alloc(h);
-        } while (!page && h->hugetlb_next_nid != start_nid);
+        } while (!page && next_nid != start_nid);
        if (ret)
                count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
        return ret;
 }
+/*
+ * helper for free_pool_huge_page() - find next node
+ * from which to free a huge page
+ */
+static int hstate_next_node_to_free(struct hstate *h)
+{
+        int next_nid;
+        next_nid = next_node(h->next_nid_to_free, node_online_map);
+        if (next_nid == MAX_NUMNODES)
+                next_nid = first_node(node_online_map);
+        h->next_nid_to_free = next_nid;
+        return next_nid;
+}
+/*
+ * Free huge page from pool from next node to free.
+ * Attempt to keep persistent huge pages more or less
+ * balanced over allowed nodes.
+ * Called with hugetlb_lock locked.
+ */
+static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
+{
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = h->next_nid_to_free;
+        next_nid = start_nid;
+        do {
+                /*
+                 * If we're returning unused surplus pages, only examine
+                 * nodes with surplus pages.
+                 */
+                if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+                    !list_empty(&h->hugepage_freelists[next_nid])) {
+                        struct page *page =
+                                list_entry(h->hugepage_freelists[next_nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        h->free_huge_pages--;
+                        h->free_huge_pages_node[next_nid]--;
+                        if (acct_surplus) {
+                                h->surplus_huge_pages--;
+                                h->surplus_huge_pages_node[next_nid]--;
+                        }
+                        update_and_free_page(h, page);
+                        ret = 1;
+                }
+                next_nid = hstate_next_node_to_free(h);
+        } while (!ret && next_nid != start_nid);
+        return ret;
+}
 static struct page *alloc_buddy_huge_page(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
 {
@@ -854,22 +893,13 @@ free:
 * When releasing a hugetlb pool reservation, any surplus pages that were
 * allocated to satisfy the reservation must be explicitly freed if they were
 * never used.
+ * Called with hugetlb_lock held.
 */
 static void return_unused_surplus_pages(struct hstate *h,
                                        unsigned long unused_resv_pages)
 {
-        static int nid = -1;
-        struct page *page;
        unsigned long nr_pages;
-        /*
-         * We want to release as many surplus pages as possible, spread
-         * evenly across all nodes. Iterate across all nodes until we
-         * can no longer free unreserved surplus pages. This occurs when
-         * the nodes with surplus pages have no free pages.
-         */
-        unsigned long remaining_iterations = nr_online_nodes;
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
@@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
        nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
-        while (remaining_iterations-- && nr_pages) {
+        /*
-                nid = next_node(nid, node_online_map);
+         * We want to release as many surplus pages as possible, spread
-                if (nid == MAX_NUMNODES)
+         * evenly across all nodes. Iterate across all nodes until we
-                        nid = first_node(node_online_map);
+         * can no longer free unreserved surplus pages. This occurs when
+         * the nodes with surplus pages have no free pages.
-                if (!h->surplus_huge_pages_node[nid])
+         * free_pool_huge_page() will balance the the frees across the
-                        continue;
+         * on-line nodes for us and will handle the hstate accounting.
+         */
-                if (!list_empty(&h->hugepage_freelists[nid])) {
+        while (nr_pages--) {
-                        page = list_entry(h->hugepage_freelists[nid].next,
+                if (!free_pool_huge_page(h, 1))
-                                          struct page, lru);
+                        break;
-                        list_del(&page->lru);
-                        update_and_free_page(h, page);
-                        h->free_huge_pages--;
-                        h->free_huge_pages_node[nid]--;
-                        h->surplus_huge_pages--;
-                        h->surplus_huge_pages_node[nid]--;
-                        nr_pages--;
-                        remaining_iterations = nr_online_nodes;
-                }
        }
 }
@@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                void *addr;
                addr = __alloc_bootmem_node_nopanic(
-                                NODE_DATA(h->hugetlb_next_nid),
+                                NODE_DATA(h->next_nid_to_alloc),
                                huge_page_size(h), huge_page_size(h), 0);
+                hstate_next_node_to_alloc(h);
                if (addr) {
                        /*
                         * Use the beginning of the huge page to store the
@@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
                        m = addr;
                        goto found;
                }
-                hstate_next_node(h);
                nr_nodes--;
        }
        return 0;
@@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
 */
 static int adjust_pool_surplus(struct hstate *h, int delta)
 {
-        static int prev_nid;
+        int start_nid, next_nid;
-        int nid = prev_nid;
        int ret = 0;
        VM_BUG_ON(delta != -1 && delta != 1);
-        do {
-                nid = next_node(nid, node_online_map);
-                if (nid == MAX_NUMNODES)
-                        nid = first_node(node_online_map);
-                /* To shrink on this node, there must be a surplus page */
+        if (delta < 0)
-                if (delta < 0 && !h->surplus_huge_pages_node[nid])
+                start_nid = h->next_nid_to_alloc;
-                        continue;
+        else
-                /* Surplus cannot exceed the total number of pages */
+                start_nid = h->next_nid_to_free;
-                if (delta > 0 && h->surplus_huge_pages_node[nid] >=
+        next_nid = start_nid;
+        do {
+                int nid = next_nid;
+                if (delta < 0)  {
+                        next_nid = hstate_next_node_to_alloc(h);
+                        /*
+                         * To shrink on this node, there must be a surplus page
+                         */
+                        if (!h->surplus_huge_pages_node[nid])
+                                continue;
+                }
+                if (delta > 0) {
+                        next_nid = hstate_next_node_to_free(h);
+                        /*
+                         * Surplus cannot exceed the total number of pages
+                         */
+                        if (h->surplus_huge_pages_node[nid] >=
                                                h->nr_huge_pages_node[nid])
-                        continue;
+                                continue;
+                }
                h->surplus_huge_pages += delta;
                h->surplus_huge_pages_node[nid] += delta;
                ret = 1;
                break;
-        } while (nid != prev_nid);
+        } while (next_nid != start_nid);
-        prev_nid = nid;
        return ret;
 }
@@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
        min_count = max(count, min_count);
        try_to_free_low(h, min_count);
        while (min_count < persistent_huge_pages(h)) {
-                struct page *page = dequeue_huge_page(h);
+                if (!free_pool_huge_page(h, 0))
-                if (!page)
                        break;
-                update_and_free_page(h, page);
        }
        while (count < persistent_huge_pages(h)) {
                if (!adjust_pool_surplus(h, 1))
@@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
        h->free_huge_pages = 0;
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&h->hugepage_freelists[i]);
-        h->hugetlb_next_nid = first_node(node_online_map);
+        h->next_nid_to_alloc = first_node(node_online_map);
+        h->next_nid_to_free = first_node(node_online_map);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
@@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 #ifdef CONFIG_SYSCTL
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
-                           struct file *file, void __user *buffer,
+                           void __user *buffer,
                           size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
@@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+        proc_doulongvec_minmax(table, write, buffer, length, ppos);
        if (write)
                h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
 }
 int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
-                        struct file *file, void __user *buffer,
+                        void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
-        proc_dointvec(table, write, file, buffer, length, ppos);
+        proc_dointvec(table, write, buffer, length, ppos);
        if (hugepages_treat_as_movable)
                htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
        else
@@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
 }
 int hugetlb_overcommit_handler(struct ctl_table *table, int write,
-                        struct file *file, void __user *buffer,
+                        void __user *buffer,
                        size_t *length, loff_t *ppos)
 {
        struct hstate *h = &default_hstate;
@@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
        table->data = &tmp;
        table->maxlen = sizeof(unsigned long);
-        proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+        proc_doulongvec_minmax(table, write, buffer, length, ppos);
        if (write) {
                spin_lock(&hugetlb_lock);
@@ -1689,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return 0;
 }
-struct vm_operations_struct hugetlb_vm_ops = {
+const struct vm_operations_struct hugetlb_vm_ops = {
        .fault = hugetlb_vm_op_fault,
        .open = hugetlb_vm_op_open,
        .close = hugetlb_vm_op_close,
@@ -1984,6 +2016,26 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h,
        return find_lock_page(mapping, idx);
 }
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        struct address_space *mapping;
+        pgoff_t idx;
+        struct page *page;
+        mapping = vma->vm_file->f_mapping;
+        idx = vma_hugecache_offset(h, vma, address);
+        page = find_get_page(mapping, idx);
+        if (page)
+                put_page(page);
+        return page != NULL;
+}
 static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, unsigned int flags)
 {
@@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
        return NULL;
 }
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
-        if (!ptep || write || shared)
-                return 0;
-        else
-                return huge_pte_none(huge_ptep_get(ptep));
-}
 int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i,
-                        int write)
+                        unsigned int flags)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
        int remainder = *length;
        struct hstate *h = hstate_vma(vma);
-        int zeropage_ok = 0;
-        int shared = vma->vm_flags & VM_SHARED;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
+                int absent;
                struct page *page;
                /*
                 * Some archs (sparc64, sh*) have multiple pte_ts to
-                 * each hugepage.  We have to make * sure we get the
+                 * each hugepage.  We have to make sure we get the
                 * first, for the page indexing below to work.
                 */
                pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
-                if (huge_zeropage_ok(pte, write, shared))
+                absent = !pte || huge_pte_none(huge_ptep_get(pte));
-                        zeropage_ok = 1;
+                /*
+                 * When coredumping, it suits get_dump_page if we just return
+                 * an error where there's an empty slot with no huge pagecache
+                 * to back it.  This way, we avoid allocating a hugepage, and
+                 * the sparse dumpfile avoids allocating disk blocks, but its
+                 * huge holes still show up with zeroes where they need to be.
+                 */
+                if (absent && (flags & FOLL_DUMP) &&
+                    !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+                        remainder = 0;
+                        break;
+                }
-                if (!pte ||
+                if (absent ||
-                    (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
+                    ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
-                    (write && !pte_write(huge_ptep_get(pte)))) {
                        int ret;
                        spin_unlock(&mm->page_table_lock);
-                        ret = hugetlb_fault(mm, vma, vaddr, write);
+                        ret = hugetlb_fault(mm, vma, vaddr,
+                                (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
                        spin_lock(&mm->page_table_lock);
                        if (!(ret & VM_FAULT_ERROR))
                                continue;
                        remainder = 0;
-                        if (!i)
-                                i = -EFAULT;
                        break;
                }
@@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                page = pte_page(huge_ptep_get(pte));
 same_page:
                if (pages) {
-                        if (zeropage_ok)
+                        pages[i] = mem_map_offset(page, pfn_offset);
-                                pages[i] = ZERO_PAGE(0);
-                        else
-                                pages[i] = mem_map_offset(page, pfn_offset);
                        get_page(pages[i]);
                }
@@ -2261,7 +2311,7 @@ same_page:
        *length = remainder;
        *position = vaddr;
-        return i;
+        return i ? i : -EFAULT;
 }
 void hugetlb_change_protection(struct vm_area_struct *vma,

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cafdcee154e8..5d7601b02874 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -234,6 +234,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
234		234
235	return 1UL << (hstate->order + PAGE_SHIFT);	235	return 1UL << (hstate->order + PAGE_SHIFT);
236	}	236	}
		237	EXPORT_SYMBOL_GPL(vma_kernel_pagesize);
237		238
238	/*	239	/*
239	* Return the page size being used by the MMU to back a VMA. In the majority	240	* Return the page size being used by the MMU to back a VMA. In the majority
@@ -455,24 +456,6 @@ static void enqueue_huge_page(struct hstate h, struct page page)
455	h->free_huge_pages_node[nid]++;	456	h->free_huge_pages_node[nid]++;
456	}	457	}
457		458
458	static struct page dequeue_huge_page(struct hstate h)
459	{
460	int nid;
461	struct page *page = NULL;
462
463	for (nid = 0; nid < MAX_NUMNODES; ++nid) {
464	if (!list_empty(&h->hugepage_freelists[nid])) {
465	page = list_entry(h->hugepage_freelists[nid].next,
466	struct page, lru);
467	list_del(&page->lru);
468	h->free_huge_pages--;
469	h->free_huge_pages_node[nid]--;
470	break;
471	}
472	}
473	return page;
474	}
475
476	static struct page dequeue_huge_page_vma(struct hstate h,	459	static struct page dequeue_huge_page_vma(struct hstate h,
477	struct vm_area_struct *vma,	460	struct vm_area_struct *vma,
478	unsigned long address, int avoid_reserve)	461	unsigned long address, int avoid_reserve)
@@ -640,7 +623,7 @@ static struct page alloc_fresh_huge_page_node(struct hstate h, int nid)
640		623
641	/*	624	/*
642	* Use a helper variable to find the next node and then	625	* Use a helper variable to find the next node and then
643	* copy it back to hugetlb_next_nid afterwards:	626	* copy it back to next_nid_to_alloc afterwards:
644	* otherwise there's a window in which a racer might	627	* otherwise there's a window in which a racer might
645	* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.	628	* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node.
646	* But we don't need to use a spin_lock here: it really	629	* But we don't need to use a spin_lock here: it really
@@ -649,13 +632,13 @@ static struct page alloc_fresh_huge_page_node(struct hstate h, int nid)
649	* if we just successfully allocated a hugepage so that	632	* if we just successfully allocated a hugepage so that
650	* the next caller gets hugepages on the next node.	633	* the next caller gets hugepages on the next node.
651	*/	634	*/
652	static int hstate_next_node(struct hstate *h)	635	static int hstate_next_node_to_alloc(struct hstate *h)
653	{	636	{
654	int next_nid;	637	int next_nid;
655	next_nid = next_node(h->hugetlb_next_nid, node_online_map);	638	next_nid = next_node(h->next_nid_to_alloc, node_online_map);
656	if (next_nid == MAX_NUMNODES)	639	if (next_nid == MAX_NUMNODES)
657	next_nid = first_node(node_online_map);	640	next_nid = first_node(node_online_map);
658	h->hugetlb_next_nid = next_nid;	641	h->next_nid_to_alloc = next_nid;
659	return next_nid;	642	return next_nid;
660	}	643	}
661		644
@@ -666,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h)
666	int next_nid;	649	int next_nid;
667	int ret = 0;	650	int ret = 0;
668		651
669	start_nid = h->hugetlb_next_nid;	652	start_nid = h->next_nid_to_alloc;
		653	next_nid = start_nid;
670		654
671	do {	655	do {
672	page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid);	656	page = alloc_fresh_huge_page_node(h, next_nid);
673	if (page)	657	if (page)
674	ret = 1;	658	ret = 1;
675	next_nid = hstate_next_node(h);	659	next_nid = hstate_next_node_to_alloc(h);
676	} while (!page && h->hugetlb_next_nid != start_nid);	660	} while (!page && next_nid != start_nid);
677		661
678	if (ret)	662	if (ret)
679	count_vm_event(HTLB_BUDDY_PGALLOC);	663	count_vm_event(HTLB_BUDDY_PGALLOC);
@@ -683,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h)
683	return ret;	667	return ret;
684	}	668	}
685		669
		670	/*
		671	* helper for free_pool_huge_page() - find next node
		672	* from which to free a huge page
		673	*/
		674	static int hstate_next_node_to_free(struct hstate *h)
		675	{
		676	int next_nid;
		677	next_nid = next_node(h->next_nid_to_free, node_online_map);
		678	if (next_nid == MAX_NUMNODES)
		679	next_nid = first_node(node_online_map);
		680	h->next_nid_to_free = next_nid;
		681	return next_nid;
		682	}
		683
		684	/*
		685	* Free huge page from pool from next node to free.
		686	* Attempt to keep persistent huge pages more or less
		687	* balanced over allowed nodes.
		688	* Called with hugetlb_lock locked.
		689	*/
		690	static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
		691	{
		692	int start_nid;
		693	int next_nid;
		694	int ret = 0;
		695
		696	start_nid = h->next_nid_to_free;
		697	next_nid = start_nid;
		698
		699	do {
		700	/*
		701	* If we're returning unused surplus pages, only examine
		702	* nodes with surplus pages.
		703	*/
		704	if ((!acct_surplus \|\| h->surplus_huge_pages_node[next_nid]) &&
		705	!list_empty(&h->hugepage_freelists[next_nid])) {
		706	struct page *page =
		707	list_entry(h->hugepage_freelists[next_nid].next,
		708	struct page, lru);
		709	list_del(&page->lru);
		710	h->free_huge_pages--;
		711	h->free_huge_pages_node[next_nid]--;
		712	if (acct_surplus) {
		713	h->surplus_huge_pages--;
		714	h->surplus_huge_pages_node[next_nid]--;
		715	}
		716	update_and_free_page(h, page);
		717	ret = 1;
		718	}
		719	next_nid = hstate_next_node_to_free(h);
		720	} while (!ret && next_nid != start_nid);
		721
		722	return ret;
		723	}
		724
686	static struct page alloc_buddy_huge_page(struct hstate h,	725	static struct page alloc_buddy_huge_page(struct hstate h,
687	struct vm_area_struct *vma, unsigned long address)	726	struct vm_area_struct *vma, unsigned long address)
688	{	727	{
@@ -854,22 +893,13 @@ free:
854	* When releasing a hugetlb pool reservation, any surplus pages that were	893	* When releasing a hugetlb pool reservation, any surplus pages that were
855	* allocated to satisfy the reservation must be explicitly freed if they were	894	* allocated to satisfy the reservation must be explicitly freed if they were
856	* never used.	895	* never used.
		896	* Called with hugetlb_lock held.
857	*/	897	*/
858	static void return_unused_surplus_pages(struct hstate *h,	898	static void return_unused_surplus_pages(struct hstate *h,
859	unsigned long unused_resv_pages)	899	unsigned long unused_resv_pages)
860	{	900	{
861	static int nid = -1;
862	struct page *page;
863	unsigned long nr_pages;	901	unsigned long nr_pages;
864		902
865	/*
866	* We want to release as many surplus pages as possible, spread
867	* evenly across all nodes. Iterate across all nodes until we
868	* can no longer free unreserved surplus pages. This occurs when
869	* the nodes with surplus pages have no free pages.
870	*/
871	unsigned long remaining_iterations = nr_online_nodes;
872
873	/* Uncommit the reservation */	903	/* Uncommit the reservation */
874	h->resv_huge_pages -= unused_resv_pages;	904	h->resv_huge_pages -= unused_resv_pages;
875		905
@@ -879,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h,
879		909
880	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);	910	nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
881		911
882	while (remaining_iterations-- && nr_pages) {	912	/*
883	nid = next_node(nid, node_online_map);	913	* We want to release as many surplus pages as possible, spread
884	if (nid == MAX_NUMNODES)	914	* evenly across all nodes. Iterate across all nodes until we
885	nid = first_node(node_online_map);	915	* can no longer free unreserved surplus pages. This occurs when
886		916	* the nodes with surplus pages have no free pages.
887	if (!h->surplus_huge_pages_node[nid])	917	* free_pool_huge_page() will balance the the frees across the
888	continue;	918	* on-line nodes for us and will handle the hstate accounting.
889		919	*/
890	if (!list_empty(&h->hugepage_freelists[nid])) {	920	while (nr_pages--) {
891	page = list_entry(h->hugepage_freelists[nid].next,	921	if (!free_pool_huge_page(h, 1))
892	struct page, lru);	922	break;
893	list_del(&page->lru);
894	update_and_free_page(h, page);
895	h->free_huge_pages--;
896	h->free_huge_pages_node[nid]--;
897	h->surplus_huge_pages--;
898	h->surplus_huge_pages_node[nid]--;
899	nr_pages--;
900	remaining_iterations = nr_online_nodes;
901	}
902	}	923	}
903	}	924	}
904		925
@@ -1007,9 +1028,10 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1007	void *addr;	1028	void *addr;
1008		1029
1009	addr = __alloc_bootmem_node_nopanic(	1030	addr = __alloc_bootmem_node_nopanic(
1010	NODE_DATA(h->hugetlb_next_nid),	1031	NODE_DATA(h->next_nid_to_alloc),
1011	huge_page_size(h), huge_page_size(h), 0);	1032	huge_page_size(h), huge_page_size(h), 0);
1012		1033
		1034	hstate_next_node_to_alloc(h);
1013	if (addr) {	1035	if (addr) {
1014	/*	1036	/*
1015	* Use the beginning of the huge page to store the	1037	* Use the beginning of the huge page to store the
@@ -1019,7 +1041,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
1019	m = addr;	1041	m = addr;
1020	goto found;	1042	goto found;
1021	}	1043	}
1022	hstate_next_node(h);
1023	nr_nodes--;	1044	nr_nodes--;
1024	}	1045	}
1025	return 0;	1046	return 0;
@@ -1140,31 +1161,43 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
1140	*/	1161	*/
1141	static int adjust_pool_surplus(struct hstate *h, int delta)	1162	static int adjust_pool_surplus(struct hstate *h, int delta)
1142	{	1163	{
1143	static int prev_nid;	1164	int start_nid, next_nid;
1144	int nid = prev_nid;
1145	int ret = 0;	1165	int ret = 0;
1146		1166
1147	VM_BUG_ON(delta != -1 && delta != 1);	1167	VM_BUG_ON(delta != -1 && delta != 1);
1148	do {
1149	nid = next_node(nid, node_online_map);
1150	if (nid == MAX_NUMNODES)
1151	nid = first_node(node_online_map);
1152		1168
1153	/* To shrink on this node, there must be a surplus page */	1169	if (delta < 0)
1154	if (delta < 0 && !h->surplus_huge_pages_node[nid])	1170	start_nid = h->next_nid_to_alloc;
1155	continue;	1171	else
1156	/* Surplus cannot exceed the total number of pages */	1172	start_nid = h->next_nid_to_free;
1157	if (delta > 0 && h->surplus_huge_pages_node[nid] >=	1173	next_nid = start_nid;
		1174
		1175	do {
		1176	int nid = next_nid;
		1177	if (delta < 0) {
		1178	next_nid = hstate_next_node_to_alloc(h);
		1179	/*
		1180	* To shrink on this node, there must be a surplus page
		1181	*/
		1182	if (!h->surplus_huge_pages_node[nid])
		1183	continue;
		1184	}
		1185	if (delta > 0) {
		1186	next_nid = hstate_next_node_to_free(h);
		1187	/*
		1188	* Surplus cannot exceed the total number of pages
		1189	*/
		1190	if (h->surplus_huge_pages_node[nid] >=
1158	h->nr_huge_pages_node[nid])	1191	h->nr_huge_pages_node[nid])
1159	continue;	1192	continue;
		1193	}
1160		1194
1161	h->surplus_huge_pages += delta;	1195	h->surplus_huge_pages += delta;
1162	h->surplus_huge_pages_node[nid] += delta;	1196	h->surplus_huge_pages_node[nid] += delta;
1163	ret = 1;	1197	ret = 1;
1164	break;	1198	break;
1165	} while (nid != prev_nid);	1199	} while (next_nid != start_nid);
1166		1200
1167	prev_nid = nid;
1168	return ret;	1201	return ret;
1169	}	1202	}
1170		1203
@@ -1226,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
1226	min_count = max(count, min_count);	1259	min_count = max(count, min_count);
1227	try_to_free_low(h, min_count);	1260	try_to_free_low(h, min_count);
1228	while (min_count < persistent_huge_pages(h)) {	1261	while (min_count < persistent_huge_pages(h)) {
1229	struct page *page = dequeue_huge_page(h);	1262	if (!free_pool_huge_page(h, 0))
1230	if (!page)
1231	break;	1263	break;
1232	update_and_free_page(h, page);
1233	}	1264	}
1234	while (count < persistent_huge_pages(h)) {	1265	while (count < persistent_huge_pages(h)) {
1235	if (!adjust_pool_surplus(h, 1))	1266	if (!adjust_pool_surplus(h, 1))
@@ -1441,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order)
1441	h->free_huge_pages = 0;	1472	h->free_huge_pages = 0;
1442	for (i = 0; i < MAX_NUMNODES; ++i)	1473	for (i = 0; i < MAX_NUMNODES; ++i)
1443	INIT_LIST_HEAD(&h->hugepage_freelists[i]);	1474	INIT_LIST_HEAD(&h->hugepage_freelists[i]);
1444	h->hugetlb_next_nid = first_node(node_online_map);	1475	h->next_nid_to_alloc = first_node(node_online_map);
		1476	h->next_nid_to_free = first_node(node_online_map);
1445	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",	1477	snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
1446	huge_page_size(h)/1024);	1478	huge_page_size(h)/1024);
1447		1479
@@ -1505,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
1505		1537
1506	#ifdef CONFIG_SYSCTL	1538	#ifdef CONFIG_SYSCTL
1507	int hugetlb_sysctl_handler(struct ctl_table *table, int write,	1539	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1508	struct file file, void __user buffer,	1540	void __user *buffer,
1509	size_t length, loff_t ppos)	1541	size_t length, loff_t ppos)
1510	{	1542	{
1511	struct hstate *h = &default_hstate;	1543	struct hstate *h = &default_hstate;
@@ -1516,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1516		1548
1517	table->data = &tmp;	1549	table->data = &tmp;
1518	table->maxlen = sizeof(unsigned long);	1550	table->maxlen = sizeof(unsigned long);
1519	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);	1551	proc_doulongvec_minmax(table, write, buffer, length, ppos);
1520		1552
1521	if (write)	1553	if (write)
1522	h->max_huge_pages = set_max_huge_pages(h, tmp);	1554	h->max_huge_pages = set_max_huge_pages(h, tmp);
@@ -1525,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
1525	}	1557	}
1526		1558
1527	int hugetlb_treat_movable_handler(struct ctl_table *table, int write,	1559	int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1528	struct file file, void __user buffer,	1560	void __user *buffer,
1529	size_t length, loff_t ppos)	1561	size_t length, loff_t ppos)
1530	{	1562	{
1531	proc_dointvec(table, write, file, buffer, length, ppos);	1563	proc_dointvec(table, write, buffer, length, ppos);
1532	if (hugepages_treat_as_movable)	1564	if (hugepages_treat_as_movable)
1533	htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;	1565	htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
1534	else	1566	else
@@ -1537,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
1537	}	1569	}
1538		1570
1539	int hugetlb_overcommit_handler(struct ctl_table *table, int write,	1571	int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1540	struct file file, void __user buffer,	1572	void __user *buffer,
1541	size_t length, loff_t ppos)	1573	size_t length, loff_t ppos)
1542	{	1574	{
1543	struct hstate *h = &default_hstate;	1575	struct hstate *h = &default_hstate;
@@ -1548,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1548		1580
1549	table->data = &tmp;	1581	table->data = &tmp;
1550	table->maxlen = sizeof(unsigned long);	1582	table->maxlen = sizeof(unsigned long);
1551	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);	1583	proc_doulongvec_minmax(table, write, buffer, length, ppos);
1552		1584
1553	if (write) {	1585	if (write) {
1554	spin_lock(&hugetlb_lock);	1586	spin_lock(&hugetlb_lock);
@@ -1689,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct vma, struct vm_fault vmf)
1689	return 0;	1721	return 0;
1690	}	1722	}
1691		1723
1692	struct vm_operations_struct hugetlb_vm_ops = {	1724	const struct vm_operations_struct hugetlb_vm_ops = {
1693	.fault = hugetlb_vm_op_fault,	1725	.fault = hugetlb_vm_op_fault,
1694	.open = hugetlb_vm_op_open,	1726	.open = hugetlb_vm_op_open,
1695	.close = hugetlb_vm_op_close,	1727	.close = hugetlb_vm_op_close,
@@ -1984,6 +2016,26 @@ static struct page hugetlbfs_pagecache_page(struct hstate h,
1984	return find_lock_page(mapping, idx);	2016	return find_lock_page(mapping, idx);
1985	}	2017	}
1986		2018
		2019	/*
		2020	* Return whether there is a pagecache page to back given address within VMA.
		2021	* Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
		2022	*/
		2023	static bool hugetlbfs_pagecache_present(struct hstate *h,
		2024	struct vm_area_struct *vma, unsigned long address)
		2025	{
		2026	struct address_space *mapping;
		2027	pgoff_t idx;
		2028	struct page *page;
		2029
		2030	mapping = vma->vm_file->f_mapping;
		2031	idx = vma_hugecache_offset(h, vma, address);
		2032
		2033	page = find_get_page(mapping, idx);
		2034	if (page)
		2035	put_page(page);
		2036	return page != NULL;
		2037	}
		2038
1987	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,	2039	static int hugetlb_no_page(struct mm_struct mm, struct vm_area_struct vma,
1988	unsigned long address, pte_t *ptep, unsigned int flags)	2040	unsigned long address, pte_t *ptep, unsigned int flags)
1989	{	2041	{
@@ -2179,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2179	return NULL;	2231	return NULL;
2180	}	2232	}
2181		2233
2182	static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2183	{
2184	if (!ptep \|\| write \|\| shared)
2185	return 0;
2186	else
2187	return huge_pte_none(huge_ptep_get(ptep));
2188	}
2189
2190	int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,	2234	int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
2191	struct page pages, struct vm_area_struct vmas,	2235	struct page pages, struct vm_area_struct vmas,
2192	unsigned long position, int length, int i,	2236	unsigned long position, int length, int i,
2193	int write)	2237	unsigned int flags)
2194	{	2238	{
2195	unsigned long pfn_offset;	2239	unsigned long pfn_offset;
2196	unsigned long vaddr = *position;	2240	unsigned long vaddr = *position;
2197	int remainder = *length;	2241	int remainder = *length;
2198	struct hstate *h = hstate_vma(vma);	2242	struct hstate *h = hstate_vma(vma);
2199	int zeropage_ok = 0;
2200	int shared = vma->vm_flags & VM_SHARED;
2201		2243
2202	spin_lock(&mm->page_table_lock);	2244	spin_lock(&mm->page_table_lock);
2203	while (vaddr < vma->vm_end && remainder) {	2245	while (vaddr < vma->vm_end && remainder) {
2204	pte_t *pte;	2246	pte_t *pte;
		2247	int absent;
2205	struct page *page;	2248	struct page *page;
2206		2249
2207	/*	2250	/*
2208	* Some archs (sparc64, sh*) have multiple pte_ts to	2251	* Some archs (sparc64, sh*) have multiple pte_ts to
2209	* each hugepage. We have to make * sure we get the	2252	* each hugepage. We have to make sure we get the
2210	* first, for the page indexing below to work.	2253	* first, for the page indexing below to work.
2211	*/	2254	*/
2212	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));	2255	pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2213	if (huge_zeropage_ok(pte, write, shared))	2256	absent = !pte \|\| huge_pte_none(huge_ptep_get(pte));
2214	zeropage_ok = 1;	2257
		2258	/*
		2259	* When coredumping, it suits get_dump_page if we just return
		2260	* an error where there's an empty slot with no huge pagecache
		2261	* to back it. This way, we avoid allocating a hugepage, and
		2262	* the sparse dumpfile avoids allocating disk blocks, but its
		2263	* huge holes still show up with zeroes where they need to be.
		2264	*/
		2265	if (absent && (flags & FOLL_DUMP) &&
		2266	!hugetlbfs_pagecache_present(h, vma, vaddr)) {
		2267	remainder = 0;
		2268	break;
		2269	}
2215		2270
2216	if (!pte \|\|	2271	if (absent \|\|
2217	(huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) \|\|	2272	((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
2218	(write && !pte_write(huge_ptep_get(pte)))) {
2219	int ret;	2273	int ret;
2220		2274
2221	spin_unlock(&mm->page_table_lock);	2275	spin_unlock(&mm->page_table_lock);
2222	ret = hugetlb_fault(mm, vma, vaddr, write);	2276	ret = hugetlb_fault(mm, vma, vaddr,
		2277	(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
2223	spin_lock(&mm->page_table_lock);	2278	spin_lock(&mm->page_table_lock);
2224	if (!(ret & VM_FAULT_ERROR))	2279	if (!(ret & VM_FAULT_ERROR))
2225	continue;	2280	continue;
2226		2281
2227	remainder = 0;	2282	remainder = 0;
2228	if (!i)
2229	i = -EFAULT;
2230	break;	2283	break;
2231	}	2284	}
2232		2285
@@ -2234,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
2234	page = pte_page(huge_ptep_get(pte));	2287	page = pte_page(huge_ptep_get(pte));
2235	same_page:	2288	same_page:
2236	if (pages) {	2289	if (pages) {
2237	if (zeropage_ok)	2290	pages[i] = mem_map_offset(page, pfn_offset);
2238	pages[i] = ZERO_PAGE(0);
2239	else
2240	pages[i] = mem_map_offset(page, pfn_offset);
2241	get_page(pages[i]);	2291	get_page(pages[i]);
2242	}	2292	}
2243		2293
@@ -2261,7 +2311,7 @@ same_page:
2261	*length = remainder;	2311	*length = remainder;
2262	*position = vaddr;	2312	*position = vaddr;
2263		2313
2264	return i;	2314	return i ? i : -EFAULT;
2265	}	2315	}
2266		2316
2267	void hugetlb_change_protection(struct vm_area_struct *vma,	2317	void hugetlb_change_protection(struct vm_area_struct *vma,