1 files changed, 326 insertions, 72 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eab8c428cc93..ae2959bb59cb 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long surplus_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
 unsigned long hugepages_treat_as_movable;
+int hugetlb_dynamic_pool;
+static int hugetlb_next_nid;
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
                        list_del(&page->lru);
                        free_huge_pages--;
                        free_huge_pages_node[nid]--;
+                        if (vma && vma->vm_flags & VM_MAYSHARE)
+                                resv_huge_pages--;
                        break;
                }
        }
@@ -92,58 +98,269 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
+static void update_and_free_page(struct page *page)
+{
+        int i;
+        nr_huge_pages--;
+        nr_huge_pages_node[page_to_nid(page)]--;
+        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
+                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
+                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
+                                1 << PG_private | 1<< PG_writeback);
+        }
+        set_compound_page_dtor(page, NULL);
+        set_page_refcounted(page);
+        __free_pages(page, HUGETLB_PAGE_ORDER);
+}
 static void free_huge_page(struct page *page)
 {
-        BUG_ON(page_count(page));
+        int nid = page_to_nid(page);
+        BUG_ON(page_count(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        if (surplus_huge_pages_node[nid]) {
+                update_and_free_page(page);
+                surplus_huge_pages--;
+                surplus_huge_pages_node[nid]--;
+        } else {
+                enqueue_huge_page(page);
+        }
        spin_unlock(&hugetlb_lock);
 }
-static int alloc_fresh_huge_page(void)
+/*
+ * Increment or decrement surplus_huge_pages.  Keep node-specific counters
+ * balanced by operating on them in a round-robin fashion.
+ * Returns 1 if an adjustment was made.
+ */
+static int adjust_pool_surplus(int delta)
 {
        static int prev_nid;
-        struct page *page;
+        int nid = prev_nid;
-        int nid;
+        int ret = 0;
+        VM_BUG_ON(delta != -1 && delta != 1);
+        do {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                /* To shrink on this node, there must be a surplus page */
+                if (delta < 0 && !surplus_huge_pages_node[nid])
+                        continue;
+                /* Surplus cannot exceed the total number of pages */
+                if (delta > 0 && surplus_huge_pages_node[nid] >=
+                                                nr_huge_pages_node[nid])
+                        continue;
+                surplus_huge_pages += delta;
+                surplus_huge_pages_node[nid] += delta;
+                ret = 1;
+                break;
+        } while (nid != prev_nid);
-        /*
-         * Copy static prev_nid to local nid, work on that, then copy it
-         * back to prev_nid afterwards: otherwise there's a window in which
-         * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
-         * But we don't need to use a spin_lock here: it really doesn't
-         * matter if occasionally a racer chooses the same nid as we do.
-         */
-        nid = next_node(prev_nid, node_online_map);
-        if (nid == MAX_NUMNODES)
-                nid = first_node(node_online_map);
        prev_nid = nid;
+        return ret;
+}
+static struct page *alloc_fresh_huge_page_node(int nid)
+{
+        struct page *page;
-        page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
+        page = alloc_pages_node(nid,
+                htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|__GFP_NOWARN,
+                HUGETLB_PAGE_ORDER);
+        if (page) {
+                set_compound_page_dtor(page, free_huge_page);
+                spin_lock(&hugetlb_lock);
+                nr_huge_pages++;
+                nr_huge_pages_node[nid]++;
+                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+        }
+        return page;
+}
+static int alloc_fresh_huge_page(void)
+{
+        struct page *page;
+        int start_nid;
+        int next_nid;
+        int ret = 0;
+        start_nid = hugetlb_next_nid;
+        do {
+                page = alloc_fresh_huge_page_node(hugetlb_next_nid);
+                if (page)
+                        ret = 1;
+                /*
+                 * Use a helper variable to find the next node and then
+                 * copy it back to hugetlb_next_nid afterwards:
+                 * otherwise there's a window in which a racer might
+                 * pass invalid nid MAX_NUMNODES to alloc_pages_node.
+                 * But we don't need to use a spin_lock here: it really
+                 * doesn't matter if occasionally a racer chooses the
+                 * same nid as we do.  Move nid forward in the mask even
+                 * if we just successfully allocated a hugepage so that
+                 * the next caller gets hugepages on the next node.
+                 */
+                next_nid = next_node(hugetlb_next_nid, node_online_map);
+                if (next_nid == MAX_NUMNODES)
+                        next_nid = first_node(node_online_map);
+                hugetlb_next_nid = next_nid;
+        } while (!page && hugetlb_next_nid != start_nid);
+        return ret;
+}
+static struct page *alloc_buddy_huge_page(struct vm_area_struct *vma,
+                                                unsigned long address)
+{
+        struct page *page;
+        /* Check if the dynamic pool is enabled */
+        if (!hugetlb_dynamic_pool)
+                return NULL;
+        page = alloc_pages(htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
        if (page) {
                set_compound_page_dtor(page, free_huge_page);
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
+                surplus_huge_pages++;
+                surplus_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
-                put_page(page); /* free it into the hugepage allocator */
-                return 1;
        }
-        return 0;
+        return page;
+}
+/*
+ * Increase the hugetlb pool such that it can accomodate a reservation
+ * of size 'delta'.
+ */
+static int gather_surplus_pages(int delta)
+{
+        struct list_head surplus_list;
+        struct page *page, *tmp;
+        int ret, i;
+        int needed, allocated;
+        needed = (resv_huge_pages + delta) - free_huge_pages;
+        if (needed <= 0)
+                return 0;
+        allocated = 0;
+        INIT_LIST_HEAD(&surplus_list);
+        ret = -ENOMEM;
+retry:
+        spin_unlock(&hugetlb_lock);
+        for (i = 0; i < needed; i++) {
+                page = alloc_buddy_huge_page(NULL, 0);
+                if (!page) {
+                        /*
+                         * We were not able to allocate enough pages to
+                         * satisfy the entire reservation so we free what
+                         * we've allocated so far.
+                         */
+                        spin_lock(&hugetlb_lock);
+                        needed = 0;
+                        goto free;
+                }
+                list_add(&page->lru, &surplus_list);
+        }
+        allocated += needed;
+        /*
+         * After retaking hugetlb_lock, we need to recalculate 'needed'
+         * because either resv_huge_pages or free_huge_pages may have changed.
+         */
+        spin_lock(&hugetlb_lock);
+        needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
+        if (needed > 0)
+                goto retry;
+        /*
+         * The surplus_list now contains _at_least_ the number of extra pages
+         * needed to accomodate the reservation.  Add the appropriate number
+         * of pages to the hugetlb pool and free the extras back to the buddy
+         * allocator.
+         */
+        needed += allocated;
+        ret = 0;
+free:
+        list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
+                list_del(&page->lru);
+                if ((--needed) >= 0)
+                        enqueue_huge_page(page);
+                else {
+                        /*
+                         * Decrement the refcount and free the page using its
+                         * destructor.  This must be done with hugetlb_lock
+                         * unlocked which is safe because free_huge_page takes
+                         * hugetlb_lock before deciding how to free the page.
+                         */
+                        spin_unlock(&hugetlb_lock);
+                        put_page(page);
+                        spin_lock(&hugetlb_lock);
+                }
+        }
+        return ret;
+}
+/*
+ * When releasing a hugetlb pool reservation, any surplus pages that were
+ * allocated to satisfy the reservation must be explicitly freed if they were
+ * never used.
+ */
+void return_unused_surplus_pages(unsigned long unused_resv_pages)
+{
+        static int nid = -1;
+        struct page *page;
+        unsigned long nr_pages;
+        nr_pages = min(unused_resv_pages, surplus_huge_pages);
+        while (nr_pages) {
+                nid = next_node(nid, node_online_map);
+                if (nid == MAX_NUMNODES)
+                        nid = first_node(node_online_map);
+                if (!surplus_huge_pages_node[nid])
+                        continue;
+                if (!list_empty(&hugepage_freelists[nid])) {
+                        page = list_entry(hugepage_freelists[nid].next,
+                                          struct page, lru);
+                        list_del(&page->lru);
+                        update_and_free_page(page);
+                        free_huge_pages--;
+                        free_huge_pages_node[nid]--;
+                        surplus_huge_pages--;
+                        surplus_huge_pages_node[nid]--;
+                        nr_pages--;
+                }
+        }
 }
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct page *page;
+        struct page *page = NULL;
+        int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
        spin_lock(&hugetlb_lock);
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
-                resv_huge_pages--;
-        else if (free_huge_pages <= resv_huge_pages)
                goto fail;
        page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        return page;
 fail:
-        if (vma->vm_flags & VM_MAYSHARE)
-                resv_huge_pages++;
        spin_unlock(&hugetlb_lock);
-        return NULL;
+        /*
+         * Private mappings do not use reserved huge pages so the allocation
+         * may have failed due to an undersized hugetlb pool.  Try to grab a
+         * surplus huge page from the buddy allocator.
+         */
+        if (!use_reserved_page)
+                page = alloc_buddy_huge_page(vma, addr);
+        return page;
 }
 static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
        for (i = 0; i < MAX_NUMNODES; ++i)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
+        hugetlb_next_nid = first_node(node_online_map);
        for (i = 0; i < max_huge_pages; ++i) {
                if (!alloc_fresh_huge_page())
                        break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
 }
 #ifdef CONFIG_SYSCTL
-static void update_and_free_page(struct page *page)
-{
-        int i;
-        nr_huge_pages--;
-        nr_huge_pages_node[page_to_nid(page)]--;
-        for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
-                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
-                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
-                                1 << PG_private | 1<< PG_writeback);
-        }
-        set_compound_page_dtor(page, NULL);
-        set_page_refcounted(page);
-        __free_pages(page, HUGETLB_PAGE_ORDER);
-}
 #ifdef CONFIG_HIGHMEM
 static void try_to_free_low(unsigned long count)
 {
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
        for (i = 0; i < MAX_NUMNODES; ++i) {
                struct page *page, *next;
                list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
+                        if (count >= nr_huge_pages)
+                                return;
                        if (PageHighMem(page))
                                continue;
                        list_del(&page->lru);
                        update_and_free_page(page);
                        free_huge_pages--;
                        free_huge_pages_node[page_to_nid(page)]--;
-                        if (count >= nr_huge_pages)
-                                return;
                }
        }
 }
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
 }
 #endif
+#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
-        while (count > nr_huge_pages) {
+        unsigned long min_count, ret;
-                if (!alloc_fresh_huge_page())
-                        return nr_huge_pages;
-        }
-        if (count >= nr_huge_pages)
-                return nr_huge_pages;
+        /*
+         * Increase the pool size
+         * First take pages out of surplus state.  Then make up the
+         * remaining difference by allocating fresh huge pages.
+         */
        spin_lock(&hugetlb_lock);
-        count = max(count, resv_huge_pages);
+        while (surplus_huge_pages && count > persistent_huge_pages) {
-        try_to_free_low(count);
+                if (!adjust_pool_surplus(-1))
-        while (count < nr_huge_pages) {
+                        break;
+        }
+        while (count > persistent_huge_pages) {
+                int ret;
+                /*
+                 * If this allocation races such that we no longer need the
+                 * page, free_huge_page will handle it by freeing the page
+                 * and reducing the surplus.
+                 */
+                spin_unlock(&hugetlb_lock);
+                ret = alloc_fresh_huge_page();
+                spin_lock(&hugetlb_lock);
+                if (!ret)
+                        goto out;
+        }
+        /*
+         * Decrease the pool size
+         * First return free pages to the buddy allocator (being careful
+         * to keep enough around to satisfy reservations).  Then place
+         * pages into surplus state as needed so the pool will shrink
+         * to the desired size as pages become free.
+         */
+        min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
+        min_count = max(count, min_count);
+        try_to_free_low(min_count);
+        while (min_count < persistent_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
                if (!page)
                        break;
                update_and_free_page(page);
        }
+        while (count < persistent_huge_pages) {
+                if (!adjust_pool_surplus(1))
+                        break;
+        }
+out:
+        ret = persistent_huge_pages;
        spin_unlock(&hugetlb_lock);
-        return nr_huge_pages;
+        return ret;
 }
 int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Surp:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
                        resv_huge_pages,
+                        surplus_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
        entry = pte_mkwrite(pte_mkdirty(*ptep));
        if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
                update_mmu_cache(vma, address, entry);
-                lazy_mmu_prot_update(entry);
        }
 }
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                        pte = huge_ptep_get_and_clear(mm, address, ptep);
                        pte = pte_mkhuge(pte_modify(pte, newprot));
                        set_huge_pte_at(mm, address, ptep, pte);
-                        lazy_mmu_prot_update(pte);
                }
        }
        spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
        int ret = -ENOMEM;
        spin_lock(&hugetlb_lock);
-        if ((delta + resv_huge_pages) <= free_huge_pages) {
-                resv_huge_pages += delta;
-                ret = 0;
-        }
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-int hugetlb_reserve_pages(struct inode *inode, long from, long to)
-{
-        long ret, chg;
-        chg = region_chg(&inode->i_mapping->private_list, from, to);
-        if (chg < 0)
-                return chg;
        /*
         * When cpuset is configured, it breaks the strict hugetlb page
         * reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
         * a best attempt and hopefully to minimize the impact of changing
         * semantics that cpuset has.
         */
-        if (chg > cpuset_mems_nr(free_huge_pages_node))
+        if (delta > 0) {
-                return -ENOMEM;
+                if (gather_surplus_pages(delta) < 0)
+                        goto out;
+                if (delta > cpuset_mems_nr(free_huge_pages_node))
+                        goto out;
+        }
+        ret = 0;
+        resv_huge_pages += delta;
+        if (delta < 0)
+                return_unused_surplus_pages((unsigned long) -delta);
+out:
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
        ret = hugetlb_acct_memory(chg);
        if (ret < 0)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index eab8c428cc93..ae2959bb59cb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -23,12 +23,16 @@
23		23
24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;	25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
		26	static unsigned long surplus_huge_pages;
26	unsigned long max_huge_pages;	27	unsigned long max_huge_pages;
27	static struct list_head hugepage_freelists[MAX_NUMNODES];	28	static struct list_head hugepage_freelists[MAX_NUMNODES];
28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];	29	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
29	static unsigned int free_huge_pages_node[MAX_NUMNODES];	30	static unsigned int free_huge_pages_node[MAX_NUMNODES];
		31	static unsigned int surplus_huge_pages_node[MAX_NUMNODES];
30	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;	32	static gfp_t htlb_alloc_mask = GFP_HIGHUSER;
31	unsigned long hugepages_treat_as_movable;	33	unsigned long hugepages_treat_as_movable;
		34	int hugetlb_dynamic_pool;
		35	static int hugetlb_next_nid;
32		36
33	/*	37	/*
34	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages	38	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
@@ -85,6 +89,8 @@ static struct page dequeue_huge_page(struct vm_area_struct vma,
85	list_del(&page->lru);	89	list_del(&page->lru);
86	free_huge_pages--;	90	free_huge_pages--;
87	free_huge_pages_node[nid]--;	91	free_huge_pages_node[nid]--;
		92	if (vma && vma->vm_flags & VM_MAYSHARE)
		93	resv_huge_pages--;
88	break;	94	break;
89	}	95	}
90	}	96	}
@@ -92,58 +98,269 @@ static struct page dequeue_huge_page(struct vm_area_struct vma,
92	return page;	98	return page;
93	}	99	}
94		100
		101	static void update_and_free_page(struct page *page)
		102	{
		103	int i;
		104	nr_huge_pages--;
		105	nr_huge_pages_node[page_to_nid(page)]--;
		106	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
		107	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \| 1 << PG_referenced \|
		108	1 << PG_dirty \| 1 << PG_active \| 1 << PG_reserved \|
		109	1 << PG_private \| 1<< PG_writeback);
		110	}
		111	set_compound_page_dtor(page, NULL);
		112	set_page_refcounted(page);
		113	__free_pages(page, HUGETLB_PAGE_ORDER);
		114	}
		115
95	static void free_huge_page(struct page *page)	116	static void free_huge_page(struct page *page)
96	{	117	{
97	BUG_ON(page_count(page));	118	int nid = page_to_nid(page);
98		119
		120	BUG_ON(page_count(page));
99	INIT_LIST_HEAD(&page->lru);	121	INIT_LIST_HEAD(&page->lru);
100		122
101	spin_lock(&hugetlb_lock);	123	spin_lock(&hugetlb_lock);
102	enqueue_huge_page(page);	124	if (surplus_huge_pages_node[nid]) {
		125	update_and_free_page(page);
		126	surplus_huge_pages--;
		127	surplus_huge_pages_node[nid]--;
		128	} else {
		129	enqueue_huge_page(page);
		130	}
103	spin_unlock(&hugetlb_lock);	131	spin_unlock(&hugetlb_lock);
104	}	132	}
105		133
106	static int alloc_fresh_huge_page(void)	134	/*
		135	* Increment or decrement surplus_huge_pages. Keep node-specific counters
		136	* balanced by operating on them in a round-robin fashion.
		137	* Returns 1 if an adjustment was made.
		138	*/
		139	static int adjust_pool_surplus(int delta)
107	{	140	{
108	static int prev_nid;	141	static int prev_nid;
109	struct page *page;	142	int nid = prev_nid;
110	int nid;	143	int ret = 0;
		144
		145	VM_BUG_ON(delta != -1 && delta != 1);
		146	do {
		147	nid = next_node(nid, node_online_map);
		148	if (nid == MAX_NUMNODES)
		149	nid = first_node(node_online_map);
		150
		151	/* To shrink on this node, there must be a surplus page */
		152	if (delta < 0 && !surplus_huge_pages_node[nid])
		153	continue;
		154	/* Surplus cannot exceed the total number of pages */
		155	if (delta > 0 && surplus_huge_pages_node[nid] >=
		156	nr_huge_pages_node[nid])
		157	continue;
		158
		159	surplus_huge_pages += delta;
		160	surplus_huge_pages_node[nid] += delta;
		161	ret = 1;
		162	break;
		163	} while (nid != prev_nid);
111		164
112	/*
113	* Copy static prev_nid to local nid, work on that, then copy it
114	* back to prev_nid afterwards: otherwise there's a window in which
115	* a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node.
116	* But we don't need to use a spin_lock here: it really doesn't
117	* matter if occasionally a racer chooses the same nid as we do.
118	*/
119	nid = next_node(prev_nid, node_online_map);
120	if (nid == MAX_NUMNODES)
121	nid = first_node(node_online_map);
122	prev_nid = nid;	165	prev_nid = nid;
		166	return ret;
		167	}
		168
		169	static struct page *alloc_fresh_huge_page_node(int nid)
		170	{
		171	struct page *page;
123		172
124	page = alloc_pages_node(nid, htlb_alloc_mask\|__GFP_COMP\|__GFP_NOWARN,	173	page = alloc_pages_node(nid,
		174	htlb_alloc_mask\|__GFP_COMP\|__GFP_THISNODE\|__GFP_NOWARN,
		175	HUGETLB_PAGE_ORDER);
		176	if (page) {
		177	set_compound_page_dtor(page, free_huge_page);
		178	spin_lock(&hugetlb_lock);
		179	nr_huge_pages++;
		180	nr_huge_pages_node[nid]++;
		181	spin_unlock(&hugetlb_lock);
		182	put_page(page); /* free it into the hugepage allocator */
		183	}
		184
		185	return page;
		186	}
		187
		188	static int alloc_fresh_huge_page(void)
		189	{
		190	struct page *page;
		191	int start_nid;
		192	int next_nid;
		193	int ret = 0;
		194
		195	start_nid = hugetlb_next_nid;
		196
		197	do {
		198	page = alloc_fresh_huge_page_node(hugetlb_next_nid);
		199	if (page)
		200	ret = 1;
		201	/*
		202	* Use a helper variable to find the next node and then
		203	* copy it back to hugetlb_next_nid afterwards:
		204	* otherwise there's a window in which a racer might
		205	* pass invalid nid MAX_NUMNODES to alloc_pages_node.
		206	* But we don't need to use a spin_lock here: it really
		207	* doesn't matter if occasionally a racer chooses the
		208	* same nid as we do. Move nid forward in the mask even
		209	* if we just successfully allocated a hugepage so that
		210	* the next caller gets hugepages on the next node.
		211	*/
		212	next_nid = next_node(hugetlb_next_nid, node_online_map);
		213	if (next_nid == MAX_NUMNODES)
		214	next_nid = first_node(node_online_map);
		215	hugetlb_next_nid = next_nid;
		216	} while (!page && hugetlb_next_nid != start_nid);
		217
		218	return ret;
		219	}
		220
		221	static struct page alloc_buddy_huge_page(struct vm_area_struct vma,
		222	unsigned long address)
		223	{
		224	struct page *page;
		225
		226	/* Check if the dynamic pool is enabled */
		227	if (!hugetlb_dynamic_pool)
		228	return NULL;
		229
		230	page = alloc_pages(htlb_alloc_mask\|__GFP_COMP\|__GFP_NOWARN,
125	HUGETLB_PAGE_ORDER);	231	HUGETLB_PAGE_ORDER);
126	if (page) {	232	if (page) {
127	set_compound_page_dtor(page, free_huge_page);	233	set_compound_page_dtor(page, free_huge_page);
128	spin_lock(&hugetlb_lock);	234	spin_lock(&hugetlb_lock);
129	nr_huge_pages++;	235	nr_huge_pages++;
130	nr_huge_pages_node[page_to_nid(page)]++;	236	nr_huge_pages_node[page_to_nid(page)]++;
		237	surplus_huge_pages++;
		238	surplus_huge_pages_node[page_to_nid(page)]++;
131	spin_unlock(&hugetlb_lock);	239	spin_unlock(&hugetlb_lock);
132	put_page(page); /* free it into the hugepage allocator */
133	return 1;
134	}	240	}
135	return 0;	241
		242	return page;
		243	}
		244
		245	/*
		246	* Increase the hugetlb pool such that it can accomodate a reservation
		247	* of size 'delta'.
		248	*/
		249	static int gather_surplus_pages(int delta)
		250	{
		251	struct list_head surplus_list;
		252	struct page page, tmp;
		253	int ret, i;
		254	int needed, allocated;
		255
		256	needed = (resv_huge_pages + delta) - free_huge_pages;
		257	if (needed <= 0)
		258	return 0;
		259
		260	allocated = 0;
		261	INIT_LIST_HEAD(&surplus_list);
		262
		263	ret = -ENOMEM;
		264	retry:
		265	spin_unlock(&hugetlb_lock);
		266	for (i = 0; i < needed; i++) {
		267	page = alloc_buddy_huge_page(NULL, 0);
		268	if (!page) {
		269	/*
		270	* We were not able to allocate enough pages to
		271	* satisfy the entire reservation so we free what
		272	* we've allocated so far.
		273	*/
		274	spin_lock(&hugetlb_lock);
		275	needed = 0;
		276	goto free;
		277	}
		278
		279	list_add(&page->lru, &surplus_list);
		280	}
		281	allocated += needed;
		282
		283	/*
		284	* After retaking hugetlb_lock, we need to recalculate 'needed'
		285	* because either resv_huge_pages or free_huge_pages may have changed.
		286	*/
		287	spin_lock(&hugetlb_lock);
		288	needed = (resv_huge_pages + delta) - (free_huge_pages + allocated);
		289	if (needed > 0)
		290	goto retry;
		291
		292	/*
		293	* The surplus_list now contains _at_least_ the number of extra pages
		294	* needed to accomodate the reservation. Add the appropriate number
		295	* of pages to the hugetlb pool and free the extras back to the buddy
		296	* allocator.
		297	*/
		298	needed += allocated;
		299	ret = 0;
		300	free:
		301	list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
		302	list_del(&page->lru);
		303	if ((--needed) >= 0)
		304	enqueue_huge_page(page);
		305	else {
		306	/*
		307	* Decrement the refcount and free the page using its
		308	* destructor. This must be done with hugetlb_lock
		309	* unlocked which is safe because free_huge_page takes
		310	* hugetlb_lock before deciding how to free the page.
		311	*/
		312	spin_unlock(&hugetlb_lock);
		313	put_page(page);
		314	spin_lock(&hugetlb_lock);
		315	}
		316	}
		317
		318	return ret;
		319	}
		320
		321	/*
		322	* When releasing a hugetlb pool reservation, any surplus pages that were
		323	* allocated to satisfy the reservation must be explicitly freed if they were
		324	* never used.
		325	*/
		326	void return_unused_surplus_pages(unsigned long unused_resv_pages)
		327	{
		328	static int nid = -1;
		329	struct page *page;
		330	unsigned long nr_pages;
		331
		332	nr_pages = min(unused_resv_pages, surplus_huge_pages);
		333
		334	while (nr_pages) {
		335	nid = next_node(nid, node_online_map);
		336	if (nid == MAX_NUMNODES)
		337	nid = first_node(node_online_map);
		338
		339	if (!surplus_huge_pages_node[nid])
		340	continue;
		341
		342	if (!list_empty(&hugepage_freelists[nid])) {
		343	page = list_entry(hugepage_freelists[nid].next,
		344	struct page, lru);
		345	list_del(&page->lru);
		346	update_and_free_page(page);
		347	free_huge_pages--;
		348	free_huge_pages_node[nid]--;
		349	surplus_huge_pages--;
		350	surplus_huge_pages_node[nid]--;
		351	nr_pages--;
		352	}
		353	}
136	}	354	}
137		355
138	static struct page alloc_huge_page(struct vm_area_struct vma,	356	static struct page alloc_huge_page(struct vm_area_struct vma,
139	unsigned long addr)	357	unsigned long addr)
140	{	358	{
141	struct page *page;	359	struct page *page = NULL;
		360	int use_reserved_page = vma->vm_flags & VM_MAYSHARE;
142		361
143	spin_lock(&hugetlb_lock);	362	spin_lock(&hugetlb_lock);
144	if (vma->vm_flags & VM_MAYSHARE)	363	if (!use_reserved_page && (free_huge_pages <= resv_huge_pages))
145	resv_huge_pages--;
146	else if (free_huge_pages <= resv_huge_pages)
147	goto fail;	364	goto fail;
148		365
149	page = dequeue_huge_page(vma, addr);	366	page = dequeue_huge_page(vma, addr);
@@ -155,10 +372,17 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
155	return page;	372	return page;
156		373
157	fail:	374	fail:
158	if (vma->vm_flags & VM_MAYSHARE)
159	resv_huge_pages++;
160	spin_unlock(&hugetlb_lock);	375	spin_unlock(&hugetlb_lock);
161	return NULL;	376
		377	/*
		378	* Private mappings do not use reserved huge pages so the allocation
		379	* may have failed due to an undersized hugetlb pool. Try to grab a
		380	* surplus huge page from the buddy allocator.
		381	*/
		382	if (!use_reserved_page)
		383	page = alloc_buddy_huge_page(vma, addr);
		384
		385	return page;
162	}	386	}
163		387
164	static int __init hugetlb_init(void)	388	static int __init hugetlb_init(void)
@@ -171,6 +395,8 @@ static int __init hugetlb_init(void)
171	for (i = 0; i < MAX_NUMNODES; ++i)	395	for (i = 0; i < MAX_NUMNODES; ++i)
172	INIT_LIST_HEAD(&hugepage_freelists[i]);	396	INIT_LIST_HEAD(&hugepage_freelists[i]);
173		397
		398	hugetlb_next_nid = first_node(node_online_map);
		399
174	for (i = 0; i < max_huge_pages; ++i) {	400	for (i = 0; i < max_huge_pages; ++i) {
175	if (!alloc_fresh_huge_page())	401	if (!alloc_fresh_huge_page())
176	break;	402	break;
@@ -201,21 +427,6 @@ static unsigned int cpuset_mems_nr(unsigned int *array)
201	}	427	}
202		428
203	#ifdef CONFIG_SYSCTL	429	#ifdef CONFIG_SYSCTL
204	static void update_and_free_page(struct page *page)
205	{
206	int i;
207	nr_huge_pages--;
208	nr_huge_pages_node[page_to_nid(page)]--;
209	for (i = 0; i < (HPAGE_SIZE / PAGE_SIZE); i++) {
210	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \| 1 << PG_referenced \|
211	1 << PG_dirty \| 1 << PG_active \| 1 << PG_reserved \|
212	1 << PG_private \| 1<< PG_writeback);
213	}
214	set_compound_page_dtor(page, NULL);
215	set_page_refcounted(page);
216	__free_pages(page, HUGETLB_PAGE_ORDER);
217	}
218
219	#ifdef CONFIG_HIGHMEM	430	#ifdef CONFIG_HIGHMEM
220	static void try_to_free_low(unsigned long count)	431	static void try_to_free_low(unsigned long count)
221	{	432	{
@@ -224,14 +435,14 @@ static void try_to_free_low(unsigned long count)
224	for (i = 0; i < MAX_NUMNODES; ++i) {	435	for (i = 0; i < MAX_NUMNODES; ++i) {
225	struct page page, next;	436	struct page page, next;
226	list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {	437	list_for_each_entry_safe(page, next, &hugepage_freelists[i], lru) {
		438	if (count >= nr_huge_pages)
		439	return;
227	if (PageHighMem(page))	440	if (PageHighMem(page))
228	continue;	441	continue;
229	list_del(&page->lru);	442	list_del(&page->lru);
230	update_and_free_page(page);	443	update_and_free_page(page);
231	free_huge_pages--;	444	free_huge_pages--;
232	free_huge_pages_node[page_to_nid(page)]--;	445	free_huge_pages_node[page_to_nid(page)]--;
233	if (count >= nr_huge_pages)
234	return;
235	}	446	}
236	}	447	}
237	}	448	}
@@ -241,26 +452,61 @@ static inline void try_to_free_low(unsigned long count)
241	}	452	}
242	#endif	453	#endif
243		454
		455	#define persistent_huge_pages (nr_huge_pages - surplus_huge_pages)
244	static unsigned long set_max_huge_pages(unsigned long count)	456	static unsigned long set_max_huge_pages(unsigned long count)
245	{	457	{
246	while (count > nr_huge_pages) {	458	unsigned long min_count, ret;
247	if (!alloc_fresh_huge_page())
248	return nr_huge_pages;
249	}
250	if (count >= nr_huge_pages)
251	return nr_huge_pages;
252		459
		460	/*
		461	* Increase the pool size
		462	* First take pages out of surplus state. Then make up the
		463	* remaining difference by allocating fresh huge pages.
		464	*/
253	spin_lock(&hugetlb_lock);	465	spin_lock(&hugetlb_lock);
254	count = max(count, resv_huge_pages);	466	while (surplus_huge_pages && count > persistent_huge_pages) {
255	try_to_free_low(count);	467	if (!adjust_pool_surplus(-1))
256	while (count < nr_huge_pages) {	468	break;
		469	}
		470
		471	while (count > persistent_huge_pages) {
		472	int ret;
		473	/*
		474	* If this allocation races such that we no longer need the
		475	* page, free_huge_page will handle it by freeing the page
		476	* and reducing the surplus.
		477	*/
		478	spin_unlock(&hugetlb_lock);
		479	ret = alloc_fresh_huge_page();
		480	spin_lock(&hugetlb_lock);
		481	if (!ret)
		482	goto out;
		483
		484	}
		485
		486	/*
		487	* Decrease the pool size
		488	* First return free pages to the buddy allocator (being careful
		489	* to keep enough around to satisfy reservations). Then place
		490	* pages into surplus state as needed so the pool will shrink
		491	* to the desired size as pages become free.
		492	*/
		493	min_count = resv_huge_pages + nr_huge_pages - free_huge_pages;
		494	min_count = max(count, min_count);
		495	try_to_free_low(min_count);
		496	while (min_count < persistent_huge_pages) {
257	struct page *page = dequeue_huge_page(NULL, 0);	497	struct page *page = dequeue_huge_page(NULL, 0);
258	if (!page)	498	if (!page)
259	break;	499	break;
260	update_and_free_page(page);	500	update_and_free_page(page);
261	}	501	}
		502	while (count < persistent_huge_pages) {
		503	if (!adjust_pool_surplus(1))
		504	break;
		505	}
		506	out:
		507	ret = persistent_huge_pages;
262	spin_unlock(&hugetlb_lock);	508	spin_unlock(&hugetlb_lock);
263	return nr_huge_pages;	509	return ret;
264	}	510	}
265		511
266	int hugetlb_sysctl_handler(struct ctl_table *table, int write,	512	int hugetlb_sysctl_handler(struct ctl_table *table, int write,
@@ -292,10 +538,12 @@ int hugetlb_report_meminfo(char *buf)
292	"HugePages_Total: %5lu\n"	538	"HugePages_Total: %5lu\n"
293	"HugePages_Free: %5lu\n"	539	"HugePages_Free: %5lu\n"
294	"HugePages_Rsvd: %5lu\n"	540	"HugePages_Rsvd: %5lu\n"
		541	"HugePages_Surp: %5lu\n"
295	"Hugepagesize: %5lu kB\n",	542	"Hugepagesize: %5lu kB\n",
296	nr_huge_pages,	543	nr_huge_pages,
297	free_huge_pages,	544	free_huge_pages,
298	resv_huge_pages,	545	resv_huge_pages,
		546	surplus_huge_pages,
299	HPAGE_SIZE/1024);	547	HPAGE_SIZE/1024);
300	}	548	}
301		549
@@ -355,7 +603,6 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma,
355	entry = pte_mkwrite(pte_mkdirty(*ptep));	603	entry = pte_mkwrite(pte_mkdirty(*ptep));
356	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {	604	if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
357	update_mmu_cache(vma, address, entry);	605	update_mmu_cache(vma, address, entry);
358	lazy_mmu_prot_update(entry);
359	}	606	}
360	}	607	}
361		608
@@ -708,7 +955,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
708	pte = huge_ptep_get_and_clear(mm, address, ptep);	955	pte = huge_ptep_get_and_clear(mm, address, ptep);
709	pte = pte_mkhuge(pte_modify(pte, newprot));	956	pte = pte_mkhuge(pte_modify(pte, newprot));
710	set_huge_pte_at(mm, address, ptep, pte);	957	set_huge_pte_at(mm, address, ptep, pte);
711	lazy_mmu_prot_update(pte);
712	}	958	}
713	}	959	}
714	spin_unlock(&mm->page_table_lock);	960	spin_unlock(&mm->page_table_lock);
@@ -843,21 +1089,6 @@ static int hugetlb_acct_memory(long delta)
843	int ret = -ENOMEM;	1089	int ret = -ENOMEM;
844		1090
845	spin_lock(&hugetlb_lock);	1091	spin_lock(&hugetlb_lock);
846	if ((delta + resv_huge_pages) <= free_huge_pages) {
847	resv_huge_pages += delta;
848	ret = 0;
849	}
850	spin_unlock(&hugetlb_lock);
851	return ret;
852	}
853
854	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
855	{
856	long ret, chg;
857
858	chg = region_chg(&inode->i_mapping->private_list, from, to);
859	if (chg < 0)
860	return chg;
861	/*	1092	/*
862	* When cpuset is configured, it breaks the strict hugetlb page	1093	* When cpuset is configured, it breaks the strict hugetlb page
863	* reservation as the accounting is done on a global variable. Such	1094	* reservation as the accounting is done on a global variable. Such
@@ -875,8 +1106,31 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to)
875	* a best attempt and hopefully to minimize the impact of changing	1106	* a best attempt and hopefully to minimize the impact of changing
876	* semantics that cpuset has.	1107	* semantics that cpuset has.
877	*/	1108	*/
878	if (chg > cpuset_mems_nr(free_huge_pages_node))	1109	if (delta > 0) {
879	return -ENOMEM;	1110	if (gather_surplus_pages(delta) < 0)
		1111	goto out;
		1112
		1113	if (delta > cpuset_mems_nr(free_huge_pages_node))
		1114	goto out;
		1115	}
		1116
		1117	ret = 0;
		1118	resv_huge_pages += delta;
		1119	if (delta < 0)
		1120	return_unused_surplus_pages((unsigned long) -delta);
		1121
		1122	out:
		1123	spin_unlock(&hugetlb_lock);
		1124	return ret;
		1125	}
		1126
		1127	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
		1128	{
		1129	long ret, chg;
		1130
		1131	chg = region_chg(&inode->i_mapping->private_list, from, to);
		1132	if (chg < 0)
		1133	return chg;
880		1134
881	ret = hugetlb_acct_memory(chg);	1135	ret = hugetlb_acct_memory(chg);
882	if (ret < 0)	1136	if (ret < 0)