1 files changed, 233 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <linux/hugetlb.h>
+#include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 /*
 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
 */
 static DEFINE_SPINLOCK(hugetlb_lock);
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+                cond_resched();
+                clear_user_highpage(page + i, addr);
+        }
+}
+static void copy_huge_page(struct page *dst, struct page *src,
+                           unsigned long addr)
+{
+        int i;
+        might_sleep();
+        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+                cond_resched();
+                copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+        }
+}
 static void enqueue_huge_page(struct page *page)
 {
        int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
        return page;
 }
-static struct page *alloc_fresh_huge_page(void)
+static void free_huge_page(struct page *page)
+{
+        BUG_ON(page_count(page));
+        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
+        enqueue_huge_page(page);
+        spin_unlock(&hugetlb_lock);
+}
+static int alloc_fresh_huge_page(void)
 {
        static int nid = 0;
        struct page *page;
        page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
                                        HUGETLB_PAGE_ORDER);
-        nid = (nid + 1) % num_online_nodes();
+        nid = next_node(nid, node_online_map);
+        if (nid == MAX_NUMNODES)
+                nid = first_node(node_online_map);
        if (page) {
+                page[1].lru.next = (void *)free_huge_page;      /* dtor */
                spin_lock(&hugetlb_lock);
                nr_huge_pages++;
                nr_huge_pages_node[page_to_nid(page)]++;
                spin_unlock(&hugetlb_lock);
+                put_page(page); /* free it into the hugepage allocator */
+                return 1;
        }
-        return page;
+        return 0;
 }
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+                                    unsigned long addr)
 {
-        BUG_ON(page_count(page));
+        struct inode *inode = vma->vm_file->f_dentry->d_inode;
+        struct page *page;
+        int use_reserve = 0;
+        unsigned long idx;
-        INIT_LIST_HEAD(&page->lru);
+        spin_lock(&hugetlb_lock);
-        page[1].lru.next = NULL;                        /* reset dtor */
+        if (vma->vm_flags & VM_MAYSHARE) {
+                /* idx = radix tree index, i.e. offset into file in
+                 * HPAGE_SIZE units */
+                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+                /* The hugetlbfs specific inode info stores the number
+                 * of "guaranteed available" (huge) pages.  That is,
+                 * the first 'prereserved_hpages' pages of the inode
+                 * are either already instantiated, or have been
+                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+                 * we're in the process of instantiating the page, so
+                 * we use this to determine whether to draw from the
+                 * pre-reserved pool or the truly free pool. */
+                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+                        use_reserve = 1;
+        }
+        if (!use_reserve) {
+                if (free_huge_pages <= reserved_huge_pages)
+                        goto fail;
+        } else {
+                BUG_ON(reserved_huge_pages == 0);
+                reserved_huge_pages--;
+        }
+        page = dequeue_huge_page(vma, addr);
+        if (!page)
+                goto fail;
+        spin_unlock(&hugetlb_lock);
+        set_page_refcounted(page);
+        return page;
+ fail:
+        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+        spin_unlock(&hugetlb_lock);
+        return NULL;
+}
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+                               unsigned long atleast)
+{
+        struct inode *inode = &info->vfs_inode;
+        unsigned long change_in_reserve = 0;
+        int ret = 0;
        spin_lock(&hugetlb_lock);
-        enqueue_huge_page(page);
+        read_lock_irq(&inode->i_mapping->tree_lock);
+        if (info->prereserved_hpages >= atleast)
+                goto out;
+        /* Because we always call this on shared mappings, none of the
+         * pages beyond info->prereserved_hpages can have been
+         * instantiated, so we need to reserve all of them now. */
+        change_in_reserve = atleast - info->prereserved_hpages;
+        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        reserved_huge_pages += change_in_reserve;
+        info->prereserved_hpages = atleast;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
+        return ret;
 }
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+                                  unsigned long atmost)
 {
+        struct inode *inode = &info->vfs_inode;
+        struct address_space *mapping = inode->i_mapping;
+        unsigned long idx;
+        unsigned long change_in_reserve = 0;
        struct page *page;
-        int i;
        spin_lock(&hugetlb_lock);
-        page = dequeue_huge_page(vma, addr);
+        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (!page) {
-                spin_unlock(&hugetlb_lock);
+        if (info->prereserved_hpages <= atmost)
-                return NULL;
+                goto out;
+        /* Count pages which were reserved, but not instantiated, and
+         * which we can now release. */
+        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+                page = radix_tree_lookup(&mapping->page_tree, idx);
+                if (!page)
+                        /* Pages which are already instantiated can't
+                         * be unreserved (and in fact have already
+                         * been removed from the reserved pool) */
+                        change_in_reserve++;
        }
+        BUG_ON(reserved_huge_pages < change_in_reserve);
+        reserved_huge_pages -= change_in_reserve;
+        info->prereserved_hpages = atmost;
+ out:
+        read_unlock_irq(&inode->i_mapping->tree_lock);
        spin_unlock(&hugetlb_lock);
-        set_page_count(page, 1);
-        page[1].lru.next = (void *)free_huge_page;      /* set dtor */
-        for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-                clear_user_highpage(&page[i], addr);
-        return page;
 }
 static int __init hugetlb_init(void)
 {
        unsigned long i;
-        struct page *page;
        if (HPAGE_SHIFT == 0)
                return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
                INIT_LIST_HEAD(&hugepage_freelists[i]);
        for (i = 0; i < max_huge_pages; ++i) {
-                page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        break;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        max_huge_pages = free_huge_pages = nr_huge_pages = i;
        printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
                page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
                                1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
                                1 << PG_private | 1<< PG_writeback);
-                set_page_count(&page[i], 0);
        }
-        set_page_count(page, 1);
+        page[1].lru.next = NULL;
+        set_page_refcounted(page);
        __free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
        while (count > nr_huge_pages) {
-                struct page *page = alloc_fresh_huge_page();
+                if (!alloc_fresh_huge_page())
-                if (!page)
                        return nr_huge_pages;
-                spin_lock(&hugetlb_lock);
-                enqueue_huge_page(page);
-                spin_unlock(&hugetlb_lock);
        }
        if (count >= nr_huge_pages)
                return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
+                        reserved_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
                nid, free_huge_pages_node[nid]);
 }
-int is_hugepage_mem_enough(size_t size)
-{
-        return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte)
 {
        struct page *old_page, *new_page;
-        int i, avoidcopy;
+        int avoidcopy;
        old_page = pte_page(pte);
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        spin_unlock(&mm->page_table_lock);
-        for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
+        copy_huge_page(new_page, old_page, address);
-                copy_user_highpage(new_page + i, old_page + i,
-                                   address + i*PAGE_SIZE);
        spin_lock(&mm->page_table_lock);
        ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
                        ret = VM_FAULT_OOM;
                        goto out;
                }
+                clear_huge_page(page, address);
                if (vma->vm_flags & VM_SHARED) {
                        int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        ptep = huge_pte_alloc(mm, address);
        if (!ptep)
                return VM_FAULT_OOM;
+        /*
+         * Serialize hugepage allocation and instantiation, so that we don't
+         * get spurious allocation failures if two CPUs race to instantiate
+         * the same page in the page cache.
+         */
+        mutex_lock(&hugetlb_instantiation_mutex);
        entry = *ptep;
-        if (pte_none(entry))
+        if (pte_none(entry)) {
-                return hugetlb_no_page(mm, vma, address, ptep, write_access);
+                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+                mutex_unlock(&hugetlb_instantiation_mutex);
+                return ret;
+        }
        ret = VM_FAULT_MINOR;
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                if (write_access && !pte_write(entry))
                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
        spin_unlock(&mm->page_table_lock);
+        mutex_unlock(&hugetlb_instantiation_mutex);
        return ret;
 }
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        struct page **pages, struct vm_area_struct **vmas,
                        unsigned long *position, int *length, int i)
 {
-        unsigned long vpfn, vaddr = *position;
+        unsigned long pfn_offset;
+        unsigned long vaddr = *position;
        int remainder = *length;
-        vpfn = vaddr/PAGE_SIZE;
        spin_lock(&mm->page_table_lock);
        while (vaddr < vma->vm_end && remainder) {
                pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        break;
                }
-                if (pages) {
+                pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
-                        page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
+                page = pte_page(*pte);
-                        get_page(page);
+same_page:
-                        pages[i] = page;
+                get_page(page);
-                }
+                if (pages)
+                        pages[i] = page + pfn_offset;
                if (vmas)
                        vmas[i] = vma;
                vaddr += PAGE_SIZE;
-                ++vpfn;
+                ++pfn_offset;
                --remainder;
                ++i;
+                if (vaddr < vma->vm_end && remainder &&
+                                pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+                        /*
+                         * We use pfn_offset to avoid touching the pageframes
+                         * of this compound page.
+                         */
+                        goto same_page;
+                }
        }
        spin_unlock(&mm->page_table_lock);
        *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i;
 }
+void hugetlb_change_protection(struct vm_area_struct *vma,
+                unsigned long address, unsigned long end, pgprot_t newprot)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long start = address;
+        pte_t *ptep;
+        pte_t pte;
+        BUG_ON(address >= end);
+        flush_cache_range(vma, address, end);
+        spin_lock(&mm->page_table_lock);
+        for (; address < end; address += HPAGE_SIZE) {
+                ptep = huge_pte_offset(mm, address);
+                if (!ptep)
+                        continue;
+                if (!pte_none(*ptep)) {
+                        pte = huge_ptep_get_and_clear(mm, address, ptep);
+                        pte = pte_mkhuge(pte_modify(pte, newprot));
+                        set_huge_pte_at(mm, address, ptep, pte);
+                        lazy_mmu_prot_update(pte);
+                }
+        }
+        spin_unlock(&mm->page_table_lock);
+        flush_tlb_range(vma, start, end);
+}

diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 508707704d2c..ebad6bbb3501 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
13	#include <linux/pagemap.h>	13	#include <linux/pagemap.h>
14	#include <linux/mempolicy.h>	14	#include <linux/mempolicy.h>
15	#include <linux/cpuset.h>	15	#include <linux/cpuset.h>
		16	#include <linux/mutex.h>
16		17
17	#include <asm/page.h>	18	#include <asm/page.h>
18	#include <asm/pgtable.h>	19	#include <asm/pgtable.h>
19		20
20	#include <linux/hugetlb.h>	21	#include <linux/hugetlb.h>
		22	#include "internal.h"
21		23
22	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23	static unsigned long nr_huge_pages, free_huge_pages;	25	static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
24	unsigned long max_huge_pages;	26	unsigned long max_huge_pages;
25	static struct list_head hugepage_freelists[MAX_NUMNODES];	27	static struct list_head hugepage_freelists[MAX_NUMNODES];
26	static unsigned int nr_huge_pages_node[MAX_NUMNODES];	28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27	static unsigned int free_huge_pages_node[MAX_NUMNODES];	29	static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29	/*	30	/*
30	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages	31	* Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31	*/	32	*/
32	static DEFINE_SPINLOCK(hugetlb_lock);	33	static DEFINE_SPINLOCK(hugetlb_lock);
33		34
		35	static void clear_huge_page(struct page *page, unsigned long addr)
		36	{
		37	int i;
		38
		39	might_sleep();
		40	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
		41	cond_resched();
		42	clear_user_highpage(page + i, addr);
		43	}
		44	}
		45
		46	static void copy_huge_page(struct page dst, struct page src,
		47	unsigned long addr)
		48	{
		49	int i;
		50
		51	might_sleep();
		52	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
		53	cond_resched();
		54	copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
		55	}
		56	}
		57
34	static void enqueue_huge_page(struct page *page)	58	static void enqueue_huge_page(struct page *page)
35	{	59	{
36	int nid = page_to_nid(page);	60	int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page dequeue_huge_page(struct vm_area_struct vma,
64	return page;	88	return page;
65	}	89	}
66		90
67	static struct page *alloc_fresh_huge_page(void)	91	static void free_huge_page(struct page *page)
		92	{
		93	BUG_ON(page_count(page));
		94
		95	INIT_LIST_HEAD(&page->lru);
		96
		97	spin_lock(&hugetlb_lock);
		98	enqueue_huge_page(page);
		99	spin_unlock(&hugetlb_lock);
		100	}
		101
		102	static int alloc_fresh_huge_page(void)
68	{	103	{
69	static int nid = 0;	104	static int nid = 0;
70	struct page *page;	105	struct page *page;
71	page = alloc_pages_node(nid, GFP_HIGHUSER\|__GFP_COMP\|__GFP_NOWARN,	106	page = alloc_pages_node(nid, GFP_HIGHUSER\|__GFP_COMP\|__GFP_NOWARN,
72	HUGETLB_PAGE_ORDER);	107	HUGETLB_PAGE_ORDER);
73	nid = (nid + 1) % num_online_nodes();	108	nid = next_node(nid, node_online_map);
		109	if (nid == MAX_NUMNODES)
		110	nid = first_node(node_online_map);
74	if (page) {	111	if (page) {
		112	page[1].lru.next = (void )free_huge_page; / dtor */
75	spin_lock(&hugetlb_lock);	113	spin_lock(&hugetlb_lock);
76	nr_huge_pages++;	114	nr_huge_pages++;
77	nr_huge_pages_node[page_to_nid(page)]++;	115	nr_huge_pages_node[page_to_nid(page)]++;
78	spin_unlock(&hugetlb_lock);	116	spin_unlock(&hugetlb_lock);
		117	put_page(page); /* free it into the hugepage allocator */
		118	return 1;
79	}	119	}
80	return page;	120	return 0;
81	}	121	}
82		122
83	void free_huge_page(struct page *page)	123	static struct page alloc_huge_page(struct vm_area_struct vma,
		124	unsigned long addr)
84	{	125	{
85	BUG_ON(page_count(page));	126	struct inode *inode = vma->vm_file->f_dentry->d_inode;
		127	struct page *page;
		128	int use_reserve = 0;
		129	unsigned long idx;
86		130
87	INIT_LIST_HEAD(&page->lru);	131	spin_lock(&hugetlb_lock);
88	page[1].lru.next = NULL; /* reset dtor */	132
		133	if (vma->vm_flags & VM_MAYSHARE) {
		134
		135	/* idx = radix tree index, i.e. offset into file in
		136	* HPAGE_SIZE units */
		137	idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
		138	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
		139
		140	/* The hugetlbfs specific inode info stores the number
		141	* of "guaranteed available" (huge) pages. That is,
		142	* the first 'prereserved_hpages' pages of the inode
		143	* are either already instantiated, or have been
		144	* pre-reserved (by hugetlb_reserve_for_inode()). Here
		145	* we're in the process of instantiating the page, so
		146	* we use this to determine whether to draw from the
		147	* pre-reserved pool or the truly free pool. */
		148	if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
		149	use_reserve = 1;
		150	}
		151
		152	if (!use_reserve) {
		153	if (free_huge_pages <= reserved_huge_pages)
		154	goto fail;
		155	} else {
		156	BUG_ON(reserved_huge_pages == 0);
		157	reserved_huge_pages--;
		158	}
		159
		160	page = dequeue_huge_page(vma, addr);
		161	if (!page)
		162	goto fail;
		163
		164	spin_unlock(&hugetlb_lock);
		165	set_page_refcounted(page);
		166	return page;
		167
		168	fail:
		169	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
		170	spin_unlock(&hugetlb_lock);
		171	return NULL;
		172	}
		173
		174	/* hugetlb_extend_reservation()
		175	*
		176	* Ensure that at least 'atleast' hugepages are, and will remain,
		177	* available to instantiate the first 'atleast' pages of the given
		178	* inode. If the inode doesn't already have this many pages reserved
		179	* or instantiated, set aside some hugepages in the reserved pool to
		180	* satisfy later faults (or fail now if there aren't enough, rather
		181	* than getting the SIGBUS later).
		182	*/
		183	int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
		184	unsigned long atleast)
		185	{
		186	struct inode *inode = &info->vfs_inode;
		187	unsigned long change_in_reserve = 0;
		188	int ret = 0;
89		189
90	spin_lock(&hugetlb_lock);	190	spin_lock(&hugetlb_lock);
91	enqueue_huge_page(page);	191	read_lock_irq(&inode->i_mapping->tree_lock);
		192
		193	if (info->prereserved_hpages >= atleast)
		194	goto out;
		195
		196	/* Because we always call this on shared mappings, none of the
		197	* pages beyond info->prereserved_hpages can have been
		198	* instantiated, so we need to reserve all of them now. */
		199	change_in_reserve = atleast - info->prereserved_hpages;
		200
		201	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
		202	ret = -ENOMEM;
		203	goto out;
		204	}
		205
		206	reserved_huge_pages += change_in_reserve;
		207	info->prereserved_hpages = atleast;
		208
		209	out:
		210	read_unlock_irq(&inode->i_mapping->tree_lock);
92	spin_unlock(&hugetlb_lock);	211	spin_unlock(&hugetlb_lock);
		212
		213	return ret;
93	}	214	}
94		215
95	struct page alloc_huge_page(struct vm_area_struct vma, unsigned long addr)	216	/* hugetlb_truncate_reservation()
		217	*
		218	* This returns pages reserved for the given inode to the general free
		219	* hugepage pool. If the inode has any pages prereserved, but not
		220	* instantiated, beyond offset (atmost << HPAGE_SIZE), then release
		221	* them.
		222	*/
		223	void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
		224	unsigned long atmost)
96	{	225	{
		226	struct inode *inode = &info->vfs_inode;
		227	struct address_space *mapping = inode->i_mapping;
		228	unsigned long idx;
		229	unsigned long change_in_reserve = 0;
97	struct page *page;	230	struct page *page;
98	int i;
99		231
100	spin_lock(&hugetlb_lock);	232	spin_lock(&hugetlb_lock);
101	page = dequeue_huge_page(vma, addr);	233	read_lock_irq(&inode->i_mapping->tree_lock);
102	if (!page) {	234
103	spin_unlock(&hugetlb_lock);	235	if (info->prereserved_hpages <= atmost)
104	return NULL;	236	goto out;
		237
		238	/* Count pages which were reserved, but not instantiated, and
		239	* which we can now release. */
		240	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
		241	page = radix_tree_lookup(&mapping->page_tree, idx);
		242	if (!page)
		243	/* Pages which are already instantiated can't
		244	* be unreserved (and in fact have already
		245	* been removed from the reserved pool) */
		246	change_in_reserve++;
105	}	247	}
		248
		249	BUG_ON(reserved_huge_pages < change_in_reserve);
		250	reserved_huge_pages -= change_in_reserve;
		251	info->prereserved_hpages = atmost;
		252
		253	out:
		254	read_unlock_irq(&inode->i_mapping->tree_lock);
106	spin_unlock(&hugetlb_lock);	255	spin_unlock(&hugetlb_lock);
107	set_page_count(page, 1);
108	page[1].lru.next = (void )free_huge_page; / set dtor */
109	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110	clear_user_highpage(&page[i], addr);
111	return page;
112	}	256	}
113		257
114	static int __init hugetlb_init(void)	258	static int __init hugetlb_init(void)
115	{	259	{
116	unsigned long i;	260	unsigned long i;
117	struct page *page;
118		261
119	if (HPAGE_SHIFT == 0)	262	if (HPAGE_SHIFT == 0)
120	return 0;	263	return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
123	INIT_LIST_HEAD(&hugepage_freelists[i]);	266	INIT_LIST_HEAD(&hugepage_freelists[i]);
124		267
125	for (i = 0; i < max_huge_pages; ++i) {	268	for (i = 0; i < max_huge_pages; ++i) {
126	page = alloc_fresh_huge_page();	269	if (!alloc_fresh_huge_page())
127	if (!page)
128	break;	270	break;
129	spin_lock(&hugetlb_lock);
130	enqueue_huge_page(page);
131	spin_unlock(&hugetlb_lock);
132	}	271	}
133	max_huge_pages = free_huge_pages = nr_huge_pages = i;	272	max_huge_pages = free_huge_pages = nr_huge_pages = i;
134	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);	273	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
154	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \| 1 << PG_referenced \|	293	page[i].flags &= ~(1 << PG_locked \| 1 << PG_error \| 1 << PG_referenced \|
155	1 << PG_dirty \| 1 << PG_active \| 1 << PG_reserved \|	294	1 << PG_dirty \| 1 << PG_active \| 1 << PG_reserved \|
156	1 << PG_private \| 1<< PG_writeback);	295	1 << PG_private \| 1<< PG_writeback);
157	set_page_count(&page[i], 0);
158	}	296	}
159	set_page_count(page, 1);	297	page[1].lru.next = NULL;
		298	set_page_refcounted(page);
160	__free_pages(page, HUGETLB_PAGE_ORDER);	299	__free_pages(page, HUGETLB_PAGE_ORDER);
161	}	300	}
162		301
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
188	static unsigned long set_max_huge_pages(unsigned long count)	327	static unsigned long set_max_huge_pages(unsigned long count)
189	{	328	{
190	while (count > nr_huge_pages) {	329	while (count > nr_huge_pages) {
191	struct page *page = alloc_fresh_huge_page();	330	if (!alloc_fresh_huge_page())
192	if (!page)
193	return nr_huge_pages;	331	return nr_huge_pages;
194	spin_lock(&hugetlb_lock);
195	enqueue_huge_page(page);
196	spin_unlock(&hugetlb_lock);
197	}	332	}
198	if (count >= nr_huge_pages)	333	if (count >= nr_huge_pages)
199	return nr_huge_pages;	334	return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
225	return sprintf(buf,	360	return sprintf(buf,
226	"HugePages_Total: %5lu\n"	361	"HugePages_Total: %5lu\n"
227	"HugePages_Free: %5lu\n"	362	"HugePages_Free: %5lu\n"
		363	"HugePages_Rsvd: %5lu\n"
228	"Hugepagesize: %5lu kB\n",	364	"Hugepagesize: %5lu kB\n",
229	nr_huge_pages,	365	nr_huge_pages,
230	free_huge_pages,	366	free_huge_pages,
		367	reserved_huge_pages,
231	HPAGE_SIZE/1024);	368	HPAGE_SIZE/1024);
232	}	369	}
233		370
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
240	nid, free_huge_pages_node[nid]);	377	nid, free_huge_pages_node[nid]);
241	}	378	}
242		379
243	int is_hugepage_mem_enough(size_t size)
244	{
245	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246	}
247
248	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */	380	/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249	unsigned long hugetlb_total_pages(void)	381	unsigned long hugetlb_total_pages(void)
250	{	382	{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
374	unsigned long address, pte_t *ptep, pte_t pte)	506	unsigned long address, pte_t *ptep, pte_t pte)
375	{	507	{
376	struct page old_page, new_page;	508	struct page old_page, new_page;
377	int i, avoidcopy;	509	int avoidcopy;
378		510
379	old_page = pte_page(pte);	511	old_page = pte_page(pte);
380		512
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct mm, struct vm_area_struct vma,
395	}	527	}
396		528
397	spin_unlock(&mm->page_table_lock);	529	spin_unlock(&mm->page_table_lock);
398	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)	530	copy_huge_page(new_page, old_page, address);
399	copy_user_highpage(new_page + i, old_page + i,
400	address + i*PAGE_SIZE);
401	spin_lock(&mm->page_table_lock);	531	spin_lock(&mm->page_table_lock);
402		532
403	ptep = huge_pte_offset(mm, address & HPAGE_MASK);	533	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
442	ret = VM_FAULT_OOM;	572	ret = VM_FAULT_OOM;
443	goto out;	573	goto out;
444	}	574	}
		575	clear_huge_page(page, address);
445		576
446	if (vma->vm_flags & VM_SHARED) {	577	if (vma->vm_flags & VM_SHARED) {
447	int err;	578	int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
496	pte_t *ptep;	627	pte_t *ptep;
497	pte_t entry;	628	pte_t entry;
498	int ret;	629	int ret;
		630	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
499		631
500	ptep = huge_pte_alloc(mm, address);	632	ptep = huge_pte_alloc(mm, address);
501	if (!ptep)	633	if (!ptep)
502	return VM_FAULT_OOM;	634	return VM_FAULT_OOM;
503		635
		636	/*
		637	* Serialize hugepage allocation and instantiation, so that we don't
		638	* get spurious allocation failures if two CPUs race to instantiate
		639	* the same page in the page cache.
		640	*/
		641	mutex_lock(&hugetlb_instantiation_mutex);
504	entry = *ptep;	642	entry = *ptep;
505	if (pte_none(entry))	643	if (pte_none(entry)) {
506	return hugetlb_no_page(mm, vma, address, ptep, write_access);	644	ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
		645	mutex_unlock(&hugetlb_instantiation_mutex);
		646	return ret;
		647	}
507		648
508	ret = VM_FAULT_MINOR;	649	ret = VM_FAULT_MINOR;
509		650
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
513	if (write_access && !pte_write(entry))	654	if (write_access && !pte_write(entry))
514	ret = hugetlb_cow(mm, vma, address, ptep, entry);	655	ret = hugetlb_cow(mm, vma, address, ptep, entry);
515	spin_unlock(&mm->page_table_lock);	656	spin_unlock(&mm->page_table_lock);
		657	mutex_unlock(&hugetlb_instantiation_mutex);
516		658
517	return ret;	659	return ret;
518	}	660	}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
521	struct page pages, struct vm_area_struct vmas,	663	struct page pages, struct vm_area_struct vmas,
522	unsigned long position, int length, int i)	664	unsigned long position, int length, int i)
523	{	665	{
524	unsigned long vpfn, vaddr = *position;	666	unsigned long pfn_offset;
		667	unsigned long vaddr = *position;
525	int remainder = *length;	668	int remainder = *length;
526		669
527	vpfn = vaddr/PAGE_SIZE;
528	spin_lock(&mm->page_table_lock);	670	spin_lock(&mm->page_table_lock);
529	while (vaddr < vma->vm_end && remainder) {	671	while (vaddr < vma->vm_end && remainder) {
530	pte_t *pte;	672	pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
552	break;	694	break;
553	}	695	}
554		696
555	if (pages) {	697	pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
556	page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];	698	page = pte_page(*pte);
557	get_page(page);	699	same_page:
558	pages[i] = page;	700	get_page(page);
559	}	701	if (pages)
		702	pages[i] = page + pfn_offset;
560		703
561	if (vmas)	704	if (vmas)
562	vmas[i] = vma;	705	vmas[i] = vma;
563		706
564	vaddr += PAGE_SIZE;	707	vaddr += PAGE_SIZE;
565	++vpfn;	708	++pfn_offset;
566	--remainder;	709	--remainder;
567	++i;	710	++i;
		711	if (vaddr < vma->vm_end && remainder &&
		712	pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
		713	/*
		714	* We use pfn_offset to avoid touching the pageframes
		715	* of this compound page.
		716	*/
		717	goto same_page;
		718	}
568	}	719	}
569	spin_unlock(&mm->page_table_lock);	720	spin_unlock(&mm->page_table_lock);
570	*length = remainder;	721	*length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct mm, struct vm_area_struct vma,
572		723
573	return i;	724	return i;
574	}	725	}
		726
		727	void hugetlb_change_protection(struct vm_area_struct *vma,
		728	unsigned long address, unsigned long end, pgprot_t newprot)
		729	{
		730	struct mm_struct *mm = vma->vm_mm;
		731	unsigned long start = address;
		732	pte_t *ptep;
		733	pte_t pte;
		734
		735	BUG_ON(address >= end);
		736	flush_cache_range(vma, address, end);
		737
		738	spin_lock(&mm->page_table_lock);
		739	for (; address < end; address += HPAGE_SIZE) {
		740	ptep = huge_pte_offset(mm, address);
		741	if (!ptep)
		742	continue;
		743	if (!pte_none(*ptep)) {
		744	pte = huge_ptep_get_and_clear(mm, address, ptep);
		745	pte = pte_mkhuge(pte_modify(pte, newprot));
		746	set_huge_pte_at(mm, address, ptep, pte);
		747	lazy_mmu_prot_update(pte);
		748	}
		749	}
		750	spin_unlock(&mm->page_table_lock);
		751
		752	flush_tlb_range(vma, start, end);
		753	}
		754