[PATCH] tightening hugetlb strict accounting

Current hugetlb strict accounting for shared mapping always assume mapping starts at zero file offset and reserves pages between zero and size of the file. This assumption often reserves (or lock down) a lot more pages then necessary if application maps at none zero file offset. libhugetlbfs is one example that requires proper reservation on shared mapping starts at none zero offset. This patch extends the reservation and hugetlb strict accounting to support any arbitrary pair of (offset, len), resulting a much more robust and accurate scheme. More importantly, it won't lock down any hugetlb pages outside file mapping. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Adam Litke <agl@us.ibm.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Chen, Kenneth W <kenneth.w.chen@intel.com> 2006-06-23 05:03:15 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-06-23 10:42:48 -0400
commit: a43a8c39bbb493c9e93f6764b350de2e33e18e92 (patch)
tree: a3f0042371810ce6d076751d8e403baaa3d2630e
parent: e8f03d02080b25f53cd6bba8dc3a297803f18c01 (diff)
3 files changed, 173 insertions, 138 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c3646..e6410d8edd0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct inode *inode = file->f_dentry->d_inode;
-        struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
        loff_t len, vma_len;
        int ret;
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
        if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
                goto out;
-        if (vma->vm_flags & VM_MAYSHARE)
+        if (vma->vm_flags & VM_MAYSHARE &&
-                if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
+            hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
-                        goto out;
+                                  len >> HPAGE_SHIFT))
+                goto out;
        ret = 0;
        hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
        const pgoff_t start = lstart >> HPAGE_SHIFT;
        struct pagevec pvec;
        pgoff_t next;
-        int i;
+        int i, freed = 0;
-        hugetlb_truncate_reservation(HUGETLBFS_I(inode),
-                                     lstart >> HPAGE_SHIFT);
-        if (!mapping->nrpages)
-                return;
        pagevec_init(&pvec, 0);
        next = start;
        while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
                        truncate_huge_page(page);
                        unlock_page(page);
                        hugetlb_put_quota(mapping);
+                        freed++;
                }
                huge_pagevec_release(&pvec);
        }
        BUG_ON(!lstart && mapping->nrpages);
+        hugetlb_unreserve_pages(inode, start, freed);
 }
 static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
                inode->i_mapping->a_ops = &hugetlbfs_aops;
                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                INIT_LIST_HEAD(&inode->i_mapping->private_list);
                info = HUGETLBFS_I(inode);
                mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
                switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
                hugetlbfs_inc_free_inodes(sbinfo);
                return NULL;
        }
-        p->prereserved_hpages = 0;
        return &p->vfs_inode;
 }
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
                goto out_file;
        error = -ENOMEM;
-        if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
+        if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
-                                       size >> HPAGE_SHIFT) != 0)
                goto out_inode;
        d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c5e610fe442..c25a38d8f600 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, int write_access);
+int hugetlb_reserve_pages(struct inode *inode, long from, long to);
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
 struct hugetlbfs_inode_info {
        struct shared_policy policy;
-        /* Protected by the (global) hugetlb_lock */
-        unsigned long prereserved_hpages;
        struct inode vfs_inode;
 };
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-                               unsigned long atleast_hpages);
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-                                  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-        struct inode *inode = vma->vm_file->f_dentry->d_inode;
        struct page *page;
-        int use_reserve = 0;
-        unsigned long idx;
        spin_lock(&hugetlb_lock);
+        if (vma->vm_flags & VM_MAYSHARE)
-        if (vma->vm_flags & VM_MAYSHARE) {
+                resv_huge_pages--;
+        else if (free_huge_pages <= resv_huge_pages)
-                /* idx = radix tree index, i.e. offset into file in
+                goto fail;
-                 * HPAGE_SIZE units */
-                idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-                        + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-                /* The hugetlbfs specific inode info stores the number
-                 * of "guaranteed available" (huge) pages.  That is,
-                 * the first 'prereserved_hpages' pages of the inode
-                 * are either already instantiated, or have been
-                 * pre-reserved (by hugetlb_reserve_for_inode()). Here
-                 * we're in the process of instantiating the page, so
-                 * we use this to determine whether to draw from the
-                 * pre-reserved pool or the truly free pool. */
-                if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
-                        use_reserve = 1;
-        }
-        if (!use_reserve) {
-                if (free_huge_pages <= reserved_huge_pages)
-                        goto fail;
-        } else {
-                BUG_ON(reserved_huge_pages == 0);
-                reserved_huge_pages--;
-        }
        page = dequeue_huge_page(vma, addr);
        if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
        set_page_refcounted(page);
        return page;
- fail:
+fail:
-        WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
        spin_unlock(&hugetlb_lock);
        return NULL;
 }
-/* hugetlb_extend_reservation()
- *
- * Ensure that at least 'atleast' hugepages are, and will remain,
- * available to instantiate the first 'atleast' pages of the given
- * inode.  If the inode doesn't already have this many pages reserved
- * or instantiated, set aside some hugepages in the reserved pool to
- * satisfy later faults (or fail now if there aren't enough, rather
- * than getting the SIGBUS later).
- */
-int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
-                               unsigned long atleast)
-{
-        struct inode *inode = &info->vfs_inode;
-        unsigned long change_in_reserve = 0;
-        int ret = 0;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages >= atleast)
-                goto out;
-        /* Because we always call this on shared mappings, none of the
-         * pages beyond info->prereserved_hpages can have been
-         * instantiated, so we need to reserve all of them now. */
-        change_in_reserve = atleast - info->prereserved_hpages;
-        if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        reserved_huge_pages += change_in_reserve;
-        info->prereserved_hpages = atleast;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-        return ret;
-}
-/* hugetlb_truncate_reservation()
- *
- * This returns pages reserved for the given inode to the general free
- * hugepage pool.  If the inode has any pages prereserved, but not
- * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
- * them.
- */
-void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
-                                  unsigned long atmost)
-{
-        struct inode *inode = &info->vfs_inode;
-        struct address_space *mapping = inode->i_mapping;
-        unsigned long idx;
-        unsigned long change_in_reserve = 0;
-        struct page *page;
-        spin_lock(&hugetlb_lock);
-        read_lock_irq(&inode->i_mapping->tree_lock);
-        if (info->prereserved_hpages <= atmost)
-                goto out;
-        /* Count pages which were reserved, but not instantiated, and
-         * which we can now release. */
-        for (idx = atmost; idx < info->prereserved_hpages; idx++) {
-                page = radix_tree_lookup(&mapping->page_tree, idx);
-                if (!page)
-                        /* Pages which are already instantiated can't
-                         * be unreserved (and in fact have already
-                         * been removed from the reserved pool) */
-                        change_in_reserve++;
-        }
-        BUG_ON(reserved_huge_pages < change_in_reserve);
-        reserved_huge_pages -= change_in_reserve;
-        info->prereserved_hpages = atmost;
- out:
-        read_unlock_irq(&inode->i_mapping->tree_lock);
-        spin_unlock(&hugetlb_lock);
-}
 static int __init hugetlb_init(void)
 {
        unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
                return nr_huge_pages;
        spin_lock(&hugetlb_lock);
-        count = max(count, reserved_huge_pages);
+        count = max(count, resv_huge_pages);
        try_to_free_low(count);
        while (count < nr_huge_pages) {
                struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
        return sprintf(buf,
                        "HugePages_Total: %5lu\n"
                        "HugePages_Free:  %5lu\n"
-                        "HugePages_Rsvd:  %5lu\n"
+                        "HugePages_Rsvd:  %5lu\n"
                        "Hugepagesize:    %5lu kB\n",
                        nr_huge_pages,
                        free_huge_pages,
-                        reserved_huge_pages,
+                        resv_huge_pages,
                        HPAGE_SIZE/1024);
 }
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
        flush_tlb_range(vma, start, end);
 }
+struct file_region {
+        struct list_head link;
+        long from;
+        long to;
+};
+static long region_add(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg, *trg;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        /* Check for and consume any regions we now overlap with. */
+        nrg = rg;
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        break;
+                /* If this area reaches higher then extend our area to
+                 * include it completely.  If this is not the first area
+                 * which we intend to reuse, free it. */
+                if (rg->to > t)
+                        t = rg->to;
+                if (rg != nrg) {
+                        list_del(&rg->link);
+                        kfree(rg);
+                }
+        }
+        nrg->from = f;
+        nrg->to = t;
+        return 0;
+}
+static long region_chg(struct list_head *head, long f, long t)
+{
+        struct file_region *rg, *nrg;
+        long chg = 0;
+        /* Locate the region we are before or in. */
+        list_for_each_entry(rg, head, link)
+                if (f <= rg->to)
+                        break;
+        /* If we are below the current region then a new region is required.
+         * Subtle, allocate a new region at the position but make it zero
+         * size such that we can guarentee to record the reservation. */
+        if (&rg->link == head || t < rg->from) {
+                nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
+                if (nrg == 0)
+                        return -ENOMEM;
+                nrg->from = f;
+                nrg->to   = f;
+                INIT_LIST_HEAD(&nrg->link);
+                list_add(&nrg->link, rg->link.prev);
+                return t - f;
+        }
+        /* Round our left edge to the current segment if it encloses us. */
+        if (f > rg->from)
+                f = rg->from;
+        chg = t - f;
+        /* Check for and consume any regions we now overlap with. */
+        list_for_each_entry(rg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                if (rg->from > t)
+                        return chg;
+                /* We overlap with this area, if it extends futher than
+                 * us then we must extend ourselves.  Account for its
+                 * existing reservation. */
+                if (rg->to > t) {
+                        chg += rg->to - t;
+                        t = rg->to;
+                }
+                chg -= rg->to - rg->from;
+        }
+        return chg;
+}
+static long region_truncate(struct list_head *head, long end)
+{
+        struct file_region *rg, *trg;
+        long chg = 0;
+        /* Locate the region we are either in or before. */
+        list_for_each_entry(rg, head, link)
+                if (end <= rg->to)
+                        break;
+        if (&rg->link == head)
+                return 0;
+        /* If we are in the middle of a region then adjust it. */
+        if (end > rg->from) {
+                chg = rg->to - end;
+                rg->to = end;
+                rg = list_entry(rg->link.next, typeof(*rg), link);
+        }
+        /* Drop any remaining regions. */
+        list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
+                if (&rg->link == head)
+                        break;
+                chg += rg->to - rg->from;
+                list_del(&rg->link);
+                kfree(rg);
+        }
+        return chg;
+}
+static int hugetlb_acct_memory(long delta)
+{
+        int ret = -ENOMEM;
+        spin_lock(&hugetlb_lock);
+        if ((delta + resv_huge_pages) <= free_huge_pages) {
+                resv_huge_pages += delta;
+                ret = 0;
+        }
+        spin_unlock(&hugetlb_lock);
+        return ret;
+}
+int hugetlb_reserve_pages(struct inode *inode, long from, long to)
+{
+        long ret, chg;
+        chg = region_chg(&inode->i_mapping->private_list, from, to);
+        if (chg < 0)
+                return chg;
+        ret = hugetlb_acct_memory(chg);
+        if (ret < 0)
+                return ret;
+        region_add(&inode->i_mapping->private_list, from, to);
+        return 0;
+}
+void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
+{
+        long chg = region_truncate(&inode->i_mapping->private_list, offset);
+        hugetlb_acct_memory(freed - chg);
+}
author	Chen, Kenneth W <kenneth.w.chen@intel.com>	2006-06-23 05:03:15 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-06-23 10:42:48 -0400
commit	a43a8c39bbb493c9e93f6764b350de2e33e18e92 (patch)
tree	a3f0042371810ce6d076751d8e403baaa3d2630e
parent	e8f03d02080b25f53cd6bba8dc3a297803f18c01 (diff)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 678fc72c3646..e6410d8edd0e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
59	static int hugetlbfs_file_mmap(struct file file, struct vm_area_struct vma)	59	static int hugetlbfs_file_mmap(struct file file, struct vm_area_struct vma)
60	{	60	{
61	struct inode *inode = file->f_dentry->d_inode;	61	struct inode *inode = file->f_dentry->d_inode;
62	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
63	loff_t len, vma_len;	62	loff_t len, vma_len;
64	int ret;	63	int ret;
65		64
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file file, struct vm_area_struct vma)
87	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)	86	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
88	goto out;	87	goto out;
89		88
90	if (vma->vm_flags & VM_MAYSHARE)	89	if (vma->vm_flags & VM_MAYSHARE &&
91	if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)	90	hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
92	goto out;	91	len >> HPAGE_SHIFT))
		92	goto out;
93		93
94	ret = 0;	94	ret = 0;
95	hugetlb_prefault_arch_hook(vma->vm_mm);	95	hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
195	const pgoff_t start = lstart >> HPAGE_SHIFT;	195	const pgoff_t start = lstart >> HPAGE_SHIFT;
196	struct pagevec pvec;	196	struct pagevec pvec;
197	pgoff_t next;	197	pgoff_t next;
198	int i;	198	int i, freed = 0;
199		199
200	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201	lstart >> HPAGE_SHIFT);
202	if (!mapping->nrpages)
203	return;
204	pagevec_init(&pvec, 0);	200	pagevec_init(&pvec, 0);
205	next = start;	201	next = start;
206	while (1) {	202	while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
221	truncate_huge_page(page);	217	truncate_huge_page(page);
222	unlock_page(page);	218	unlock_page(page);
223	hugetlb_put_quota(mapping);	219	hugetlb_put_quota(mapping);
		220	freed++;
224	}	221	}
225	huge_pagevec_release(&pvec);	222	huge_pagevec_release(&pvec);
226	}	223	}
227	BUG_ON(!lstart && mapping->nrpages);	224	BUG_ON(!lstart && mapping->nrpages);
		225	hugetlb_unreserve_pages(inode, start, freed);
228	}	226	}
229		227
230	static void hugetlbfs_delete_inode(struct inode *inode)	228	static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode hugetlbfs_get_inode(struct super_block sb, uid_t uid,
366	inode->i_mapping->a_ops = &hugetlbfs_aops;	364	inode->i_mapping->a_ops = &hugetlbfs_aops;
367	inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;	365	inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
368	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;	366	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
		367	INIT_LIST_HEAD(&inode->i_mapping->private_list);
369	info = HUGETLBFS_I(inode);	368	info = HUGETLBFS_I(inode);
370	mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);	369	mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
371	switch (mode & S_IFMT) {	370	switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode hugetlbfs_alloc_inode(struct super_block sb)
538	hugetlbfs_inc_free_inodes(sbinfo);	537	hugetlbfs_inc_free_inodes(sbinfo);
539	return NULL;	538	return NULL;
540	}	539	}
541	p->prereserved_hpages = 0;
542	return &p->vfs_inode;	540	return &p->vfs_inode;
543	}	541	}
544		542
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
781	goto out_file;	779	goto out_file;
782		780
783	error = -ENOMEM;	781	error = -ENOMEM;
784	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),	782	if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
785	size >> HPAGE_SHIFT) != 0)
786	goto out_inode;	783	goto out_inode;
787		784
788	d_instantiate(dentry, inode);	785	d_instantiate(dentry, inode);


diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4c5e610fe442..c25a38d8f600 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
23	unsigned long hugetlb_total_pages(void);	23	unsigned long hugetlb_total_pages(void);
24	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,	24	int hugetlb_fault(struct mm_struct mm, struct vm_area_struct vma,
25	unsigned long address, int write_access);	25	unsigned long address, int write_access);
		26	int hugetlb_reserve_pages(struct inode *inode, long from, long to);
		27	void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
26		28
27	extern unsigned long max_huge_pages;	29	extern unsigned long max_huge_pages;
28	extern const unsigned long hugetlb_zero, hugetlb_infinity;	30	extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
139		141
140	struct hugetlbfs_inode_info {	142	struct hugetlbfs_inode_info {
141	struct shared_policy policy;	143	struct shared_policy policy;
142	/* Protected by the (global) hugetlb_lock */
143	unsigned long prereserved_hpages;
144	struct inode vfs_inode;	144	struct inode vfs_inode;
145	};	145	};
146		146
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info HUGETLBFS_SB(struct super_block sb)
157	extern const struct file_operations hugetlbfs_file_operations;	157	extern const struct file_operations hugetlbfs_file_operations;
158	extern struct vm_operations_struct hugetlb_vm_ops;	158	extern struct vm_operations_struct hugetlb_vm_ops;
159	struct file *hugetlb_zero_setup(size_t);	159	struct file *hugetlb_zero_setup(size_t);
160	int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
161	unsigned long atleast_hpages);
162	void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
163	unsigned long atmost_hpages);
164	int hugetlb_get_quota(struct address_space *mapping);	160	int hugetlb_get_quota(struct address_space *mapping);
165	void hugetlb_put_quota(struct address_space *mapping);	161	void hugetlb_put_quota(struct address_space *mapping);
166		162


diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca038..df499973255f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22	#include "internal.h"	22	#include "internal.h"
23		23
24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;	24	const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25	static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;	25	static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26	unsigned long max_huge_pages;	26	unsigned long max_huge_pages;
27	static struct list_head hugepage_freelists[MAX_NUMNODES];	27	static struct list_head hugepage_freelists[MAX_NUMNODES];
28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];	28	static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
123	static struct page alloc_huge_page(struct vm_area_struct vma,	123	static struct page alloc_huge_page(struct vm_area_struct vma,
124	unsigned long addr)	124	unsigned long addr)
125	{	125	{
126	struct inode *inode = vma->vm_file->f_dentry->d_inode;
127	struct page *page;	126	struct page *page;
128	int use_reserve = 0;
129	unsigned long idx;
130		127
131	spin_lock(&hugetlb_lock);	128	spin_lock(&hugetlb_lock);
132		129	if (vma->vm_flags & VM_MAYSHARE)
133	if (vma->vm_flags & VM_MAYSHARE) {	130	resv_huge_pages--;
134		131	else if (free_huge_pages <= resv_huge_pages)
135	/* idx = radix tree index, i.e. offset into file in	132	goto fail;
136	* HPAGE_SIZE units */
137	idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138	+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140	/* The hugetlbfs specific inode info stores the number
141	* of "guaranteed available" (huge) pages. That is,
142	* the first 'prereserved_hpages' pages of the inode
143	* are either already instantiated, or have been
144	* pre-reserved (by hugetlb_reserve_for_inode()). Here
145	* we're in the process of instantiating the page, so
146	* we use this to determine whether to draw from the
147	* pre-reserved pool or the truly free pool. */
148	if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149	use_reserve = 1;
150	}
151
152	if (!use_reserve) {
153	if (free_huge_pages <= reserved_huge_pages)
154	goto fail;
155	} else {
156	BUG_ON(reserved_huge_pages == 0);
157	reserved_huge_pages--;
158	}
159		133
160	page = dequeue_huge_page(vma, addr);	134	page = dequeue_huge_page(vma, addr);
161	if (!page)	135	if (!page)
@@ -165,96 +139,11 @@ static struct page alloc_huge_page(struct vm_area_struct vma,
165	set_page_refcounted(page);	139	set_page_refcounted(page);
166	return page;	140	return page;
167		141
168	fail:	142	fail:
169	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170	spin_unlock(&hugetlb_lock);	143	spin_unlock(&hugetlb_lock);
171	return NULL;	144	return NULL;
172	}	145	}
173		146
174	/* hugetlb_extend_reservation()
175	*
176	* Ensure that at least 'atleast' hugepages are, and will remain,
177	* available to instantiate the first 'atleast' pages of the given
178	* inode. If the inode doesn't already have this many pages reserved
179	* or instantiated, set aside some hugepages in the reserved pool to
180	* satisfy later faults (or fail now if there aren't enough, rather
181	* than getting the SIGBUS later).
182	*/
183	int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184	unsigned long atleast)
185	{
186	struct inode *inode = &info->vfs_inode;
187	unsigned long change_in_reserve = 0;
188	int ret = 0;
189
190	spin_lock(&hugetlb_lock);
191	read_lock_irq(&inode->i_mapping->tree_lock);
192
193	if (info->prereserved_hpages >= atleast)
194	goto out;
195
196	/* Because we always call this on shared mappings, none of the
197	* pages beyond info->prereserved_hpages can have been
198	* instantiated, so we need to reserve all of them now. */
199	change_in_reserve = atleast - info->prereserved_hpages;
200
201	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202	ret = -ENOMEM;
203	goto out;
204	}
205
206	reserved_huge_pages += change_in_reserve;
207	info->prereserved_hpages = atleast;
208
209	out:
210	read_unlock_irq(&inode->i_mapping->tree_lock);
211	spin_unlock(&hugetlb_lock);
212
213	return ret;
214	}
215
216	/* hugetlb_truncate_reservation()
217	*
218	* This returns pages reserved for the given inode to the general free
219	* hugepage pool. If the inode has any pages prereserved, but not
220	* instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221	* them.
222	*/
223	void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224	unsigned long atmost)
225	{
226	struct inode *inode = &info->vfs_inode;
227	struct address_space *mapping = inode->i_mapping;
228	unsigned long idx;
229	unsigned long change_in_reserve = 0;
230	struct page *page;
231
232	spin_lock(&hugetlb_lock);
233	read_lock_irq(&inode->i_mapping->tree_lock);
234
235	if (info->prereserved_hpages <= atmost)
236	goto out;
237
238	/* Count pages which were reserved, but not instantiated, and
239	* which we can now release. */
240	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241	page = radix_tree_lookup(&mapping->page_tree, idx);
242	if (!page)
243	/* Pages which are already instantiated can't
244	* be unreserved (and in fact have already
245	* been removed from the reserved pool) */
246	change_in_reserve++;
247	}
248
249	BUG_ON(reserved_huge_pages < change_in_reserve);
250	reserved_huge_pages -= change_in_reserve;
251	info->prereserved_hpages = atmost;
252
253	out:
254	read_unlock_irq(&inode->i_mapping->tree_lock);
255	spin_unlock(&hugetlb_lock);
256	}
257
258	static int __init hugetlb_init(void)	147	static int __init hugetlb_init(void)
259	{	148	{
260	unsigned long i;	149	unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
334	return nr_huge_pages;	223	return nr_huge_pages;
335		224
336	spin_lock(&hugetlb_lock);	225	spin_lock(&hugetlb_lock);
337	count = max(count, reserved_huge_pages);	226	count = max(count, resv_huge_pages);
338	try_to_free_low(count);	227	try_to_free_low(count);
339	while (count < nr_huge_pages) {	228	while (count < nr_huge_pages) {
340	struct page *page = dequeue_huge_page(NULL, 0);	229	struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
361	return sprintf(buf,	250	return sprintf(buf,
362	"HugePages_Total: %5lu\n"	251	"HugePages_Total: %5lu\n"
363	"HugePages_Free: %5lu\n"	252	"HugePages_Free: %5lu\n"
364	"HugePages_Rsvd: %5lu\n"	253	"HugePages_Rsvd: %5lu\n"
365	"Hugepagesize: %5lu kB\n",	254	"Hugepagesize: %5lu kB\n",
366	nr_huge_pages,	255	nr_huge_pages,
367	free_huge_pages,	256	free_huge_pages,
368	reserved_huge_pages,	257	resv_huge_pages,
369	HPAGE_SIZE/1024);	258	HPAGE_SIZE/1024);
370	}	259	}
371		260
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
754	flush_tlb_range(vma, start, end);	643	flush_tlb_range(vma, start, end);
755	}	644	}
756		645
		646	struct file_region {
		647	struct list_head link;
		648	long from;
		649	long to;
		650	};
		651
		652	static long region_add(struct list_head *head, long f, long t)
		653	{
		654	struct file_region rg, nrg, *trg;
		655
		656	/* Locate the region we are either in or before. */
		657	list_for_each_entry(rg, head, link)
		658	if (f <= rg->to)
		659	break;
		660
		661	/* Round our left edge to the current segment if it encloses us. */
		662	if (f > rg->from)
		663	f = rg->from;
		664
		665	/* Check for and consume any regions we now overlap with. */
		666	nrg = rg;
		667	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		668	if (&rg->link == head)
		669	break;
		670	if (rg->from > t)
		671	break;
		672
		673	/* If this area reaches higher then extend our area to
		674	* include it completely. If this is not the first area
		675	* which we intend to reuse, free it. */
		676	if (rg->to > t)
		677	t = rg->to;
		678	if (rg != nrg) {
		679	list_del(&rg->link);
		680	kfree(rg);
		681	}
		682	}
		683	nrg->from = f;
		684	nrg->to = t;
		685	return 0;
		686	}
		687
		688	static long region_chg(struct list_head *head, long f, long t)
		689	{
		690	struct file_region rg, nrg;
		691	long chg = 0;
		692
		693	/* Locate the region we are before or in. */
		694	list_for_each_entry(rg, head, link)
		695	if (f <= rg->to)
		696	break;
		697
		698	/* If we are below the current region then a new region is required.
		699	* Subtle, allocate a new region at the position but make it zero
		700	* size such that we can guarentee to record the reservation. */
		701	if (&rg->link == head \|\| t < rg->from) {
		702	nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
		703	if (nrg == 0)
		704	return -ENOMEM;
		705	nrg->from = f;
		706	nrg->to = f;
		707	INIT_LIST_HEAD(&nrg->link);
		708	list_add(&nrg->link, rg->link.prev);
		709
		710	return t - f;
		711	}
		712
		713	/* Round our left edge to the current segment if it encloses us. */
		714	if (f > rg->from)
		715	f = rg->from;
		716	chg = t - f;
		717
		718	/* Check for and consume any regions we now overlap with. */
		719	list_for_each_entry(rg, rg->link.prev, link) {
		720	if (&rg->link == head)
		721	break;
		722	if (rg->from > t)
		723	return chg;
		724
		725	/* We overlap with this area, if it extends futher than
		726	* us then we must extend ourselves. Account for its
		727	* existing reservation. */
		728	if (rg->to > t) {
		729	chg += rg->to - t;
		730	t = rg->to;
		731	}
		732	chg -= rg->to - rg->from;
		733	}
		734	return chg;
		735	}
		736
		737	static long region_truncate(struct list_head *head, long end)
		738	{
		739	struct file_region rg, trg;
		740	long chg = 0;
		741
		742	/* Locate the region we are either in or before. */
		743	list_for_each_entry(rg, head, link)
		744	if (end <= rg->to)
		745	break;
		746	if (&rg->link == head)
		747	return 0;
		748
		749	/* If we are in the middle of a region then adjust it. */
		750	if (end > rg->from) {
		751	chg = rg->to - end;
		752	rg->to = end;
		753	rg = list_entry(rg->link.next, typeof(*rg), link);
		754	}
		755
		756	/* Drop any remaining regions. */
		757	list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
		758	if (&rg->link == head)
		759	break;
		760	chg += rg->to - rg->from;
		761	list_del(&rg->link);
		762	kfree(rg);
		763	}
		764	return chg;
		765	}
		766
		767	static int hugetlb_acct_memory(long delta)
		768	{
		769	int ret = -ENOMEM;
		770
		771	spin_lock(&hugetlb_lock);
		772	if ((delta + resv_huge_pages) <= free_huge_pages) {
		773	resv_huge_pages += delta;
		774	ret = 0;
		775	}
		776	spin_unlock(&hugetlb_lock);
		777	return ret;
		778	}
		779
		780	int hugetlb_reserve_pages(struct inode *inode, long from, long to)
		781	{
		782	long ret, chg;
		783
		784	chg = region_chg(&inode->i_mapping->private_list, from, to);
		785	if (chg < 0)
		786	return chg;
		787	ret = hugetlb_acct_memory(chg);
		788	if (ret < 0)
		789	return ret;
		790	region_add(&inode->i_mapping->private_list, from, to);
		791	return 0;
		792	}
		793
		794	void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
		795	{
		796	long chg = region_truncate(&inode->i_mapping->private_list, offset);
		797	hugetlb_acct_memory(freed - chg);
		798	}