aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChen, Kenneth W <kenneth.w.chen@intel.com>2006-06-23 05:03:15 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2006-06-23 10:42:48 -0400
commita43a8c39bbb493c9e93f6764b350de2e33e18e92 (patch)
treea3f0042371810ce6d076751d8e403baaa3d2630e
parente8f03d02080b25f53cd6bba8dc3a297803f18c01 (diff)
[PATCH] tightening hugetlb strict accounting
Current hugetlb strict accounting for shared mapping always assume mapping starts at zero file offset and reserves pages between zero and size of the file. This assumption often reserves (or lock down) a lot more pages then necessary if application maps at none zero file offset. libhugetlbfs is one example that requires proper reservation on shared mapping starts at none zero offset. This patch extends the reservation and hugetlb strict accounting to support any arbitrary pair of (offset, len), resulting a much more robust and accurate scheme. More importantly, it won't lock down any hugetlb pages outside file mapping. Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Adam Litke <agl@us.ibm.com> Cc: David Gibson <david@gibson.dropbear.id.au> Cc: William Lee Irwin III <wli@holomorphy.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/hugetlbfs/inode.c21
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--mm/hugetlb.c282
3 files changed, 173 insertions, 138 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 678fc72c3646..e6410d8edd0e 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -59,7 +59,6 @@ static void huge_pagevec_release(struct pagevec *pvec)
59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
60{ 60{
61 struct inode *inode = file->f_dentry->d_inode; 61 struct inode *inode = file->f_dentry->d_inode;
62 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
63 loff_t len, vma_len; 62 loff_t len, vma_len;
64 int ret; 63 int ret;
65 64
@@ -87,9 +86,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
87 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 86 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
88 goto out; 87 goto out;
89 88
90 if (vma->vm_flags & VM_MAYSHARE) 89 if (vma->vm_flags & VM_MAYSHARE &&
91 if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) 90 hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT),
92 goto out; 91 len >> HPAGE_SHIFT))
92 goto out;
93 93
94 ret = 0; 94 ret = 0;
95 hugetlb_prefault_arch_hook(vma->vm_mm); 95 hugetlb_prefault_arch_hook(vma->vm_mm);
@@ -195,12 +195,8 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
195 const pgoff_t start = lstart >> HPAGE_SHIFT; 195 const pgoff_t start = lstart >> HPAGE_SHIFT;
196 struct pagevec pvec; 196 struct pagevec pvec;
197 pgoff_t next; 197 pgoff_t next;
198 int i; 198 int i, freed = 0;
199 199
200 hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201 lstart >> HPAGE_SHIFT);
202 if (!mapping->nrpages)
203 return;
204 pagevec_init(&pvec, 0); 200 pagevec_init(&pvec, 0);
205 next = start; 201 next = start;
206 while (1) { 202 while (1) {
@@ -221,10 +217,12 @@ static void truncate_hugepages(struct inode *inode, loff_t lstart)
221 truncate_huge_page(page); 217 truncate_huge_page(page);
222 unlock_page(page); 218 unlock_page(page);
223 hugetlb_put_quota(mapping); 219 hugetlb_put_quota(mapping);
220 freed++;
224 } 221 }
225 huge_pagevec_release(&pvec); 222 huge_pagevec_release(&pvec);
226 } 223 }
227 BUG_ON(!lstart && mapping->nrpages); 224 BUG_ON(!lstart && mapping->nrpages);
225 hugetlb_unreserve_pages(inode, start, freed);
228} 226}
229 227
230static void hugetlbfs_delete_inode(struct inode *inode) 228static void hugetlbfs_delete_inode(struct inode *inode)
@@ -366,6 +364,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
366 inode->i_mapping->a_ops = &hugetlbfs_aops; 364 inode->i_mapping->a_ops = &hugetlbfs_aops;
367 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; 365 inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
368 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 366 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
367 INIT_LIST_HEAD(&inode->i_mapping->private_list);
369 info = HUGETLBFS_I(inode); 368 info = HUGETLBFS_I(inode);
370 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL); 369 mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL);
371 switch (mode & S_IFMT) { 370 switch (mode & S_IFMT) {
@@ -538,7 +537,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
538 hugetlbfs_inc_free_inodes(sbinfo); 537 hugetlbfs_inc_free_inodes(sbinfo);
539 return NULL; 538 return NULL;
540 } 539 }
541 p->prereserved_hpages = 0;
542 return &p->vfs_inode; 540 return &p->vfs_inode;
543} 541}
544 542
@@ -781,8 +779,7 @@ struct file *hugetlb_zero_setup(size_t size)
781 goto out_file; 779 goto out_file;
782 780
783 error = -ENOMEM; 781 error = -ENOMEM;
784 if (hugetlb_extend_reservation(HUGETLBFS_I(inode), 782 if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT))
785 size >> HPAGE_SHIFT) != 0)
786 goto out_inode; 783 goto out_inode;
787 784
788 d_instantiate(dentry, inode); 785 d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 4c5e610fe442..c25a38d8f600 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -23,6 +23,8 @@ int hugetlb_report_node_meminfo(int, char *);
23unsigned long hugetlb_total_pages(void); 23unsigned long hugetlb_total_pages(void);
24int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 24int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
25 unsigned long address, int write_access); 25 unsigned long address, int write_access);
26int hugetlb_reserve_pages(struct inode *inode, long from, long to);
27void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
26 28
27extern unsigned long max_huge_pages; 29extern unsigned long max_huge_pages;
28extern const unsigned long hugetlb_zero, hugetlb_infinity; 30extern const unsigned long hugetlb_zero, hugetlb_infinity;
@@ -139,8 +141,6 @@ struct hugetlbfs_sb_info {
139 141
140struct hugetlbfs_inode_info { 142struct hugetlbfs_inode_info {
141 struct shared_policy policy; 143 struct shared_policy policy;
142 /* Protected by the (global) hugetlb_lock */
143 unsigned long prereserved_hpages;
144 struct inode vfs_inode; 144 struct inode vfs_inode;
145}; 145};
146 146
@@ -157,10 +157,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
157extern const struct file_operations hugetlbfs_file_operations; 157extern const struct file_operations hugetlbfs_file_operations;
158extern struct vm_operations_struct hugetlb_vm_ops; 158extern struct vm_operations_struct hugetlb_vm_ops;
159struct file *hugetlb_zero_setup(size_t); 159struct file *hugetlb_zero_setup(size_t);
160int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
161 unsigned long atleast_hpages);
162void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
163 unsigned long atmost_hpages);
164int hugetlb_get_quota(struct address_space *mapping); 160int hugetlb_get_quota(struct address_space *mapping);
165void hugetlb_put_quota(struct address_space *mapping); 161void hugetlb_put_quota(struct address_space *mapping);
166 162
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 832f676ca038..df499973255f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void)
123static struct page *alloc_huge_page(struct vm_area_struct *vma, 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr) 124 unsigned long addr)
125{ 125{
126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page; 126 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
130 127
131 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
132 129 if (vma->vm_flags & VM_MAYSHARE)
133 if (vma->vm_flags & VM_MAYSHARE) { 130 resv_huge_pages--;
134 131 else if (free_huge_pages <= resv_huge_pages)
135 /* idx = radix tree index, i.e. offset into file in 132 goto fail;
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159 133
160 page = dequeue_huge_page(vma, addr); 134 page = dequeue_huge_page(vma, addr);
161 if (!page) 135 if (!page)
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
165 set_page_refcounted(page); 139 set_page_refcounted(page);
166 return page; 140 return page;
167 141
168 fail: 142fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock); 143 spin_unlock(&hugetlb_lock);
171 return NULL; 144 return NULL;
172} 145}
173 146
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
189
190 spin_lock(&hugetlb_lock);
191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
211 spin_unlock(&hugetlb_lock);
212
213 return ret;
214}
215
216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
230 struct page *page;
231
232 spin_lock(&hugetlb_lock);
233 read_lock_irq(&inode->i_mapping->tree_lock);
234
235 if (info->prereserved_hpages <= atmost)
236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
255 spin_unlock(&hugetlb_lock);
256}
257
258static int __init hugetlb_init(void) 147static int __init hugetlb_init(void)
259{ 148{
260 unsigned long i; 149 unsigned long i;
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count)
334 return nr_huge_pages; 223 return nr_huge_pages;
335 224
336 spin_lock(&hugetlb_lock); 225 spin_lock(&hugetlb_lock);
337 count = max(count, reserved_huge_pages); 226 count = max(count, resv_huge_pages);
338 try_to_free_low(count); 227 try_to_free_low(count);
339 while (count < nr_huge_pages) { 228 while (count < nr_huge_pages) {
340 struct page *page = dequeue_huge_page(NULL, 0); 229 struct page *page = dequeue_huge_page(NULL, 0);
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf)
361 return sprintf(buf, 250 return sprintf(buf,
362 "HugePages_Total: %5lu\n" 251 "HugePages_Total: %5lu\n"
363 "HugePages_Free: %5lu\n" 252 "HugePages_Free: %5lu\n"
364 "HugePages_Rsvd: %5lu\n" 253 "HugePages_Rsvd: %5lu\n"
365 "Hugepagesize: %5lu kB\n", 254 "Hugepagesize: %5lu kB\n",
366 nr_huge_pages, 255 nr_huge_pages,
367 free_huge_pages, 256 free_huge_pages,
368 reserved_huge_pages, 257 resv_huge_pages,
369 HPAGE_SIZE/1024); 258 HPAGE_SIZE/1024);
370} 259}
371 260
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
754 flush_tlb_range(vma, start, end); 643 flush_tlb_range(vma, start, end);
755} 644}
756 645
646struct file_region {
647 struct list_head link;
648 long from;
649 long to;
650};
651
652static long region_add(struct list_head *head, long f, long t)
653{
654 struct file_region *rg, *nrg, *trg;
655
656 /* Locate the region we are either in or before. */
657 list_for_each_entry(rg, head, link)
658 if (f <= rg->to)
659 break;
660
661 /* Round our left edge to the current segment if it encloses us. */
662 if (f > rg->from)
663 f = rg->from;
664
665 /* Check for and consume any regions we now overlap with. */
666 nrg = rg;
667 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
668 if (&rg->link == head)
669 break;
670 if (rg->from > t)
671 break;
672
673 /* If this area reaches higher then extend our area to
674 * include it completely. If this is not the first area
675 * which we intend to reuse, free it. */
676 if (rg->to > t)
677 t = rg->to;
678 if (rg != nrg) {
679 list_del(&rg->link);
680 kfree(rg);
681 }
682 }
683 nrg->from = f;
684 nrg->to = t;
685 return 0;
686}
687
688static long region_chg(struct list_head *head, long f, long t)
689{
690 struct file_region *rg, *nrg;
691 long chg = 0;
692
693 /* Locate the region we are before or in. */
694 list_for_each_entry(rg, head, link)
695 if (f <= rg->to)
696 break;
697
698 /* If we are below the current region then a new region is required.
699 * Subtle, allocate a new region at the position but make it zero
700 * size such that we can guarentee to record the reservation. */
701 if (&rg->link == head || t < rg->from) {
702 nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
703 if (nrg == 0)
704 return -ENOMEM;
705 nrg->from = f;
706 nrg->to = f;
707 INIT_LIST_HEAD(&nrg->link);
708 list_add(&nrg->link, rg->link.prev);
709
710 return t - f;
711 }
712
713 /* Round our left edge to the current segment if it encloses us. */
714 if (f > rg->from)
715 f = rg->from;
716 chg = t - f;
717
718 /* Check for and consume any regions we now overlap with. */
719 list_for_each_entry(rg, rg->link.prev, link) {
720 if (&rg->link == head)
721 break;
722 if (rg->from > t)
723 return chg;
724
725 /* We overlap with this area, if it extends futher than
726 * us then we must extend ourselves. Account for its
727 * existing reservation. */
728 if (rg->to > t) {
729 chg += rg->to - t;
730 t = rg->to;
731 }
732 chg -= rg->to - rg->from;
733 }
734 return chg;
735}
736
737static long region_truncate(struct list_head *head, long end)
738{
739 struct file_region *rg, *trg;
740 long chg = 0;
741
742 /* Locate the region we are either in or before. */
743 list_for_each_entry(rg, head, link)
744 if (end <= rg->to)
745 break;
746 if (&rg->link == head)
747 return 0;
748
749 /* If we are in the middle of a region then adjust it. */
750 if (end > rg->from) {
751 chg = rg->to - end;
752 rg->to = end;
753 rg = list_entry(rg->link.next, typeof(*rg), link);
754 }
755
756 /* Drop any remaining regions. */
757 list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
758 if (&rg->link == head)
759 break;
760 chg += rg->to - rg->from;
761 list_del(&rg->link);
762 kfree(rg);
763 }
764 return chg;
765}
766
767static int hugetlb_acct_memory(long delta)
768{
769 int ret = -ENOMEM;
770
771 spin_lock(&hugetlb_lock);
772 if ((delta + resv_huge_pages) <= free_huge_pages) {
773 resv_huge_pages += delta;
774 ret = 0;
775 }
776 spin_unlock(&hugetlb_lock);
777 return ret;
778}
779
780int hugetlb_reserve_pages(struct inode *inode, long from, long to)
781{
782 long ret, chg;
783
784 chg = region_chg(&inode->i_mapping->private_list, from, to);
785 if (chg < 0)
786 return chg;
787 ret = hugetlb_acct_memory(chg);
788 if (ret < 0)
789 return ret;
790 region_add(&inode->i_mapping->private_list, from, to);
791 return 0;
792}
793
794void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
795{
796 long chg = region_truncate(&inode->i_mapping->private_list, offset);
797 hugetlb_acct_memory(freed - chg);
798}