diff options
author | Chen, Kenneth W <kenneth.w.chen@intel.com> | 2006-06-23 05:03:15 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-06-23 10:42:48 -0400 |
commit | a43a8c39bbb493c9e93f6764b350de2e33e18e92 (patch) | |
tree | a3f0042371810ce6d076751d8e403baaa3d2630e /mm/hugetlb.c | |
parent | e8f03d02080b25f53cd6bba8dc3a297803f18c01 (diff) |
[PATCH] tightening hugetlb strict accounting
Current hugetlb strict accounting for shared mapping always assume mapping
starts at zero file offset and reserves pages between zero and size of the
file. This assumption often reserves (or lock down) a lot more pages then
necessary if application maps at none zero file offset. libhugetlbfs is
one example that requires proper reservation on shared mapping starts at
none zero offset.
This patch extends the reservation and hugetlb strict accounting to support
any arbitrary pair of (offset, len), resulting a much more robust and
accurate scheme. More importantly, it won't lock down any hugetlb pages
outside file mapping.
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Acked-by: Adam Litke <agl@us.ibm.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 282 |
1 files changed, 162 insertions, 120 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca03..df499973255 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) | |||
123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | 124 | unsigned long addr) |
125 | { | 125 | { |
126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
127 | struct page *page; | 126 | struct page *page; |
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
130 | 127 | ||
131 | spin_lock(&hugetlb_lock); | 128 | spin_lock(&hugetlb_lock); |
132 | 129 | if (vma->vm_flags & VM_MAYSHARE) | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | 130 | resv_huge_pages--; |
134 | 131 | else if (free_huge_pages <= resv_huge_pages) | |
135 | /* idx = radix tree index, i.e. offset into file in | 132 | goto fail; |
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | 133 | ||
160 | page = dequeue_huge_page(vma, addr); | 134 | page = dequeue_huge_page(vma, addr); |
161 | if (!page) | 135 | if (!page) |
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
165 | set_page_refcounted(page); | 139 | set_page_refcounted(page); |
166 | return page; | 140 | return page; |
167 | 141 | ||
168 | fail: | 142 | fail: |
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | 143 | spin_unlock(&hugetlb_lock); |
171 | return NULL; | 144 | return NULL; |
172 | } | 145 | } |
173 | 146 | ||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
189 | |||
190 | spin_lock(&hugetlb_lock); | ||
191 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
211 | spin_unlock(&hugetlb_lock); | ||
212 | |||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* hugetlb_truncate_reservation() | ||
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
225 | { | ||
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
230 | struct page *page; | ||
231 | |||
232 | spin_lock(&hugetlb_lock); | ||
233 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
234 | |||
235 | if (info->prereserved_hpages <= atmost) | ||
236 | goto out; | ||
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
247 | } | ||
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
255 | spin_unlock(&hugetlb_lock); | ||
256 | } | ||
257 | |||
258 | static int __init hugetlb_init(void) | 147 | static int __init hugetlb_init(void) |
259 | { | 148 | { |
260 | unsigned long i; | 149 | unsigned long i; |
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
334 | return nr_huge_pages; | 223 | return nr_huge_pages; |
335 | 224 | ||
336 | spin_lock(&hugetlb_lock); | 225 | spin_lock(&hugetlb_lock); |
337 | count = max(count, reserved_huge_pages); | 226 | count = max(count, resv_huge_pages); |
338 | try_to_free_low(count); | 227 | try_to_free_low(count); |
339 | while (count < nr_huge_pages) { | 228 | while (count < nr_huge_pages) { |
340 | struct page *page = dequeue_huge_page(NULL, 0); | 229 | struct page *page = dequeue_huge_page(NULL, 0); |
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) | |||
361 | return sprintf(buf, | 250 | return sprintf(buf, |
362 | "HugePages_Total: %5lu\n" | 251 | "HugePages_Total: %5lu\n" |
363 | "HugePages_Free: %5lu\n" | 252 | "HugePages_Free: %5lu\n" |
364 | "HugePages_Rsvd: %5lu\n" | 253 | "HugePages_Rsvd: %5lu\n" |
365 | "Hugepagesize: %5lu kB\n", | 254 | "Hugepagesize: %5lu kB\n", |
366 | nr_huge_pages, | 255 | nr_huge_pages, |
367 | free_huge_pages, | 256 | free_huge_pages, |
368 | reserved_huge_pages, | 257 | resv_huge_pages, |
369 | HPAGE_SIZE/1024); | 258 | HPAGE_SIZE/1024); |
370 | } | 259 | } |
371 | 260 | ||
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
754 | flush_tlb_range(vma, start, end); | 643 | flush_tlb_range(vma, start, end); |
755 | } | 644 | } |
756 | 645 | ||
646 | struct file_region { | ||
647 | struct list_head link; | ||
648 | long from; | ||
649 | long to; | ||
650 | }; | ||
651 | |||
652 | static long region_add(struct list_head *head, long f, long t) | ||
653 | { | ||
654 | struct file_region *rg, *nrg, *trg; | ||
655 | |||
656 | /* Locate the region we are either in or before. */ | ||
657 | list_for_each_entry(rg, head, link) | ||
658 | if (f <= rg->to) | ||
659 | break; | ||
660 | |||
661 | /* Round our left edge to the current segment if it encloses us. */ | ||
662 | if (f > rg->from) | ||
663 | f = rg->from; | ||
664 | |||
665 | /* Check for and consume any regions we now overlap with. */ | ||
666 | nrg = rg; | ||
667 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
668 | if (&rg->link == head) | ||
669 | break; | ||
670 | if (rg->from > t) | ||
671 | break; | ||
672 | |||
673 | /* If this area reaches higher then extend our area to | ||
674 | * include it completely. If this is not the first area | ||
675 | * which we intend to reuse, free it. */ | ||
676 | if (rg->to > t) | ||
677 | t = rg->to; | ||
678 | if (rg != nrg) { | ||
679 | list_del(&rg->link); | ||
680 | kfree(rg); | ||
681 | } | ||
682 | } | ||
683 | nrg->from = f; | ||
684 | nrg->to = t; | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static long region_chg(struct list_head *head, long f, long t) | ||
689 | { | ||
690 | struct file_region *rg, *nrg; | ||
691 | long chg = 0; | ||
692 | |||
693 | /* Locate the region we are before or in. */ | ||
694 | list_for_each_entry(rg, head, link) | ||
695 | if (f <= rg->to) | ||
696 | break; | ||
697 | |||
698 | /* If we are below the current region then a new region is required. | ||
699 | * Subtle, allocate a new region at the position but make it zero | ||
700 | * size such that we can guarentee to record the reservation. */ | ||
701 | if (&rg->link == head || t < rg->from) { | ||
702 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
703 | if (nrg == 0) | ||
704 | return -ENOMEM; | ||
705 | nrg->from = f; | ||
706 | nrg->to = f; | ||
707 | INIT_LIST_HEAD(&nrg->link); | ||
708 | list_add(&nrg->link, rg->link.prev); | ||
709 | |||
710 | return t - f; | ||
711 | } | ||
712 | |||
713 | /* Round our left edge to the current segment if it encloses us. */ | ||
714 | if (f > rg->from) | ||
715 | f = rg->from; | ||
716 | chg = t - f; | ||
717 | |||
718 | /* Check for and consume any regions we now overlap with. */ | ||
719 | list_for_each_entry(rg, rg->link.prev, link) { | ||
720 | if (&rg->link == head) | ||
721 | break; | ||
722 | if (rg->from > t) | ||
723 | return chg; | ||
724 | |||
725 | /* We overlap with this area, if it extends futher than | ||
726 | * us then we must extend ourselves. Account for its | ||
727 | * existing reservation. */ | ||
728 | if (rg->to > t) { | ||
729 | chg += rg->to - t; | ||
730 | t = rg->to; | ||
731 | } | ||
732 | chg -= rg->to - rg->from; | ||
733 | } | ||
734 | return chg; | ||
735 | } | ||
736 | |||
737 | static long region_truncate(struct list_head *head, long end) | ||
738 | { | ||
739 | struct file_region *rg, *trg; | ||
740 | long chg = 0; | ||
741 | |||
742 | /* Locate the region we are either in or before. */ | ||
743 | list_for_each_entry(rg, head, link) | ||
744 | if (end <= rg->to) | ||
745 | break; | ||
746 | if (&rg->link == head) | ||
747 | return 0; | ||
748 | |||
749 | /* If we are in the middle of a region then adjust it. */ | ||
750 | if (end > rg->from) { | ||
751 | chg = rg->to - end; | ||
752 | rg->to = end; | ||
753 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
754 | } | ||
755 | |||
756 | /* Drop any remaining regions. */ | ||
757 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
758 | if (&rg->link == head) | ||
759 | break; | ||
760 | chg += rg->to - rg->from; | ||
761 | list_del(&rg->link); | ||
762 | kfree(rg); | ||
763 | } | ||
764 | return chg; | ||
765 | } | ||
766 | |||
767 | static int hugetlb_acct_memory(long delta) | ||
768 | { | ||
769 | int ret = -ENOMEM; | ||
770 | |||
771 | spin_lock(&hugetlb_lock); | ||
772 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
773 | resv_huge_pages += delta; | ||
774 | ret = 0; | ||
775 | } | ||
776 | spin_unlock(&hugetlb_lock); | ||
777 | return ret; | ||
778 | } | ||
779 | |||
780 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
781 | { | ||
782 | long ret, chg; | ||
783 | |||
784 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
785 | if (chg < 0) | ||
786 | return chg; | ||
787 | ret = hugetlb_acct_memory(chg); | ||
788 | if (ret < 0) | ||
789 | return ret; | ||
790 | region_add(&inode->i_mapping->private_list, from, to); | ||
791 | return 0; | ||
792 | } | ||
793 | |||
794 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | ||
795 | { | ||
796 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | ||
797 | hugetlb_acct_memory(freed - chg); | ||
798 | } | ||