diff options
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r-- | mm/hugetlb.c | 282 |
1 files changed, 162 insertions, 120 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 832f676ca038..df499973255f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; |
26 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
@@ -123,39 +123,13 @@ static int alloc_fresh_huge_page(void) | |||
123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 123 | static struct page *alloc_huge_page(struct vm_area_struct *vma, |
124 | unsigned long addr) | 124 | unsigned long addr) |
125 | { | 125 | { |
126 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
127 | struct page *page; | 126 | struct page *page; |
128 | int use_reserve = 0; | ||
129 | unsigned long idx; | ||
130 | 127 | ||
131 | spin_lock(&hugetlb_lock); | 128 | spin_lock(&hugetlb_lock); |
132 | 129 | if (vma->vm_flags & VM_MAYSHARE) | |
133 | if (vma->vm_flags & VM_MAYSHARE) { | 130 | resv_huge_pages--; |
134 | 131 | else if (free_huge_pages <= resv_huge_pages) | |
135 | /* idx = radix tree index, i.e. offset into file in | 132 | goto fail; |
136 | * HPAGE_SIZE units */ | ||
137 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
138 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
139 | |||
140 | /* The hugetlbfs specific inode info stores the number | ||
141 | * of "guaranteed available" (huge) pages. That is, | ||
142 | * the first 'prereserved_hpages' pages of the inode | ||
143 | * are either already instantiated, or have been | ||
144 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
145 | * we're in the process of instantiating the page, so | ||
146 | * we use this to determine whether to draw from the | ||
147 | * pre-reserved pool or the truly free pool. */ | ||
148 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
149 | use_reserve = 1; | ||
150 | } | ||
151 | |||
152 | if (!use_reserve) { | ||
153 | if (free_huge_pages <= reserved_huge_pages) | ||
154 | goto fail; | ||
155 | } else { | ||
156 | BUG_ON(reserved_huge_pages == 0); | ||
157 | reserved_huge_pages--; | ||
158 | } | ||
159 | 133 | ||
160 | page = dequeue_huge_page(vma, addr); | 134 | page = dequeue_huge_page(vma, addr); |
161 | if (!page) | 135 | if (!page) |
@@ -165,96 +139,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, | |||
165 | set_page_refcounted(page); | 139 | set_page_refcounted(page); |
166 | return page; | 140 | return page; |
167 | 141 | ||
168 | fail: | 142 | fail: |
169 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
170 | spin_unlock(&hugetlb_lock); | 143 | spin_unlock(&hugetlb_lock); |
171 | return NULL; | 144 | return NULL; |
172 | } | 145 | } |
173 | 146 | ||
174 | /* hugetlb_extend_reservation() | ||
175 | * | ||
176 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
177 | * available to instantiate the first 'atleast' pages of the given | ||
178 | * inode. If the inode doesn't already have this many pages reserved | ||
179 | * or instantiated, set aside some hugepages in the reserved pool to | ||
180 | * satisfy later faults (or fail now if there aren't enough, rather | ||
181 | * than getting the SIGBUS later). | ||
182 | */ | ||
183 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
184 | unsigned long atleast) | ||
185 | { | ||
186 | struct inode *inode = &info->vfs_inode; | ||
187 | unsigned long change_in_reserve = 0; | ||
188 | int ret = 0; | ||
189 | |||
190 | spin_lock(&hugetlb_lock); | ||
191 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
192 | |||
193 | if (info->prereserved_hpages >= atleast) | ||
194 | goto out; | ||
195 | |||
196 | /* Because we always call this on shared mappings, none of the | ||
197 | * pages beyond info->prereserved_hpages can have been | ||
198 | * instantiated, so we need to reserve all of them now. */ | ||
199 | change_in_reserve = atleast - info->prereserved_hpages; | ||
200 | |||
201 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
202 | ret = -ENOMEM; | ||
203 | goto out; | ||
204 | } | ||
205 | |||
206 | reserved_huge_pages += change_in_reserve; | ||
207 | info->prereserved_hpages = atleast; | ||
208 | |||
209 | out: | ||
210 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
211 | spin_unlock(&hugetlb_lock); | ||
212 | |||
213 | return ret; | ||
214 | } | ||
215 | |||
216 | /* hugetlb_truncate_reservation() | ||
217 | * | ||
218 | * This returns pages reserved for the given inode to the general free | ||
219 | * hugepage pool. If the inode has any pages prereserved, but not | ||
220 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
221 | * them. | ||
222 | */ | ||
223 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
224 | unsigned long atmost) | ||
225 | { | ||
226 | struct inode *inode = &info->vfs_inode; | ||
227 | struct address_space *mapping = inode->i_mapping; | ||
228 | unsigned long idx; | ||
229 | unsigned long change_in_reserve = 0; | ||
230 | struct page *page; | ||
231 | |||
232 | spin_lock(&hugetlb_lock); | ||
233 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
234 | |||
235 | if (info->prereserved_hpages <= atmost) | ||
236 | goto out; | ||
237 | |||
238 | /* Count pages which were reserved, but not instantiated, and | ||
239 | * which we can now release. */ | ||
240 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
241 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
242 | if (!page) | ||
243 | /* Pages which are already instantiated can't | ||
244 | * be unreserved (and in fact have already | ||
245 | * been removed from the reserved pool) */ | ||
246 | change_in_reserve++; | ||
247 | } | ||
248 | |||
249 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
250 | reserved_huge_pages -= change_in_reserve; | ||
251 | info->prereserved_hpages = atmost; | ||
252 | |||
253 | out: | ||
254 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
255 | spin_unlock(&hugetlb_lock); | ||
256 | } | ||
257 | |||
258 | static int __init hugetlb_init(void) | 147 | static int __init hugetlb_init(void) |
259 | { | 148 | { |
260 | unsigned long i; | 149 | unsigned long i; |
@@ -334,7 +223,7 @@ static unsigned long set_max_huge_pages(unsigned long count) | |||
334 | return nr_huge_pages; | 223 | return nr_huge_pages; |
335 | 224 | ||
336 | spin_lock(&hugetlb_lock); | 225 | spin_lock(&hugetlb_lock); |
337 | count = max(count, reserved_huge_pages); | 226 | count = max(count, resv_huge_pages); |
338 | try_to_free_low(count); | 227 | try_to_free_low(count); |
339 | while (count < nr_huge_pages) { | 228 | while (count < nr_huge_pages) { |
340 | struct page *page = dequeue_huge_page(NULL, 0); | 229 | struct page *page = dequeue_huge_page(NULL, 0); |
@@ -361,11 +250,11 @@ int hugetlb_report_meminfo(char *buf) | |||
361 | return sprintf(buf, | 250 | return sprintf(buf, |
362 | "HugePages_Total: %5lu\n" | 251 | "HugePages_Total: %5lu\n" |
363 | "HugePages_Free: %5lu\n" | 252 | "HugePages_Free: %5lu\n" |
364 | "HugePages_Rsvd: %5lu\n" | 253 | "HugePages_Rsvd: %5lu\n" |
365 | "Hugepagesize: %5lu kB\n", | 254 | "Hugepagesize: %5lu kB\n", |
366 | nr_huge_pages, | 255 | nr_huge_pages, |
367 | free_huge_pages, | 256 | free_huge_pages, |
368 | reserved_huge_pages, | 257 | resv_huge_pages, |
369 | HPAGE_SIZE/1024); | 258 | HPAGE_SIZE/1024); |
370 | } | 259 | } |
371 | 260 | ||
@@ -754,3 +643,156 @@ void hugetlb_change_protection(struct vm_area_struct *vma, | |||
754 | flush_tlb_range(vma, start, end); | 643 | flush_tlb_range(vma, start, end); |
755 | } | 644 | } |
756 | 645 | ||
646 | struct file_region { | ||
647 | struct list_head link; | ||
648 | long from; | ||
649 | long to; | ||
650 | }; | ||
651 | |||
652 | static long region_add(struct list_head *head, long f, long t) | ||
653 | { | ||
654 | struct file_region *rg, *nrg, *trg; | ||
655 | |||
656 | /* Locate the region we are either in or before. */ | ||
657 | list_for_each_entry(rg, head, link) | ||
658 | if (f <= rg->to) | ||
659 | break; | ||
660 | |||
661 | /* Round our left edge to the current segment if it encloses us. */ | ||
662 | if (f > rg->from) | ||
663 | f = rg->from; | ||
664 | |||
665 | /* Check for and consume any regions we now overlap with. */ | ||
666 | nrg = rg; | ||
667 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
668 | if (&rg->link == head) | ||
669 | break; | ||
670 | if (rg->from > t) | ||
671 | break; | ||
672 | |||
673 | /* If this area reaches higher then extend our area to | ||
674 | * include it completely. If this is not the first area | ||
675 | * which we intend to reuse, free it. */ | ||
676 | if (rg->to > t) | ||
677 | t = rg->to; | ||
678 | if (rg != nrg) { | ||
679 | list_del(&rg->link); | ||
680 | kfree(rg); | ||
681 | } | ||
682 | } | ||
683 | nrg->from = f; | ||
684 | nrg->to = t; | ||
685 | return 0; | ||
686 | } | ||
687 | |||
688 | static long region_chg(struct list_head *head, long f, long t) | ||
689 | { | ||
690 | struct file_region *rg, *nrg; | ||
691 | long chg = 0; | ||
692 | |||
693 | /* Locate the region we are before or in. */ | ||
694 | list_for_each_entry(rg, head, link) | ||
695 | if (f <= rg->to) | ||
696 | break; | ||
697 | |||
698 | /* If we are below the current region then a new region is required. | ||
699 | * Subtle, allocate a new region at the position but make it zero | ||
700 | * size such that we can guarentee to record the reservation. */ | ||
701 | if (&rg->link == head || t < rg->from) { | ||
702 | nrg = kmalloc(sizeof(*nrg), GFP_KERNEL); | ||
703 | if (nrg == 0) | ||
704 | return -ENOMEM; | ||
705 | nrg->from = f; | ||
706 | nrg->to = f; | ||
707 | INIT_LIST_HEAD(&nrg->link); | ||
708 | list_add(&nrg->link, rg->link.prev); | ||
709 | |||
710 | return t - f; | ||
711 | } | ||
712 | |||
713 | /* Round our left edge to the current segment if it encloses us. */ | ||
714 | if (f > rg->from) | ||
715 | f = rg->from; | ||
716 | chg = t - f; | ||
717 | |||
718 | /* Check for and consume any regions we now overlap with. */ | ||
719 | list_for_each_entry(rg, rg->link.prev, link) { | ||
720 | if (&rg->link == head) | ||
721 | break; | ||
722 | if (rg->from > t) | ||
723 | return chg; | ||
724 | |||
725 | /* We overlap with this area, if it extends futher than | ||
726 | * us then we must extend ourselves. Account for its | ||
727 | * existing reservation. */ | ||
728 | if (rg->to > t) { | ||
729 | chg += rg->to - t; | ||
730 | t = rg->to; | ||
731 | } | ||
732 | chg -= rg->to - rg->from; | ||
733 | } | ||
734 | return chg; | ||
735 | } | ||
736 | |||
737 | static long region_truncate(struct list_head *head, long end) | ||
738 | { | ||
739 | struct file_region *rg, *trg; | ||
740 | long chg = 0; | ||
741 | |||
742 | /* Locate the region we are either in or before. */ | ||
743 | list_for_each_entry(rg, head, link) | ||
744 | if (end <= rg->to) | ||
745 | break; | ||
746 | if (&rg->link == head) | ||
747 | return 0; | ||
748 | |||
749 | /* If we are in the middle of a region then adjust it. */ | ||
750 | if (end > rg->from) { | ||
751 | chg = rg->to - end; | ||
752 | rg->to = end; | ||
753 | rg = list_entry(rg->link.next, typeof(*rg), link); | ||
754 | } | ||
755 | |||
756 | /* Drop any remaining regions. */ | ||
757 | list_for_each_entry_safe(rg, trg, rg->link.prev, link) { | ||
758 | if (&rg->link == head) | ||
759 | break; | ||
760 | chg += rg->to - rg->from; | ||
761 | list_del(&rg->link); | ||
762 | kfree(rg); | ||
763 | } | ||
764 | return chg; | ||
765 | } | ||
766 | |||
767 | static int hugetlb_acct_memory(long delta) | ||
768 | { | ||
769 | int ret = -ENOMEM; | ||
770 | |||
771 | spin_lock(&hugetlb_lock); | ||
772 | if ((delta + resv_huge_pages) <= free_huge_pages) { | ||
773 | resv_huge_pages += delta; | ||
774 | ret = 0; | ||
775 | } | ||
776 | spin_unlock(&hugetlb_lock); | ||
777 | return ret; | ||
778 | } | ||
779 | |||
780 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | ||
781 | { | ||
782 | long ret, chg; | ||
783 | |||
784 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
785 | if (chg < 0) | ||
786 | return chg; | ||
787 | ret = hugetlb_acct_memory(chg); | ||
788 | if (ret < 0) | ||
789 | return ret; | ||
790 | region_add(&inode->i_mapping->private_list, from, to); | ||
791 | return 0; | ||
792 | } | ||
793 | |||
794 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) | ||
795 | { | ||
796 | long chg = region_truncate(&inode->i_mapping->private_list, offset); | ||
797 | hugetlb_acct_memory(freed - chg); | ||
798 | } | ||