aboutsummaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c286
1 files changed, 233 insertions, 53 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,24 +13,48 @@
13#include <linux/pagemap.h> 13#include <linux/pagemap.h>
14#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
15#include <linux/cpuset.h> 15#include <linux/cpuset.h>
16#include <linux/mutex.h>
16 17
17#include <asm/page.h> 18#include <asm/page.h>
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19 20
20#include <linux/hugetlb.h> 21#include <linux/hugetlb.h>
22#include "internal.h"
21 23
22const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
23static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
24unsigned long max_huge_pages; 26unsigned long max_huge_pages;
25static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
26static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
27static unsigned int free_huge_pages_node[MAX_NUMNODES]; 29static unsigned int free_huge_pages_node[MAX_NUMNODES];
28
29/* 30/*
30 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages 31 * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
31 */ 32 */
32static DEFINE_SPINLOCK(hugetlb_lock); 33static DEFINE_SPINLOCK(hugetlb_lock);
33 34
35static void clear_huge_page(struct page *page, unsigned long addr)
36{
37 int i;
38
39 might_sleep();
40 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
41 cond_resched();
42 clear_user_highpage(page + i, addr);
43 }
44}
45
46static void copy_huge_page(struct page *dst, struct page *src,
47 unsigned long addr)
48{
49 int i;
50
51 might_sleep();
52 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
53 cond_resched();
54 copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
55 }
56}
57
34static void enqueue_huge_page(struct page *page) 58static void enqueue_huge_page(struct page *page)
35{ 59{
36 int nid = page_to_nid(page); 60 int nid = page_to_nid(page);
@@ -64,57 +88,176 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
64 return page; 88 return page;
65} 89}
66 90
67static struct page *alloc_fresh_huge_page(void) 91static void free_huge_page(struct page *page)
92{
93 BUG_ON(page_count(page));
94
95 INIT_LIST_HEAD(&page->lru);
96
97 spin_lock(&hugetlb_lock);
98 enqueue_huge_page(page);
99 spin_unlock(&hugetlb_lock);
100}
101
102static int alloc_fresh_huge_page(void)
68{ 103{
69 static int nid = 0; 104 static int nid = 0;
70 struct page *page; 105 struct page *page;
71 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, 106 page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
72 HUGETLB_PAGE_ORDER); 107 HUGETLB_PAGE_ORDER);
73 nid = (nid + 1) % num_online_nodes(); 108 nid = next_node(nid, node_online_map);
109 if (nid == MAX_NUMNODES)
110 nid = first_node(node_online_map);
74 if (page) { 111 if (page) {
112 page[1].lru.next = (void *)free_huge_page; /* dtor */
75 spin_lock(&hugetlb_lock); 113 spin_lock(&hugetlb_lock);
76 nr_huge_pages++; 114 nr_huge_pages++;
77 nr_huge_pages_node[page_to_nid(page)]++; 115 nr_huge_pages_node[page_to_nid(page)]++;
78 spin_unlock(&hugetlb_lock); 116 spin_unlock(&hugetlb_lock);
117 put_page(page); /* free it into the hugepage allocator */
118 return 1;
79 } 119 }
80 return page; 120 return 0;
81} 121}
82 122
83void free_huge_page(struct page *page) 123static struct page *alloc_huge_page(struct vm_area_struct *vma,
124 unsigned long addr)
84{ 125{
85 BUG_ON(page_count(page)); 126 struct inode *inode = vma->vm_file->f_dentry->d_inode;
127 struct page *page;
128 int use_reserve = 0;
129 unsigned long idx;
86 130
87 INIT_LIST_HEAD(&page->lru); 131 spin_lock(&hugetlb_lock);
88 page[1].lru.next = NULL; /* reset dtor */ 132
133 if (vma->vm_flags & VM_MAYSHARE) {
134
135 /* idx = radix tree index, i.e. offset into file in
136 * HPAGE_SIZE units */
137 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
138 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
139
140 /* The hugetlbfs specific inode info stores the number
141 * of "guaranteed available" (huge) pages. That is,
142 * the first 'prereserved_hpages' pages of the inode
143 * are either already instantiated, or have been
144 * pre-reserved (by hugetlb_reserve_for_inode()). Here
145 * we're in the process of instantiating the page, so
146 * we use this to determine whether to draw from the
147 * pre-reserved pool or the truly free pool. */
148 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
149 use_reserve = 1;
150 }
151
152 if (!use_reserve) {
153 if (free_huge_pages <= reserved_huge_pages)
154 goto fail;
155 } else {
156 BUG_ON(reserved_huge_pages == 0);
157 reserved_huge_pages--;
158 }
159
160 page = dequeue_huge_page(vma, addr);
161 if (!page)
162 goto fail;
163
164 spin_unlock(&hugetlb_lock);
165 set_page_refcounted(page);
166 return page;
167
168 fail:
169 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
170 spin_unlock(&hugetlb_lock);
171 return NULL;
172}
173
174/* hugetlb_extend_reservation()
175 *
176 * Ensure that at least 'atleast' hugepages are, and will remain,
177 * available to instantiate the first 'atleast' pages of the given
178 * inode. If the inode doesn't already have this many pages reserved
179 * or instantiated, set aside some hugepages in the reserved pool to
180 * satisfy later faults (or fail now if there aren't enough, rather
181 * than getting the SIGBUS later).
182 */
183int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
184 unsigned long atleast)
185{
186 struct inode *inode = &info->vfs_inode;
187 unsigned long change_in_reserve = 0;
188 int ret = 0;
89 189
90 spin_lock(&hugetlb_lock); 190 spin_lock(&hugetlb_lock);
91 enqueue_huge_page(page); 191 read_lock_irq(&inode->i_mapping->tree_lock);
192
193 if (info->prereserved_hpages >= atleast)
194 goto out;
195
196 /* Because we always call this on shared mappings, none of the
197 * pages beyond info->prereserved_hpages can have been
198 * instantiated, so we need to reserve all of them now. */
199 change_in_reserve = atleast - info->prereserved_hpages;
200
201 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
202 ret = -ENOMEM;
203 goto out;
204 }
205
206 reserved_huge_pages += change_in_reserve;
207 info->prereserved_hpages = atleast;
208
209 out:
210 read_unlock_irq(&inode->i_mapping->tree_lock);
92 spin_unlock(&hugetlb_lock); 211 spin_unlock(&hugetlb_lock);
212
213 return ret;
93} 214}
94 215
95struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 216/* hugetlb_truncate_reservation()
217 *
218 * This returns pages reserved for the given inode to the general free
219 * hugepage pool. If the inode has any pages prereserved, but not
220 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
221 * them.
222 */
223void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
224 unsigned long atmost)
96{ 225{
226 struct inode *inode = &info->vfs_inode;
227 struct address_space *mapping = inode->i_mapping;
228 unsigned long idx;
229 unsigned long change_in_reserve = 0;
97 struct page *page; 230 struct page *page;
98 int i;
99 231
100 spin_lock(&hugetlb_lock); 232 spin_lock(&hugetlb_lock);
101 page = dequeue_huge_page(vma, addr); 233 read_lock_irq(&inode->i_mapping->tree_lock);
102 if (!page) { 234
103 spin_unlock(&hugetlb_lock); 235 if (info->prereserved_hpages <= atmost)
104 return NULL; 236 goto out;
237
238 /* Count pages which were reserved, but not instantiated, and
239 * which we can now release. */
240 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
241 page = radix_tree_lookup(&mapping->page_tree, idx);
242 if (!page)
243 /* Pages which are already instantiated can't
244 * be unreserved (and in fact have already
245 * been removed from the reserved pool) */
246 change_in_reserve++;
105 } 247 }
248
249 BUG_ON(reserved_huge_pages < change_in_reserve);
250 reserved_huge_pages -= change_in_reserve;
251 info->prereserved_hpages = atmost;
252
253 out:
254 read_unlock_irq(&inode->i_mapping->tree_lock);
106 spin_unlock(&hugetlb_lock); 255 spin_unlock(&hugetlb_lock);
107 set_page_count(page, 1);
108 page[1].lru.next = (void *)free_huge_page; /* set dtor */
109 for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
110 clear_user_highpage(&page[i], addr);
111 return page;
112} 256}
113 257
114static int __init hugetlb_init(void) 258static int __init hugetlb_init(void)
115{ 259{
116 unsigned long i; 260 unsigned long i;
117 struct page *page;
118 261
119 if (HPAGE_SHIFT == 0) 262 if (HPAGE_SHIFT == 0)
120 return 0; 263 return 0;
@@ -123,12 +266,8 @@ static int __init hugetlb_init(void)
123 INIT_LIST_HEAD(&hugepage_freelists[i]); 266 INIT_LIST_HEAD(&hugepage_freelists[i]);
124 267
125 for (i = 0; i < max_huge_pages; ++i) { 268 for (i = 0; i < max_huge_pages; ++i) {
126 page = alloc_fresh_huge_page(); 269 if (!alloc_fresh_huge_page())
127 if (!page)
128 break; 270 break;
129 spin_lock(&hugetlb_lock);
130 enqueue_huge_page(page);
131 spin_unlock(&hugetlb_lock);
132 } 271 }
133 max_huge_pages = free_huge_pages = nr_huge_pages = i; 272 max_huge_pages = free_huge_pages = nr_huge_pages = i;
134 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages); 273 printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,9 +293,9 @@ static void update_and_free_page(struct page *page)
154 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 293 page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
155 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 294 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
156 1 << PG_private | 1<< PG_writeback); 295 1 << PG_private | 1<< PG_writeback);
157 set_page_count(&page[i], 0);
158 } 296 }
159 set_page_count(page, 1); 297 page[1].lru.next = NULL;
298 set_page_refcounted(page);
160 __free_pages(page, HUGETLB_PAGE_ORDER); 299 __free_pages(page, HUGETLB_PAGE_ORDER);
161} 300}
162 301
@@ -188,12 +327,8 @@ static inline void try_to_free_low(unsigned long count)
188static unsigned long set_max_huge_pages(unsigned long count) 327static unsigned long set_max_huge_pages(unsigned long count)
189{ 328{
190 while (count > nr_huge_pages) { 329 while (count > nr_huge_pages) {
191 struct page *page = alloc_fresh_huge_page(); 330 if (!alloc_fresh_huge_page())
192 if (!page)
193 return nr_huge_pages; 331 return nr_huge_pages;
194 spin_lock(&hugetlb_lock);
195 enqueue_huge_page(page);
196 spin_unlock(&hugetlb_lock);
197 } 332 }
198 if (count >= nr_huge_pages) 333 if (count >= nr_huge_pages)
199 return nr_huge_pages; 334 return nr_huge_pages;
@@ -225,9 +360,11 @@ int hugetlb_report_meminfo(char *buf)
225 return sprintf(buf, 360 return sprintf(buf,
226 "HugePages_Total: %5lu\n" 361 "HugePages_Total: %5lu\n"
227 "HugePages_Free: %5lu\n" 362 "HugePages_Free: %5lu\n"
363 "HugePages_Rsvd: %5lu\n"
228 "Hugepagesize: %5lu kB\n", 364 "Hugepagesize: %5lu kB\n",
229 nr_huge_pages, 365 nr_huge_pages,
230 free_huge_pages, 366 free_huge_pages,
367 reserved_huge_pages,
231 HPAGE_SIZE/1024); 368 HPAGE_SIZE/1024);
232} 369}
233 370
@@ -240,11 +377,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
240 nid, free_huge_pages_node[nid]); 377 nid, free_huge_pages_node[nid]);
241} 378}
242 379
243int is_hugepage_mem_enough(size_t size)
244{
245 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
246}
247
248/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 380/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
249unsigned long hugetlb_total_pages(void) 381unsigned long hugetlb_total_pages(void)
250{ 382{
@@ -374,7 +506,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
374 unsigned long address, pte_t *ptep, pte_t pte) 506 unsigned long address, pte_t *ptep, pte_t pte)
375{ 507{
376 struct page *old_page, *new_page; 508 struct page *old_page, *new_page;
377 int i, avoidcopy; 509 int avoidcopy;
378 510
379 old_page = pte_page(pte); 511 old_page = pte_page(pte);
380 512
@@ -395,9 +527,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
395 } 527 }
396 528
397 spin_unlock(&mm->page_table_lock); 529 spin_unlock(&mm->page_table_lock);
398 for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) 530 copy_huge_page(new_page, old_page, address);
399 copy_user_highpage(new_page + i, old_page + i,
400 address + i*PAGE_SIZE);
401 spin_lock(&mm->page_table_lock); 531 spin_lock(&mm->page_table_lock);
402 532
403 ptep = huge_pte_offset(mm, address & HPAGE_MASK); 533 ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -442,6 +572,7 @@ retry:
442 ret = VM_FAULT_OOM; 572 ret = VM_FAULT_OOM;
443 goto out; 573 goto out;
444 } 574 }
575 clear_huge_page(page, address);
445 576
446 if (vma->vm_flags & VM_SHARED) { 577 if (vma->vm_flags & VM_SHARED) {
447 int err; 578 int err;
@@ -496,14 +627,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
496 pte_t *ptep; 627 pte_t *ptep;
497 pte_t entry; 628 pte_t entry;
498 int ret; 629 int ret;
630 static DEFINE_MUTEX(hugetlb_instantiation_mutex);
499 631
500 ptep = huge_pte_alloc(mm, address); 632 ptep = huge_pte_alloc(mm, address);
501 if (!ptep) 633 if (!ptep)
502 return VM_FAULT_OOM; 634 return VM_FAULT_OOM;
503 635
636 /*
637 * Serialize hugepage allocation and instantiation, so that we don't
638 * get spurious allocation failures if two CPUs race to instantiate
639 * the same page in the page cache.
640 */
641 mutex_lock(&hugetlb_instantiation_mutex);
504 entry = *ptep; 642 entry = *ptep;
505 if (pte_none(entry)) 643 if (pte_none(entry)) {
506 return hugetlb_no_page(mm, vma, address, ptep, write_access); 644 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
645 mutex_unlock(&hugetlb_instantiation_mutex);
646 return ret;
647 }
507 648
508 ret = VM_FAULT_MINOR; 649 ret = VM_FAULT_MINOR;
509 650
@@ -513,6 +654,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
513 if (write_access && !pte_write(entry)) 654 if (write_access && !pte_write(entry))
514 ret = hugetlb_cow(mm, vma, address, ptep, entry); 655 ret = hugetlb_cow(mm, vma, address, ptep, entry);
515 spin_unlock(&mm->page_table_lock); 656 spin_unlock(&mm->page_table_lock);
657 mutex_unlock(&hugetlb_instantiation_mutex);
516 658
517 return ret; 659 return ret;
518} 660}
@@ -521,10 +663,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
521 struct page **pages, struct vm_area_struct **vmas, 663 struct page **pages, struct vm_area_struct **vmas,
522 unsigned long *position, int *length, int i) 664 unsigned long *position, int *length, int i)
523{ 665{
524 unsigned long vpfn, vaddr = *position; 666 unsigned long pfn_offset;
667 unsigned long vaddr = *position;
525 int remainder = *length; 668 int remainder = *length;
526 669
527 vpfn = vaddr/PAGE_SIZE;
528 spin_lock(&mm->page_table_lock); 670 spin_lock(&mm->page_table_lock);
529 while (vaddr < vma->vm_end && remainder) { 671 while (vaddr < vma->vm_end && remainder) {
530 pte_t *pte; 672 pte_t *pte;
@@ -552,19 +694,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
552 break; 694 break;
553 } 695 }
554 696
555 if (pages) { 697 pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
556 page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)]; 698 page = pte_page(*pte);
557 get_page(page); 699same_page:
558 pages[i] = page; 700 get_page(page);
559 } 701 if (pages)
702 pages[i] = page + pfn_offset;
560 703
561 if (vmas) 704 if (vmas)
562 vmas[i] = vma; 705 vmas[i] = vma;
563 706
564 vaddr += PAGE_SIZE; 707 vaddr += PAGE_SIZE;
565 ++vpfn; 708 ++pfn_offset;
566 --remainder; 709 --remainder;
567 ++i; 710 ++i;
711 if (vaddr < vma->vm_end && remainder &&
712 pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
713 /*
714 * We use pfn_offset to avoid touching the pageframes
715 * of this compound page.
716 */
717 goto same_page;
718 }
568 } 719 }
569 spin_unlock(&mm->page_table_lock); 720 spin_unlock(&mm->page_table_lock);
570 *length = remainder; 721 *length = remainder;
@@ -572,3 +723,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
572 723
573 return i; 724 return i;
574} 725}
726
727void hugetlb_change_protection(struct vm_area_struct *vma,
728 unsigned long address, unsigned long end, pgprot_t newprot)
729{
730 struct mm_struct *mm = vma->vm_mm;
731 unsigned long start = address;
732 pte_t *ptep;
733 pte_t pte;
734
735 BUG_ON(address >= end);
736 flush_cache_range(vma, address, end);
737
738 spin_lock(&mm->page_table_lock);
739 for (; address < end; address += HPAGE_SIZE) {
740 ptep = huge_pte_offset(mm, address);
741 if (!ptep)
742 continue;
743 if (!pte_none(*ptep)) {
744 pte = huge_ptep_get_and_clear(mm, address, ptep);
745 pte = pte_mkhuge(pte_modify(pte, newprot));
746 set_huge_pte_at(mm, address, ptep, pte);
747 lazy_mmu_prot_update(pte);
748 }
749 }
750 spin_unlock(&mm->page_table_lock);
751
752 flush_tlb_range(vma, start, end);
753}
754