aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/hugetlbfs/inode.c74
-rw-r--r--include/linux/hugetlb.h8
-rw-r--r--mm/hugetlb.c136
3 files changed, 154 insertions, 64 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..1a1c2fcb7823 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
56 pagevec_reinit(pvec); 56 pagevec_reinit(pvec);
57} 57}
58 58
59/*
60 * huge_pages_needed tries to determine the number of new huge pages that
61 * will be required to fully populate this VMA. This will be equal to
62 * the size of the VMA in huge pages minus the number of huge pages
63 * (covered by this VMA) that are found in the page cache.
64 *
65 * Result is in bytes to be compatible with is_hugepage_mem_enough()
66 */
67static unsigned long
68huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
69{
70 int i;
71 struct pagevec pvec;
72 unsigned long start = vma->vm_start;
73 unsigned long end = vma->vm_end;
74 unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
75 pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
76 pgoff_t endpg = next + hugepages;
77
78 pagevec_init(&pvec, 0);
79 while (next < endpg) {
80 if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
81 break;
82 for (i = 0; i < pagevec_count(&pvec); i++) {
83 struct page *page = pvec.pages[i];
84 if (page->index > next)
85 next = page->index;
86 if (page->index >= endpg)
87 break;
88 next++;
89 hugepages--;
90 }
91 huge_pagevec_release(&pvec);
92 }
93 return hugepages << HPAGE_SHIFT;
94}
95
96static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) 59static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
97{ 60{
98 struct inode *inode = file->f_dentry->d_inode; 61 struct inode *inode = file->f_dentry->d_inode;
99 struct address_space *mapping = inode->i_mapping; 62 struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
100 unsigned long bytes;
101 loff_t len, vma_len; 63 loff_t len, vma_len;
102 int ret; 64 int ret;
103 65
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
113 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 75 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
114 return -EINVAL; 76 return -EINVAL;
115 77
116 bytes = huge_pages_needed(mapping, vma);
117 if (!is_hugepage_mem_enough(bytes))
118 return -ENOMEM;
119
120 vma_len = (loff_t)(vma->vm_end - vma->vm_start); 78 vma_len = (loff_t)(vma->vm_end - vma->vm_start);
121 79
122 mutex_lock(&inode->i_mutex); 80 mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
129 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) 87 if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
130 goto out; 88 goto out;
131 89
90 if (vma->vm_flags & VM_MAYSHARE)
91 if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
92 goto out;
93
132 ret = 0; 94 ret = 0;
133 hugetlb_prefault_arch_hook(vma->vm_mm); 95 hugetlb_prefault_arch_hook(vma->vm_mm);
134 if (inode->i_size < len) 96 if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
227 put_page(page); 189 put_page(page);
228} 190}
229 191
230static void truncate_hugepages(struct address_space *mapping, loff_t lstart) 192static void truncate_hugepages(struct inode *inode, loff_t lstart)
231{ 193{
194 struct address_space *mapping = &inode->i_data;
232 const pgoff_t start = lstart >> HPAGE_SHIFT; 195 const pgoff_t start = lstart >> HPAGE_SHIFT;
233 struct pagevec pvec; 196 struct pagevec pvec;
234 pgoff_t next; 197 pgoff_t next;
235 int i; 198 int i;
236 199
200 hugetlb_truncate_reservation(HUGETLBFS_I(inode),
201 lstart >> HPAGE_SHIFT);
202 if (!mapping->nrpages)
203 return;
237 pagevec_init(&pvec, 0); 204 pagevec_init(&pvec, 0);
238 next = start; 205 next = start;
239 while (1) { 206 while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
262 229
263static void hugetlbfs_delete_inode(struct inode *inode) 230static void hugetlbfs_delete_inode(struct inode *inode)
264{ 231{
265 if (inode->i_data.nrpages) 232 truncate_hugepages(inode, 0);
266 truncate_hugepages(&inode->i_data, 0);
267 clear_inode(inode); 233 clear_inode(inode);
268} 234}
269 235
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
296 inode->i_state |= I_FREEING; 262 inode->i_state |= I_FREEING;
297 inodes_stat.nr_inodes--; 263 inodes_stat.nr_inodes--;
298 spin_unlock(&inode_lock); 264 spin_unlock(&inode_lock);
299 if (inode->i_data.nrpages) 265 truncate_hugepages(inode, 0);
300 truncate_hugepages(&inode->i_data, 0);
301 clear_inode(inode); 266 clear_inode(inode);
302 destroy_inode(inode); 267 destroy_inode(inode);
303} 268}
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
356 if (!prio_tree_empty(&mapping->i_mmap)) 321 if (!prio_tree_empty(&mapping->i_mmap))
357 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); 322 hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
358 spin_unlock(&mapping->i_mmap_lock); 323 spin_unlock(&mapping->i_mmap_lock);
359 truncate_hugepages(mapping, offset); 324 truncate_hugepages(inode, offset);
360 return 0; 325 return 0;
361} 326}
362 327
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
573 hugetlbfs_inc_free_inodes(sbinfo); 538 hugetlbfs_inc_free_inodes(sbinfo);
574 return NULL; 539 return NULL;
575 } 540 }
541 p->prereserved_hpages = 0;
576 return &p->vfs_inode; 542 return &p->vfs_inode;
577} 543}
578 544
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size)
805 if (!can_do_hugetlb_shm()) 771 if (!can_do_hugetlb_shm())
806 return ERR_PTR(-EPERM); 772 return ERR_PTR(-EPERM);
807 773
808 if (!is_hugepage_mem_enough(size))
809 return ERR_PTR(-ENOMEM);
810
811 if (!user_shm_lock(size, current->user)) 774 if (!user_shm_lock(size, current->user))
812 return ERR_PTR(-ENOMEM); 775 return ERR_PTR(-ENOMEM);
813 776
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size)
831 if (!inode) 794 if (!inode)
832 goto out_file; 795 goto out_file;
833 796
797 error = -ENOMEM;
798 if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
799 size >> HPAGE_SHIFT) != 0)
800 goto out_inode;
801
834 d_instantiate(dentry, inode); 802 d_instantiate(dentry, inode);
835 inode->i_size = size; 803 inode->i_size = size;
836 inode->i_nlink = 0; 804 inode->i_nlink = 0;
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size)
841 file->f_mode = FMODE_WRITE | FMODE_READ; 809 file->f_mode = FMODE_WRITE | FMODE_READ;
842 return file; 810 return file;
843 811
812out_inode:
813 iput(inode);
844out_file: 814out_file:
845 put_filp(file); 815 put_filp(file);
846out_dentry: 816out_dentry:
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fa83836b63d2..cafe73eecb05 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
20int hugetlb_prefault(struct address_space *, struct vm_area_struct *); 20int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
21int hugetlb_report_meminfo(char *); 21int hugetlb_report_meminfo(char *);
22int hugetlb_report_node_meminfo(int, char *); 22int hugetlb_report_node_meminfo(int, char *);
23int is_hugepage_mem_enough(size_t);
24unsigned long hugetlb_total_pages(void); 23unsigned long hugetlb_total_pages(void);
25struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); 24struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
26void free_huge_page(struct page *); 25void free_huge_page(struct page *);
@@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void)
89#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) 88#define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; })
90#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) 89#define hugetlb_prefault(mapping, vma) ({ BUG(); 0; })
91#define unmap_hugepage_range(vma, start, end) BUG() 90#define unmap_hugepage_range(vma, start, end) BUG()
92#define is_hugepage_mem_enough(size) 0
93#define hugetlb_report_meminfo(buf) 0 91#define hugetlb_report_meminfo(buf) 0
94#define hugetlb_report_node_meminfo(n, buf) 0 92#define hugetlb_report_node_meminfo(n, buf) 0
95#define follow_huge_pmd(mm, addr, pmd, write) NULL 93#define follow_huge_pmd(mm, addr, pmd, write) NULL
@@ -132,6 +130,8 @@ struct hugetlbfs_sb_info {
132 130
133struct hugetlbfs_inode_info { 131struct hugetlbfs_inode_info {
134 struct shared_policy policy; 132 struct shared_policy policy;
133 /* Protected by the (global) hugetlb_lock */
134 unsigned long prereserved_hpages;
135 struct inode vfs_inode; 135 struct inode vfs_inode;
136}; 136};
137 137
@@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
148extern struct file_operations hugetlbfs_file_operations; 148extern struct file_operations hugetlbfs_file_operations;
149extern struct vm_operations_struct hugetlb_vm_ops; 149extern struct vm_operations_struct hugetlb_vm_ops;
150struct file *hugetlb_zero_setup(size_t); 150struct file *hugetlb_zero_setup(size_t);
151int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
152 unsigned long atleast_hpages);
153void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
154 unsigned long atmost_hpages);
151int hugetlb_get_quota(struct address_space *mapping); 155int hugetlb_get_quota(struct address_space *mapping);
152void hugetlb_put_quota(struct address_space *mapping); 156void hugetlb_put_quota(struct address_space *mapping);
153 157
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
22#include "internal.h" 22#include "internal.h"
23 23
24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; 24const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
25static unsigned long nr_huge_pages, free_huge_pages; 25static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
26unsigned long max_huge_pages; 26unsigned long max_huge_pages;
27static struct list_head hugepage_freelists[MAX_NUMNODES]; 27static struct list_head hugepage_freelists[MAX_NUMNODES];
28static unsigned int nr_huge_pages_node[MAX_NUMNODES]; 28static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
120 120
121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) 121struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
122{ 122{
123 struct inode *inode = vma->vm_file->f_dentry->d_inode;
123 struct page *page; 124 struct page *page;
125 int use_reserve = 0;
126 unsigned long idx;
124 127
125 spin_lock(&hugetlb_lock); 128 spin_lock(&hugetlb_lock);
126 page = dequeue_huge_page(vma, addr); 129
127 if (!page) { 130 if (vma->vm_flags & VM_MAYSHARE) {
128 spin_unlock(&hugetlb_lock); 131
129 return NULL; 132 /* idx = radix tree index, i.e. offset into file in
133 * HPAGE_SIZE units */
134 idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
135 + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
136
137 /* The hugetlbfs specific inode info stores the number
138 * of "guaranteed available" (huge) pages. That is,
139 * the first 'prereserved_hpages' pages of the inode
140 * are either already instantiated, or have been
141 * pre-reserved (by hugetlb_reserve_for_inode()). Here
142 * we're in the process of instantiating the page, so
143 * we use this to determine whether to draw from the
144 * pre-reserved pool or the truly free pool. */
145 if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
146 use_reserve = 1;
147 }
148
149 if (!use_reserve) {
150 if (free_huge_pages <= reserved_huge_pages)
151 goto fail;
152 } else {
153 BUG_ON(reserved_huge_pages == 0);
154 reserved_huge_pages--;
130 } 155 }
156
157 page = dequeue_huge_page(vma, addr);
158 if (!page)
159 goto fail;
160
131 spin_unlock(&hugetlb_lock); 161 spin_unlock(&hugetlb_lock);
132 set_page_refcounted(page); 162 set_page_refcounted(page);
133 return page; 163 return page;
164
165 fail:
166 WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
167 spin_unlock(&hugetlb_lock);
168 return NULL;
169}
170
171/* hugetlb_extend_reservation()
172 *
173 * Ensure that at least 'atleast' hugepages are, and will remain,
174 * available to instantiate the first 'atleast' pages of the given
175 * inode. If the inode doesn't already have this many pages reserved
176 * or instantiated, set aside some hugepages in the reserved pool to
177 * satisfy later faults (or fail now if there aren't enough, rather
178 * than getting the SIGBUS later).
179 */
180int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
181 unsigned long atleast)
182{
183 struct inode *inode = &info->vfs_inode;
184 unsigned long change_in_reserve = 0;
185 int ret = 0;
186
187 spin_lock(&hugetlb_lock);
188 read_lock_irq(&inode->i_mapping->tree_lock);
189
190 if (info->prereserved_hpages >= atleast)
191 goto out;
192
193 /* Because we always call this on shared mappings, none of the
194 * pages beyond info->prereserved_hpages can have been
195 * instantiated, so we need to reserve all of them now. */
196 change_in_reserve = atleast - info->prereserved_hpages;
197
198 if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
199 ret = -ENOMEM;
200 goto out;
201 }
202
203 reserved_huge_pages += change_in_reserve;
204 info->prereserved_hpages = atleast;
205
206 out:
207 read_unlock_irq(&inode->i_mapping->tree_lock);
208 spin_unlock(&hugetlb_lock);
209
210 return ret;
211}
212
213/* hugetlb_truncate_reservation()
214 *
215 * This returns pages reserved for the given inode to the general free
216 * hugepage pool. If the inode has any pages prereserved, but not
217 * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
218 * them.
219 */
220void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
221 unsigned long atmost)
222{
223 struct inode *inode = &info->vfs_inode;
224 struct address_space *mapping = inode->i_mapping;
225 unsigned long idx;
226 unsigned long change_in_reserve = 0;
227 struct page *page;
228
229 spin_lock(&hugetlb_lock);
230 read_lock_irq(&inode->i_mapping->tree_lock);
231
232 if (info->prereserved_hpages <= atmost)
233 goto out;
234
235 /* Count pages which were reserved, but not instantiated, and
236 * which we can now release. */
237 for (idx = atmost; idx < info->prereserved_hpages; idx++) {
238 page = radix_tree_lookup(&mapping->page_tree, idx);
239 if (!page)
240 /* Pages which are already instantiated can't
241 * be unreserved (and in fact have already
242 * been removed from the reserved pool) */
243 change_in_reserve++;
244 }
245
246 BUG_ON(reserved_huge_pages < change_in_reserve);
247 reserved_huge_pages -= change_in_reserve;
248 info->prereserved_hpages = atmost;
249
250 out:
251 read_unlock_irq(&inode->i_mapping->tree_lock);
252 spin_unlock(&hugetlb_lock);
134} 253}
135 254
136static int __init hugetlb_init(void) 255static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
238 return sprintf(buf, 357 return sprintf(buf,
239 "HugePages_Total: %5lu\n" 358 "HugePages_Total: %5lu\n"
240 "HugePages_Free: %5lu\n" 359 "HugePages_Free: %5lu\n"
360 "HugePages_Rsvd: %5lu\n"
241 "Hugepagesize: %5lu kB\n", 361 "Hugepagesize: %5lu kB\n",
242 nr_huge_pages, 362 nr_huge_pages,
243 free_huge_pages, 363 free_huge_pages,
364 reserved_huge_pages,
244 HPAGE_SIZE/1024); 365 HPAGE_SIZE/1024);
245} 366}
246 367
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
253 nid, free_huge_pages_node[nid]); 374 nid, free_huge_pages_node[nid]);
254} 375}
255 376
256int is_hugepage_mem_enough(size_t size)
257{
258 return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
259}
260
261/* Return the number pages of memory we physically have, in PAGE_SIZE units. */ 377/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
262unsigned long hugetlb_total_pages(void) 378unsigned long hugetlb_total_pages(void)
263{ 379{