diff options
-rw-r--r-- | fs/hugetlbfs/inode.c | 74 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 8 | ||||
-rw-r--r-- | mm/hugetlb.c | 136 |
3 files changed, 154 insertions, 64 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b35195289945..1a1c2fcb7823 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec) | |||
56 | pagevec_reinit(pvec); | 56 | pagevec_reinit(pvec); |
57 | } | 57 | } |
58 | 58 | ||
59 | /* | ||
60 | * huge_pages_needed tries to determine the number of new huge pages that | ||
61 | * will be required to fully populate this VMA. This will be equal to | ||
62 | * the size of the VMA in huge pages minus the number of huge pages | ||
63 | * (covered by this VMA) that are found in the page cache. | ||
64 | * | ||
65 | * Result is in bytes to be compatible with is_hugepage_mem_enough() | ||
66 | */ | ||
67 | static unsigned long | ||
68 | huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma) | ||
69 | { | ||
70 | int i; | ||
71 | struct pagevec pvec; | ||
72 | unsigned long start = vma->vm_start; | ||
73 | unsigned long end = vma->vm_end; | ||
74 | unsigned long hugepages = (end - start) >> HPAGE_SHIFT; | ||
75 | pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT); | ||
76 | pgoff_t endpg = next + hugepages; | ||
77 | |||
78 | pagevec_init(&pvec, 0); | ||
79 | while (next < endpg) { | ||
80 | if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) | ||
81 | break; | ||
82 | for (i = 0; i < pagevec_count(&pvec); i++) { | ||
83 | struct page *page = pvec.pages[i]; | ||
84 | if (page->index > next) | ||
85 | next = page->index; | ||
86 | if (page->index >= endpg) | ||
87 | break; | ||
88 | next++; | ||
89 | hugepages--; | ||
90 | } | ||
91 | huge_pagevec_release(&pvec); | ||
92 | } | ||
93 | return hugepages << HPAGE_SHIFT; | ||
94 | } | ||
95 | |||
96 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | 59 | static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) |
97 | { | 60 | { |
98 | struct inode *inode = file->f_dentry->d_inode; | 61 | struct inode *inode = file->f_dentry->d_inode; |
99 | struct address_space *mapping = inode->i_mapping; | 62 | struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); |
100 | unsigned long bytes; | ||
101 | loff_t len, vma_len; | 63 | loff_t len, vma_len; |
102 | int ret; | 64 | int ret; |
103 | 65 | ||
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
113 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) | 75 | if (vma->vm_end - vma->vm_start < HPAGE_SIZE) |
114 | return -EINVAL; | 76 | return -EINVAL; |
115 | 77 | ||
116 | bytes = huge_pages_needed(mapping, vma); | ||
117 | if (!is_hugepage_mem_enough(bytes)) | ||
118 | return -ENOMEM; | ||
119 | |||
120 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); | 78 | vma_len = (loff_t)(vma->vm_end - vma->vm_start); |
121 | 79 | ||
122 | mutex_lock(&inode->i_mutex); | 80 | mutex_lock(&inode->i_mutex); |
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
129 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) | 87 | if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size) |
130 | goto out; | 88 | goto out; |
131 | 89 | ||
90 | if (vma->vm_flags & VM_MAYSHARE) | ||
91 | if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0) | ||
92 | goto out; | ||
93 | |||
132 | ret = 0; | 94 | ret = 0; |
133 | hugetlb_prefault_arch_hook(vma->vm_mm); | 95 | hugetlb_prefault_arch_hook(vma->vm_mm); |
134 | if (inode->i_size < len) | 96 | if (inode->i_size < len) |
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page) | |||
227 | put_page(page); | 189 | put_page(page); |
228 | } | 190 | } |
229 | 191 | ||
230 | static void truncate_hugepages(struct address_space *mapping, loff_t lstart) | 192 | static void truncate_hugepages(struct inode *inode, loff_t lstart) |
231 | { | 193 | { |
194 | struct address_space *mapping = &inode->i_data; | ||
232 | const pgoff_t start = lstart >> HPAGE_SHIFT; | 195 | const pgoff_t start = lstart >> HPAGE_SHIFT; |
233 | struct pagevec pvec; | 196 | struct pagevec pvec; |
234 | pgoff_t next; | 197 | pgoff_t next; |
235 | int i; | 198 | int i; |
236 | 199 | ||
200 | hugetlb_truncate_reservation(HUGETLBFS_I(inode), | ||
201 | lstart >> HPAGE_SHIFT); | ||
202 | if (!mapping->nrpages) | ||
203 | return; | ||
237 | pagevec_init(&pvec, 0); | 204 | pagevec_init(&pvec, 0); |
238 | next = start; | 205 | next = start; |
239 | while (1) { | 206 | while (1) { |
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart) | |||
262 | 229 | ||
263 | static void hugetlbfs_delete_inode(struct inode *inode) | 230 | static void hugetlbfs_delete_inode(struct inode *inode) |
264 | { | 231 | { |
265 | if (inode->i_data.nrpages) | 232 | truncate_hugepages(inode, 0); |
266 | truncate_hugepages(&inode->i_data, 0); | ||
267 | clear_inode(inode); | 233 | clear_inode(inode); |
268 | } | 234 | } |
269 | 235 | ||
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode) | |||
296 | inode->i_state |= I_FREEING; | 262 | inode->i_state |= I_FREEING; |
297 | inodes_stat.nr_inodes--; | 263 | inodes_stat.nr_inodes--; |
298 | spin_unlock(&inode_lock); | 264 | spin_unlock(&inode_lock); |
299 | if (inode->i_data.nrpages) | 265 | truncate_hugepages(inode, 0); |
300 | truncate_hugepages(&inode->i_data, 0); | ||
301 | clear_inode(inode); | 266 | clear_inode(inode); |
302 | destroy_inode(inode); | 267 | destroy_inode(inode); |
303 | } | 268 | } |
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) | |||
356 | if (!prio_tree_empty(&mapping->i_mmap)) | 321 | if (!prio_tree_empty(&mapping->i_mmap)) |
357 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); | 322 | hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff); |
358 | spin_unlock(&mapping->i_mmap_lock); | 323 | spin_unlock(&mapping->i_mmap_lock); |
359 | truncate_hugepages(mapping, offset); | 324 | truncate_hugepages(inode, offset); |
360 | return 0; | 325 | return 0; |
361 | } | 326 | } |
362 | 327 | ||
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) | |||
573 | hugetlbfs_inc_free_inodes(sbinfo); | 538 | hugetlbfs_inc_free_inodes(sbinfo); |
574 | return NULL; | 539 | return NULL; |
575 | } | 540 | } |
541 | p->prereserved_hpages = 0; | ||
576 | return &p->vfs_inode; | 542 | return &p->vfs_inode; |
577 | } | 543 | } |
578 | 544 | ||
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size) | |||
805 | if (!can_do_hugetlb_shm()) | 771 | if (!can_do_hugetlb_shm()) |
806 | return ERR_PTR(-EPERM); | 772 | return ERR_PTR(-EPERM); |
807 | 773 | ||
808 | if (!is_hugepage_mem_enough(size)) | ||
809 | return ERR_PTR(-ENOMEM); | ||
810 | |||
811 | if (!user_shm_lock(size, current->user)) | 774 | if (!user_shm_lock(size, current->user)) |
812 | return ERR_PTR(-ENOMEM); | 775 | return ERR_PTR(-ENOMEM); |
813 | 776 | ||
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size) | |||
831 | if (!inode) | 794 | if (!inode) |
832 | goto out_file; | 795 | goto out_file; |
833 | 796 | ||
797 | error = -ENOMEM; | ||
798 | if (hugetlb_extend_reservation(HUGETLBFS_I(inode), | ||
799 | size >> HPAGE_SHIFT) != 0) | ||
800 | goto out_inode; | ||
801 | |||
834 | d_instantiate(dentry, inode); | 802 | d_instantiate(dentry, inode); |
835 | inode->i_size = size; | 803 | inode->i_size = size; |
836 | inode->i_nlink = 0; | 804 | inode->i_nlink = 0; |
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size) | |||
841 | file->f_mode = FMODE_WRITE | FMODE_READ; | 809 | file->f_mode = FMODE_WRITE | FMODE_READ; |
842 | return file; | 810 | return file; |
843 | 811 | ||
812 | out_inode: | ||
813 | iput(inode); | ||
844 | out_file: | 814 | out_file: |
845 | put_filp(file); | 815 | put_filp(file); |
846 | out_dentry: | 816 | out_dentry: |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index fa83836b63d2..cafe73eecb05 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long) | |||
20 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); | 20 | int hugetlb_prefault(struct address_space *, struct vm_area_struct *); |
21 | int hugetlb_report_meminfo(char *); | 21 | int hugetlb_report_meminfo(char *); |
22 | int hugetlb_report_node_meminfo(int, char *); | 22 | int hugetlb_report_node_meminfo(int, char *); |
23 | int is_hugepage_mem_enough(size_t); | ||
24 | unsigned long hugetlb_total_pages(void); | 23 | unsigned long hugetlb_total_pages(void); |
25 | struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); | 24 | struct page *alloc_huge_page(struct vm_area_struct *, unsigned long); |
26 | void free_huge_page(struct page *); | 25 | void free_huge_page(struct page *); |
@@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void) | |||
89 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) | 88 | #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) |
90 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) | 89 | #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) |
91 | #define unmap_hugepage_range(vma, start, end) BUG() | 90 | #define unmap_hugepage_range(vma, start, end) BUG() |
92 | #define is_hugepage_mem_enough(size) 0 | ||
93 | #define hugetlb_report_meminfo(buf) 0 | 91 | #define hugetlb_report_meminfo(buf) 0 |
94 | #define hugetlb_report_node_meminfo(n, buf) 0 | 92 | #define hugetlb_report_node_meminfo(n, buf) 0 |
95 | #define follow_huge_pmd(mm, addr, pmd, write) NULL | 93 | #define follow_huge_pmd(mm, addr, pmd, write) NULL |
@@ -132,6 +130,8 @@ struct hugetlbfs_sb_info { | |||
132 | 130 | ||
133 | struct hugetlbfs_inode_info { | 131 | struct hugetlbfs_inode_info { |
134 | struct shared_policy policy; | 132 | struct shared_policy policy; |
133 | /* Protected by the (global) hugetlb_lock */ | ||
134 | unsigned long prereserved_hpages; | ||
135 | struct inode vfs_inode; | 135 | struct inode vfs_inode; |
136 | }; | 136 | }; |
137 | 137 | ||
@@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) | |||
148 | extern struct file_operations hugetlbfs_file_operations; | 148 | extern struct file_operations hugetlbfs_file_operations; |
149 | extern struct vm_operations_struct hugetlb_vm_ops; | 149 | extern struct vm_operations_struct hugetlb_vm_ops; |
150 | struct file *hugetlb_zero_setup(size_t); | 150 | struct file *hugetlb_zero_setup(size_t); |
151 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
152 | unsigned long atleast_hpages); | ||
153 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
154 | unsigned long atmost_hpages); | ||
151 | int hugetlb_get_quota(struct address_space *mapping); | 155 | int hugetlb_get_quota(struct address_space *mapping); |
152 | void hugetlb_put_quota(struct address_space *mapping); | 156 | void hugetlb_put_quota(struct address_space *mapping); |
153 | 157 | ||
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d5987a87bbe5..27fad5d9bcf6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "internal.h" | 22 | #include "internal.h" |
23 | 23 | ||
24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; | 24 | const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; |
25 | static unsigned long nr_huge_pages, free_huge_pages; | 25 | static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages; |
26 | unsigned long max_huge_pages; | 26 | unsigned long max_huge_pages; |
27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; | 27 | static struct list_head hugepage_freelists[MAX_NUMNODES]; |
28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; | 28 | static unsigned int nr_huge_pages_node[MAX_NUMNODES]; |
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page) | |||
120 | 120 | ||
121 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) | 121 | struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) |
122 | { | 122 | { |
123 | struct inode *inode = vma->vm_file->f_dentry->d_inode; | ||
123 | struct page *page; | 124 | struct page *page; |
125 | int use_reserve = 0; | ||
126 | unsigned long idx; | ||
124 | 127 | ||
125 | spin_lock(&hugetlb_lock); | 128 | spin_lock(&hugetlb_lock); |
126 | page = dequeue_huge_page(vma, addr); | 129 | |
127 | if (!page) { | 130 | if (vma->vm_flags & VM_MAYSHARE) { |
128 | spin_unlock(&hugetlb_lock); | 131 | |
129 | return NULL; | 132 | /* idx = radix tree index, i.e. offset into file in |
133 | * HPAGE_SIZE units */ | ||
134 | idx = ((addr - vma->vm_start) >> HPAGE_SHIFT) | ||
135 | + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT)); | ||
136 | |||
137 | /* The hugetlbfs specific inode info stores the number | ||
138 | * of "guaranteed available" (huge) pages. That is, | ||
139 | * the first 'prereserved_hpages' pages of the inode | ||
140 | * are either already instantiated, or have been | ||
141 | * pre-reserved (by hugetlb_reserve_for_inode()). Here | ||
142 | * we're in the process of instantiating the page, so | ||
143 | * we use this to determine whether to draw from the | ||
144 | * pre-reserved pool or the truly free pool. */ | ||
145 | if (idx < HUGETLBFS_I(inode)->prereserved_hpages) | ||
146 | use_reserve = 1; | ||
147 | } | ||
148 | |||
149 | if (!use_reserve) { | ||
150 | if (free_huge_pages <= reserved_huge_pages) | ||
151 | goto fail; | ||
152 | } else { | ||
153 | BUG_ON(reserved_huge_pages == 0); | ||
154 | reserved_huge_pages--; | ||
130 | } | 155 | } |
156 | |||
157 | page = dequeue_huge_page(vma, addr); | ||
158 | if (!page) | ||
159 | goto fail; | ||
160 | |||
131 | spin_unlock(&hugetlb_lock); | 161 | spin_unlock(&hugetlb_lock); |
132 | set_page_refcounted(page); | 162 | set_page_refcounted(page); |
133 | return page; | 163 | return page; |
164 | |||
165 | fail: | ||
166 | WARN_ON(use_reserve); /* reserved allocations shouldn't fail */ | ||
167 | spin_unlock(&hugetlb_lock); | ||
168 | return NULL; | ||
169 | } | ||
170 | |||
171 | /* hugetlb_extend_reservation() | ||
172 | * | ||
173 | * Ensure that at least 'atleast' hugepages are, and will remain, | ||
174 | * available to instantiate the first 'atleast' pages of the given | ||
175 | * inode. If the inode doesn't already have this many pages reserved | ||
176 | * or instantiated, set aside some hugepages in the reserved pool to | ||
177 | * satisfy later faults (or fail now if there aren't enough, rather | ||
178 | * than getting the SIGBUS later). | ||
179 | */ | ||
180 | int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info, | ||
181 | unsigned long atleast) | ||
182 | { | ||
183 | struct inode *inode = &info->vfs_inode; | ||
184 | unsigned long change_in_reserve = 0; | ||
185 | int ret = 0; | ||
186 | |||
187 | spin_lock(&hugetlb_lock); | ||
188 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
189 | |||
190 | if (info->prereserved_hpages >= atleast) | ||
191 | goto out; | ||
192 | |||
193 | /* Because we always call this on shared mappings, none of the | ||
194 | * pages beyond info->prereserved_hpages can have been | ||
195 | * instantiated, so we need to reserve all of them now. */ | ||
196 | change_in_reserve = atleast - info->prereserved_hpages; | ||
197 | |||
198 | if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) { | ||
199 | ret = -ENOMEM; | ||
200 | goto out; | ||
201 | } | ||
202 | |||
203 | reserved_huge_pages += change_in_reserve; | ||
204 | info->prereserved_hpages = atleast; | ||
205 | |||
206 | out: | ||
207 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
208 | spin_unlock(&hugetlb_lock); | ||
209 | |||
210 | return ret; | ||
211 | } | ||
212 | |||
213 | /* hugetlb_truncate_reservation() | ||
214 | * | ||
215 | * This returns pages reserved for the given inode to the general free | ||
216 | * hugepage pool. If the inode has any pages prereserved, but not | ||
217 | * instantiated, beyond offset (atmost << HPAGE_SIZE), then release | ||
218 | * them. | ||
219 | */ | ||
220 | void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info, | ||
221 | unsigned long atmost) | ||
222 | { | ||
223 | struct inode *inode = &info->vfs_inode; | ||
224 | struct address_space *mapping = inode->i_mapping; | ||
225 | unsigned long idx; | ||
226 | unsigned long change_in_reserve = 0; | ||
227 | struct page *page; | ||
228 | |||
229 | spin_lock(&hugetlb_lock); | ||
230 | read_lock_irq(&inode->i_mapping->tree_lock); | ||
231 | |||
232 | if (info->prereserved_hpages <= atmost) | ||
233 | goto out; | ||
234 | |||
235 | /* Count pages which were reserved, but not instantiated, and | ||
236 | * which we can now release. */ | ||
237 | for (idx = atmost; idx < info->prereserved_hpages; idx++) { | ||
238 | page = radix_tree_lookup(&mapping->page_tree, idx); | ||
239 | if (!page) | ||
240 | /* Pages which are already instantiated can't | ||
241 | * be unreserved (and in fact have already | ||
242 | * been removed from the reserved pool) */ | ||
243 | change_in_reserve++; | ||
244 | } | ||
245 | |||
246 | BUG_ON(reserved_huge_pages < change_in_reserve); | ||
247 | reserved_huge_pages -= change_in_reserve; | ||
248 | info->prereserved_hpages = atmost; | ||
249 | |||
250 | out: | ||
251 | read_unlock_irq(&inode->i_mapping->tree_lock); | ||
252 | spin_unlock(&hugetlb_lock); | ||
134 | } | 253 | } |
135 | 254 | ||
136 | static int __init hugetlb_init(void) | 255 | static int __init hugetlb_init(void) |
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf) | |||
238 | return sprintf(buf, | 357 | return sprintf(buf, |
239 | "HugePages_Total: %5lu\n" | 358 | "HugePages_Total: %5lu\n" |
240 | "HugePages_Free: %5lu\n" | 359 | "HugePages_Free: %5lu\n" |
360 | "HugePages_Rsvd: %5lu\n" | ||
241 | "Hugepagesize: %5lu kB\n", | 361 | "Hugepagesize: %5lu kB\n", |
242 | nr_huge_pages, | 362 | nr_huge_pages, |
243 | free_huge_pages, | 363 | free_huge_pages, |
364 | reserved_huge_pages, | ||
244 | HPAGE_SIZE/1024); | 365 | HPAGE_SIZE/1024); |
245 | } | 366 | } |
246 | 367 | ||
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf) | |||
253 | nid, free_huge_pages_node[nid]); | 374 | nid, free_huge_pages_node[nid]); |
254 | } | 375 | } |
255 | 376 | ||
256 | int is_hugepage_mem_enough(size_t size) | ||
257 | { | ||
258 | return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages; | ||
259 | } | ||
260 | |||
261 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ | 377 | /* Return the number pages of memory we physically have, in PAGE_SIZE units. */ |
262 | unsigned long hugetlb_total_pages(void) | 378 | unsigned long hugetlb_total_pages(void) |
263 | { | 379 | { |