diff options
-rw-r--r-- | fs/hugetlbfs/inode.c | 8 | ||||
-rw-r--r-- | include/linux/hugetlb.h | 9 | ||||
-rw-r--r-- | kernel/fork.c | 9 | ||||
-rw-r--r-- | mm/hugetlb.c | 158 |
4 files changed, 140 insertions, 44 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index aeabf80f81a5..1576bbecd084 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c | |||
@@ -103,9 +103,9 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) | |||
103 | ret = -ENOMEM; | 103 | ret = -ENOMEM; |
104 | len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); | 104 | len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
105 | 105 | ||
106 | if (vma->vm_flags & VM_MAYSHARE && | 106 | if (hugetlb_reserve_pages(inode, |
107 | hugetlb_reserve_pages(inode, vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), | 107 | vma->vm_pgoff >> (HPAGE_SHIFT-PAGE_SHIFT), |
108 | len >> HPAGE_SHIFT)) | 108 | len >> HPAGE_SHIFT, vma)) |
109 | goto out; | 109 | goto out; |
110 | 110 | ||
111 | ret = 0; | 111 | ret = 0; |
@@ -942,7 +942,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size) | |||
942 | goto out_dentry; | 942 | goto out_dentry; |
943 | 943 | ||
944 | error = -ENOMEM; | 944 | error = -ENOMEM; |
945 | if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT)) | 945 | if (hugetlb_reserve_pages(inode, 0, size >> HPAGE_SHIFT, NULL)) |
946 | goto out_inode; | 946 | goto out_inode; |
947 | 947 | ||
948 | d_instantiate(dentry, inode); | 948 | d_instantiate(dentry, inode); |
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a79e80b689d8..185b14c9f021 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h | |||
@@ -17,6 +17,7 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) | |||
17 | return vma->vm_flags & VM_HUGETLB; | 17 | return vma->vm_flags & VM_HUGETLB; |
18 | } | 18 | } |
19 | 19 | ||
20 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma); | ||
20 | int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); | 21 | int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); |
21 | int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); | 22 | int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); |
22 | int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); | 23 | int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); |
@@ -30,7 +31,8 @@ int hugetlb_report_node_meminfo(int, char *); | |||
30 | unsigned long hugetlb_total_pages(void); | 31 | unsigned long hugetlb_total_pages(void); |
31 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, | 32 | int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, |
32 | unsigned long address, int write_access); | 33 | unsigned long address, int write_access); |
33 | int hugetlb_reserve_pages(struct inode *inode, long from, long to); | 34 | int hugetlb_reserve_pages(struct inode *inode, long from, long to, |
35 | struct vm_area_struct *vma); | ||
34 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); | 36 | void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); |
35 | 37 | ||
36 | extern unsigned long max_huge_pages; | 38 | extern unsigned long max_huge_pages; |
@@ -58,6 +60,11 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) | |||
58 | { | 60 | { |
59 | return 0; | 61 | return 0; |
60 | } | 62 | } |
63 | |||
64 | static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | ||
65 | { | ||
66 | } | ||
67 | |||
61 | static inline unsigned long hugetlb_total_pages(void) | 68 | static inline unsigned long hugetlb_total_pages(void) |
62 | { | 69 | { |
63 | return 0; | 70 | return 0; |
diff --git a/kernel/fork.c b/kernel/fork.c index adefc1131f27..552c8d8e77ad 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/cpu.h> | 33 | #include <linux/cpu.h> |
34 | #include <linux/cgroup.h> | 34 | #include <linux/cgroup.h> |
35 | #include <linux/security.h> | 35 | #include <linux/security.h> |
36 | #include <linux/hugetlb.h> | ||
36 | #include <linux/swap.h> | 37 | #include <linux/swap.h> |
37 | #include <linux/syscalls.h> | 38 | #include <linux/syscalls.h> |
38 | #include <linux/jiffies.h> | 39 | #include <linux/jiffies.h> |
@@ -307,6 +308,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) | |||
307 | } | 308 | } |
308 | 309 | ||
309 | /* | 310 | /* |
311 | * Clear hugetlb-related page reserves for children. This only | ||
312 | * affects MAP_PRIVATE mappings. Faults generated by the child | ||
313 | * are not guaranteed to succeed, even if read-only | ||
314 | */ | ||
315 | if (is_vm_hugetlb_page(tmp)) | ||
316 | reset_vma_resv_huge_pages(tmp); | ||
317 | |||
318 | /* | ||
310 | * Link in the new vma and copy the page table entries. | 319 | * Link in the new vma and copy the page table entries. |
311 | */ | 320 | */ |
312 | *pprev = tmp; | 321 | *pprev = tmp; |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a4dbba8965f3..0af500db3632 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -40,6 +40,69 @@ static int hugetlb_next_nid; | |||
40 | */ | 40 | */ |
41 | static DEFINE_SPINLOCK(hugetlb_lock); | 41 | static DEFINE_SPINLOCK(hugetlb_lock); |
42 | 42 | ||
43 | /* | ||
44 | * These helpers are used to track how many pages are reserved for | ||
45 | * faults in a MAP_PRIVATE mapping. Only the process that called mmap() | ||
46 | * is guaranteed to have their future faults succeed. | ||
47 | * | ||
48 | * With the exception of reset_vma_resv_huge_pages() which is called at fork(), | ||
49 | * the reserve counters are updated with the hugetlb_lock held. It is safe | ||
50 | * to reset the VMA at fork() time as it is not in use yet and there is no | ||
51 | * chance of the global counters getting corrupted as a result of the values. | ||
52 | */ | ||
53 | static unsigned long vma_resv_huge_pages(struct vm_area_struct *vma) | ||
54 | { | ||
55 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
56 | if (!(vma->vm_flags & VM_SHARED)) | ||
57 | return (unsigned long)vma->vm_private_data; | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | static void set_vma_resv_huge_pages(struct vm_area_struct *vma, | ||
62 | unsigned long reserve) | ||
63 | { | ||
64 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
65 | VM_BUG_ON(vma->vm_flags & VM_SHARED); | ||
66 | |||
67 | vma->vm_private_data = (void *)reserve; | ||
68 | } | ||
69 | |||
70 | /* Decrement the reserved pages in the hugepage pool by one */ | ||
71 | static void decrement_hugepage_resv_vma(struct vm_area_struct *vma) | ||
72 | { | ||
73 | if (vma->vm_flags & VM_SHARED) { | ||
74 | /* Shared mappings always use reserves */ | ||
75 | resv_huge_pages--; | ||
76 | } else { | ||
77 | /* | ||
78 | * Only the process that called mmap() has reserves for | ||
79 | * private mappings. | ||
80 | */ | ||
81 | if (vma_resv_huge_pages(vma)) { | ||
82 | resv_huge_pages--; | ||
83 | reserve = (unsigned long)vma->vm_private_data - 1; | ||
84 | vma->vm_private_data = (void *)reserve; | ||
85 | } | ||
86 | } | ||
87 | } | ||
88 | |||
89 | void reset_vma_resv_huge_pages(struct vm_area_struct *vma) | ||
90 | { | ||
91 | VM_BUG_ON(!is_vm_hugetlb_page(vma)); | ||
92 | if (!(vma->vm_flags & VM_SHARED)) | ||
93 | vma->vm_private_data = (void *)0; | ||
94 | } | ||
95 | |||
96 | /* Returns true if the VMA has associated reserve pages */ | ||
97 | static int vma_has_private_reserves(struct vm_area_struct *vma) | ||
98 | { | ||
99 | if (vma->vm_flags & VM_SHARED) | ||
100 | return 0; | ||
101 | if (!vma_resv_huge_pages(vma)) | ||
102 | return 0; | ||
103 | return 1; | ||
104 | } | ||
105 | |||
43 | static void clear_huge_page(struct page *page, unsigned long addr) | 106 | static void clear_huge_page(struct page *page, unsigned long addr) |
44 | { | 107 | { |
45 | int i; | 108 | int i; |
@@ -101,6 +164,15 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
101 | struct zone *zone; | 164 | struct zone *zone; |
102 | struct zoneref *z; | 165 | struct zoneref *z; |
103 | 166 | ||
167 | /* | ||
168 | * A child process with MAP_PRIVATE mappings created by their parent | ||
169 | * have no page reserves. This check ensures that reservations are | ||
170 | * not "stolen". The child may still get SIGKILLed | ||
171 | */ | ||
172 | if (!vma_has_private_reserves(vma) && | ||
173 | free_huge_pages - resv_huge_pages == 0) | ||
174 | return NULL; | ||
175 | |||
104 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 176 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
105 | MAX_NR_ZONES - 1, nodemask) { | 177 | MAX_NR_ZONES - 1, nodemask) { |
106 | nid = zone_to_nid(zone); | 178 | nid = zone_to_nid(zone); |
@@ -111,8 +183,8 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma, | |||
111 | list_del(&page->lru); | 183 | list_del(&page->lru); |
112 | free_huge_pages--; | 184 | free_huge_pages--; |
113 | free_huge_pages_node[nid]--; | 185 | free_huge_pages_node[nid]--; |
114 | if (vma && vma->vm_flags & VM_MAYSHARE) | 186 | decrement_hugepage_resv_vma(vma); |
115 | resv_huge_pages--; | 187 | |
116 | break; | 188 | break; |
117 | } | 189 | } |
118 | } | 190 | } |
@@ -461,55 +533,40 @@ static void return_unused_surplus_pages(unsigned long unused_resv_pages) | |||
461 | } | 533 | } |
462 | } | 534 | } |
463 | 535 | ||
464 | 536 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | |
465 | static struct page *alloc_huge_page_shared(struct vm_area_struct *vma, | 537 | unsigned long addr) |
466 | unsigned long addr) | ||
467 | { | 538 | { |
468 | struct page *page; | 539 | struct page *page; |
540 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
541 | struct inode *inode = mapping->host; | ||
542 | unsigned int chg = 0; | ||
543 | |||
544 | /* | ||
545 | * Processes that did not create the mapping will have no reserves and | ||
546 | * will not have accounted against quota. Check that the quota can be | ||
547 | * made before satisfying the allocation | ||
548 | */ | ||
549 | if (!vma_has_private_reserves(vma)) { | ||
550 | chg = 1; | ||
551 | if (hugetlb_get_quota(inode->i_mapping, chg)) | ||
552 | return ERR_PTR(-ENOSPC); | ||
553 | } | ||
469 | 554 | ||
470 | spin_lock(&hugetlb_lock); | 555 | spin_lock(&hugetlb_lock); |
471 | page = dequeue_huge_page_vma(vma, addr); | 556 | page = dequeue_huge_page_vma(vma, addr); |
472 | spin_unlock(&hugetlb_lock); | 557 | spin_unlock(&hugetlb_lock); |
473 | return page ? page : ERR_PTR(-VM_FAULT_OOM); | ||
474 | } | ||
475 | 558 | ||
476 | static struct page *alloc_huge_page_private(struct vm_area_struct *vma, | ||
477 | unsigned long addr) | ||
478 | { | ||
479 | struct page *page = NULL; | ||
480 | |||
481 | if (hugetlb_get_quota(vma->vm_file->f_mapping, 1)) | ||
482 | return ERR_PTR(-VM_FAULT_SIGBUS); | ||
483 | |||
484 | spin_lock(&hugetlb_lock); | ||
485 | if (free_huge_pages > resv_huge_pages) | ||
486 | page = dequeue_huge_page_vma(vma, addr); | ||
487 | spin_unlock(&hugetlb_lock); | ||
488 | if (!page) { | 559 | if (!page) { |
489 | page = alloc_buddy_huge_page(vma, addr); | 560 | page = alloc_buddy_huge_page(vma, addr); |
490 | if (!page) { | 561 | if (!page) { |
491 | hugetlb_put_quota(vma->vm_file->f_mapping, 1); | 562 | hugetlb_put_quota(inode->i_mapping, chg); |
492 | return ERR_PTR(-VM_FAULT_OOM); | 563 | return ERR_PTR(-VM_FAULT_OOM); |
493 | } | 564 | } |
494 | } | 565 | } |
495 | return page; | ||
496 | } | ||
497 | 566 | ||
498 | static struct page *alloc_huge_page(struct vm_area_struct *vma, | 567 | set_page_refcounted(page); |
499 | unsigned long addr) | 568 | set_page_private(page, (unsigned long) mapping); |
500 | { | ||
501 | struct page *page; | ||
502 | struct address_space *mapping = vma->vm_file->f_mapping; | ||
503 | |||
504 | if (vma->vm_flags & VM_MAYSHARE) | ||
505 | page = alloc_huge_page_shared(vma, addr); | ||
506 | else | ||
507 | page = alloc_huge_page_private(vma, addr); | ||
508 | 569 | ||
509 | if (!IS_ERR(page)) { | ||
510 | set_page_refcounted(page); | ||
511 | set_page_private(page, (unsigned long) mapping); | ||
512 | } | ||
513 | return page; | 570 | return page; |
514 | } | 571 | } |
515 | 572 | ||
@@ -757,6 +814,13 @@ out: | |||
757 | return ret; | 814 | return ret; |
758 | } | 815 | } |
759 | 816 | ||
817 | static void hugetlb_vm_op_close(struct vm_area_struct *vma) | ||
818 | { | ||
819 | unsigned long reserve = vma_resv_huge_pages(vma); | ||
820 | if (reserve) | ||
821 | hugetlb_acct_memory(-reserve); | ||
822 | } | ||
823 | |||
760 | /* | 824 | /* |
761 | * We cannot handle pagefaults against hugetlb pages at all. They cause | 825 | * We cannot handle pagefaults against hugetlb pages at all. They cause |
762 | * handle_mm_fault() to try to instantiate regular-sized pages in the | 826 | * handle_mm_fault() to try to instantiate regular-sized pages in the |
@@ -771,6 +835,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
771 | 835 | ||
772 | struct vm_operations_struct hugetlb_vm_ops = { | 836 | struct vm_operations_struct hugetlb_vm_ops = { |
773 | .fault = hugetlb_vm_op_fault, | 837 | .fault = hugetlb_vm_op_fault, |
838 | .close = hugetlb_vm_op_close, | ||
774 | }; | 839 | }; |
775 | 840 | ||
776 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, | 841 | static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, |
@@ -1289,11 +1354,25 @@ static long region_truncate(struct list_head *head, long end) | |||
1289 | return chg; | 1354 | return chg; |
1290 | } | 1355 | } |
1291 | 1356 | ||
1292 | int hugetlb_reserve_pages(struct inode *inode, long from, long to) | 1357 | int hugetlb_reserve_pages(struct inode *inode, |
1358 | long from, long to, | ||
1359 | struct vm_area_struct *vma) | ||
1293 | { | 1360 | { |
1294 | long ret, chg; | 1361 | long ret, chg; |
1295 | 1362 | ||
1296 | chg = region_chg(&inode->i_mapping->private_list, from, to); | 1363 | /* |
1364 | * Shared mappings base their reservation on the number of pages that | ||
1365 | * are already allocated on behalf of the file. Private mappings need | ||
1366 | * to reserve the full area even if read-only as mprotect() may be | ||
1367 | * called to make the mapping read-write. Assume !vma is a shm mapping | ||
1368 | */ | ||
1369 | if (!vma || vma->vm_flags & VM_SHARED) | ||
1370 | chg = region_chg(&inode->i_mapping->private_list, from, to); | ||
1371 | else { | ||
1372 | chg = to - from; | ||
1373 | set_vma_resv_huge_pages(vma, chg); | ||
1374 | } | ||
1375 | |||
1297 | if (chg < 0) | 1376 | if (chg < 0) |
1298 | return chg; | 1377 | return chg; |
1299 | 1378 | ||
@@ -1304,7 +1383,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) | |||
1304 | hugetlb_put_quota(inode->i_mapping, chg); | 1383 | hugetlb_put_quota(inode->i_mapping, chg); |
1305 | return ret; | 1384 | return ret; |
1306 | } | 1385 | } |
1307 | region_add(&inode->i_mapping->private_list, from, to); | 1386 | if (!vma || vma->vm_flags & VM_SHARED) |
1387 | region_add(&inode->i_mapping->private_list, from, to); | ||
1308 | return 0; | 1388 | return 0; |
1309 | } | 1389 | } |
1310 | 1390 | ||