aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2009-02-10 09:02:27 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2009-02-10 13:48:42 -0500
commit5a6fe125950676015f5108fb71b2a67441755003 (patch)
treec985fac46de39392466c4917c497b50bdc9c0757
parent4c098bcd55fad34dcf224bf8343db6a9ac58fc68 (diff)
Do not account for the address space used by hugetlbfs using VM_ACCOUNT
When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/hugetlbfs/inode.c8
-rw-r--r--include/linux/hugetlb.h5
-rw-r--r--include/linux/mm.h3
-rw-r--r--ipc/shm.c8
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mprotect.c5
8 files changed, 65 insertions, 43 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
108 108
109 if (hugetlb_reserve_pages(inode, 109 if (hugetlb_reserve_pages(inode,
110 vma->vm_pgoff >> huge_page_order(h), 110 vma->vm_pgoff >> huge_page_order(h),
111 len >> huge_page_shift(h), vma)) 111 len >> huge_page_shift(h), vma,
112 vma->vm_flags))
112 goto out; 113 goto out;
113 114
114 ret = 0; 115 ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
947 can_do_mlock()); 948 can_do_mlock());
948} 949}
949 950
950struct file *hugetlb_file_setup(const char *name, size_t size) 951struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
951{ 952{
952 int error = -ENOMEM; 953 int error = -ENOMEM;
953 struct file *file; 954 struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
981 982
982 error = -ENOMEM; 983 error = -ENOMEM;
983 if (hugetlb_reserve_pages(inode, 0, 984 if (hugetlb_reserve_pages(inode, 0,
984 size >> huge_page_shift(hstate_inode(inode)), NULL)) 985 size >> huge_page_shift(hstate_inode(inode)), NULL,
986 acctflag))
985 goto out_inode; 987 goto out_inode;
986 988
987 d_instantiate(dentry, inode); 989 d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f1d2fba19ea0..af09660001c7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
33int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, 33int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
34 unsigned long address, int write_access); 34 unsigned long address, int write_access);
35int hugetlb_reserve_pages(struct inode *inode, long from, long to, 35int hugetlb_reserve_pages(struct inode *inode, long from, long to,
36 struct vm_area_struct *vma); 36 struct vm_area_struct *vma,
37 int acctflags);
37void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); 38void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
38 39
39extern unsigned long hugepages_treat_as_movable; 40extern unsigned long hugepages_treat_as_movable;
@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
138 139
139extern const struct file_operations hugetlbfs_file_operations; 140extern const struct file_operations hugetlbfs_file_operations;
140extern struct vm_operations_struct hugetlb_vm_ops; 141extern struct vm_operations_struct hugetlb_vm_ops;
141struct file *hugetlb_file_setup(const char *name, size_t); 142struct file *hugetlb_file_setup(const char *name, size_t, int);
142int hugetlb_get_quota(struct address_space *mapping, long delta); 143int hugetlb_get_quota(struct address_space *mapping, long delta);
143void hugetlb_put_quota(struct address_space *mapping, long delta); 144void hugetlb_put_quota(struct address_space *mapping, long delta);
144 145
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..323561582c10 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1129 unsigned long flag, unsigned long pgoff); 1129 unsigned long flag, unsigned long pgoff);
1130extern unsigned long mmap_region(struct file *file, unsigned long addr, 1130extern unsigned long mmap_region(struct file *file, unsigned long addr,
1131 unsigned long len, unsigned long flags, 1131 unsigned long len, unsigned long flags,
1132 unsigned int vm_flags, unsigned long pgoff, 1132 unsigned int vm_flags, unsigned long pgoff);
1133 int accountable);
1134 1133
1135static inline unsigned long do_mmap(struct file *file, unsigned long addr, 1134static inline unsigned long do_mmap(struct file *file, unsigned long addr,
1136 unsigned long len, unsigned long prot, 1135 unsigned long len, unsigned long prot,
diff --git a/ipc/shm.c b/ipc/shm.c
index f8f69fad3a27..05d51d2a792c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
340 struct file * file; 340 struct file * file;
341 char name[13]; 341 char name[13];
342 int id; 342 int id;
343 int acctflag = 0;
343 344
344 if (size < SHMMIN || size > ns->shm_ctlmax) 345 if (size < SHMMIN || size > ns->shm_ctlmax)
345 return -EINVAL; 346 return -EINVAL;
@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
364 365
365 sprintf (name, "SYSV%08x", key); 366 sprintf (name, "SYSV%08x", key);
366 if (shmflg & SHM_HUGETLB) { 367 if (shmflg & SHM_HUGETLB) {
367 /* hugetlb_file_setup takes care of mlock user accounting */ 368 /* hugetlb_file_setup applies strict accounting */
368 file = hugetlb_file_setup(name, size); 369 if (shmflg & SHM_NORESERVE)
370 acctflag = VM_NORESERVE;
371 file = hugetlb_file_setup(name, size, acctflag);
369 shp->mlock_user = current_user(); 372 shp->mlock_user = current_user();
370 } else { 373 } else {
371 int acctflag = 0;
372 /* 374 /*
373 * Do not allow no accounting for OVERCOMMIT_NEVER, even 375 * Do not allow no accounting for OVERCOMMIT_NEVER, even
374 * if it's asked for. 376 * if it's asked for.
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
198 flags &= MAP_NONBLOCK; 198 flags &= MAP_NONBLOCK;
199 get_file(file); 199 get_file(file);
200 addr = mmap_region(file, start, size, 200 addr = mmap_region(file, start, size,
201 flags, vma->vm_flags, pgoff, 1); 201 flags, vma->vm_flags, pgoff);
202 fput(file); 202 fput(file);
203 if (IS_ERR_VALUE(addr)) { 203 if (IS_ERR_VALUE(addr)) {
204 err = addr; 204 err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..207464209546 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
2269 2269
2270int hugetlb_reserve_pages(struct inode *inode, 2270int hugetlb_reserve_pages(struct inode *inode,
2271 long from, long to, 2271 long from, long to,
2272 struct vm_area_struct *vma) 2272 struct vm_area_struct *vma,
2273 int acctflag)
2273{ 2274{
2274 long ret, chg; 2275 long ret = 0, chg;
2275 struct hstate *h = hstate_inode(inode); 2276 struct hstate *h = hstate_inode(inode);
2276 2277
2277 if (vma && vma->vm_flags & VM_NORESERVE)
2278 return 0;
2279
2280 /* 2278 /*
2281 * Shared mappings base their reservation on the number of pages that 2279 * Shared mappings base their reservation on the number of pages that
2282 * are already allocated on behalf of the file. Private mappings need 2280 * are already allocated on behalf of the file. Private mappings need
@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
2285 */ 2283 */
2286 if (!vma || vma->vm_flags & VM_SHARED) 2284 if (!vma || vma->vm_flags & VM_SHARED)
2287 chg = region_chg(&inode->i_mapping->private_list, from, to); 2285 chg = region_chg(&inode->i_mapping->private_list, from, to);
2288 else { 2286 else
2289 struct resv_map *resv_map = resv_map_alloc();
2290 if (!resv_map)
2291 return -ENOMEM;
2292
2293 chg = to - from; 2287 chg = to - from;
2294 2288
2295 set_vma_resv_map(vma, resv_map);
2296 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2297 }
2298
2299 if (chg < 0) 2289 if (chg < 0)
2300 return chg; 2290 return chg;
2301 2291
2302 if (hugetlb_get_quota(inode->i_mapping, chg)) 2292 if (hugetlb_get_quota(inode->i_mapping, chg))
2303 return -ENOSPC; 2293 return -ENOSPC;
2294
2295 /*
2296 * Only apply hugepage reservation if asked. We still have to
2297 * take the filesystem quota because it is an upper limit
2298 * defined for the mount and not necessarily memory as a whole
2299 */
2300 if (acctflag & VM_NORESERVE) {
2301 reset_vma_resv_huge_pages(vma);
2302 return 0;
2303 }
2304
2304 ret = hugetlb_acct_memory(h, chg); 2305 ret = hugetlb_acct_memory(h, chg);
2305 if (ret < 0) { 2306 if (ret < 0) {
2306 hugetlb_put_quota(inode->i_mapping, chg); 2307 hugetlb_put_quota(inode->i_mapping, chg);
@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
2308 } 2309 }
2309 if (!vma || vma->vm_flags & VM_SHARED) 2310 if (!vma || vma->vm_flags & VM_SHARED)
2310 region_add(&inode->i_mapping->private_list, from, to); 2311 region_add(&inode->i_mapping->private_list, from, to);
2312 else {
2313 struct resv_map *resv_map = resv_map_alloc();
2314
2315 if (!resv_map)
2316 return -ENOMEM;
2317
2318 set_vma_resv_map(vma, resv_map);
2319 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
2320 }
2321
2311 return 0; 2322 return 0;
2312} 2323}
2313 2324
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a258eeb..eb1270bebe67 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
918 struct inode *inode; 918 struct inode *inode;
919 unsigned int vm_flags; 919 unsigned int vm_flags;
920 int error; 920 int error;
921 int accountable = 1;
922 unsigned long reqprot = prot; 921 unsigned long reqprot = prot;
923 922
924 /* 923 /*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1019 return -EPERM; 1018 return -EPERM;
1020 vm_flags &= ~VM_MAYEXEC; 1019 vm_flags &= ~VM_MAYEXEC;
1021 } 1020 }
1022 if (is_file_hugepages(file))
1023 accountable = 0;
1024 1021
1025 if (!file->f_op || !file->f_op->mmap) 1022 if (!file->f_op || !file->f_op->mmap)
1026 return -ENODEV; 1023 return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
1053 if (error) 1050 if (error)
1054 return error; 1051 return error;
1055 1052
1056 return mmap_region(file, addr, len, flags, vm_flags, pgoff, 1053 return mmap_region(file, addr, len, flags, vm_flags, pgoff);
1057 accountable);
1058} 1054}
1059EXPORT_SYMBOL(do_mmap_pgoff); 1055EXPORT_SYMBOL(do_mmap_pgoff);
1060 1056
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1092 1088
1093/* 1089/*
1094 * We account for memory if it's a private writeable mapping, 1090 * We account for memory if it's a private writeable mapping,
1095 * and VM_NORESERVE wasn't set. 1091 * not hugepages and VM_NORESERVE wasn't set.
1096 */ 1092 */
1097static inline int accountable_mapping(unsigned int vm_flags) 1093static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
1098{ 1094{
1095 /*
1096 * hugetlb has its own accounting separate from the core VM
1097 * VM_HUGETLB may not be set yet so we cannot check for that flag.
1098 */
1099 if (file && is_file_hugepages(file))
1100 return 0;
1101
1099 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; 1102 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1100} 1103}
1101 1104
1102unsigned long mmap_region(struct file *file, unsigned long addr, 1105unsigned long mmap_region(struct file *file, unsigned long addr,
1103 unsigned long len, unsigned long flags, 1106 unsigned long len, unsigned long flags,
1104 unsigned int vm_flags, unsigned long pgoff, 1107 unsigned int vm_flags, unsigned long pgoff)
1105 int accountable)
1106{ 1108{
1107 struct mm_struct *mm = current->mm; 1109 struct mm_struct *mm = current->mm;
1108 struct vm_area_struct *vma, *prev; 1110 struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
1128 1130
1129 /* 1131 /*
1130 * Set 'VM_NORESERVE' if we should not account for the 1132 * Set 'VM_NORESERVE' if we should not account for the
1131 * memory use of this mapping. We only honor MAP_NORESERVE 1133 * memory use of this mapping.
1132 * if we're allowed to overcommit memory.
1133 */ 1134 */
1134 if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) 1135 if ((flags & MAP_NORESERVE)) {
1135 vm_flags |= VM_NORESERVE; 1136 /* We honor MAP_NORESERVE if allowed to overcommit */
1136 if (!accountable) 1137 if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1137 vm_flags |= VM_NORESERVE; 1138 vm_flags |= VM_NORESERVE;
1139
1140 /* hugetlb applies strict overcommit unless MAP_NORESERVE */
1141 if (file && is_file_hugepages(file))
1142 vm_flags |= VM_NORESERVE;
1143 }
1138 1144
1139 /* 1145 /*
1140 * Private writable mapping: check memory availability 1146 * Private writable mapping: check memory availability
1141 */ 1147 */
1142 if (accountable_mapping(vm_flags)) { 1148 if (accountable_mapping(file, vm_flags)) {
1143 charged = len >> PAGE_SHIFT; 1149 charged = len >> PAGE_SHIFT;
1144 if (security_vm_enough_memory(charged)) 1150 if (security_vm_enough_memory(charged))
1145 return -ENOMEM; 1151 return -ENOMEM;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
151 /* 151 /*
152 * If we make a private mapping writable we increase our commit; 152 * If we make a private mapping writable we increase our commit;
153 * but (without finer accounting) cannot reduce our commit if we 153 * but (without finer accounting) cannot reduce our commit if we
154 * make it unwritable again. 154 * make it unwritable again. hugetlb mapping were accounted for
155 * even if read-only so there is no need to account for them here
155 */ 156 */
156 if (newflags & VM_WRITE) { 157 if (newflags & VM_WRITE) {
157 if (!(oldflags & (VM_ACCOUNT|VM_WRITE| 158 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
158 VM_SHARED|VM_NORESERVE))) { 159 VM_SHARED|VM_NORESERVE))) {
159 charged = nrpages; 160 charged = nrpages;
160 if (security_vm_enough_memory(charged)) 161 if (security_vm_enough_memory(charged))