diff options
author | Michel Lespinasse <walken@google.com> | 2013-02-22 19:32:37 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-02-23 20:50:10 -0500 |
commit | bebeb3d68b24bb4132d452c5707fe321208bcbcd (patch) | |
tree | 6e609cb7323fb1b4b7026fa0e35867145a181094 | |
parent | 940e7da5163029978c2f6b5bbe213607add59062 (diff) |
mm: introduce mm_populate() for populating new vmas
When creating new mappings using the MAP_POPULATE / MAP_LOCKED flags (or
with MCL_FUTURE in effect), we want to populate the pages within the
newly created vmas. This may take a while as we may have to read pages
from disk, so ideally we want to do this outside of the write-locked
mmap_sem region.
This change introduces mm_populate(), which is used to defer populating
such mappings until after the mmap_sem write lock has been released.
This is implemented as a generalization of the former do_mlock_pages(),
which accomplished the same task but was using during mlock() /
mlockall().
Signed-off-by: Michel Lespinasse <walken@google.com>
Reported-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Rik van Riel <riel@redhat.com>
Tested-by: Andy Lutomirski <luto@amacapital.net>
Cc: Greg Ungerer <gregungerer@westnet.com.au>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/aio.c | 6 | ||||
-rw-r--r-- | include/linux/mm.h | 18 | ||||
-rw-r--r-- | ipc/shm.c | 12 | ||||
-rw-r--r-- | mm/mlock.c | 17 | ||||
-rw-r--r-- | mm/mmap.c | 20 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/util.c | 6 |
7 files changed, 62 insertions, 22 deletions
@@ -103,6 +103,7 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
103 | unsigned nr_events = ctx->max_reqs; | 103 | unsigned nr_events = ctx->max_reqs; |
104 | unsigned long size; | 104 | unsigned long size; |
105 | int nr_pages; | 105 | int nr_pages; |
106 | bool populate; | ||
106 | 107 | ||
107 | /* Compensate for the ring buffer's head/tail overlap entry */ | 108 | /* Compensate for the ring buffer's head/tail overlap entry */ |
108 | nr_events += 2; /* 1 is required, 2 for good luck */ | 109 | nr_events += 2; /* 1 is required, 2 for good luck */ |
@@ -129,7 +130,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
129 | down_write(&ctx->mm->mmap_sem); | 130 | down_write(&ctx->mm->mmap_sem); |
130 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, | 131 | info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, |
131 | PROT_READ|PROT_WRITE, | 132 | PROT_READ|PROT_WRITE, |
132 | MAP_ANONYMOUS|MAP_PRIVATE, 0); | 133 | MAP_ANONYMOUS|MAP_PRIVATE, 0, |
134 | &populate); | ||
133 | if (IS_ERR((void *)info->mmap_base)) { | 135 | if (IS_ERR((void *)info->mmap_base)) { |
134 | up_write(&ctx->mm->mmap_sem); | 136 | up_write(&ctx->mm->mmap_sem); |
135 | info->mmap_size = 0; | 137 | info->mmap_size = 0; |
@@ -147,6 +149,8 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
147 | aio_free_ring(ctx); | 149 | aio_free_ring(ctx); |
148 | return -EAGAIN; | 150 | return -EAGAIN; |
149 | } | 151 | } |
152 | if (populate) | ||
153 | mm_populate(info->mmap_base, info->mmap_size); | ||
150 | 154 | ||
151 | ctx->user_id = info->mmap_base; | 155 | ctx->user_id = info->mmap_base; |
152 | 156 | ||
diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d9dcc35d6a1..da0a0fe970c2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -1474,11 +1474,23 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo | |||
1474 | extern unsigned long mmap_region(struct file *file, unsigned long addr, | 1474 | extern unsigned long mmap_region(struct file *file, unsigned long addr, |
1475 | unsigned long len, unsigned long flags, | 1475 | unsigned long len, unsigned long flags, |
1476 | vm_flags_t vm_flags, unsigned long pgoff); | 1476 | vm_flags_t vm_flags, unsigned long pgoff); |
1477 | extern unsigned long do_mmap_pgoff(struct file *, unsigned long, | 1477 | extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1478 | unsigned long, unsigned long, | 1478 | unsigned long len, unsigned long prot, unsigned long flags, |
1479 | unsigned long, unsigned long); | 1479 | unsigned long pgoff, bool *populate); |
1480 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); | 1480 | extern int do_munmap(struct mm_struct *, unsigned long, size_t); |
1481 | 1481 | ||
1482 | #ifdef CONFIG_MMU | ||
1483 | extern int __mm_populate(unsigned long addr, unsigned long len, | ||
1484 | int ignore_errors); | ||
1485 | static inline void mm_populate(unsigned long addr, unsigned long len) | ||
1486 | { | ||
1487 | /* Ignore errors */ | ||
1488 | (void) __mm_populate(addr, len, 1); | ||
1489 | } | ||
1490 | #else | ||
1491 | static inline void mm_populate(unsigned long addr, unsigned long len) {} | ||
1492 | #endif | ||
1493 | |||
1482 | /* These take the mm semaphore themselves */ | 1494 | /* These take the mm semaphore themselves */ |
1483 | extern unsigned long vm_brk(unsigned long, unsigned long); | 1495 | extern unsigned long vm_brk(unsigned long, unsigned long); |
1484 | extern int vm_munmap(unsigned long, size_t); | 1496 | extern int vm_munmap(unsigned long, size_t); |
@@ -967,11 +967,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
967 | unsigned long flags; | 967 | unsigned long flags; |
968 | unsigned long prot; | 968 | unsigned long prot; |
969 | int acc_mode; | 969 | int acc_mode; |
970 | unsigned long user_addr; | ||
971 | struct ipc_namespace *ns; | 970 | struct ipc_namespace *ns; |
972 | struct shm_file_data *sfd; | 971 | struct shm_file_data *sfd; |
973 | struct path path; | 972 | struct path path; |
974 | fmode_t f_mode; | 973 | fmode_t f_mode; |
974 | bool populate = false; | ||
975 | 975 | ||
976 | err = -EINVAL; | 976 | err = -EINVAL; |
977 | if (shmid < 0) | 977 | if (shmid < 0) |
@@ -1070,13 +1070,15 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, | |||
1070 | goto invalid; | 1070 | goto invalid; |
1071 | } | 1071 | } |
1072 | 1072 | ||
1073 | user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0); | 1073 | addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate); |
1074 | *raddr = user_addr; | 1074 | *raddr = addr; |
1075 | err = 0; | 1075 | err = 0; |
1076 | if (IS_ERR_VALUE(user_addr)) | 1076 | if (IS_ERR_VALUE(addr)) |
1077 | err = (long)user_addr; | 1077 | err = (long)addr; |
1078 | invalid: | 1078 | invalid: |
1079 | up_write(¤t->mm->mmap_sem); | 1079 | up_write(¤t->mm->mmap_sem); |
1080 | if (populate) | ||
1081 | mm_populate(addr, size); | ||
1080 | 1082 | ||
1081 | out_fput: | 1083 | out_fput: |
1082 | fput(file); | 1084 | fput(file); |
diff --git a/mm/mlock.c b/mm/mlock.c index c9bd528b01d2..a296a49865df 100644 --- a/mm/mlock.c +++ b/mm/mlock.c | |||
@@ -416,7 +416,14 @@ static int do_mlock(unsigned long start, size_t len, int on) | |||
416 | return error; | 416 | return error; |
417 | } | 417 | } |
418 | 418 | ||
419 | static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) | 419 | /* |
420 | * __mm_populate - populate and/or mlock pages within a range of address space. | ||
421 | * | ||
422 | * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap | ||
423 | * flags. VMAs must be already marked with the desired vm_flags, and | ||
424 | * mmap_sem must not be held. | ||
425 | */ | ||
426 | int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) | ||
420 | { | 427 | { |
421 | struct mm_struct *mm = current->mm; | 428 | struct mm_struct *mm = current->mm; |
422 | unsigned long end, nstart, nend; | 429 | unsigned long end, nstart, nend; |
@@ -498,7 +505,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) | |||
498 | error = do_mlock(start, len, 1); | 505 | error = do_mlock(start, len, 1); |
499 | up_write(¤t->mm->mmap_sem); | 506 | up_write(¤t->mm->mmap_sem); |
500 | if (!error) | 507 | if (!error) |
501 | error = do_mlock_pages(start, len, 0); | 508 | error = __mm_populate(start, len, 0); |
502 | return error; | 509 | return error; |
503 | } | 510 | } |
504 | 511 | ||
@@ -564,10 +571,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) | |||
564 | capable(CAP_IPC_LOCK)) | 571 | capable(CAP_IPC_LOCK)) |
565 | ret = do_mlockall(flags); | 572 | ret = do_mlockall(flags); |
566 | up_write(¤t->mm->mmap_sem); | 573 | up_write(¤t->mm->mmap_sem); |
567 | if (!ret && (flags & MCL_CURRENT)) { | 574 | if (!ret && (flags & MCL_CURRENT)) |
568 | /* Ignore errors */ | 575 | mm_populate(0, TASK_SIZE); |
569 | do_mlock_pages(0, TASK_SIZE, 1); | ||
570 | } | ||
571 | out: | 576 | out: |
572 | return ret; | 577 | return ret; |
573 | } | 578 | } |
@@ -1154,12 +1154,15 @@ static inline unsigned long round_hint_to_min(unsigned long hint) | |||
1154 | 1154 | ||
1155 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | 1155 | unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, |
1156 | unsigned long len, unsigned long prot, | 1156 | unsigned long len, unsigned long prot, |
1157 | unsigned long flags, unsigned long pgoff) | 1157 | unsigned long flags, unsigned long pgoff, |
1158 | bool *populate) | ||
1158 | { | 1159 | { |
1159 | struct mm_struct * mm = current->mm; | 1160 | struct mm_struct * mm = current->mm; |
1160 | struct inode *inode; | 1161 | struct inode *inode; |
1161 | vm_flags_t vm_flags; | 1162 | vm_flags_t vm_flags; |
1162 | 1163 | ||
1164 | *populate = false; | ||
1165 | |||
1163 | /* | 1166 | /* |
1164 | * Does the application expect PROT_READ to imply PROT_EXEC? | 1167 | * Does the application expect PROT_READ to imply PROT_EXEC? |
1165 | * | 1168 | * |
@@ -1280,7 +1283,12 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, | |||
1280 | } | 1283 | } |
1281 | } | 1284 | } |
1282 | 1285 | ||
1283 | return mmap_region(file, addr, len, flags, vm_flags, pgoff); | 1286 | addr = mmap_region(file, addr, len, flags, vm_flags, pgoff); |
1287 | if (!IS_ERR_VALUE(addr) && | ||
1288 | ((vm_flags & VM_LOCKED) || | ||
1289 | (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE)) | ||
1290 | *populate = true; | ||
1291 | return addr; | ||
1284 | } | 1292 | } |
1285 | 1293 | ||
1286 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, | 1294 | SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, |
@@ -1531,10 +1539,12 @@ out: | |||
1531 | 1539 | ||
1532 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); | 1540 | vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); |
1533 | if (vm_flags & VM_LOCKED) { | 1541 | if (vm_flags & VM_LOCKED) { |
1534 | if (!mlock_vma_pages_range(vma, addr, addr + len)) | 1542 | if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || |
1543 | vma == get_gate_vma(current->mm))) | ||
1535 | mm->locked_vm += (len >> PAGE_SHIFT); | 1544 | mm->locked_vm += (len >> PAGE_SHIFT); |
1536 | } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) | 1545 | else |
1537 | make_pages_present(addr, addr + len); | 1546 | vma->vm_flags &= ~VM_LOCKED; |
1547 | } | ||
1538 | 1548 | ||
1539 | if (file) | 1549 | if (file) |
1540 | uprobe_mmap(vma); | 1550 | uprobe_mmap(vma); |
diff --git a/mm/nommu.c b/mm/nommu.c index b20db4e22263..7296a5a280e7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -1250,7 +1250,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1250 | unsigned long len, | 1250 | unsigned long len, |
1251 | unsigned long prot, | 1251 | unsigned long prot, |
1252 | unsigned long flags, | 1252 | unsigned long flags, |
1253 | unsigned long pgoff) | 1253 | unsigned long pgoff, |
1254 | bool *populate) | ||
1254 | { | 1255 | { |
1255 | struct vm_area_struct *vma; | 1256 | struct vm_area_struct *vma; |
1256 | struct vm_region *region; | 1257 | struct vm_region *region; |
@@ -1260,6 +1261,8 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1260 | 1261 | ||
1261 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); | 1262 | kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); |
1262 | 1263 | ||
1264 | *populate = false; | ||
1265 | |||
1263 | /* decide whether we should attempt the mapping, and if so what sort of | 1266 | /* decide whether we should attempt the mapping, and if so what sort of |
1264 | * mapping */ | 1267 | * mapping */ |
1265 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, | 1268 | ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, |
@@ -355,12 +355,16 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, | |||
355 | { | 355 | { |
356 | unsigned long ret; | 356 | unsigned long ret; |
357 | struct mm_struct *mm = current->mm; | 357 | struct mm_struct *mm = current->mm; |
358 | bool populate; | ||
358 | 359 | ||
359 | ret = security_mmap_file(file, prot, flag); | 360 | ret = security_mmap_file(file, prot, flag); |
360 | if (!ret) { | 361 | if (!ret) { |
361 | down_write(&mm->mmap_sem); | 362 | down_write(&mm->mmap_sem); |
362 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff); | 363 | ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, |
364 | &populate); | ||
363 | up_write(&mm->mmap_sem); | 365 | up_write(&mm->mmap_sem); |
366 | if (!IS_ERR_VALUE(ret) && populate) | ||
367 | mm_populate(ret, len); | ||
364 | } | 368 | } |
365 | return ret; | 369 | return ret; |
366 | } | 370 | } |