aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2016-05-23 19:25:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-23 20:04:14 -0400
commitdc0ef0df7b6a90892ec41933212ac701152a254c (patch)
treeaf3ab3813d51334e8af5762e0bd4580bc33ea764
parente10af1328b13554dee3de91b713496704cb5822e (diff)
mm: make mmap_sem for write waits killable for mm syscalls
This is a follow up work for oom_reaper [1]. As the async OOM killing depends on oom_sem for read we would really appreciate if a holder for write didn't stood in the way. This patchset is changing many of down_write calls to be killable to help those cases when the writer is blocked and waiting for readers to release the lock and so help __oom_reap_task to process the oom victim. Most of the patches are really trivial because the lock is help from a shallow syscall paths where we can return EINTR trivially and allow the current task to die (note that EINTR will never get to the userspace as the task has fatal signal pending). Others seem to be easy as well as the callers are already handling fatal errors and bail and return to userspace which should be sufficient to handle the failure gracefully. I am not familiar with all those code paths so a deeper review is really appreciated. As this work is touching more areas which are not directly connected I have tried to keep the CC list as small as possible and people who I believed would be familiar are CCed only to the specific patches (all should have received the cover though). This patchset is based on linux-next and it depends on down_write_killable for rw_semaphores which got merged into tip locking/rwsem branch and it is merged into this next tree. I guess it would be easiest to route these patches via mmotm because of the dependency on the tip tree but if respective maintainers prefer other way I have no objections. I haven't covered all the mmap_write(mm->mmap_sem) instances here $ git grep "down_write(.*\<mmap_sem\>)" next/master | wc -l 98 $ git grep "down_write(.*\<mmap_sem\>)" | wc -l 62 I have tried to cover those which should be relatively easy to review in this series because this alone should be a nice improvement. Other places can be changed on top. [0] http://lkml.kernel.org/r/1456752417-9626-1-git-send-email-mhocko@kernel.org [1] http://lkml.kernel.org/r/1452094975-551-1-git-send-email-mhocko@kernel.org [2] http://lkml.kernel.org/r/1456750705-7141-1-git-send-email-mhocko@kernel.org This patch (of 18): This is the first step in making mmap_sem write waiters killable. It focuses on the trivial ones which are taking the lock early after entering the syscall and they are not changing state before. Therefore it is very easy to change them to use down_write_killable and immediately return with -EINTR. This will allow the waiter to pass away without blocking the mmap_sem which might be required to make a forward progress. E.g. the oom reaper will need the lock for reading to dismantle the OOM victim address space. The only tricky function in this patch is vm_mmap_pgoff which has many call sites via vm_mmap. To reduce the risk keep vm_mmap with the original non-killable semantic for now. vm_munmap callers do not bother checking the return value so open code it into the munmap syscall path for now for simplicity. Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Mel Gorman <mgorman@suse.de> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Cc: Hugh Dickins <hughd@google.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: David Rientjes <rientjes@google.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/internal.h5
-rw-r--r--mm/madvise.c8
-rw-r--r--mm/mlock.c16
-rw-r--r--mm/mmap.c27
-rw-r--r--mm/mprotect.c3
-rw-r--r--mm/mremap.c3
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/util.c12
8 files changed, 55 insertions, 21 deletions
diff --git a/mm/internal.h b/mm/internal.h
index f6f3353b0868..bff7fd702331 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -442,9 +442,10 @@ extern u64 hwpoison_filter_flags_value;
442extern u64 hwpoison_filter_memcg; 442extern u64 hwpoison_filter_memcg;
443extern u32 hwpoison_filter_enable; 443extern u32 hwpoison_filter_enable;
444 444
445extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, 445extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
446 unsigned long, unsigned long, 446 unsigned long, unsigned long,
447 unsigned long, unsigned long); 447 unsigned long, unsigned long,
448 bool);
448 449
449extern void set_pageblock_order(void); 450extern void set_pageblock_order(void);
450unsigned long reclaim_clean_pages_from_list(struct zone *zone, 451unsigned long reclaim_clean_pages_from_list(struct zone *zone,
diff --git a/mm/madvise.c b/mm/madvise.c
index 07427d3fcead..93fb63e88b5e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -707,10 +707,12 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
707 return error; 707 return error;
708 708
709 write = madvise_need_mmap_write(behavior); 709 write = madvise_need_mmap_write(behavior);
710 if (write) 710 if (write) {
711 down_write(&current->mm->mmap_sem); 711 if (down_write_killable(&current->mm->mmap_sem))
712 else 712 return -EINTR;
713 } else {
713 down_read(&current->mm->mmap_sem); 714 down_read(&current->mm->mmap_sem);
715 }
714 716
715 /* 717 /*
716 * If the interval [start,end) covers some unmapped address 718 * If the interval [start,end) covers some unmapped address
diff --git a/mm/mlock.c b/mm/mlock.c
index 96f001041928..ef8dc9f395c4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -617,7 +617,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len,
617 return error; 617 return error;
618} 618}
619 619
620static int do_mlock(unsigned long start, size_t len, vm_flags_t flags) 620static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
621{ 621{
622 unsigned long locked; 622 unsigned long locked;
623 unsigned long lock_limit; 623 unsigned long lock_limit;
@@ -635,7 +635,8 @@ static int do_mlock(unsigned long start, size_t len, vm_flags_t flags)
635 lock_limit >>= PAGE_SHIFT; 635 lock_limit >>= PAGE_SHIFT;
636 locked = len >> PAGE_SHIFT; 636 locked = len >> PAGE_SHIFT;
637 637
638 down_write(&current->mm->mmap_sem); 638 if (down_write_killable(&current->mm->mmap_sem))
639 return -EINTR;
639 640
640 locked += current->mm->locked_vm; 641 locked += current->mm->locked_vm;
641 642
@@ -678,7 +679,8 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
678 len = PAGE_ALIGN(len + (offset_in_page(start))); 679 len = PAGE_ALIGN(len + (offset_in_page(start)));
679 start &= PAGE_MASK; 680 start &= PAGE_MASK;
680 681
681 down_write(&current->mm->mmap_sem); 682 if (down_write_killable(&current->mm->mmap_sem))
683 return -EINTR;
682 ret = apply_vma_lock_flags(start, len, 0); 684 ret = apply_vma_lock_flags(start, len, 0);
683 up_write(&current->mm->mmap_sem); 685 up_write(&current->mm->mmap_sem);
684 686
@@ -748,9 +750,10 @@ SYSCALL_DEFINE1(mlockall, int, flags)
748 lock_limit = rlimit(RLIMIT_MEMLOCK); 750 lock_limit = rlimit(RLIMIT_MEMLOCK);
749 lock_limit >>= PAGE_SHIFT; 751 lock_limit >>= PAGE_SHIFT;
750 752
751 ret = -ENOMEM; 753 if (down_write_killable(&current->mm->mmap_sem))
752 down_write(&current->mm->mmap_sem); 754 return -EINTR;
753 755
756 ret = -ENOMEM;
754 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) || 757 if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
755 capable(CAP_IPC_LOCK)) 758 capable(CAP_IPC_LOCK))
756 ret = apply_mlockall_flags(flags); 759 ret = apply_mlockall_flags(flags);
@@ -765,7 +768,8 @@ SYSCALL_DEFINE0(munlockall)
765{ 768{
766 int ret; 769 int ret;
767 770
768 down_write(&current->mm->mmap_sem); 771 if (down_write_killable(&current->mm->mmap_sem))
772 return -EINTR;
769 ret = apply_mlockall_flags(0); 773 ret = apply_mlockall_flags(0);
770 up_write(&current->mm->mmap_sem); 774 up_write(&current->mm->mmap_sem);
771 return ret; 775 return ret;
diff --git a/mm/mmap.c b/mm/mmap.c
index b9274a0c82c9..11e1f2ca72af 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -178,7 +178,8 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
178 unsigned long min_brk; 178 unsigned long min_brk;
179 bool populate; 179 bool populate;
180 180
181 down_write(&mm->mmap_sem); 181 if (down_write_killable(&mm->mmap_sem))
182 return -EINTR;
182 183
183#ifdef CONFIG_COMPAT_BRK 184#ifdef CONFIG_COMPAT_BRK
184 /* 185 /*
@@ -1332,7 +1333,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1332 1333
1333 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1334 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1334 1335
1335 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1336 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
1336out_fput: 1337out_fput:
1337 if (file) 1338 if (file)
1338 fput(file); 1339 fput(file);
@@ -2493,6 +2494,10 @@ int vm_munmap(unsigned long start, size_t len)
2493 int ret; 2494 int ret;
2494 struct mm_struct *mm = current->mm; 2495 struct mm_struct *mm = current->mm;
2495 2496
2497 /*
2498 * XXX convert to down_write_killable as soon as all users are able
2499 * to handle the error.
2500 */
2496 down_write(&mm->mmap_sem); 2501 down_write(&mm->mmap_sem);
2497 ret = do_munmap(mm, start, len); 2502 ret = do_munmap(mm, start, len);
2498 up_write(&mm->mmap_sem); 2503 up_write(&mm->mmap_sem);
@@ -2502,8 +2507,15 @@ EXPORT_SYMBOL(vm_munmap);
2502 2507
2503SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) 2508SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
2504{ 2509{
2510 int ret;
2511 struct mm_struct *mm = current->mm;
2512
2505 profile_munmap(addr); 2513 profile_munmap(addr);
2506 return vm_munmap(addr, len); 2514 if (down_write_killable(&mm->mmap_sem))
2515 return -EINTR;
2516 ret = do_munmap(mm, addr, len);
2517 up_write(&mm->mmap_sem);
2518 return ret;
2507} 2519}
2508 2520
2509 2521
@@ -2535,7 +2547,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
2535 if (pgoff + (size >> PAGE_SHIFT) < pgoff) 2547 if (pgoff + (size >> PAGE_SHIFT) < pgoff)
2536 return ret; 2548 return ret;
2537 2549
2538 down_write(&mm->mmap_sem); 2550 if (down_write_killable(&mm->mmap_sem))
2551 return -EINTR;
2552
2539 vma = find_vma(mm, start); 2553 vma = find_vma(mm, start);
2540 2554
2541 if (!vma || !(vma->vm_flags & VM_SHARED)) 2555 if (!vma || !(vma->vm_flags & VM_SHARED))
@@ -2700,6 +2714,11 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
2700 unsigned long ret; 2714 unsigned long ret;
2701 bool populate; 2715 bool populate;
2702 2716
2717 /*
2718 * XXX not all users are chcecking the return value, convert
2719 * to down_write_killable after they are able to cope with
2720 * error
2721 */
2703 down_write(&mm->mmap_sem); 2722 down_write(&mm->mmap_sem);
2704 ret = do_brk(addr, len); 2723 ret = do_brk(addr, len);
2705 populate = ((mm->def_flags & VM_LOCKED) != 0); 2724 populate = ((mm->def_flags & VM_LOCKED) != 0);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index b650c5412f58..5019a1ef2848 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -379,7 +379,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
379 379
380 reqprot = prot; 380 reqprot = prot;
381 381
382 down_write(&current->mm->mmap_sem); 382 if (down_write_killable(&current->mm->mmap_sem))
383 return -EINTR;
383 384
384 vma = find_vma(current->mm, start); 385 vma = find_vma(current->mm, start);
385 error = -ENOMEM; 386 error = -ENOMEM;
diff --git a/mm/mremap.c b/mm/mremap.c
index 9dc499977924..1f157adfdaf9 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -503,7 +503,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
503 if (!new_len) 503 if (!new_len)
504 return ret; 504 return ret;
505 505
506 down_write(&current->mm->mmap_sem); 506 if (down_write_killable(&current->mm->mmap_sem))
507 return -EINTR;
507 508
508 if (flags & MREMAP_FIXED) { 509 if (flags & MREMAP_FIXED) {
509 ret = mremap_to(addr, old_len, new_addr, new_len, 510 ret = mremap_to(addr, old_len, new_addr, new_len,
diff --git a/mm/nommu.c b/mm/nommu.c
index c8bd59a03c71..b74512746aae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1446,7 +1446,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
1446 1446
1447 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); 1447 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
1448 1448
1449 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); 1449 retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff, true);
1450 1450
1451 if (file) 1451 if (file)
1452 fput(file); 1452 fput(file);
diff --git a/mm/util.c b/mm/util.c
index 8a1b3a1fb595..03b237746850 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
289 289
290unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr, 290unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
291 unsigned long len, unsigned long prot, 291 unsigned long len, unsigned long prot,
292 unsigned long flag, unsigned long pgoff) 292 unsigned long flag, unsigned long pgoff, bool killable)
293{ 293{
294 unsigned long ret; 294 unsigned long ret;
295 struct mm_struct *mm = current->mm; 295 struct mm_struct *mm = current->mm;
@@ -297,7 +297,12 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
297 297
298 ret = security_mmap_file(file, prot, flag); 298 ret = security_mmap_file(file, prot, flag);
299 if (!ret) { 299 if (!ret) {
300 down_write(&mm->mmap_sem); 300 if (killable) {
301 if (down_write_killable(&mm->mmap_sem))
302 return -EINTR;
303 } else {
304 down_write(&mm->mmap_sem);
305 }
301 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff, 306 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
302 &populate); 307 &populate);
303 up_write(&mm->mmap_sem); 308 up_write(&mm->mmap_sem);
@@ -307,6 +312,7 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
307 return ret; 312 return ret;
308} 313}
309 314
315/* XXX are all callers checking an error */
310unsigned long vm_mmap(struct file *file, unsigned long addr, 316unsigned long vm_mmap(struct file *file, unsigned long addr,
311 unsigned long len, unsigned long prot, 317 unsigned long len, unsigned long prot,
312 unsigned long flag, unsigned long offset) 318 unsigned long flag, unsigned long offset)
@@ -316,7 +322,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
316 if (unlikely(offset_in_page(offset))) 322 if (unlikely(offset_in_page(offset)))
317 return -EINVAL; 323 return -EINVAL;
318 324
319 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT); 325 return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT, false);
320} 326}
321EXPORT_SYMBOL(vm_mmap); 327EXPORT_SYMBOL(vm_mmap);
322 328