diff options
author | Yang Shi <yang.shi@linux.alibaba.com> | 2018-10-26 18:08:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-10-26 19:26:35 -0400 |
commit | 85a06835f6f1ba79f0f00838ccd5ad840dd1eafb (patch) | |
tree | 91ec31587d1e465de5d5940a11f469695c7320a3 /mm/mremap.c | |
parent | 3c0513243a4a07ebad2d59f3d972bef483818ec6 (diff) |
mm: mremap: downgrade mmap_sem to read when shrinking
Other than munmap, mremap might be used to shrink memory mapping too.
So, it may hold write mmap_sem for long time when shrinking large
mapping, as what commit ("mm: mmap: zap pages with read mmap_sem in
munmap") described.
The mremap() will not manipulate vmas anymore after __do_munmap() call for
the mapping shrink use case, so it is safe to downgrade to read mmap_sem.
So, the same optimization, which downgrades mmap_sem to read for zapping
pages, is also feasible and reasonable to this case.
The period of holding exclusive mmap_sem for shrinking large mapping
would be reduced significantly with this optimization.
MREMAP_FIXED and MREMAP_MAYMOVE are more complicated to adopt this
optimization since they need manipulate vmas after do_munmap(),
downgrading mmap_sem may create race window.
Simple mapping shrink is the low hanging fruit, and it may cover the
most cases of unmap with munmap together.
[akpm@linux-foundation.org: tweak comment]
[yang.shi@linux.alibaba.com: fix unsigned compare against 0 issue]
Link: http://lkml.kernel.org/r/1538687672-17795-2-git-send-email-yang.shi@linux.alibaba.com
Link: http://lkml.kernel.org/r/1538067582-60038-1-git-send-email-yang.shi@linux.alibaba.com
Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Laurent Dufour <ldufour@linux.vnet.ibm.com>
Cc: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/mremap.c')
-rw-r--r-- | mm/mremap.c | 20 |
1 files changed, 16 insertions, 4 deletions
diff --git a/mm/mremap.c b/mm/mremap.c index a9617e72e6b7..7f9f9180e401 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
@@ -521,6 +521,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
521 | unsigned long ret = -EINVAL; | 521 | unsigned long ret = -EINVAL; |
522 | unsigned long charged = 0; | 522 | unsigned long charged = 0; |
523 | bool locked = false; | 523 | bool locked = false; |
524 | bool downgraded = false; | ||
524 | struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; | 525 | struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX; |
525 | LIST_HEAD(uf_unmap_early); | 526 | LIST_HEAD(uf_unmap_early); |
526 | LIST_HEAD(uf_unmap); | 527 | LIST_HEAD(uf_unmap); |
@@ -557,12 +558,20 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, | |||
557 | /* | 558 | /* |
558 | * Always allow a shrinking remap: that just unmaps | 559 | * Always allow a shrinking remap: that just unmaps |
559 | * the unnecessary pages.. | 560 | * the unnecessary pages.. |
560 | * do_munmap does all the needed commit accounting | 561 | * __do_munmap does all the needed commit accounting, and |
562 | * downgrades mmap_sem to read if so directed. | ||
561 | */ | 563 | */ |
562 | if (old_len >= new_len) { | 564 | if (old_len >= new_len) { |
563 | ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap); | 565 | int retval; |
564 | if (ret && old_len != new_len) | 566 | |
567 | retval = __do_munmap(mm, addr+new_len, old_len - new_len, | ||
568 | &uf_unmap, true); | ||
569 | if (retval < 0 && old_len != new_len) { | ||
570 | ret = retval; | ||
565 | goto out; | 571 | goto out; |
572 | /* Returning 1 indicates mmap_sem is downgraded to read. */ | ||
573 | } else if (retval == 1) | ||
574 | downgraded = true; | ||
566 | ret = addr; | 575 | ret = addr; |
567 | goto out; | 576 | goto out; |
568 | } | 577 | } |
@@ -627,7 +636,10 @@ out: | |||
627 | vm_unacct_memory(charged); | 636 | vm_unacct_memory(charged); |
628 | locked = 0; | 637 | locked = 0; |
629 | } | 638 | } |
630 | up_write(¤t->mm->mmap_sem); | 639 | if (downgraded) |
640 | up_read(¤t->mm->mmap_sem); | ||
641 | else | ||
642 | up_write(¤t->mm->mmap_sem); | ||
631 | if (locked && new_len > old_len) | 643 | if (locked && new_len > old_len) |
632 | mm_populate(new_addr + old_len, new_len - old_len); | 644 | mm_populate(new_addr + old_len, new_len - old_len); |
633 | userfaultfd_unmap_complete(mm, &uf_unmap_early); | 645 | userfaultfd_unmap_complete(mm, &uf_unmap_early); |