aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric B Munson <emunson@akamai.com>2015-11-05 21:51:36 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 22:34:48 -0500
commitde60f5f10c58d4f34b68622442c0e04180367f3f (patch)
treebe0a15ded3c2177e26e69c316bf730d393e6357a
parenta8ca5d0ecbdde5cc3d7accacbd69968b0c98764e (diff)
mm: introduce VM_LOCKONFAULT
The cost of faulting in all memory to be locked can be very high when working with large mappings. If only portions of the mapping will be used this can incur a high penalty for locking. For the example of a large file, this is the usage pattern for a large statical language model (probably applies to other statical or graphical models as well). For the security example, any application transacting in data that cannot be swapped out (credit card data, medical records, etc). This patch introduces the ability to request that pages are not pre-faulted, but are placed on the unevictable LRU when they are finally faulted in. The VM_LOCKONFAULT flag will be used together with VM_LOCKED and has no effect when set without VM_LOCKED. Setting the VM_LOCKONFAULT flag for a VMA will cause pages faulted into that VMA to be added to the unevictable LRU when they are faulted or if they are already present, but will not cause any missing pages to be faulted in. Exposing this new lock state means that we cannot overload the meaning of the FOLL_POPULATE flag any longer. Prior to this patch it was used to mean that the VMA for a fault was locked. This means we need the new FOLL_MLOCK flag to communicate the locked state of a VMA. FOLL_POPULATE will now only control if the VMA should be populated and in the case of VM_LOCKONFAULT, it will not be set. Signed-off-by: Eric B Munson <emunson@akamai.com> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Cc: Michal Hocko <mhocko@suse.cz> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Guenter Roeck <linux@roeck-us.net> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Shuah Khan <shuahkh@osg.samsung.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h5
-rw-r--r--kernel/fork.c3
-rw-r--r--mm/debug.c1
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/hugetlb.c4
-rw-r--r--mm/mlock.c2
-rw-r--r--mm/mmap.c2
8 files changed, 21 insertions, 8 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3c258f8eb9ae..906c46a05707 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -139,6 +139,7 @@ extern unsigned int kobjsize(const void *objp);
139 139
140#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ 140#define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */
141#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ 141#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
142#define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */
142#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ 143#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
143#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ 144#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
144#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ 145#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
@@ -202,6 +203,9 @@ extern unsigned int kobjsize(const void *objp);
202/* This mask defines which mm->def_flags a process can inherit its parent */ 203/* This mask defines which mm->def_flags a process can inherit its parent */
203#define VM_INIT_DEF_MASK VM_NOHUGEPAGE 204#define VM_INIT_DEF_MASK VM_NOHUGEPAGE
204 205
206/* This mask is used to clear all the VMA flags used by mlock */
207#define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT))
208
205/* 209/*
206 * mapping from the currently active vm_flags protection bits (the 210 * mapping from the currently active vm_flags protection bits (the
207 * low four bits) to a page protection mask.. 211 * low four bits) to a page protection mask..
@@ -2137,6 +2141,7 @@ static inline struct page *follow_page(struct vm_area_struct *vma,
2137#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ 2141#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */
2138#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ 2142#define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */
2139#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ 2143#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */
2144#define FOLL_MLOCK 0x1000 /* lock present pages */
2140 2145
2141typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, 2146typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr,
2142 void *data); 2147 void *data);
diff --git a/kernel/fork.c b/kernel/fork.c
index 6ac894244d39..a30fae45b486 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,7 +454,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
454 tmp->vm_mm = mm; 454 tmp->vm_mm = mm;
455 if (anon_vma_fork(tmp, mpnt)) 455 if (anon_vma_fork(tmp, mpnt))
456 goto fail_nomem_anon_vma_fork; 456 goto fail_nomem_anon_vma_fork;
457 tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP); 457 tmp->vm_flags &=
458 ~(VM_LOCKED|VM_LOCKONFAULT|VM_UFFD_MISSING|VM_UFFD_WP);
458 tmp->vm_next = tmp->vm_prev = NULL; 459 tmp->vm_next = tmp->vm_prev = NULL;
459 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; 460 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
460 file = tmp->vm_file; 461 file = tmp->vm_file;
diff --git a/mm/debug.c b/mm/debug.c
index 6c1b3ea61bfd..e784110fb51d 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -125,6 +125,7 @@ static const struct trace_print_flags vmaflags_names[] = {
125 {VM_GROWSDOWN, "growsdown" }, 125 {VM_GROWSDOWN, "growsdown" },
126 {VM_PFNMAP, "pfnmap" }, 126 {VM_PFNMAP, "pfnmap" },
127 {VM_DENYWRITE, "denywrite" }, 127 {VM_DENYWRITE, "denywrite" },
128 {VM_LOCKONFAULT, "lockonfault" },
128 {VM_LOCKED, "locked" }, 129 {VM_LOCKED, "locked" },
129 {VM_IO, "io" }, 130 {VM_IO, "io" },
130 {VM_SEQ_READ, "seqread" }, 131 {VM_SEQ_READ, "seqread" },
diff --git a/mm/gup.c b/mm/gup.c
index a798293fc648..deafa2c91b36 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -129,7 +129,7 @@ retry:
129 */ 129 */
130 mark_page_accessed(page); 130 mark_page_accessed(page);
131 } 131 }
132 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { 132 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
133 /* 133 /*
134 * The preliminary mapping check is mainly to avoid the 134 * The preliminary mapping check is mainly to avoid the
135 * pointless overhead of lock_page on the ZERO_PAGE 135 * pointless overhead of lock_page on the ZERO_PAGE
@@ -299,6 +299,9 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
299 unsigned int fault_flags = 0; 299 unsigned int fault_flags = 0;
300 int ret; 300 int ret;
301 301
302 /* mlock all present pages, but do not fault in new pages */
303 if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
304 return -ENOENT;
302 /* For mm_populate(), just skip the stack guard page. */ 305 /* For mm_populate(), just skip the stack guard page. */
303 if ((*flags & FOLL_POPULATE) && 306 if ((*flags & FOLL_POPULATE) &&
304 (stack_guard_page_start(vma, address) || 307 (stack_guard_page_start(vma, address) ||
@@ -890,7 +893,10 @@ long populate_vma_page_range(struct vm_area_struct *vma,
890 VM_BUG_ON_VMA(end > vma->vm_end, vma); 893 VM_BUG_ON_VMA(end > vma->vm_end, vma);
891 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm); 894 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
892 895
893 gup_flags = FOLL_TOUCH | FOLL_POPULATE; 896 gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
897 if (vma->vm_flags & VM_LOCKONFAULT)
898 gup_flags &= ~FOLL_POPULATE;
899
894 /* 900 /*
895 * We want to touch writable mappings with a write fault in order 901 * We want to touch writable mappings with a write fault in order
896 * to break COW, except for shared mappings because these don't COW 902 * to break COW, except for shared mappings because these don't COW
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 3fd0311c3ba7..f5c08b46fef8 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1307,7 +1307,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1307 pmd, _pmd, 1)) 1307 pmd, _pmd, 1))
1308 update_mmu_cache_pmd(vma, addr, pmd); 1308 update_mmu_cache_pmd(vma, addr, pmd);
1309 } 1309 }
1310 if ((flags & FOLL_POPULATE) && (vma->vm_flags & VM_LOCKED)) { 1310 if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
1311 if (page->mapping && trylock_page(page)) { 1311 if (page->mapping && trylock_page(page)) {
1312 lru_add_drain(); 1312 lru_add_drain();
1313 if (page->mapping) 1313 if (page->mapping)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 241de2712b36..74ef0c6a25dd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4137,8 +4137,8 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
4137 unsigned long s_end = sbase + PUD_SIZE; 4137 unsigned long s_end = sbase + PUD_SIZE;
4138 4138
4139 /* Allow segments to share if only one is marked locked */ 4139 /* Allow segments to share if only one is marked locked */
4140 unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED; 4140 unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
4141 unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED; 4141 unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
4142 4142
4143 /* 4143 /*
4144 * match the virtual addresses, permission and the alignment of the 4144 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/mlock.c b/mm/mlock.c
index 35dcf8fa7195..ca3894113b97 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -422,7 +422,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
422void munlock_vma_pages_range(struct vm_area_struct *vma, 422void munlock_vma_pages_range(struct vm_area_struct *vma,
423 unsigned long start, unsigned long end) 423 unsigned long start, unsigned long end)
424{ 424{
425 vma->vm_flags &= ~VM_LOCKED; 425 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
426 426
427 while (start < end) { 427 while (start < end) {
428 struct page *page = NULL; 428 struct page *page = NULL;
diff --git a/mm/mmap.c b/mm/mmap.c
index 220effde8ea3..2ce04a649f6b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1661,7 +1661,7 @@ out:
1661 vma == get_gate_vma(current->mm))) 1661 vma == get_gate_vma(current->mm)))
1662 mm->locked_vm += (len >> PAGE_SHIFT); 1662 mm->locked_vm += (len >> PAGE_SHIFT);
1663 else 1663 else
1664 vma->vm_flags &= ~VM_LOCKED; 1664 vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
1665 } 1665 }
1666 1666
1667 if (file) 1667 if (file)