From b291f000393f5a0b679012b39d79fbc85c018233 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:26:44 -0700 Subject: mlock: mlocked pages are unevictable Make sure that mlocked pages also live on the unevictable LRU, so kswapd will not scan them over and over again. This is achieved through various strategies: 1) add yet another page flag--PG_mlocked--to indicate that the page is locked for efficient testing in vmscan and, optionally, fault path. This allows early culling of unevictable pages, preventing them from getting to page_referenced()/try_to_unmap(). Also allows separate accounting of mlock'd pages, as Nick's original patch did. Note: Nick's original mlock patch used a PG_mlocked flag. I had removed this in favor of the PG_unevictable flag + an mlock_count [new page struct member]. I restored the PG_mlocked flag to eliminate the new count field. 2) add the mlock/unevictable infrastructure to mm/mlock.c, with internal APIs in mm/internal.h. This is a rework of Nick's original patch to these files, taking into account that mlocked pages are now kept on unevictable LRU list. 3) update vmscan.c:page_evictable() to check PageMlocked() and, if vma passed in, the vm_flags. Note that the vma will only be passed in for new pages in the fault path; and then only if the "cull unevictable pages in fault path" patch is included. 4) add try_to_unlock() to rmap.c to walk a page's rmap and ClearPageMlocked() if no other vmas have it mlocked. Reuses as much of try_to_unmap() as possible. This effectively replaces the use of one of the lru list links as an mlock count. If this mechanism let's pages in mlocked vmas leak through w/o PG_mlocked set [I don't know that it does], we should catch them later in try_to_unmap(). One hopes this will be rare, as it will be relatively expensive. Original mm/internal.h, mm/rmap.c and mm/mlock.c changes: Signed-off-by: Nick Piggin splitlru: introduce __get_user_pages(): New munlock processing need to GUP_FLAGS_IGNORE_VMA_PERMISSIONS. because current get_user_pages() can't grab PROT_NONE pages theresore it cause PROT_NONE pages can't munlock. [akpm@linux-foundation.org: fix this for pagemap-pass-mm-into-pagewalkers.patch] [akpm@linux-foundation.org: untangle patch interdependencies] [akpm@linux-foundation.org: fix things after out-of-order merging] [hugh@veritas.com: fix page-flags mess] [lee.schermerhorn@hp.com: fix munlock page table walk - now requires 'mm'] [kosaki.motohiro@jp.fujitsu.com: build fix] [kosaki.motohiro@jp.fujitsu.com: fix truncate race and sevaral comments] [kosaki.motohiro@jp.fujitsu.com: splitlru: introduce __get_user_pages()] Signed-off-by: KOSAKI Motohiro Signed-off-by: Rik van Riel Signed-off-by: Lee Schermerhorn Cc: Nick Piggin Cc: Dave Hansen Cc: Matt Mackall Signed-off-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index e7a5a68a9c2e..7bdfd2661f17 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) - static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { -- cgit v1.2.2 From ba470de43188cdbff795b5da43a1474523c6c2fb Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Sat, 18 Oct 2008 20:26:50 -0700 Subject: mmap: handle mlocked pages during map, remap, unmap Originally by Nick Piggin Remove mlocked pages from the LRU using "unevictable infrastructure" during mmap(), munmap(), mremap() and truncate(). Try to move back to normal LRU lists on munmap() when last mlocked mapping removed. Remove PageMlocked() status when page truncated from file. [akpm@linux-foundation.org: cleanup] [kamezawa.hiroyu@jp.fujitsu.com: fix double unlock_page()] [kosaki.motohiro@jp.fujitsu.com: split LRU: munlock rework] [lee.schermerhorn@hp.com: mlock: fix __mlock_vma_pages_range comment block] [akpm@linux-foundation.org: remove bogus kerneldoc token] Signed-off-by: Nick Piggin Signed-off-by: Lee Schermerhorn Signed-off-by: Rik van Riel Signed-off-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 52 insertions(+), 19 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 7bdfd2661f17..505a454f365e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -970,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EPERM; vm_flags |= VM_LOCKED; } + /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -1137,10 +1138,12 @@ munmap_back: * The VM_SHARED test is necessary because shmem_zero_setup * will create the file object for a shared anonymous map below. */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } /* * Determine the object being mapped and call the appropriate @@ -1222,10 +1225,14 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1698,8 +1705,10 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!prev || expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) - make_pages_present(addr, prev->vm_end); + if (prev->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) + return NULL; /* vma gone! */ + } return prev; } #else @@ -1725,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) - make_pages_present(addr, start); + if (vma->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(vma, addr, start) < 0) + return NULL; /* vma gone! */ + } return vma; } #endif @@ -1745,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) long nrpages = vma_pages(vma); mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); @@ -1911,6 +1920,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) } vma = prev? prev->vm_next: mm->mmap; + /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + /* * Remove the vma's, and unmap the actual pages */ @@ -2023,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) goto out; /* @@ -2046,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } return addr; } @@ -2058,7 +2082,7 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2066,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + vma = mm->mmap; lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); -- cgit v1.2.2 From cb8f488c33539f096580e202f5438a809195008f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 18 Oct 2008 20:27:01 -0700 Subject: mmap.c: deinline a few functions __vma_link_file and expand_downwards functions are not small, yeat they are marked inline. They probably had one callsite sometime in the past, but now they have more. In order to prevent similar thing, I also deinlined expand_upwards, despite it having only pne callsite. Nowadays gcc auto-inlines such static functions anyway. In find_extend_vma, I removed one extra level of indirection. Patch is deliberately generated with -U $BIGNUM to make it easier to see that functions are big. Result: # size */*/mmap.o */vmlinux text data bss dec hex filename 9514 188 16 9718 25f6 0.org/mm/mmap.o 9237 188 16 9441 24e1 deinline/mm/mmap.o 6124402 858996 389480 7372878 70804e 0.org/vmlinux 6124113 858996 389480 7372589 707f2d deinline/vmlinux Signed-off-by: Denys Vlasenko Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm/mmap.c') diff --git a/mm/mmap.c b/mm/mmap.c index 505a454f365e..74f4d158022e 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, rb_insert_color(&vma->vm_rb, &mm->mm_rb); } -static inline void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma) { struct file * file; @@ -1591,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un * vma is the last one with address > vma->vm_end. Have to extend vma. */ #ifndef CONFIG_IA64 -static inline +static #endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { @@ -1641,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -static inline int expand_downwards(struct vm_area_struct *vma, +static int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1703,7 +1703,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) -- cgit v1.2.2