1 files changed, 421 insertions, 18 deletions
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..a6da2aee940a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
 #include <linux/capability.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
 #include <linux/sched.h>
 #include <linux/module.h>
+#include <linux/rmap.h>
+#include <linux/mmzone.h>
+#include <linux/hugetlb.h>
+#include "internal.h"
 int can_do_mlock(void)
 {
@@ -23,17 +31,373 @@ int can_do_mlock(void)
 }
 EXPORT_SYMBOL(can_do_mlock);
+#ifdef CONFIG_UNEVICTABLE_LRU
+/*
+ * Mlocked pages are marked with PageMlocked() flag for efficient testing
+ * in vmscan and, possibly, the fault path; and to support semi-accurate
+ * statistics.
+ *
+ * An mlocked page [PageMlocked(page)] is unevictable.  As such, it will
+ * be placed on the LRU "unevictable" list, rather than the [in]active lists.
+ * The unevictable list is an LRU sibling list to the [in]active lists.
+ * PageUnevictable is set to indicate the unevictable state.
+ *
+ * When lazy mlocking via vmscan, it is important to ensure that the
+ * vma's VM_LOCKED status is not concurrently being modified, otherwise we
+ * may have mlocked a page that is being munlocked. So lazy mlock must take
+ * the mmap_sem for read, and verify that the vma really is locked
+ * (see mm/rmap.c).
+ */
+/*
+ *  LRU accounting for clear_page_mlock()
+ */
+void __clear_page_mlock(struct page *page)
+{
+        VM_BUG_ON(!PageLocked(page));
+        if (!page->mapping) {   /* truncated ? */
+                return;
+        }
+        dec_zone_page_state(page, NR_MLOCK);
+        count_vm_event(UNEVICTABLE_PGCLEARED);
+        if (!isolate_lru_page(page)) {
+                putback_lru_page(page);
+        } else {
+                /*
+                 * We lost the race. the page already moved to evictable list.
+                 */
+                if (PageUnevictable(page))
+                        count_vm_event(UNEVICTABLE_PGSTRANDED);
+        }
+}
+/*
+ * Mark page as mlocked if not already.
+ * If page on LRU, isolate and putback to move to unevictable list.
+ */
+void mlock_vma_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        if (!TestSetPageMlocked(page)) {
+                inc_zone_page_state(page, NR_MLOCK);
+                count_vm_event(UNEVICTABLE_PGMLOCKED);
+                if (!isolate_lru_page(page))
+                        putback_lru_page(page);
+        }
+}
+/*
+ * called from munlock()/munmap() path with page supposedly on the LRU.
+ *
+ * Note:  unlike mlock_vma_page(), we can't just clear the PageMlocked
+ * [in try_to_munlock()] and then attempt to isolate the page.  We must
+ * isolate the page to keep others from messing with its unevictable
+ * and mlocked state while trying to munlock.  However, we pre-clear the
+ * mlocked state anyway as we might lose the isolation race and we might
+ * not get another chance to clear PageMlocked.  If we successfully
+ * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
+ * mapping the page, it will restore the PageMlocked state, unless the page
+ * is mapped in a non-linear vma.  So, we go ahead and SetPageMlocked(),
+ * perhaps redundantly.
+ * If we lose the isolation race, and the page is mapped by other VM_LOCKED
+ * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
+ * either of which will restore the PageMlocked state by calling
+ * mlock_vma_page() above, if it can grab the vma's mmap sem.
+ */
+static void munlock_vma_page(struct page *page)
+{
+        BUG_ON(!PageLocked(page));
+        if (TestClearPageMlocked(page)) {
+                dec_zone_page_state(page, NR_MLOCK);
+                if (!isolate_lru_page(page)) {
+                        int ret = try_to_munlock(page);
+                        /*
+                         * did try_to_unlock() succeed or punt?
+                         */
+                        if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
+                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+                        putback_lru_page(page);
+                } else {
+                        /*
+                         * We lost the race.  let try_to_unmap() deal
+                         * with it.  At least we get the page state and
+                         * mlock stats right.  However, page is still on
+                         * the noreclaim list.  We'll fix that up when
+                         * the page is eventually freed or we scan the
+                         * noreclaim list.
+                         */
+                        if (PageUnevictable(page))
+                                count_vm_event(UNEVICTABLE_PGSTRANDED);
+                        else
+                                count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+                }
+        }
+}
+/**
+ * __mlock_vma_pages_range() -  mlock/munlock a range of pages in the vma.
+ * @vma:   target vma
+ * @start: start address
+ * @end:   end address
+ * @mlock: 0 indicate munlock, otherwise mlock.
+ *
+ * If @mlock == 0, unlock an mlocked range;
+ * else mlock the range of pages.  This takes care of making the pages present ,
+ * too.
+ *
+ * return 0 on success, negative error code on error.
+ *
+ * vma->vm_mm->mmap_sem must be held for at least read.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end,
+                                   int mlock)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        unsigned long addr = start;
+        struct page *pages[16]; /* 16 gives a reasonable batch */
+        int nr_pages = (end - start) / PAGE_SIZE;
+        int ret;
+        int gup_flags = 0;
+        VM_BUG_ON(start & ~PAGE_MASK);
+        VM_BUG_ON(end   & ~PAGE_MASK);
+        VM_BUG_ON(start < vma->vm_start);
+        VM_BUG_ON(end   > vma->vm_end);
+        VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
+                  (atomic_read(&mm->mm_users) != 0));
+        /*
+         * mlock:   don't page populate if page has PROT_NONE permission.
+         * munlock: the pages always do munlock althrough
+         *          its has PROT_NONE permission.
+         */
+        if (!mlock)
+                gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
+        if (vma->vm_flags & VM_WRITE)
+                gup_flags |= GUP_FLAGS_WRITE;
+        while (nr_pages > 0) {
+                int i;
+                cond_resched();
+                /*
+                 * get_user_pages makes pages present if we are
+                 * setting mlock. and this extra reference count will
+                 * disable migration of this page.  However, page may
+                 * still be truncated out from under us.
+                 */
+                ret = __get_user_pages(current, mm, addr,
+                                min_t(int, nr_pages, ARRAY_SIZE(pages)),
+                                gup_flags, pages, NULL);
+                /*
+                 * This can happen for, e.g., VM_NONLINEAR regions before
+                 * a page has been allocated and mapped at a given offset,
+                 * or for addresses that map beyond end of a file.
+                 * We'll mlock the the pages if/when they get faulted in.
+                 */
+                if (ret < 0)
+                        break;
+                if (ret == 0) {
+                        /*
+                         * We know the vma is there, so the only time
+                         * we cannot get a single page should be an
+                         * error (ret < 0) case.
+                         */
+                        WARN_ON(1);
+                        break;
+                }
+                lru_add_drain();        /* push cached pages to LRU */
+                for (i = 0; i < ret; i++) {
+                        struct page *page = pages[i];
+                        lock_page(page);
+                        /*
+                         * Because we lock page here and migration is blocked
+                         * by the elevated reference, we need only check for
+                         * page truncation (file-cache only).
+                         */
+                        if (page->mapping) {
+                                if (mlock)
+                                        mlock_vma_page(page);
+                                else
+                                        munlock_vma_page(page);
+                        }
+                        unlock_page(page);
+                        put_page(page);         /* ref from get_user_pages() */
+                        /*
+                         * here we assume that get_user_pages() has given us
+                         * a list of virtually contiguous pages.
+                         */
+                        addr += PAGE_SIZE;      /* for next get_user_pages() */
+                        nr_pages--;
+                }
+                ret = 0;
+        }
+        return ret;     /* count entire vma as locked_vm */
+}
+/*
+ * convert get_user_pages() return value to posix mlock() error
+ */
+static int __mlock_posix_error_return(long retval)
+{
+        if (retval == -EFAULT)
+                retval = -ENOMEM;
+        else if (retval == -ENOMEM)
+                retval = -EAGAIN;
+        return retval;
+}
+#else /* CONFIG_UNEVICTABLE_LRU */
+/*
+ * Just make pages present if VM_LOCKED.  No-op if unlocking.
+ */
+static long __mlock_vma_pages_range(struct vm_area_struct *vma,
+                                   unsigned long start, unsigned long end,
+                                   int mlock)
+{
+        if (mlock && (vma->vm_flags & VM_LOCKED))
+                return make_pages_present(start, end);
+        return 0;
+}
+static inline int __mlock_posix_error_return(long retval)
+{
+        return 0;
+}
+#endif /* CONFIG_UNEVICTABLE_LRU */
+/**
+ * mlock_vma_pages_range() - mlock pages in specified vma range.
+ * @vma - the vma containing the specfied address range
+ * @start - starting address in @vma to mlock
+ * @end   - end address [+1] in @vma to mlock
+ *
+ * For mmap()/mremap()/expansion of mlocked vma.
+ *
+ * return 0 on success for "normal" vmas.
+ *
+ * return number of pages [> 0] to be removed from locked_vm on success
+ * of "special" vmas.
+ *
+ * return negative error if vma spanning @start-@range disappears while
+ * mmap semaphore is dropped.  Unlikely?
+ */
+long mlock_vma_pages_range(struct vm_area_struct *vma,
+                        unsigned long start, unsigned long end)
+{
+        struct mm_struct *mm = vma->vm_mm;
+        int nr_pages = (end - start) / PAGE_SIZE;
+        BUG_ON(!(vma->vm_flags & VM_LOCKED));
+        /*
+         * filter unlockable vmas
+         */
+        if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+                goto no_mlock;
+        if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+                        is_vm_hugetlb_page(vma) ||
+                        vma == get_gate_vma(current))) {
+                long error;
+                downgrade_write(&mm->mmap_sem);
+                error = __mlock_vma_pages_range(vma, start, end, 1);
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                vma = find_vma(mm, start);
+                /* non-NULL vma must contain @start, but need to check @end */
+                if (!vma ||  end > vma->vm_end)
+                        return -ENOMEM;
+                return 0;       /* hide other errors from mmap(), et al */
+        }
+        /*
+         * User mapped kernel pages or huge pages:
+         * make these pages present to populate the ptes, but
+         * fall thru' to reset VM_LOCKED--no need to unlock, and
+         * return nr_pages so these don't get counted against task's
+         * locked limit.  huge pages are already counted against
+         * locked vm limit.
+         */
+        make_pages_present(start, end);
+no_mlock:
+        vma->vm_flags &= ~VM_LOCKED;    /* and don't come back! */
+        return nr_pages;                /* error or pages NOT mlocked */
+}
+/*
+ * munlock_vma_pages_range() - munlock all pages in the vma range.'
+ * @vma - vma containing range to be munlock()ed.
+ * @start - start address in @vma of the range
+ * @end - end of range in @vma.
+ *
+ *  For mremap(), munmap() and exit().
+ *
+ * Called with @vma VM_LOCKED.
+ *
+ * Returns with VM_LOCKED cleared.  Callers must be prepared to
+ * deal with this.
+ *
+ * We don't save and restore VM_LOCKED here because pages are
+ * still on lru.  In unmap path, pages might be scanned by reclaim
+ * and re-mlocked by try_to_{munlock|unmap} before we unmap and
+ * free them.  This will result in freeing mlocked pages.
+ */
+void munlock_vma_pages_range(struct vm_area_struct *vma,
+                           unsigned long start, unsigned long end)
+{
+        vma->vm_flags &= ~VM_LOCKED;
+        __mlock_vma_pages_range(vma, start, end, 0);
+}
+/*
+ * mlock_fixup  - handle mlock[all]/munlock[all] requests.
+ *
+ * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
+ * munlock is a no-op.  However, for some special vmas, we go ahead and
+ * populate the ptes via make_pages_present().
+ *
+ * For vmas that pass the filters, merge/split as appropriate.
+ */
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
        unsigned long start, unsigned long end, unsigned int newflags)
 {
-        struct mm_struct * mm = vma->vm_mm;
+        struct mm_struct *mm = vma->vm_mm;
        pgoff_t pgoff;
-        int pages;
+        int nr_pages;
        int ret = 0;
+        int lock = newflags & VM_LOCKED;
-        if (newflags == vma->vm_flags) {
-                *prev = vma;
+        if (newflags == vma->vm_flags ||
-                goto out;
+                        (vma->vm_flags & (VM_IO | VM_PFNMAP)))
+                goto out;       /* don't set VM_LOCKED,  don't count */
+        if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
+                        is_vm_hugetlb_page(vma) ||
+                        vma == get_gate_vma(current)) {
+                if (lock)
+                        make_pages_present(start, end);
+                goto out;       /* don't set VM_LOCKED,  don't count */
        }
        pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +408,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
                goto success;
        }
-        *prev = vma;
        if (start != vma->vm_start) {
                ret = split_vma(mm, vma, start, 1);
                if (ret)
@@ -60,24 +422,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 success:
        /*
+         * Keep track of amount of locked VM.
+         */
+        nr_pages = (end - start) >> PAGE_SHIFT;
+        if (!lock)
+                nr_pages = -nr_pages;
+        mm->locked_vm += nr_pages;
+        /*
         * vm_flags is protected by the mmap_sem held in write mode.
         * It's okay if try_to_unmap_one unmaps a page just after we
-         * set VM_LOCKED, make_pages_present below will bring it back.
+         * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
         */
        vma->vm_flags = newflags;
-        /*
+        if (lock) {
-         * Keep track of amount of locked VM.
+                /*
-         */
+                 * mmap_sem is currently held for write.  Downgrade the write
-        pages = (end - start) >> PAGE_SHIFT;
+                 * lock to a read lock so that other faults, mmap scans, ...
-        if (newflags & VM_LOCKED) {
+                 * while we fault in all pages.
-                pages = -pages;
+                 */
-                if (!(newflags & VM_IO))
+                downgrade_write(&mm->mmap_sem);
-                        ret = make_pages_present(start, end);
+                ret = __mlock_vma_pages_range(vma, start, end, 1);
+                /*
+                 * Need to reacquire mmap sem in write mode, as our callers
+                 * expect this.  We have no support for atomically upgrading
+                 * a sem to write, so we need to check for ranges while sem
+                 * is unlocked.
+                 */
+                up_read(&mm->mmap_sem);
+                /* vma can change or disappear */
+                down_write(&mm->mmap_sem);
+                *prev = find_vma(mm, start);
+                /* non-NULL *prev must contain @start, but need to check @end */
+                if (!(*prev) || end > (*prev)->vm_end)
+                        ret = -ENOMEM;
+                else if (ret > 0) {
+                        mm->locked_vm -= ret;
+                        ret = 0;
+                } else
+                        ret = __mlock_posix_error_return(ret); /* translate if needed */
+        } else {
+                /*
+                 * TODO:  for unlocking, pages will already be resident, so
+                 * we don't need to wait for allocations/reclaim/pagein, ...
+                 * However, unlocking a very large region can still take a
+                 * while.  Should we downgrade the semaphore for both lock
+                 * AND unlock ?
+                 */
+                __mlock_vma_pages_range(vma, start, end, 0);
        }
-        mm->locked_vm -= pages;
 out:
+        *prev = vma;
        return ret;
 }
@@ -139,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
        if (!can_do_mlock())
                return -EPERM;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
        start &= PAGE_MASK;
@@ -205,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
        if (!can_do_mlock())
                goto out;
+        lru_add_drain_all();    /* flush pagevec */
        down_write(&current->mm->mmap_sem);
        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;

diff --git a/mm/mlock.c b/mm/mlock.c index 01fbe93eff5c..a6da2aee940a 100644 --- a/mm/mlock.c +++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8	#include <linux/capability.h>	8	#include <linux/capability.h>
9	#include <linux/mman.h>	9	#include <linux/mman.h>
10	#include <linux/mm.h>	10	#include <linux/mm.h>
		11	#include <linux/swap.h>
		12	#include <linux/swapops.h>
		13	#include <linux/pagemap.h>
11	#include <linux/mempolicy.h>	14	#include <linux/mempolicy.h>
12	#include <linux/syscalls.h>	15	#include <linux/syscalls.h>
13	#include <linux/sched.h>	16	#include <linux/sched.h>
14	#include <linux/module.h>	17	#include <linux/module.h>
		18	#include <linux/rmap.h>
		19	#include <linux/mmzone.h>
		20	#include <linux/hugetlb.h>
		21
		22	#include "internal.h"
15		23
16	int can_do_mlock(void)	24	int can_do_mlock(void)
17	{	25	{
@@ -23,17 +31,373 @@ int can_do_mlock(void)
23	}	31	}
24	EXPORT_SYMBOL(can_do_mlock);	32	EXPORT_SYMBOL(can_do_mlock);
25		33
		34	#ifdef CONFIG_UNEVICTABLE_LRU
		35	/*
		36	* Mlocked pages are marked with PageMlocked() flag for efficient testing
		37	* in vmscan and, possibly, the fault path; and to support semi-accurate
		38	* statistics.
		39	*
		40	* An mlocked page [PageMlocked(page)] is unevictable. As such, it will
		41	* be placed on the LRU "unevictable" list, rather than the [in]active lists.
		42	* The unevictable list is an LRU sibling list to the [in]active lists.
		43	* PageUnevictable is set to indicate the unevictable state.
		44	*
		45	* When lazy mlocking via vmscan, it is important to ensure that the
		46	* vma's VM_LOCKED status is not concurrently being modified, otherwise we
		47	* may have mlocked a page that is being munlocked. So lazy mlock must take
		48	* the mmap_sem for read, and verify that the vma really is locked
		49	* (see mm/rmap.c).
		50	*/
		51
		52	/*
		53	* LRU accounting for clear_page_mlock()
		54	*/
		55	void __clear_page_mlock(struct page *page)
		56	{
		57	VM_BUG_ON(!PageLocked(page));
		58
		59	if (!page->mapping) { /* truncated ? */
		60	return;
		61	}
		62
		63	dec_zone_page_state(page, NR_MLOCK);
		64	count_vm_event(UNEVICTABLE_PGCLEARED);
		65	if (!isolate_lru_page(page)) {
		66	putback_lru_page(page);
		67	} else {
		68	/*
		69	* We lost the race. the page already moved to evictable list.
		70	*/
		71	if (PageUnevictable(page))
		72	count_vm_event(UNEVICTABLE_PGSTRANDED);
		73	}
		74	}
		75
		76	/*
		77	* Mark page as mlocked if not already.
		78	* If page on LRU, isolate and putback to move to unevictable list.
		79	*/
		80	void mlock_vma_page(struct page *page)
		81	{
		82	BUG_ON(!PageLocked(page));
		83
		84	if (!TestSetPageMlocked(page)) {
		85	inc_zone_page_state(page, NR_MLOCK);
		86	count_vm_event(UNEVICTABLE_PGMLOCKED);
		87	if (!isolate_lru_page(page))
		88	putback_lru_page(page);
		89	}
		90	}
		91
		92	/*
		93	* called from munlock()/munmap() path with page supposedly on the LRU.
		94	*
		95	* Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
		96	* [in try_to_munlock()] and then attempt to isolate the page. We must
		97	* isolate the page to keep others from messing with its unevictable
		98	* and mlocked state while trying to munlock. However, we pre-clear the
		99	* mlocked state anyway as we might lose the isolation race and we might
		100	* not get another chance to clear PageMlocked. If we successfully
		101	* isolate the page and try_to_munlock() detects other VM_LOCKED vmas
		102	* mapping the page, it will restore the PageMlocked state, unless the page
		103	* is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
		104	* perhaps redundantly.
		105	* If we lose the isolation race, and the page is mapped by other VM_LOCKED
		106	* vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
		107	* either of which will restore the PageMlocked state by calling
		108	* mlock_vma_page() above, if it can grab the vma's mmap sem.
		109	*/
		110	static void munlock_vma_page(struct page *page)
		111	{
		112	BUG_ON(!PageLocked(page));
		113
		114	if (TestClearPageMlocked(page)) {
		115	dec_zone_page_state(page, NR_MLOCK);
		116	if (!isolate_lru_page(page)) {
		117	int ret = try_to_munlock(page);
		118	/*
		119	* did try_to_unlock() succeed or punt?
		120	*/
		121	if (ret == SWAP_SUCCESS \|\| ret == SWAP_AGAIN)
		122	count_vm_event(UNEVICTABLE_PGMUNLOCKED);
		123
		124	putback_lru_page(page);
		125	} else {
		126	/*
		127	* We lost the race. let try_to_unmap() deal
		128	* with it. At least we get the page state and
		129	* mlock stats right. However, page is still on
		130	* the noreclaim list. We'll fix that up when
		131	* the page is eventually freed or we scan the
		132	* noreclaim list.
		133	*/
		134	if (PageUnevictable(page))
		135	count_vm_event(UNEVICTABLE_PGSTRANDED);
		136	else
		137	count_vm_event(UNEVICTABLE_PGMUNLOCKED);
		138	}
		139	}
		140	}
		141
		142	/**
		143	* __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
		144	* @vma: target vma
		145	* @start: start address
		146	* @end: end address
		147	* @mlock: 0 indicate munlock, otherwise mlock.
		148	*
		149	* If @mlock == 0, unlock an mlocked range;
		150	* else mlock the range of pages. This takes care of making the pages present ,
		151	* too.
		152	*
		153	* return 0 on success, negative error code on error.
		154	*
		155	* vma->vm_mm->mmap_sem must be held for at least read.
		156	*/
		157	static long __mlock_vma_pages_range(struct vm_area_struct *vma,
		158	unsigned long start, unsigned long end,
		159	int mlock)
		160	{
		161	struct mm_struct *mm = vma->vm_mm;
		162	unsigned long addr = start;
		163	struct page pages[16]; / 16 gives a reasonable batch */
		164	int nr_pages = (end - start) / PAGE_SIZE;
		165	int ret;
		166	int gup_flags = 0;
		167
		168	VM_BUG_ON(start & ~PAGE_MASK);
		169	VM_BUG_ON(end & ~PAGE_MASK);
		170	VM_BUG_ON(start < vma->vm_start);
		171	VM_BUG_ON(end > vma->vm_end);
		172	VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
		173	(atomic_read(&mm->mm_users) != 0));
		174
		175	/*
		176	* mlock: don't page populate if page has PROT_NONE permission.
		177	* munlock: the pages always do munlock althrough
		178	* its has PROT_NONE permission.
		179	*/
		180	if (!mlock)
		181	gup_flags \|= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
		182
		183	if (vma->vm_flags & VM_WRITE)
		184	gup_flags \|= GUP_FLAGS_WRITE;
		185
		186	while (nr_pages > 0) {
		187	int i;
		188
		189	cond_resched();
		190
		191	/*
		192	* get_user_pages makes pages present if we are
		193	* setting mlock. and this extra reference count will
		194	* disable migration of this page. However, page may
		195	* still be truncated out from under us.
		196	*/
		197	ret = __get_user_pages(current, mm, addr,
		198	min_t(int, nr_pages, ARRAY_SIZE(pages)),
		199	gup_flags, pages, NULL);
		200	/*
		201	* This can happen for, e.g., VM_NONLINEAR regions before
		202	* a page has been allocated and mapped at a given offset,
		203	* or for addresses that map beyond end of a file.
		204	* We'll mlock the the pages if/when they get faulted in.
		205	*/
		206	if (ret < 0)
		207	break;
		208	if (ret == 0) {
		209	/*
		210	* We know the vma is there, so the only time
		211	* we cannot get a single page should be an
		212	* error (ret < 0) case.
		213	*/
		214	WARN_ON(1);
		215	break;
		216	}
		217
		218	lru_add_drain(); /* push cached pages to LRU */
		219
		220	for (i = 0; i < ret; i++) {
		221	struct page *page = pages[i];
		222
		223	lock_page(page);
		224	/*
		225	* Because we lock page here and migration is blocked
		226	* by the elevated reference, we need only check for
		227	* page truncation (file-cache only).
		228	*/
		229	if (page->mapping) {
		230	if (mlock)
		231	mlock_vma_page(page);
		232	else
		233	munlock_vma_page(page);
		234	}
		235	unlock_page(page);
		236	put_page(page); /* ref from get_user_pages() */
		237
		238	/*
		239	* here we assume that get_user_pages() has given us
		240	* a list of virtually contiguous pages.
		241	*/
		242	addr += PAGE_SIZE; /* for next get_user_pages() */
		243	nr_pages--;
		244	}
		245	ret = 0;
		246	}
		247
		248	return ret; /* count entire vma as locked_vm */
		249	}
		250
		251	/*
		252	* convert get_user_pages() return value to posix mlock() error
		253	*/
		254	static int __mlock_posix_error_return(long retval)
		255	{
		256	if (retval == -EFAULT)
		257	retval = -ENOMEM;
		258	else if (retval == -ENOMEM)
		259	retval = -EAGAIN;
		260	return retval;
		261	}
		262
		263	#else /* CONFIG_UNEVICTABLE_LRU */
		264
		265	/*
		266	* Just make pages present if VM_LOCKED. No-op if unlocking.
		267	*/
		268	static long __mlock_vma_pages_range(struct vm_area_struct *vma,
		269	unsigned long start, unsigned long end,
		270	int mlock)
		271	{
		272	if (mlock && (vma->vm_flags & VM_LOCKED))
		273	return make_pages_present(start, end);
		274	return 0;
		275	}
		276
		277	static inline int __mlock_posix_error_return(long retval)
		278	{
		279	return 0;
		280	}
		281
		282	#endif /* CONFIG_UNEVICTABLE_LRU */
		283
		284	/**
		285	* mlock_vma_pages_range() - mlock pages in specified vma range.
		286	* @vma - the vma containing the specfied address range
		287	* @start - starting address in @vma to mlock
		288	* @end - end address [+1] in @vma to mlock
		289	*
		290	* For mmap()/mremap()/expansion of mlocked vma.
		291	*
		292	* return 0 on success for "normal" vmas.
		293	*
		294	* return number of pages [> 0] to be removed from locked_vm on success
		295	* of "special" vmas.
		296	*
		297	* return negative error if vma spanning @start-@range disappears while
		298	* mmap semaphore is dropped. Unlikely?
		299	*/
		300	long mlock_vma_pages_range(struct vm_area_struct *vma,
		301	unsigned long start, unsigned long end)
		302	{
		303	struct mm_struct *mm = vma->vm_mm;
		304	int nr_pages = (end - start) / PAGE_SIZE;
		305	BUG_ON(!(vma->vm_flags & VM_LOCKED));
		306
		307	/*
		308	* filter unlockable vmas
		309	*/
		310	if (vma->vm_flags & (VM_IO \| VM_PFNMAP))
		311	goto no_mlock;
		312
		313	if (!((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|
		314	is_vm_hugetlb_page(vma) \|\|
		315	vma == get_gate_vma(current))) {
		316	long error;
		317	downgrade_write(&mm->mmap_sem);
		318
		319	error = __mlock_vma_pages_range(vma, start, end, 1);
		320
		321	up_read(&mm->mmap_sem);
		322	/* vma can change or disappear */
		323	down_write(&mm->mmap_sem);
		324	vma = find_vma(mm, start);
		325	/* non-NULL vma must contain @start, but need to check @end */
		326	if (!vma \|\| end > vma->vm_end)
		327	return -ENOMEM;
		328
		329	return 0; /* hide other errors from mmap(), et al */
		330	}
		331
		332	/*
		333	* User mapped kernel pages or huge pages:
		334	* make these pages present to populate the ptes, but
		335	* fall thru' to reset VM_LOCKED--no need to unlock, and
		336	* return nr_pages so these don't get counted against task's
		337	* locked limit. huge pages are already counted against
		338	* locked vm limit.
		339	*/
		340	make_pages_present(start, end);
		341
		342	no_mlock:
		343	vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
		344	return nr_pages; /* error or pages NOT mlocked */
		345	}
		346
		347
		348	/*
		349	* munlock_vma_pages_range() - munlock all pages in the vma range.'
		350	* @vma - vma containing range to be munlock()ed.
		351	* @start - start address in @vma of the range
		352	* @end - end of range in @vma.
		353	*
		354	* For mremap(), munmap() and exit().
		355	*
		356	* Called with @vma VM_LOCKED.
		357	*
		358	* Returns with VM_LOCKED cleared. Callers must be prepared to
		359	* deal with this.
		360	*
		361	* We don't save and restore VM_LOCKED here because pages are
		362	* still on lru. In unmap path, pages might be scanned by reclaim
		363	* and re-mlocked by try_to_{munlock\|unmap} before we unmap and
		364	* free them. This will result in freeing mlocked pages.
		365	*/
		366	void munlock_vma_pages_range(struct vm_area_struct *vma,
		367	unsigned long start, unsigned long end)
		368	{
		369	vma->vm_flags &= ~VM_LOCKED;
		370	__mlock_vma_pages_range(vma, start, end, 0);
		371	}
		372
		373	/*
		374	* mlock_fixup - handle mlock[all]/munlock[all] requests.
		375	*
		376	* Filters out "special" vmas -- VM_LOCKED never gets set for these, and
		377	* munlock is a no-op. However, for some special vmas, we go ahead and
		378	* populate the ptes via make_pages_present().
		379	*
		380	* For vmas that pass the filters, merge/split as appropriate.
		381	*/
26	static int mlock_fixup(struct vm_area_struct vma, struct vm_area_struct *prev,	382	static int mlock_fixup(struct vm_area_struct vma, struct vm_area_struct *prev,
27	unsigned long start, unsigned long end, unsigned int newflags)	383	unsigned long start, unsigned long end, unsigned int newflags)
28	{	384	{
29	struct mm_struct * mm = vma->vm_mm;	385	struct mm_struct *mm = vma->vm_mm;
30	pgoff_t pgoff;	386	pgoff_t pgoff;
31	int pages;	387	int nr_pages;
32	int ret = 0;	388	int ret = 0;
33		389	int lock = newflags & VM_LOCKED;
34	if (newflags == vma->vm_flags) {	390
35	*prev = vma;	391	if (newflags == vma->vm_flags \|\|
36	goto out;	392	(vma->vm_flags & (VM_IO \| VM_PFNMAP)))
		393	goto out; /* don't set VM_LOCKED, don't count */
		394
		395	if ((vma->vm_flags & (VM_DONTEXPAND \| VM_RESERVED)) \|\|
		396	is_vm_hugetlb_page(vma) \|\|
		397	vma == get_gate_vma(current)) {
		398	if (lock)
		399	make_pages_present(start, end);
		400	goto out; /* don't set VM_LOCKED, don't count */
37	}	401	}
38		402
39	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);	403	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +408,6 @@ static int mlock_fixup(struct vm_area_struct vma, struct vm_area_struct *prev,
44	goto success;	408	goto success;
45	}	409	}
46		410
47	*prev = vma;
48
49	if (start != vma->vm_start) {	411	if (start != vma->vm_start) {
50	ret = split_vma(mm, vma, start, 1);	412	ret = split_vma(mm, vma, start, 1);
51	if (ret)	413	if (ret)
@@ -60,24 +422,61 @@ static int mlock_fixup(struct vm_area_struct vma, struct vm_area_struct *prev,
60		422
61	success:	423	success:
62	/*	424	/*
		425	* Keep track of amount of locked VM.
		426	*/
		427	nr_pages = (end - start) >> PAGE_SHIFT;
		428	if (!lock)
		429	nr_pages = -nr_pages;
		430	mm->locked_vm += nr_pages;
		431
		432	/*
63	* vm_flags is protected by the mmap_sem held in write mode.	433	* vm_flags is protected by the mmap_sem held in write mode.
64	* It's okay if try_to_unmap_one unmaps a page just after we	434	* It's okay if try_to_unmap_one unmaps a page just after we
65	* set VM_LOCKED, make_pages_present below will bring it back.	435	* set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66	*/	436	*/
67	vma->vm_flags = newflags;	437	vma->vm_flags = newflags;
68		438
69	/*	439	if (lock) {
70	* Keep track of amount of locked VM.	440	/*
71	*/	441	* mmap_sem is currently held for write. Downgrade the write
72	pages = (end - start) >> PAGE_SHIFT;	442	* lock to a read lock so that other faults, mmap scans, ...
73	if (newflags & VM_LOCKED) {	443	* while we fault in all pages.
74	pages = -pages;	444	*/
75	if (!(newflags & VM_IO))	445	downgrade_write(&mm->mmap_sem);
76	ret = make_pages_present(start, end);	446
		447	ret = __mlock_vma_pages_range(vma, start, end, 1);
		448
		449	/*
		450	* Need to reacquire mmap sem in write mode, as our callers
		451	* expect this. We have no support for atomically upgrading
		452	* a sem to write, so we need to check for ranges while sem
		453	* is unlocked.
		454	*/
		455	up_read(&mm->mmap_sem);
		456	/* vma can change or disappear */
		457	down_write(&mm->mmap_sem);
		458	*prev = find_vma(mm, start);
		459	/* non-NULL prev must contain @start, but need to check @end /
		460	if (!(prev) \|\| end > (prev)->vm_end)
		461	ret = -ENOMEM;
		462	else if (ret > 0) {
		463	mm->locked_vm -= ret;
		464	ret = 0;
		465	} else
		466	ret = __mlock_posix_error_return(ret); /* translate if needed */
		467	} else {
		468	/*
		469	* TODO: for unlocking, pages will already be resident, so
		470	* we don't need to wait for allocations/reclaim/pagein, ...
		471	* However, unlocking a very large region can still take a
		472	* while. Should we downgrade the semaphore for both lock
		473	* AND unlock ?
		474	*/
		475	__mlock_vma_pages_range(vma, start, end, 0);
77	}	476	}
78		477
79	mm->locked_vm -= pages;
80	out:	478	out:
		479	*prev = vma;
81	return ret;	480	return ret;
82	}	481	}
83		482
@@ -139,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
139	if (!can_do_mlock())	538	if (!can_do_mlock())
140	return -EPERM;	539	return -EPERM;
141		540
		541	lru_add_drain_all(); /* flush pagevec */
		542
142	down_write(&current->mm->mmap_sem);	543	down_write(&current->mm->mmap_sem);
143	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));	544	len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
144	start &= PAGE_MASK;	545	start &= PAGE_MASK;
@@ -205,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
205	if (!can_do_mlock())	606	if (!can_do_mlock())
206	goto out;	607	goto out;
207		608
		609	lru_add_drain_all(); /* flush pagevec */
		610
208	down_write(&current->mm->mmap_sem);	611	down_write(&current->mm->mmap_sem);
209		612
210	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;	613	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;