10 files changed, 117 insertions, 65 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 2024bbd573d2..9221c02ed9e2 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2604,6 +2604,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                } else {
                        if (cow)
                                huge_ptep_set_wrprotect(src, addr, src_pte);
+                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
                        page_dup_rmap(ptepage);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index cd8989c1027e..7211a73ba14d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -435,7 +435,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
        if (av == NULL) /* Not actually mapped anymore */
                return;
-        pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff = page_to_pgoff(page);
        read_lock(&tasklist_lock);
        for_each_process (tsk) {
                struct anon_vma_chain *vmac;
@@ -469,7 +469,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
        mutex_lock(&mapping->i_mmap_mutex);
        read_lock(&tasklist_lock);
        for_each_process(tsk) {
-                pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+                pgoff_t pgoff = page_to_pgoff(page);
                struct task_struct *t = task_early_kill(tsk, force_early);
                if (!t)
@@ -895,7 +895,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        struct page *hpage = *hpagep;
        struct page *ppage;
-        if (PageReserved(p) || PageSlab(p))
+        if (PageReserved(p) || PageSlab(p) || !PageLRU(p))
                return SWAP_SUCCESS;
        /*
@@ -1159,9 +1159,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                                        action_result(pfn, "free buddy, 2nd try", DELAYED);
                                return 0;
                        }
-                        action_result(pfn, "non LRU", IGNORED);
-                        put_page(p);
-                        return -EBUSY;
                }
        }
@@ -1194,6 +1191,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                return 0;
        }
+        if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p))
+                goto identify_page_state;
        /*
         * For error on the tail page, we should set PG_hwpoison
         * on the head page to show that the hugepage is hwpoisoned
@@ -1243,6 +1243,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
                goto out;
        }
+identify_page_state:
        res = -EBUSY;
        /*
         * The first check uses the current page flags which may not have any
diff --git a/mm/memory.c b/mm/memory.c
index d67fd9fcf1f2..7e8d8205b610 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2882,7 +2882,8 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * if page by the offset is not ready to be mapped (cold cache or
         * something).
         */
-        if (vma->vm_ops->map_pages && fault_around_pages() > 1) {
+        if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
+            fault_around_pages() > 1) {
                pte = pte_offset_map_lock(mm, pmd, address, &ptl);
                do_fault_around(vma, address, pte, pgoff, flags);
                if (!pte_same(*pte, orig_pte))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eb58de19f815..8f5330d74f47 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2139,7 +2139,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
        } else
                *new = *old;
-        rcu_read_lock();
        if (current_cpuset_is_being_rebound()) {
                nodemask_t mems = cpuset_mems_allowed(current);
                if (new->flags & MPOL_F_REBINDING)
@@ -2147,7 +2146,6 @@ struct mempolicy *__mpol_dup(struct mempolicy *old)
                else
                        mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
        }
-        rcu_read_unlock();
        atomic_set(&new->refcnt, 1);
        return new;
 }
diff --git a/mm/msync.c b/mm/msync.c
index a5c673669ca6..992a1673d488 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -78,7 +78,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
                        goto out_unlock;
                }
                file = vma->vm_file;
-                fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+                fstart = (start - vma->vm_start) +
+                         ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
                fend = fstart + (min(end, vma->vm_end) - start) - 1;
                start = vma->vm_end;
                if ((flags & MS_SYNC) && file &&
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 20d17f8266fe..0ea758b898fd 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -816,9 +816,21 @@ void __init init_cma_reserved_pageblock(struct page *page)
                set_page_count(p, 0);
        } while (++p, --i);
-        set_page_refcounted(page);
        set_pageblock_migratetype(page, MIGRATE_CMA);
-        __free_pages(page, pageblock_order);
+        if (pageblock_order >= MAX_ORDER) {
+                i = pageblock_nr_pages;
+                p = page;
+                do {
+                        set_page_refcounted(p);
+                        __free_pages(p, MAX_ORDER - 1);
+                        p += MAX_ORDER_NR_PAGES;
+                } while (i -= MAX_ORDER_NR_PAGES);
+        } else {
+                set_page_refcounted(page);
+                __free_pages(page, pageblock_order);
+        }
        adjust_managed_page_count(page, pageblock_nr_pages);
 }
 #endif
diff --git a/mm/rmap.c b/mm/rmap.c
index b7e94ebbd09e..22a4a7699cdb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -517,11 +517,7 @@ void page_unlock_anon_vma_read(struct anon_vma *anon_vma)
 static inline unsigned long
 __vma_address(struct page *page, struct vm_area_struct *vma)
 {
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page_to_pgoff(page);
-        if (unlikely(is_vm_hugetlb_page(vma)))
-                pgoff = page->index << huge_page_order(page_hstate(page));
        return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
 }
@@ -1639,7 +1635,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
 static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 {
        struct anon_vma *anon_vma;
-        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+        pgoff_t pgoff = page_to_pgoff(page);
        struct anon_vma_chain *avc;
        int ret = SWAP_AGAIN;
@@ -1680,7 +1676,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
 static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
        struct address_space *mapping = page->mapping;
-        pgoff_t pgoff = page->index << compound_order(page);
+        pgoff_t pgoff = page_to_pgoff(page);
        struct vm_area_struct *vma;
        int ret = SWAP_AGAIN;
diff --git a/mm/shmem.c b/mm/shmem.c
index 8f419cff9e34..af68b15a8fc1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -85,7 +85,7 @@ static struct vfsmount *shm_mnt;
 * a time): we would prefer not to enlarge the shmem inode just for that.
 */
 struct shmem_falloc {
-        int     mode;           /* FALLOC_FL mode currently operating */
+        wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;          /* start of range currently being fallocated */
        pgoff_t next;           /* the next page offset to be fallocated */
        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
@@ -468,23 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                return;
        index = start;
-        for ( ; ; ) {
+        while (index < end) {
                cond_resched();
                pvec.nr = find_get_entries(mapping, index,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
                                pvec.pages, indices);
                if (!pvec.nr) {
-                        if (index == start || unfalloc)
+                        /* If all gone or hole-punch or unfalloc, we're done */
+                        if (index == start || end != -1)
                                break;
+                        /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
-                if ((index == start || unfalloc) && indices[0] >= end) {
-                        pagevec_remove_exceptionals(&pvec);
-                        pagevec_release(&pvec);
-                        break;
-                }
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
@@ -496,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        if (radix_tree_exceptional_entry(page)) {
                                if (unfalloc)
                                        continue;
-                                nr_swaps_freed += !shmem_free_swap(mapping,
+                                if (shmem_free_swap(mapping, index, page)) {
-                                                                index, page);
+                                        /* Swap was replaced by page: retry */
+                                        index--;
+                                        break;
+                                }
+                                nr_swaps_freed++;
                                continue;
                        }
@@ -506,6 +507,11 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                if (page->mapping == mapping) {
                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
+                                } else {
+                                        /* Page was replaced by swap: retry */
+                                        unlock_page(page);
+                                        index--;
+                                        break;
                                }
                        }
                        unlock_page(page);
@@ -760,7 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
-                            !shmem_falloc->mode &&
+                            !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
@@ -1029,6 +1035,9 @@ repeat:
                goto failed;
        }
+        if (page && sgp == SGP_WRITE)
+                mark_page_accessed(page);
        /* fallocated page? */
        if (page && !PageUptodate(page)) {
                if (sgp != SGP_READ)
@@ -1110,6 +1119,9 @@ repeat:
                shmem_recalc_inode(inode);
                spin_unlock(&info->lock);
+                if (sgp == SGP_WRITE)
+                        mark_page_accessed(page);
                delete_from_swap_cache(page);
                set_page_dirty(page);
                swap_free(swap);
@@ -1136,6 +1148,9 @@ repeat:
                __SetPageSwapBacked(page);
                __set_page_locked(page);
+                if (sgp == SGP_WRITE)
+                        init_page_accessed(page);
                error = mem_cgroup_charge_file(page, current->mm,
                                                gfp & GFP_RECLAIM_MASK);
                if (error)
@@ -1239,38 +1254,58 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
         * Trinity finds that probing a hole which tmpfs is punching can
         * prevent the hole-punch from ever completing: which in turn
         * locks writers out with its hold on i_mutex.  So refrain from
-         * faulting pages into the hole while it's being punched, and
+         * faulting pages into the hole while it's being punched.  Although
-         * wait on i_mutex to be released if vmf->flags permits.
+         * shmem_undo_range() does remove the additions, it may be unable to
+         * keep up, as each new page needs its own unmap_mapping_range() call,
+         * and the i_mmap tree grows ever slower to scan if new vmas are added.
+         *
+         * It does not matter if we sometimes reach this check just before the
+         * hole-punch begins, so that one fault then races with the punch:
+         * we just need to make racing faults a rare case.
+         *
+         * The implementation below would be much simpler if we just used a
+         * standard mutex or completion: but we cannot take i_mutex in fault,
+         * and bloating every shmem inode for this unlikely case would be sad.
         */
        if (unlikely(inode->i_private)) {
                struct shmem_falloc *shmem_falloc;
                spin_lock(&inode->i_lock);
                shmem_falloc = inode->i_private;
-                if (!shmem_falloc ||
+                if (shmem_falloc &&
-                    shmem_falloc->mode != FALLOC_FL_PUNCH_HOLE ||
+                    shmem_falloc->waitq &&
-                    vmf->pgoff < shmem_falloc->start ||
+                    vmf->pgoff >= shmem_falloc->start &&
-                    vmf->pgoff >= shmem_falloc->next)
+                    vmf->pgoff < shmem_falloc->next) {
-                        shmem_falloc = NULL;
+                        wait_queue_head_t *shmem_falloc_waitq;
-                spin_unlock(&inode->i_lock);
+                        DEFINE_WAIT(shmem_fault_wait);
-                /*
-                 * i_lock has protected us from taking shmem_falloc seriously
+                        ret = VM_FAULT_NOPAGE;
-                 * once return from shmem_fallocate() went back up that stack.
-                 * i_lock does not serialize with i_mutex at all, but it does
-                 * not matter if sometimes we wait unnecessarily, or sometimes
-                 * miss out on waiting: we just need to make those cases rare.
-                 */
-                if (shmem_falloc) {
                        if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
                           !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                                /* It's polite to up mmap_sem if we can */
                                up_read(&vma->vm_mm->mmap_sem);
-                                mutex_lock(&inode->i_mutex);
+                                ret = VM_FAULT_RETRY;
-                                mutex_unlock(&inode->i_mutex);
-                                return VM_FAULT_RETRY;
                        }
-                        /* cond_resched? Leave that to GUP or return to user */
-                        return VM_FAULT_NOPAGE;
+                        shmem_falloc_waitq = shmem_falloc->waitq;
+                        prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        spin_unlock(&inode->i_lock);
+                        schedule();
+                        /*
+                         * shmem_falloc_waitq points into the shmem_fallocate()
+                         * stack of the hole-punching task: shmem_falloc_waitq
+                         * is usually invalid by the time we reach here, but
+                         * finish_wait() does not dereference it in that case;
+                         * though i_lock needed lest racing with wake_up_all().
+                         */
+                        spin_lock(&inode->i_lock);
+                        finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+                        spin_unlock(&inode->i_lock);
+                        return ret;
                }
+                spin_unlock(&inode->i_lock);
        }
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
@@ -1412,13 +1447,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
-        int ret;
        struct inode *inode = mapping->host;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
-        ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
+        return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
-        if (ret == 0 && *pagep)
-                init_page_accessed(*pagep);
-        return ret;
 }
 static int
@@ -1769,13 +1800,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
        mutex_lock(&inode->i_mutex);
-        shmem_falloc.mode = mode & ~FALLOC_FL_KEEP_SIZE;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+                DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+                shmem_falloc.waitq = &shmem_falloc_waitq;
                shmem_falloc.start = unmap_start >> PAGE_SHIFT;
                shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
                spin_lock(&inode->i_lock);
@@ -1787,8 +1818,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */
+                spin_lock(&inode->i_lock);
+                inode->i_private = NULL;
+                wake_up_all(&shmem_falloc_waitq);
+                spin_unlock(&inode->i_lock);
                error = 0;
-                goto undone;
+                goto out;
        }
        /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
@@ -1804,6 +1840,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                goto out;
        }
+        shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
diff --git a/mm/slub.c b/mm/slub.c
index b2b047327d76..73004808537e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1881,7 +1881,7 @@ redo:
        new.frozen = 0;
-        if (!new.inuse && n->nr_partial > s->min_partial)
+        if (!new.inuse && n->nr_partial >= s->min_partial)
                m = M_FREE;
        else if (new.freelist) {
                m = M_PARTIAL;
@@ -1992,7 +1992,7 @@ static void unfreeze_partials(struct kmem_cache *s,
                                new.freelist, new.counters,
                                "unfreezing slab"));
-                if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+                if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
                        page->next = discard_page;
                        discard_page = page;
                } else {
@@ -2620,7 +2620,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                return;
        }
-        if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+        if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
                goto slab_empty;
        /*
diff --git a/mm/truncate.c b/mm/truncate.c
index 6a78c814bebf..eda247307164 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -355,14 +355,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
        for ( ; ; ) {
                cond_resched();
                if (!pagevec_lookup_entries(&pvec, mapping, index,
-                        min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                        min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) {
-                        indices)) {
+                        /* If all gone from start onwards, we're done */
                        if (index == start)
                                break;
+                        /* Otherwise restart to make sure all gone */
                        index = start;
                        continue;
                }
                if (index == start && indices[0] >= end) {
+                        /* All gone out of hole to be punched, we're done */
                        pagevec_remove_exceptionals(&pvec);
                        pagevec_release(&pvec);
                        break;
@@ -373,8 +375,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
                        /* We rely upon deletion not changing page->index */
                        index = indices[i];
-                        if (index >= end)
+                        if (index >= end) {
+                                /* Restart punch to make sure all gone */
+                                index = start - 1;
                                break;
+                        }
                        if (radix_tree_exceptional_entry(page)) {
                                clear_exceptional_entry(mapping, index, page);