15 files changed, 120 insertions, 120 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 026771a9b097..394838f489eb 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -183,7 +183,7 @@ config MEMORY_HOTPLUG_SPARSE
 config MEMORY_HOTREMOVE
        bool "Allow for memory hot remove"
        select MEMORY_ISOLATION
-        select HAVE_BOOTMEM_INFO_NODE if X86_64
+        select HAVE_BOOTMEM_INFO_NODE if (X86_64 || PPC64)
        depends on MEMORY_HOTPLUG && ARCH_ENABLE_MEMORY_HOTREMOVE
        depends on MIGRATION
diff --git a/mm/filemap.c b/mm/filemap.c
index 1e6aec4a2d2e..ae4846ff4849 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        struct inode *inode = mapping->host;
        pgoff_t offset = vmf->pgoff;
        struct page *page;
-        bool memcg_oom;
        pgoff_t size;
        int ret = 0;
@@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
        /*
-         * Do we have something in the page cache already?  Either
+         * Do we have something in the page cache already?
-         * way, try readahead, but disable the memcg OOM killer for it
-         * as readahead is optional and no errors are propagated up
-         * the fault stack.  The OOM killer is enabled while trying to
-         * instantiate the faulting page individually below.
         */
        page = find_get_page(mapping, offset);
        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
@@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
                 * We found the page, so try async readahead before
                 * waiting for the lock.
                 */
-                memcg_oom = mem_cgroup_toggle_oom(false);
                do_async_mmap_readahead(vma, ra, file, page, offset);
-                mem_cgroup_toggle_oom(memcg_oom);
        } else if (!page) {
                /* No page in the page cache at all */
-                memcg_oom = mem_cgroup_toggle_oom(false);
                do_sync_mmap_readahead(vma, ra, file, offset);
-                mem_cgroup_toggle_oom(memcg_oom);
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
                ret = VM_FAULT_MAJOR;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7489884682d8..610e3df2768a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        mmun_start = haddr;
        mmun_end   = haddr + HPAGE_PMD_SIZE;
+again:
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        spin_lock(&mm->page_table_lock);
        if (unlikely(!pmd_trans_huge(*pmd))) {
@@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
        split_huge_page(page);
        put_page(page);
-        BUG_ON(pmd_trans_huge(*pmd));
+        /*
+         * We don't always have down_write of mmap_sem here: a racing
+         * do_huge_pmd_wp_page() might have copied-on-write to another
+         * huge page before our split_huge_page() got the anon_vma lock.
+         */
+        if (unlikely(pmd_trans_huge(*pmd)))
+                goto again;
 }
 void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b49579c7f2a5..0b7656e804d1 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -653,6 +653,7 @@ static void free_huge_page(struct page *page)
        BUG_ON(page_count(page));
        BUG_ON(page_mapcount(page));
        restore_reserve = PagePrivate(page);
+        ClearPagePrivate(page);
        spin_lock(&hugetlb_lock);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
@@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order)
        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
        __SetPageHead(page);
+        __ClearPageReserved(page);
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                __SetPageTail(p);
+                /*
+                 * For gigantic hugepages allocated through bootmem at
+                 * boot, it's safer to be consistent with the not-gigantic
+                 * hugepages and clear the PG_reserved bit from all tail pages
+                 * too.  Otherwse drivers using get_user_pages() to access tail
+                 * pages may get the reference counting wrong if they see
+                 * PG_reserved set on a tail page (despite the head page not
+                 * having PG_reserved set).  Enforcing this consistency between
+                 * head and tail pages allows drivers to optimize away a check
+                 * on the head page when they need know if put_page() is needed
+                 * after get_user_pages().
+                 */
+                __ClearPageReserved(p);
                set_page_count(p, 0);
                p->first_page = page;
        }
@@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void)
 #else
                page = virt_to_page(m);
 #endif
-                __ClearPageReserved(page);
                WARN_ON(page_count(page) != 1);
                prep_compound_huge_page(page, h->order);
+                WARN_ON(PageReserved(page));
                prep_new_huge_page(h, page, page_to_nid(page));
                /*
                 * If we had gigantic hugepages allocated at boot time, we need
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28243f7d9c23..9c9c685e4ddc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        unsigned long val = 0;
        int cpu;
+        get_online_cpus();
        for_each_online_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        val += memcg->nocpu_base.events[idx];
        spin_unlock(&memcg->pcp_counter_lock);
 #endif
+        put_online_cpus();
        return val;
 }
@@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
                memcg_wakeup_oom(memcg);
 }
-/*
- * try to call OOM killer
- */
 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 {
-        bool locked;
-        int wakeups;
        if (!current->memcg_oom.may_oom)
                return;
-        current->memcg_oom.in_memcg_oom = 1;
        /*
-         * As with any blocking lock, a contender needs to start
+         * We are in the middle of the charge context here, so we
-         * listening for wakeups before attempting the trylock,
+         * don't want to block when potentially sitting on a callstack
-         * otherwise it can miss the wakeup from the unlock and sleep
+         * that holds all kinds of filesystem and mm locks.
-         * indefinitely.  This is just open-coded because our locking
+         *
-         * is so particular to memcg hierarchies.
+         * Also, the caller may handle a failed allocation gracefully
+         * (like optional page cache readahead) and so an OOM killer
+         * invocation might not even be necessary.
+         *
+         * That's why we don't do anything here except remember the
+         * OOM context and then deal with it at the end of the page
+         * fault when the stack is unwound, the locks are released,
+         * and when we know whether the fault was overall successful.
         */
-        wakeups = atomic_read(&memcg->oom_wakeups);
+        css_get(&memcg->css);
-        mem_cgroup_mark_under_oom(memcg);
+        current->memcg_oom.memcg = memcg;
+        current->memcg_oom.gfp_mask = mask;
-        locked = mem_cgroup_oom_trylock(memcg);
+        current->memcg_oom.order = order;
-        if (locked)
-                mem_cgroup_oom_notify(memcg);
-        if (locked && !memcg->oom_kill_disable) {
-                mem_cgroup_unmark_under_oom(memcg);
-                mem_cgroup_out_of_memory(memcg, mask, order);
-                mem_cgroup_oom_unlock(memcg);
-                /*
-                 * There is no guarantee that an OOM-lock contender
-                 * sees the wakeups triggered by the OOM kill
-                 * uncharges.  Wake any sleepers explicitely.
-                 */
-                memcg_oom_recover(memcg);
-        } else {
-                /*
-                 * A system call can just return -ENOMEM, but if this
-                 * is a page fault and somebody else is handling the
-                 * OOM already, we need to sleep on the OOM waitqueue
-                 * for this memcg until the situation is resolved.
-                 * Which can take some time because it might be
-                 * handled by a userspace task.
-                 *
-                 * However, this is the charge context, which means
-                 * that we may sit on a large call stack and hold
-                 * various filesystem locks, the mmap_sem etc. and we
-                 * don't want the OOM handler to deadlock on them
-                 * while we sit here and wait.  Store the current OOM
-                 * context in the task_struct, then return -ENOMEM.
-                 * At the end of the page fault handler, with the
-                 * stack unwound, pagefault_out_of_memory() will check
-                 * back with us by calling
-                 * mem_cgroup_oom_synchronize(), possibly putting the
-                 * task to sleep.
-                 */
-                current->memcg_oom.oom_locked = locked;
-                current->memcg_oom.wakeups = wakeups;
-                css_get(&memcg->css);
-                current->memcg_oom.wait_on_memcg = memcg;
-        }
 }
 /**
 * mem_cgroup_oom_synchronize - complete memcg OOM handling
+ * @handle: actually kill/wait or just clean up the OOM state
 *
- * This has to be called at the end of a page fault if the the memcg
+ * This has to be called at the end of a page fault if the memcg OOM
- * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
+ * handler was enabled.
 *
- * Memcg supports userspace OOM handling, so failed allocations must
+ * Memcg supports userspace OOM handling where failed allocations must
 * sleep on a waitqueue until the userspace task resolves the
 * situation.  Sleeping directly in the charge context with all kinds
 * of locks held is not a good idea, instead we remember an OOM state
 * in the task and mem_cgroup_oom_synchronize() has to be called at
- * the end of the page fault to put the task to sleep and clean up the
+ * the end of the page fault to complete the OOM handling.
- * OOM state.
 *
 * Returns %true if an ongoing memcg OOM situation was detected and
- * finalized, %false otherwise.
+ * completed, %false otherwise.
 */
-bool mem_cgroup_oom_synchronize(void)
+bool mem_cgroup_oom_synchronize(bool handle)
 {
+        struct mem_cgroup *memcg = current->memcg_oom.memcg;
        struct oom_wait_info owait;
-        struct mem_cgroup *memcg;
+        bool locked;
        /* OOM is global, do not handle */
-        if (!current->memcg_oom.in_memcg_oom)
-                return false;
-        /*
-         * We invoked the OOM killer but there is a chance that a kill
-         * did not free up any charges.  Everybody else might already
-         * be sleeping, so restart the fault and keep the rampage
-         * going until some charges are released.
-         */
-        memcg = current->memcg_oom.wait_on_memcg;
        if (!memcg)
-                goto out;
+                return false;
-        if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+        if (!handle)
-                goto out_memcg;
+                goto cleanup;
        owait.memcg = memcg;
        owait.wait.flags = 0;
@@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void)
        INIT_LIST_HEAD(&owait.wait.task_list);
        prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
-        /* Only sleep if we didn't miss any wakeups since OOM */
+        mem_cgroup_mark_under_oom(memcg);
-        if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
+        locked = mem_cgroup_oom_trylock(memcg);
+        if (locked)
+                mem_cgroup_oom_notify(memcg);
+        if (locked && !memcg->oom_kill_disable) {
+                mem_cgroup_unmark_under_oom(memcg);
+                finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
+                                         current->memcg_oom.order);
+        } else {
                schedule();
-        finish_wait(&memcg_oom_waitq, &owait.wait);
+                mem_cgroup_unmark_under_oom(memcg);
-out_memcg:
+                finish_wait(&memcg_oom_waitq, &owait.wait);
-        mem_cgroup_unmark_under_oom(memcg);
+        }
-        if (current->memcg_oom.oom_locked) {
+        if (locked) {
                mem_cgroup_oom_unlock(memcg);
                /*
                 * There is no guarantee that an OOM-lock contender
@@ -2286,10 +2249,9 @@ out_memcg:
                 */
                memcg_oom_recover(memcg);
        }
+cleanup:
+        current->memcg_oom.memcg = NULL;
        css_put(&memcg->css);
-        current->memcg_oom.wait_on_memcg = NULL;
-out:
-        current->memcg_oom.in_memcg_oom = 0;
        return true;
 }
@@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                     || fatal_signal_pending(current)))
                goto bypass;
+        if (unlikely(task_in_memcg_oom(current)))
+                goto bypass;
        /*
         * We always charge the cgroup the mm_struct belongs to.
         * The mm_struct's mem_cgroup changes on task migration if the
@@ -2801,6 +2766,8 @@ done:
        return 0;
 nomem:
        *ptr = NULL;
+        if (gfp_mask & __GFP_NOFAIL)
+                return 0;
        return -ENOMEM;
 bypass:
        *ptr = root_mem_cgroup;
diff --git a/mm/memory.c b/mm/memory.c
index ca0003947115..1311f26497e6 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                                         */
                                        make_migration_entry_read(&entry);
                                        pte = swp_entry_to_pte(entry);
+                                        if (pte_swp_soft_dirty(*src_pte))
+                                                pte = pte_swp_mksoft_dirty(pte);
                                        set_pte_at(src_mm, addr, src_pte, pte);
                                }
                        }
@@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * space.  Kernel faults are handled more gracefully.
         */
        if (flags & FAULT_FLAG_USER)
-                mem_cgroup_enable_oom();
+                mem_cgroup_oom_enable();
        ret = __handle_mm_fault(mm, vma, address, flags);
-        if (flags & FAULT_FLAG_USER)
+        if (flags & FAULT_FLAG_USER) {
-                mem_cgroup_disable_oom();
+                mem_cgroup_oom_disable();
+                /*
-        if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
+                 * The task may have entered a memcg OOM situation but
-                mem_cgroup_oom_synchronize();
+                 * if the allocation error was handled gracefully (no
+                 * VM_FAULT_OOM), there is no need to kill anything.
+                 * Just clean up the OOM state peacefully.
+                 */
+                if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))
+                        mem_cgroup_oom_synchronize(false);
+        }
        return ret;
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index a26bccd44ccb..7a7325ee1d08 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
        get_page(new);
        pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
+        if (pte_swp_soft_dirty(*ptep))
+                pte = pte_mksoft_dirty(pte);
        if (is_write_migration_entry(entry))
                pte = pte_mkwrite(pte);
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94722a4d6b43..a3af058f68e4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
                        if (is_write_migration_entry(entry)) {
+                                pte_t newpte;
                                /*
                                 * A protection check is difficult so
                                 * just be safe and disable write
                                 */
                                make_migration_entry_read(&entry);
-                                set_pte_at(mm, addr, pte,
+                                newpte = swp_entry_to_pte(entry);
-                                        swp_entry_to_pte(entry));
+                                if (pte_swp_soft_dirty(oldpte))
+                                        newpte = pte_swp_mksoft_dirty(newpte);
+                                set_pte_at(mm, addr, pte, newpte);
                        }
                        pages++;
                }
diff --git a/mm/mremap.c b/mm/mremap.c
index 91b13d6a16d4..0843feb66f3d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,6 @@
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
 #include "internal.h"
@@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
                return NULL;
        pmd = pmd_alloc(mm, pud, addr);
-        if (!pmd) {
+        if (!pmd)
-                pud_free(mm, pud);
                return NULL;
-        }
        VM_BUG_ON(pmd_trans_huge(*pmd));
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 314e9d274381..6738c47f1f72 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -680,7 +680,7 @@ void pagefault_out_of_memory(void)
 {
        struct zonelist *zonelist;
-        if (mem_cgroup_oom_synchronize())
+        if (mem_cgroup_oom_synchronize(true))
                return;
        zonelist = node_zonelist(first_online_node, GFP_KERNEL);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f5236f804aa6..63807583d8e8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
        return 1;
 }
-static long bdi_max_pause(struct backing_dev_info *bdi,
+static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-                          unsigned long bdi_dirty)
+                                   unsigned long bdi_dirty)
 {
-        long bw = bdi->avg_write_bandwidth;
+        unsigned long bw = bdi->avg_write_bandwidth;
-        long t;
+        unsigned long t;
        /*
         * Limit pause time for small memory systems. If sleeping for too long
@@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi,
        t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;
-        return min_t(long, t, MAX_PAUSE);
+        return min_t(unsigned long, t, MAX_PAUSE);
 }
 static long bdi_min_pause(struct backing_dev_info *bdi,
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a3443278ce3a..e2e98af703ea 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -56,6 +56,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
                        continue;
                }
+#if !defined(CONFIG_SLUB) || !defined(CONFIG_SLUB_DEBUG_ON)
                /*
                 * For simplicity, we won't check this in the list of memcg
                 * caches. We have control over memcg naming, and if there
@@ -69,6 +70,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
                        s = NULL;
                        return -EINVAL;
                }
+#endif
        }
        WARN_ON(strchr(name, ' '));     /* It confuses parsers */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 3963fc24fcc1..de7c904e52e5 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct filename *pathname;
        int i, type, prev;
        int err;
+        unsigned int old_block_size;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        }
        swap_file = p->swap_file;
+        old_block_size = p->old_block_size;
        p->swap_file = NULL;
        p->max = 0;
        swap_map = p->swap_map;
@@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
                struct block_device *bdev = I_BDEV(inode);
-                set_blocksize(bdev, p->old_block_size);
+                set_blocksize(bdev, old_block_size);
                blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
        } else {
                mutex_lock(&inode->i_mutex);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 53f2f82f83ae..eea668d9cff6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker)
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
        up_write(&shrinker_rwsem);
+        kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
diff --git a/mm/zswap.c b/mm/zswap.c
index 841e35f1db22..d93510c6aa2d 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type)
        }
        tree->rbroot = RB_ROOT;
        spin_unlock(&tree->lock);
+        zbud_destroy_pool(tree->pool);
+        kfree(tree);
+        zswap_trees[type] = NULL;
 }
 static struct zbud_ops zswap_zbud_ops = {