29 files changed, 2185 insertions, 1410 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f9fd3dd3916b..eaa4a5bbe063 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -10,6 +10,7 @@
 #include <linux/module.h>
 #include <linux/writeback.h>
 #include <linux/device.h>
+#include <trace/events/writeback.h>
 static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
@@ -49,8 +50,6 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
-static void bdi_add_default_flusher_task(struct backing_dev_info *bdi);
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -65,31 +64,25 @@ static void bdi_debug_init(void)
 static int bdi_debug_stats_show(struct seq_file *m, void *v)
 {
        struct backing_dev_info *bdi = m->private;
-        struct bdi_writeback *wb;
+        struct bdi_writeback *wb = &bdi->wb;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        unsigned long bdi_thresh;
        unsigned long nr_dirty, nr_io, nr_more_io, nr_wb;
        struct inode *inode;
-        /*
-         * inode lock is enough here, the bdi->wb_list is protected by
-         * RCU on the reader side
-         */
        nr_wb = nr_dirty = nr_io = nr_more_io = 0;
        spin_lock(&inode_lock);
-        list_for_each_entry(wb, &bdi->wb_list, list) {
+        list_for_each_entry(inode, &wb->b_dirty, i_list)
-                nr_wb++;
+                nr_dirty++;
-                list_for_each_entry(inode, &wb->b_dirty, i_list)
+        list_for_each_entry(inode, &wb->b_io, i_list)
-                        nr_dirty++;
+                nr_io++;
-                list_for_each_entry(inode, &wb->b_io, i_list)
+        list_for_each_entry(inode, &wb->b_more_io, i_list)
-                        nr_io++;
+                nr_more_io++;
-                list_for_each_entry(inode, &wb->b_more_io, i_list)
-                        nr_more_io++;
-        }
        spin_unlock(&inode_lock);
-        get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+        global_dirty_limits(&background_thresh, &dirty_thresh);
+        bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
@@ -98,19 +91,16 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   "BdiDirtyThresh:   %8lu kB\n"
                   "DirtyThresh:      %8lu kB\n"
                   "BackgroundThresh: %8lu kB\n"
-                   "WritebackThreads: %8lu\n"
                   "b_dirty:          %8lu\n"
                   "b_io:             %8lu\n"
                   "b_more_io:        %8lu\n"
                   "bdi_list:         %8u\n"
-                   "state:            %8lx\n"
+                   "state:            %8lx\n",
-                   "wb_list:          %8u\n",
                   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
                   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
                   K(bdi_thresh), K(dirty_thresh),
-                   K(background_thresh), nr_wb, nr_dirty, nr_io, nr_more_io,
+                   K(background_thresh), nr_dirty, nr_io, nr_more_io,
-                   !list_empty(&bdi->bdi_list), bdi->state,
+                   !list_empty(&bdi->bdi_list), bdi->state);
-                   !list_empty(&bdi->wb_list));
 #undef K
        return 0;
@@ -247,7 +237,6 @@ static int __init default_bdi_init(void)
        sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers");
        BUG_ON(IS_ERR(sync_supers_tsk));
-        init_timer(&sync_supers_timer);
        setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0);
        bdi_arm_supers_timer();
@@ -259,77 +248,6 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
-{
-        memset(wb, 0, sizeof(*wb));
-        wb->bdi = bdi;
-        wb->last_old_flush = jiffies;
-        INIT_LIST_HEAD(&wb->b_dirty);
-        INIT_LIST_HEAD(&wb->b_io);
-        INIT_LIST_HEAD(&wb->b_more_io);
-}
-static void bdi_task_init(struct backing_dev_info *bdi,
-                          struct bdi_writeback *wb)
-{
-        struct task_struct *tsk = current;
-        spin_lock(&bdi->wb_lock);
-        list_add_tail_rcu(&wb->list, &bdi->wb_list);
-        spin_unlock(&bdi->wb_lock);
-        tsk->flags |= PF_FLUSHER | PF_SWAPWRITE;
-        set_freezable();
-        /*
-         * Our parent may run at a different priority, just set us to normal
-         */
-        set_user_nice(tsk, 0);
-}
-static int bdi_start_fn(void *ptr)
-{
-        struct bdi_writeback *wb = ptr;
-        struct backing_dev_info *bdi = wb->bdi;
-        int ret;
-        /*
-         * Add us to the active bdi_list
-         */
-        spin_lock_bh(&bdi_lock);
-        list_add_rcu(&bdi->bdi_list, &bdi_list);
-        spin_unlock_bh(&bdi_lock);
-        bdi_task_init(bdi, wb);
-        /*
-         * Clear pending bit and wakeup anybody waiting to tear us down
-         */
-        clear_bit(BDI_pending, &bdi->state);
-        smp_mb__after_clear_bit();
-        wake_up_bit(&bdi->state, BDI_pending);
-        ret = bdi_writeback_task(wb);
-        /*
-         * Remove us from the list
-         */
-        spin_lock(&bdi->wb_lock);
-        list_del_rcu(&wb->list);
-        spin_unlock(&bdi->wb_lock);
-        /*
-         * Flush any work that raced with us exiting. No new work
-         * will be added, since this bdi isn't discoverable anymore.
-         */
-        if (!list_empty(&bdi->work_list))
-                wb_do_writeback(wb, 1);
-        wb->task = NULL;
-        return ret;
-}
 int bdi_has_dirty_io(struct backing_dev_info *bdi)
 {
        return wb_has_dirty_io(&bdi->wb);
@@ -348,10 +266,10 @@ static void bdi_flush_io(struct backing_dev_info *bdi)
 }
 /*
- * kupdated() used to do this. We cannot do it from the bdi_forker_task()
+ * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
 * or we risk deadlocking on ->s_umount. The longer term solution would be
 * to implement sync_supers_bdi() or similar and simply do it from the
- * bdi writeback tasks individually.
+ * bdi writeback thread individually.
 */
 static int bdi_sync_supers(void *unused)
 {
@@ -387,144 +305,198 @@ static void sync_supers_timer_fn(unsigned long unused)
        bdi_arm_supers_timer();
 }
-static int bdi_forker_task(void *ptr)
+static void wakeup_timer_fn(unsigned long data)
+{
+        struct backing_dev_info *bdi = (struct backing_dev_info *)data;
+        spin_lock_bh(&bdi->wb_lock);
+        if (bdi->wb.task) {
+                trace_writeback_wake_thread(bdi);
+                wake_up_process(bdi->wb.task);
+        } else {
+                /*
+                 * When bdi tasks are inactive for long time, they are killed.
+                 * In this case we have to wake-up the forker thread which
+                 * should create and run the bdi thread.
+                 */
+                trace_writeback_wake_forker_thread(bdi);
+                wake_up_process(default_backing_dev_info.wb.task);
+        }
+        spin_unlock_bh(&bdi->wb_lock);
+}
+/*
+ * This function is used when the first inode for this bdi is marked dirty. It
+ * wakes-up the corresponding bdi thread which should then take care of the
+ * periodic background write-out of dirty inodes. Since the write-out would
+ * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
+ * set up a timer which wakes the bdi thread up later.
+ *
+ * Note, we wouldn't bother setting up the timer, but this function is on the
+ * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
+ * by delaying the wake-up.
+ */
+void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+{
+        unsigned long timeout;
+        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
+        mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
+}
+/*
+ * Calculate the longest interval (jiffies) bdi threads are allowed to be
+ * inactive.
+ */
+static unsigned long bdi_longest_inactive(void)
+{
+        unsigned long interval;
+        interval = msecs_to_jiffies(dirty_writeback_interval * 10);
+        return max(5UL * 60 * HZ, interval);
+}
+static int bdi_forker_thread(void *ptr)
 {
        struct bdi_writeback *me = ptr;
-        bdi_task_init(me->bdi, me);
+        current->flags |= PF_FLUSHER | PF_SWAPWRITE;
+        set_freezable();
+        /*
+         * Our parent may run at a different priority, just set us to normal
+         */
+        set_user_nice(current, 0);
        for (;;) {
-                struct backing_dev_info *bdi, *tmp;
+                struct task_struct *task = NULL;
-                struct bdi_writeback *wb;
+                struct backing_dev_info *bdi;
+                enum {
+                        NO_ACTION,   /* Nothing to do */
+                        FORK_THREAD, /* Fork bdi thread */
+                        KILL_THREAD, /* Kill inactive bdi thread */
+                } action = NO_ACTION;
                /*
                 * Temporary measure, we want to make sure we don't see
                 * dirty data on the default backing_dev_info
                 */
-                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list))
+                if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
+                        del_timer(&me->wakeup_timer);
                        wb_do_writeback(me, 0);
+                }
                spin_lock_bh(&bdi_lock);
+                set_current_state(TASK_INTERRUPTIBLE);
-                /*
+                list_for_each_entry(bdi, &bdi_list, bdi_list) {
-                 * Check if any existing bdi's have dirty data without
+                        bool have_dirty_io;
-                 * a thread registered. If so, set that up.
-                 */
+                        if (!bdi_cap_writeback_dirty(bdi) ||
-                list_for_each_entry_safe(bdi, tmp, &bdi_list, bdi_list) {
+                             bdi_cap_flush_forker(bdi))
-                        if (bdi->wb.task)
-                                continue;
-                        if (list_empty(&bdi->work_list) &&
-                            !bdi_has_dirty_io(bdi))
                                continue;
-                        bdi_add_default_flusher_task(bdi);
+                        WARN(!test_bit(BDI_registered, &bdi->state),
-                }
+                             "bdi %p/%s is not registered!\n", bdi, bdi->name);
-                set_current_state(TASK_INTERRUPTIBLE);
+                        have_dirty_io = !list_empty(&bdi->work_list) ||
+                                        wb_has_dirty_io(&bdi->wb);
-                if (list_empty(&bdi_pending_list)) {
+                        /*
-                        unsigned long wait;
+                         * If the bdi has work to do, but the thread does not
+                         * exist - create it.
+                         */
+                        if (!bdi->wb.task && have_dirty_io) {
+                                /*
+                                 * Set the pending bit - if someone will try to
+                                 * unregister this bdi - it'll wait on this bit.
+                                 */
+                                set_bit(BDI_pending, &bdi->state);
+                                action = FORK_THREAD;
+                                break;
+                        }
+                        spin_lock(&bdi->wb_lock);
+                        /*
+                         * If there is no work to do and the bdi thread was
+                         * inactive long enough - kill it. The wb_lock is taken
+                         * to make sure no-one adds more work to this bdi and
+                         * wakes the bdi thread up.
+                         */
+                        if (bdi->wb.task && !have_dirty_io &&
+                            time_after(jiffies, bdi->wb.last_active +
+                                                bdi_longest_inactive())) {
+                                task = bdi->wb.task;
+                                bdi->wb.task = NULL;
+                                spin_unlock(&bdi->wb_lock);
+                                set_bit(BDI_pending, &bdi->state);
+                                action = KILL_THREAD;
+                                break;
+                        }
+                        spin_unlock(&bdi->wb_lock);
+                }
+                spin_unlock_bh(&bdi_lock);
-                        spin_unlock_bh(&bdi_lock);
+                /* Keep working if default bdi still has things to do */
-                        wait = msecs_to_jiffies(dirty_writeback_interval * 10);
+                if (!list_empty(&me->bdi->work_list))
-                        if (wait)
+                        __set_current_state(TASK_RUNNING);
-                                schedule_timeout(wait);
+                switch (action) {
+                case FORK_THREAD:
+                        __set_current_state(TASK_RUNNING);
+                        task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
+                                           dev_name(bdi->dev));
+                        if (IS_ERR(task)) {
+                                /*
+                                 * If thread creation fails, force writeout of
+                                 * the bdi from the thread.
+                                 */
+                                bdi_flush_io(bdi);
+                        } else {
+                                /*
+                                 * The spinlock makes sure we do not lose
+                                 * wake-ups when racing with 'bdi_queue_work()'.
+                                 */
+                                spin_lock_bh(&bdi->wb_lock);
+                                bdi->wb.task = task;
+                                spin_unlock_bh(&bdi->wb_lock);
+                        }
+                        break;
+                case KILL_THREAD:
+                        __set_current_state(TASK_RUNNING);
+                        kthread_stop(task);
+                        break;
+                case NO_ACTION:
+                        if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
+                                /*
+                                 * There are no dirty data. The only thing we
+                                 * should now care about is checking for
+                                 * inactive bdi threads and killing them. Thus,
+                                 * let's sleep for longer time, save energy and
+                                 * be friendly for battery-driven devices.
+                                 */
+                                schedule_timeout(bdi_longest_inactive());
                        else
-                                schedule();
+                                schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
                        try_to_freeze();
+                        /* Back to the main loop */
                        continue;
                }
-                __set_current_state(TASK_RUNNING);
-                /*
-                 * This is our real job - check for pending entries in
-                 * bdi_pending_list, and create the tasks that got added
-                 */
-                bdi = list_entry(bdi_pending_list.next, struct backing_dev_info,
-                                 bdi_list);
-                list_del_init(&bdi->bdi_list);
-                spin_unlock_bh(&bdi_lock);
-                wb = &bdi->wb;
-                wb->task = kthread_run(bdi_start_fn, wb, "flush-%s",
-                                        dev_name(bdi->dev));
                /*
-                 * If task creation fails, then readd the bdi to
+                 * Clear pending bit and wakeup anybody waiting to tear us down.
-                 * the pending list and force writeout of the bdi
-                 * from this forker thread. That will free some memory
-                 * and we can try again.
                 */
-                if (IS_ERR(wb->task)) {
+                clear_bit(BDI_pending, &bdi->state);
-                        wb->task = NULL;
+                smp_mb__after_clear_bit();
+                wake_up_bit(&bdi->state, BDI_pending);
-                        /*
-                         * Add this 'bdi' to the back, so we get
-                         * a chance to flush other bdi's to free
-                         * memory.
-                         */
-                        spin_lock_bh(&bdi_lock);
-                        list_add_tail(&bdi->bdi_list, &bdi_pending_list);
-                        spin_unlock_bh(&bdi_lock);
-                        bdi_flush_io(bdi);
-                }
        }
        return 0;
 }
-static void bdi_add_to_pending(struct rcu_head *head)
-{
-        struct backing_dev_info *bdi;
-        bdi = container_of(head, struct backing_dev_info, rcu_head);
-        INIT_LIST_HEAD(&bdi->bdi_list);
-        spin_lock(&bdi_lock);
-        list_add_tail(&bdi->bdi_list, &bdi_pending_list);
-        spin_unlock(&bdi_lock);
-        /*
-         * We are now on the pending list, wake up bdi_forker_task()
-         * to finish the job and add us back to the active bdi_list
-         */
-        wake_up_process(default_backing_dev_info.wb.task);
-}
-/*
- * Add the default flusher task that gets created for any bdi
- * that has dirty data pending writeout
- */
-void static bdi_add_default_flusher_task(struct backing_dev_info *bdi)
-{
-        if (!bdi_cap_writeback_dirty(bdi))
-                return;
-        if (WARN_ON(!test_bit(BDI_registered, &bdi->state))) {
-                printk(KERN_ERR "bdi %p/%s is not registered!\n",
-                                                        bdi, bdi->name);
-                return;
-        }
-        /*
-         * Check with the helper whether to proceed adding a task. Will only
-         * abort if we two or more simultanous calls to
-         * bdi_add_default_flusher_task() occured, further additions will block
-         * waiting for previous additions to finish.
-         */
-        if (!test_and_set_bit(BDI_pending, &bdi->state)) {
-                list_del_rcu(&bdi->bdi_list);
-                /*
-                 * We must wait for the current RCU period to end before
-                 * moving to the pending list. So schedule that operation
-                 * from an RCU callback.
-                 */
-                call_rcu(&bdi->rcu_head, bdi_add_to_pending);
-        }
-}
 /*
 * Remove bdi from bdi_list, and ensure that it is no longer visible
 */
@@ -541,23 +513,16 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...)
 {
        va_list args;
-        int ret = 0;
        struct device *dev;
        if (bdi->dev)   /* The driver needs to use separate queues per device */
-                goto exit;
+                return 0;
        va_start(args, fmt);
        dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
        va_end(args);
-        if (IS_ERR(dev)) {
+        if (IS_ERR(dev))
-                ret = PTR_ERR(dev);
+                return PTR_ERR(dev);
-                goto exit;
-        }
-        spin_lock_bh(&bdi_lock);
-        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-        spin_unlock_bh(&bdi_lock);
        bdi->dev = dev;
@@ -569,21 +534,21 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
        if (bdi_cap_flush_forker(bdi)) {
                struct bdi_writeback *wb = &bdi->wb;
-                wb->task = kthread_run(bdi_forker_task, wb, "bdi-%s",
+                wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
                                                dev_name(dev));
-                if (IS_ERR(wb->task)) {
+                if (IS_ERR(wb->task))
-                        wb->task = NULL;
+                        return PTR_ERR(wb->task);
-                        ret = -ENOMEM;
-                        bdi_remove_from_list(bdi);
-                        goto exit;
-                }
        }
        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);
-exit:
-        return ret;
+        spin_lock_bh(&bdi_lock);
+        list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+        spin_unlock_bh(&bdi_lock);
+        trace_writeback_bdi_register(bdi);
+        return 0;
 }
 EXPORT_SYMBOL(bdi_register);
@@ -598,31 +563,29 @@ EXPORT_SYMBOL(bdi_register_dev);
 */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-        struct bdi_writeback *wb;
        if (!bdi_cap_writeback_dirty(bdi))
                return;
        /*
-         * If setup is pending, wait for that to complete first
+         * Make sure nobody finds us on the bdi_list anymore
         */
-        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+        bdi_remove_from_list(bdi);
-                        TASK_UNINTERRUPTIBLE);
        /*
-         * Make sure nobody finds us on the bdi_list anymore
+         * If setup is pending, wait for that to complete first
         */
-        bdi_remove_from_list(bdi);
+        wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
+                        TASK_UNINTERRUPTIBLE);
        /*
-         * Finally, kill the kernel threads. We don't need to be RCU
+         * Finally, kill the kernel thread. We don't need to be RCU
         * safe anymore, since the bdi is gone from visibility. Force
         * unfreeze of the thread before calling kthread_stop(), otherwise
         * it would never exet if it is currently stuck in the refrigerator.
         */
-        list_for_each_entry(wb, &bdi->wb_list, list) {
+        if (bdi->wb.task) {
-                thaw_process(wb->task);
+                thaw_process(bdi->wb.task);
-                kthread_stop(wb->task);
+                kthread_stop(bdi->wb.task);
        }
 }
@@ -644,7 +607,9 @@ static void bdi_prune_sb(struct backing_dev_info *bdi)
 void bdi_unregister(struct backing_dev_info *bdi)
 {
        if (bdi->dev) {
+                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
+                del_timer_sync(&bdi->wb.wakeup_timer);
                if (!bdi_cap_flush_forker(bdi))
                        bdi_wb_shutdown(bdi);
@@ -655,6 +620,18 @@ void bdi_unregister(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(bdi_unregister);
+static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+{
+        memset(wb, 0, sizeof(*wb));
+        wb->bdi = bdi;
+        wb->last_old_flush = jiffies;
+        INIT_LIST_HEAD(&wb->b_dirty);
+        INIT_LIST_HEAD(&wb->b_io);
+        INIT_LIST_HEAD(&wb->b_more_io);
+        setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+}
 int bdi_init(struct backing_dev_info *bdi)
 {
        int i, err;
@@ -666,7 +643,6 @@ int bdi_init(struct backing_dev_info *bdi)
        bdi->max_prop_frac = PROP_FRAC_BASE;
        spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
-        INIT_LIST_HEAD(&bdi->wb_list);
        INIT_LIST_HEAD(&bdi->work_list);
        bdi_wb_init(&bdi->wb, bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index 20e5642e9f9f..3d4df44e4221 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2238,14 +2238,12 @@ static ssize_t generic_perform_write(struct file *file,
        do {
                struct page *page;
-                pgoff_t index;          /* Pagecache index for current page */
                unsigned long offset;   /* Offset into pagecache page */
                unsigned long bytes;    /* Bytes to write to page */
                size_t copied;          /* Bytes copied from user */
                void *fsdata;
                offset = (pos & (PAGE_CACHE_SIZE - 1));
-                index = pos >> PAGE_CACHE_SHIFT;
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
                                                iov_iter_count(i));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 54d42b009dbe..cc5be788a39f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
 #include <linux/bootmem.h>
 #include <linux/sysfs.h>
 #include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
                        (vma->vm_pgoff >> huge_page_order(h));
 }
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+                                     unsigned long address)
+{
+        return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
 /*
 * Return the size of the pages allocated when backing a VMA. In the majority
 * cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
        set_page_private(page, 0);
        page->mapping = NULL;
        BUG_ON(page_count(page));
+        BUG_ON(page_mapcount(page));
        INIT_LIST_HEAD(&page->lru);
        spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
        return dtor == free_huge_page;
 }
+EXPORT_SYMBOL_GPL(PageHuge);
 static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
 {
        struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        entry = huge_ptep_get(src_pte);
                        ptepage = pte_page(entry);
                        get_page(ptepage);
+                        page_dup_rmap(ptepage);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
                }
                spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
        return -ENOMEM;
 }
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+        swp_entry_t swp;
+        if (huge_pte_none(pte) || pte_present(pte))
+                return 0;
+        swp = pte_to_swp_entry(pte);
+        if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+                return 1;
+        } else
+                return 0;
+}
 void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                            unsigned long end, struct page *ref_page)
 {
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                if (huge_pte_none(pte))
                        continue;
+                /*
+                 * HWPoisoned hugepage is already unmapped and dropped reference
+                 */
+                if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+                        continue;
                page = pte_page(pte);
                if (pte_dirty(pte))
                        set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
        flush_tlb_range(vma, start, end);
        mmu_notifier_invalidate_range_end(mm, start, end);
        list_for_each_entry_safe(page, tmp, &page_list, lru) {
+                page_remove_rmap(page);
                list_del(&page->lru);
                put_page(page);
        }
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
        return 1;
 }
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, pte_t pte,
                        struct page *pagecache_page)
@@ -2286,8 +2322,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 retry_avoidcopy:
        /* If no-one else is actually using this page, avoid the copy
         * and just make the page writable */
-        avoidcopy = (page_count(old_page) == 1);
+        avoidcopy = (page_mapcount(old_page) == 1);
        if (avoidcopy) {
+                if (!trylock_page(old_page)) {
+                        if (PageAnon(old_page))
+                                page_move_anon_rmap(old_page, vma, address);
+                } else
+                        unlock_page(old_page);
                set_huge_ptep_writable(vma, address, ptep);
                return 0;
        }
@@ -2338,6 +2379,13 @@ retry_avoidcopy:
                return -PTR_ERR(new_page);
        }
+        /*
+         * When the original hugepage is shared one, it does not have
+         * anon_vma prepared.
+         */
+        if (unlikely(anon_vma_prepare(vma)))
+                return VM_FAULT_OOM;
        copy_huge_page(new_page, old_page, address, vma);
        __SetPageUptodate(new_page);
@@ -2349,11 +2397,19 @@ retry_avoidcopy:
        ptep = huge_pte_offset(mm, address & huge_page_mask(h));
        if (likely(pte_same(huge_ptep_get(ptep), pte))) {
                /* Break COW */
+                mmu_notifier_invalidate_range_start(mm,
+                        address & huge_page_mask(h),
+                        (address & huge_page_mask(h)) + huge_page_size(h));
                huge_ptep_clear_flush(vma, address, ptep);
                set_huge_pte_at(mm, address, ptep,
                                make_huge_pte(vma, new_page, 1));
+                page_remove_rmap(old_page);
+                hugepage_add_anon_rmap(new_page, vma, address);
                /* Make the old page be freed below */
                new_page = old_page;
+                mmu_notifier_invalidate_range_end(mm,
+                        address & huge_page_mask(h),
+                        (address & huge_page_mask(h)) + huge_page_size(h));
        }
        page_cache_release(new_page);
        page_cache_release(old_page);
@@ -2452,10 +2508,29 @@ retry:
                        spin_lock(&inode->i_lock);
                        inode->i_blocks += blocks_per_huge_page(h);
                        spin_unlock(&inode->i_lock);
+                        page_dup_rmap(page);
                } else {
                        lock_page(page);
-                        page->mapping = HUGETLB_POISON;
+                        if (unlikely(anon_vma_prepare(vma))) {
+                                ret = VM_FAULT_OOM;
+                                goto backout_unlocked;
+                        }
+                        hugepage_add_new_anon_rmap(page, vma, address);
                }
+        } else {
+                page_dup_rmap(page);
+        }
+        /*
+         * Since memory error handler replaces pte into hwpoison swap entry
+         * at the time of error handling, a process which reserved but not have
+         * the mapping to the error hugepage does not have hwpoison swap entry.
+         * So we need to block accesses from such a process by checking
+         * PG_hwpoison bit here.
+         */
+        if (unlikely(PageHWPoison(page))) {
+                ret = VM_FAULT_HWPOISON;
+                goto backout_unlocked;
        }
        /*
@@ -2507,10 +2582,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *ptep;
        pte_t entry;
        int ret;
+        struct page *page = NULL;
        struct page *pagecache_page = NULL;
        static DEFINE_MUTEX(hugetlb_instantiation_mutex);
        struct hstate *h = hstate_vma(vma);
+        ptep = huge_pte_offset(mm, address);
+        if (ptep) {
+                entry = huge_ptep_get(ptep);
+                if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+                        return VM_FAULT_HWPOISON;
+        }
        ptep = huge_pte_alloc(mm, address, huge_page_size(h));
        if (!ptep)
                return VM_FAULT_OOM;
@@ -2548,6 +2631,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                                                vma, address);
        }
+        if (!pagecache_page) {
+                page = pte_page(entry);
+                lock_page(page);
+        }
        spin_lock(&mm->page_table_lock);
        /* Check for a racing update before calling hugetlb_cow */
        if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2573,6 +2661,8 @@ out_page_table_lock:
        if (pagecache_page) {
                unlock_page(pagecache_page);
                put_page(pagecache_page);
+        } else {
+                unlock_page(page);
        }
 out_mutex:
@@ -2785,3 +2875,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
        hugetlb_put_quota(inode->i_mapping, (chg - freed));
        hugetlb_acct_memory(h, -(chg - freed));
 }
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+        struct hstate *h = page_hstate(hpage);
+        int nid = page_to_nid(hpage);
+        spin_lock(&hugetlb_lock);
+        list_del(&hpage->lru);
+        h->free_huge_pages--;
+        h->free_huge_pages_node[nid]--;
+        spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea71905c1f..0948f1072d6b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
 {
        unsigned long pfn = val;
        struct page *p;
+        struct page *hpage;
        int err;
        if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
                return -ENXIO;
        p = pfn_to_page(pfn);
+        hpage = compound_head(p);
        /*
         * This implies unable to support free buddy pages.
         */
-        if (!get_page_unless_zero(p))
+        if (!get_page_unless_zero(hpage))
                return 0;
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                shake_page(p, 0);
        /*
         * This implies unable to support non-LRU pages.
         */
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                return 0;
        /*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
         * We temporarily take page lock for try_get_mem_cgroup_from_page().
         * __memory_failure() will redo the check reliably inside page lock.
         */
-        lock_page(p);
+        lock_page(hpage);
-        err = hwpoison_filter(p);
+        err = hwpoison_filter(hpage);
-        unlock_page(p);
+        unlock_page(hpage);
        if (err)
                return 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 57aba0da9668..1d29cdfe8ebb 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -7,6 +7,11 @@
 #include <asm/atomic.h>
 #include <asm/pgtable.h>
+#include <asm/mmu.h>
+#ifndef INIT_MM_CONTEXT
+#define INIT_MM_CONTEXT(name)
+#endif
 struct mm_struct init_mm = {
        .mm_rb          = RB_ROOT,
@@ -17,4 +22,5 @@ struct mm_struct init_mm = {
        .page_table_lock =  __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
        .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
        .cpu_vm_mask    = CPU_MASK_ALL,
+        INIT_MM_CONTEXT(init_mm)
 };
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2c0d032ac898..bd9bc214091b 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -211,6 +211,9 @@ static signed long jiffies_scan_wait;
 static int kmemleak_stack_scan = 1;
 /* protects the memory scanning, parameters and debug/kmemleak file access */
 static DEFINE_MUTEX(scan_mutex);
+/* setting kmemleak=on, will set this var, skipping the disable */
+static int kmemleak_skip_disable;
 /*
 * Early object allocation/freeing logging. Kmemleak is initialized after the
@@ -398,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
                object = prio_tree_entry(node, struct kmemleak_object,
                                         tree_node);
                if (!alias && object->pointer != ptr) {
-                        kmemleak_warn("Found object by alias");
+                        pr_warning("Found object by alias at 0x%08lx\n", ptr);
+                        dump_stack();
+                        dump_object_info(object);
                        object = NULL;
                }
        } else
@@ -695,7 +700,7 @@ static void paint_ptr(unsigned long ptr, int color)
 }
 /*
- * Make a object permanently as gray-colored so that it can no longer be
+ * Mark an object permanently as gray-colored so that it can no longer be
 * reported as a leak. This is used in general to mark a false positive.
 */
 static void make_gray_object(unsigned long ptr)
@@ -838,10 +843,19 @@ out:
        rcu_read_unlock();
 }
-/*
+/**
- * Memory allocation function callback. This function is called from the
+ * kmemleak_alloc - register a newly allocated object
- * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
+ * @ptr:        pointer to beginning of the object
- * vmalloc etc.).
+ * @size:       size of the object
+ * @min_count:  minimum number of references to this object. If during memory
+ *              scanning a number of references less than @min_count is found,
+ *              the object is reported as a memory leak. If @min_count is 0,
+ *              the object is never reported as a leak. If @min_count is -1,
+ *              the object is ignored (not scanned and not reported as a leak)
+ * @gfp:        kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel allocators when a new object
+ * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.).
 */
 void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
                          gfp_t gfp)
@@ -855,9 +869,12 @@ void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count,
 }
 EXPORT_SYMBOL_GPL(kmemleak_alloc);
-/*
+/**
- * Memory freeing function callback. This function is called from the kernel
+ * kmemleak_free - unregister a previously registered object
- * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+ * @ptr:        pointer to beginning of the object
+ *
+ * This function is called from the kernel allocators when an object (memory
+ * block) is freed (kmem_cache_free, kfree, vfree etc.).
 */
 void __ref kmemleak_free(const void *ptr)
 {
@@ -870,9 +887,14 @@ void __ref kmemleak_free(const void *ptr)
 }
 EXPORT_SYMBOL_GPL(kmemleak_free);
-/*
+/**
- * Partial memory freeing function callback. This function is usually called
+ * kmemleak_free_part - partially unregister a previously registered object
- * from bootmem allocator when (part of) a memory block is freed.
+ * @ptr:        pointer to the beginning or inside the object. This also
+ *              represents the start of the range to be freed
+ * @size:       size to be unregistered
+ *
+ * This function is called when only a part of a memory block is freed
+ * (usually from the bootmem allocator).
 */
 void __ref kmemleak_free_part(const void *ptr, size_t size)
 {
@@ -885,9 +907,12 @@ void __ref kmemleak_free_part(const void *ptr, size_t size)
 }
 EXPORT_SYMBOL_GPL(kmemleak_free_part);
-/*
+/**
- * Mark an already allocated memory block as a false positive. This will cause
+ * kmemleak_not_leak - mark an allocated object as false positive
- * the block to no longer be reported as leak and always be scanned.
+ * @ptr:        pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to no longer
+ * be reported as leak and always be scanned.
 */
 void __ref kmemleak_not_leak(const void *ptr)
 {
@@ -900,10 +925,14 @@ void __ref kmemleak_not_leak(const void *ptr)
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
-/*
+/**
- * Ignore a memory block. This is usually done when it is known that the
+ * kmemleak_ignore - ignore an allocated object
- * corresponding block is not a leak and does not contain any references to
+ * @ptr:        pointer to beginning of the object
- * other allocated memory blocks.
+ *
+ * Calling this function on an object will cause the memory block to be
+ * ignored (not scanned and not reported as a leak). This is usually done when
+ * it is known that the corresponding block is not a leak and does not contain
+ * any references to other allocated memory blocks.
 */
 void __ref kmemleak_ignore(const void *ptr)
 {
@@ -916,8 +945,16 @@ void __ref kmemleak_ignore(const void *ptr)
 }
 EXPORT_SYMBOL(kmemleak_ignore);
-/*
+/**
- * Limit the range to be scanned in an allocated memory block.
+ * kmemleak_scan_area - limit the range to be scanned in an allocated object
+ * @ptr:        pointer to beginning or inside the object. This also
+ *              represents the start of the scan area
+ * @size:       size of the scan area
+ * @gfp:        kmalloc() flags used for kmemleak internal memory allocations
+ *
+ * This function is used when it is known that only certain parts of an object
+ * contain references to other objects. Kmemleak will only scan these areas
+ * reducing the number false negatives.
 */
 void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
 {
@@ -930,8 +967,14 @@ void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp)
 }
 EXPORT_SYMBOL(kmemleak_scan_area);
-/*
+/**
- * Inform kmemleak not to scan the given memory block.
+ * kmemleak_no_scan - do not scan an allocated object
+ * @ptr:        pointer to beginning of the object
+ *
+ * This function notifies kmemleak not to scan the given memory block. Useful
+ * in situations where it is known that the given object does not contain any
+ * references to other objects. Kmemleak will not scan such objects reducing
+ * the number of false negatives.
 */
 void __ref kmemleak_no_scan(const void *ptr)
 {
@@ -1602,7 +1645,9 @@ static int kmemleak_boot_config(char *str)
                return -EINVAL;
        if (strcmp(str, "off") == 0)
                kmemleak_disable();
-        else if (strcmp(str, "on") != 0)
+        else if (strcmp(str, "on") == 0)
+                kmemleak_skip_disable = 1;
+        else
                return -EINVAL;
        return 0;
 }
@@ -1616,6 +1661,13 @@ void __init kmemleak_init(void)
        int i;
        unsigned long flags;
+#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
+        if (!kmemleak_skip_disable) {
+                kmemleak_disable();
+                return;
+        }
+#endif
        jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
        jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
diff --git a/mm/ksm.c b/mm/ksm.c
index 6c3e99b4ae7c..e2ae00458320 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -33,6 +33,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/swap.h>
 #include <linux/ksm.h>
+#include <linux/hash.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
@@ -153,8 +154,9 @@ struct rmap_item {
 static struct rb_root root_stable_tree = RB_ROOT;
 static struct rb_root root_unstable_tree = RB_ROOT;
-#define MM_SLOTS_HASH_HEADS 1024
+#define MM_SLOTS_HASH_SHIFT 10
-static struct hlist_head *mm_slots_hash;
+#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT)
+static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS];
 static struct mm_slot ksm_mm_head = {
        .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list),
@@ -269,28 +271,13 @@ static inline void free_mm_slot(struct mm_slot *mm_slot)
        kmem_cache_free(mm_slot_cache, mm_slot);
 }
-static int __init mm_slots_hash_init(void)
-{
-        mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head),
-                                GFP_KERNEL);
-        if (!mm_slots_hash)
-                return -ENOMEM;
-        return 0;
-}
-static void __init mm_slots_hash_free(void)
-{
-        kfree(mm_slots_hash);
-}
 static struct mm_slot *get_mm_slot(struct mm_struct *mm)
 {
        struct mm_slot *mm_slot;
        struct hlist_head *bucket;
        struct hlist_node *node;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-                                % MM_SLOTS_HASH_HEADS];
        hlist_for_each_entry(mm_slot, node, bucket, link) {
                if (mm == mm_slot->mm)
                        return mm_slot;
@@ -303,8 +290,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
 {
        struct hlist_head *bucket;
-        bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct))
+        bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)];
-                                % MM_SLOTS_HASH_HEADS];
        mm_slot->mm = mm;
        hlist_add_head(&mm_slot->link, bucket);
 }
@@ -318,19 +304,14 @@ static void hold_anon_vma(struct rmap_item *rmap_item,
                          struct anon_vma *anon_vma)
 {
        rmap_item->anon_vma = anon_vma;
-        atomic_inc(&anon_vma->external_refcount);
+        get_anon_vma(anon_vma);
 }
-static void drop_anon_vma(struct rmap_item *rmap_item)
+static void ksm_drop_anon_vma(struct rmap_item *rmap_item)
 {
        struct anon_vma *anon_vma = rmap_item->anon_vma;
-        if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+        drop_anon_vma(anon_vma);
-                int empty = list_empty(&anon_vma->head);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
 }
 /*
@@ -415,7 +396,7 @@ static void break_cow(struct rmap_item *rmap_item)
         * It is not an accident that whenever we want to break COW
         * to undo, we also need to drop a reference to the anon_vma.
         */
-        drop_anon_vma(rmap_item);
+        ksm_drop_anon_vma(rmap_item);
        down_read(&mm->mmap_sem);
        if (ksm_test_exit(mm))
@@ -470,7 +451,7 @@ static void remove_node_from_stable_tree(struct stable_node *stable_node)
                        ksm_pages_sharing--;
                else
                        ksm_pages_shared--;
-                drop_anon_vma(rmap_item);
+                ksm_drop_anon_vma(rmap_item);
                rmap_item->address &= PAGE_MASK;
                cond_resched();
        }
@@ -558,7 +539,7 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
                else
                        ksm_pages_shared--;
-                drop_anon_vma(rmap_item);
+                ksm_drop_anon_vma(rmap_item);
                rmap_item->address &= PAGE_MASK;
        } else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -1566,7 +1547,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1589,7 +1570,7 @@ again:
                        if (!search_new_forks || !mapcount)
                                break;
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
                if (!mapcount)
                        goto out;
        }
@@ -1619,7 +1600,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1637,11 +1618,11 @@ again:
                        ret = try_to_unmap_one(page, vma,
                                        rmap_item->address, flags);
                        if (ret != SWAP_AGAIN || !page_mapped(page)) {
-                                spin_unlock(&anon_vma->lock);
+                                anon_vma_unlock(anon_vma);
                                goto out;
                        }
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1671,7 +1652,7 @@ again:
                struct anon_vma_chain *vmac;
                struct vm_area_struct *vma;
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) {
                        vma = vmac->vma;
                        if (rmap_item->address < vma->vm_start ||
@@ -1688,11 +1669,11 @@ again:
                        ret = rmap_one(page, vma, rmap_item->address, arg);
                        if (ret != SWAP_AGAIN) {
-                                spin_unlock(&anon_vma->lock);
+                                anon_vma_unlock(anon_vma);
                                goto out;
                        }
                }
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
        if (!search_new_forks++)
                goto again;
@@ -1943,15 +1924,11 @@ static int __init ksm_init(void)
        if (err)
                goto out;
-        err = mm_slots_hash_init();
-        if (err)
-                goto out_free1;
        ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd");
        if (IS_ERR(ksm_thread)) {
                printk(KERN_ERR "ksm: creating kthread failed\n");
                err = PTR_ERR(ksm_thread);
-                goto out_free2;
+                goto out_free;
        }
 #ifdef CONFIG_SYSFS
@@ -1959,7 +1936,7 @@ static int __init ksm_init(void)
        if (err) {
                printk(KERN_ERR "ksm: register sysfs failed\n");
                kthread_stop(ksm_thread);
-                goto out_free2;
+                goto out_free;
        }
 #else
        ksm_run = KSM_RUN_MERGE;        /* no way for user to start it */
@@ -1975,9 +1952,7 @@ static int __init ksm_init(void)
 #endif
        return 0;
-out_free2:
+out_free:
-        mm_slots_hash_free();
-out_free1:
        ksm_slab_free();
 out:
        return err;
diff --git a/mm/memblock.c b/mm/memblock.c
index 3024eb30fc27..43840b305ecb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -504,7 +504,7 @@ int __init memblock_is_reserved(u64 addr)
 int memblock_is_region_reserved(u64 base, u64 size)
 {
-        return memblock_overlaps_region(&memblock.reserved, base, size);
+        return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
 }
 /*
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 #include "internal.h"
 #include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +214,6 @@ struct mem_cgroup {
        */
        spinlock_t reclaim_param_lock;
-        int     prev_priority;  /* for recording reclaim priority */
        /*
         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
@@ -268,6 +269,7 @@ enum move_type {
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
+        spinlock_t        lock; /* for from, to, moving_task */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
        struct task_struct *moving_task;        /* a task moving charges */
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
+        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
        struct mem_cgroup *curr = NULL;
+        struct task_struct *p;
-        task_lock(task);
+        p = find_lock_task_mm(task);
-        rcu_read_lock();
+        if (!p)
-        curr = try_get_mem_cgroup_from_mm(task->mm);
+                return 0;
-        rcu_read_unlock();
+        curr = try_get_mem_cgroup_from_mm(p->mm);
-        task_unlock(task);
+        task_unlock(p);
        if (!curr)
                return 0;
        /*
@@ -858,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
        return ret;
 }
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-        int prev_priority;
-        spin_lock(&mem->reclaim_param_lock);
-        prev_priority = mem->prev_priority;
-        spin_unlock(&mem->reclaim_param_lock);
-        return prev_priority;
-}
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        if (priority < mem->prev_priority)
-                mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
        unsigned long active;
@@ -944,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                       struct zone *zone,
                                       enum lru_list lru)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -954,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -999,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        LIST_HEAD(pc_list);
        struct list_head *src;
        struct page_cgroup *pc, *tmp;
-        int nid = z->zone_pgdat->node_id;
+        int nid = zone_to_nid(z);
        int zid = zone_idx(z);
        struct mem_cgroup_per_zone *mz;
        int lru = LRU_FILE * file + active;
@@ -1038,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        }
        *scanned = scan;
+        trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+                                      0, 0, 0, mode);
        return nr_taken;
 }
@@ -1072,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
+/* A routine for testing mem is not under move_account */
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+        struct mem_cgroup *from;
+        struct mem_cgroup *to;
+        bool ret = false;
+        /*
+         * Unlike task_move routines, we access mc.to, mc.from not under
+         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+         */
+        spin_lock(&mc.lock);
+        from = mc.from;
+        to = mc.to;
+        if (!from)
+                goto unlock;
+        if (from == mem || to == mem
+            || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+            || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+                ret = true;
+unlock:
+        spin_unlock(&mc.lock);
+        return ret;
+}
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+        if (mc.moving_task && current != mc.moving_task) {
+                if (mem_cgroup_under_move(mem)) {
+                        DEFINE_WAIT(wait);
+                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+                        /* moving charge context might have finished. */
+                        if (mc.moving_task)
+                                schedule();
+                        finish_wait(&mc.waitq, &wait);
+                        return true;
+                }
+        }
+        return false;
+}
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
        int *val = data;
@@ -1158,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 }
 /*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+        u64 limit;
+        u64 memsw;
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+                        total_swap_pages;
+        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+        /*
+         * If memsw is finite and limits the amount of swap space available
+         * to this memcg, return that limit.
+         */
+        return min(limit, memsw);
+}
+/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
@@ -1262,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                /* we use swappiness of local cgroup */
                if (check_soft)
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone,
+                                noswap, get_swappiness(victim), zone);
-                                zone->zone_pgdat->node_id);
                else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
@@ -1370,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-        if (atomic_read(&mem->oom_lock))
+        if (mem && atomic_read(&mem->oom_lock))
                memcg_wakeup_oom(mem);
 }
@@ -1582,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+/* See __mem_cgroup_try_charge() for details */
+enum {
+        CHARGE_OK,              /* success */
+        CHARGE_RETRY,           /* need to retry but retry is not bad */
+        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+};
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+                                int csize, bool oom_check)
+{
+        struct mem_cgroup *mem_over_limit;
+        struct res_counter *fail_res;
+        unsigned long flags = 0;
+        int ret;
+        ret = res_counter_charge(&mem->res, csize, &fail_res);
+        if (likely(!ret)) {
+                if (!do_swap_account)
+                        return CHARGE_OK;
+                ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+                if (likely(!ret))
+                        return CHARGE_OK;
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+        } else
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        if (csize > PAGE_SIZE) /* change csize and retry */
+                return CHARGE_RETRY;
+        if (!(gfp_mask & __GFP_WAIT))
+                return CHARGE_WOULDBLOCK;
+        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                        gfp_mask, flags);
+        /*
+         * try_to_free_mem_cgroup_pages() might not give us a full
+         * picture of reclaim. Some pages are reclaimed and might be
+         * moved to swap cache or just unmapped from the cgroup.
+         * Check the limit again to see if the reclaim reduced the
+         * current usage of the cgroup before giving up
+         */
+        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+                return CHARGE_RETRY;
+        /*
+         * At task move, charge accounts can be doubly counted. So, it's
+         * better to wait until the end of task_move if something is going on.
+         */
+        if (mem_cgroup_wait_acct_move(mem_over_limit))
+                return CHARGE_RETRY;
+        /* If we don't need to call oom-killer at el, return immediately */
+        if (!oom_check)
+                return CHARGE_NOMEM;
+        /* check OOM */
+        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+                return CHARGE_OOM_DIE;
+        return CHARGE_RETRY;
+}
 /*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                        gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
-        struct mem_cgroup *mem, *mem_over_limit;
+        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct mem_cgroup *mem = NULL;
-        struct res_counter *fail_res;
+        int ret;
        int csize = CHARGE_SIZE;
        /*
@@ -1609,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        mem = *memcg;
+        if (!*memcg && !mm)
-        if (likely(!mem)) {
+                goto bypass;
-                mem = try_get_mem_cgroup_from_mm(mm);
+again:
-                *memcg = mem;
+        if (*memcg) { /* css should be a valid one */
-        } else {
+                mem = *memcg;
-                css_get(&mem->css);
+                VM_BUG_ON(css_is_removed(&mem->css));
-        }
+                if (mem_cgroup_is_root(mem))
-        if (unlikely(!mem))
+                        goto done;
-                return 0;
-        VM_BUG_ON(css_is_removed(&mem->css));
-        if (mem_cgroup_is_root(mem))
-                goto done;
-        while (1) {
-                int ret = 0;
-                unsigned long flags = 0;
                if (consume_stock(mem))
                        goto done;
+                css_get(&mem->css);
+        } else {
+                struct task_struct *p;
-                ret = res_counter_charge(&mem->res, csize, &fail_res);
+                rcu_read_lock();
-                if (likely(!ret)) {
+                p = rcu_dereference(mm->owner);
-                        if (!do_swap_account)
+                VM_BUG_ON(!p);
-                                break;
-                        ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                        if (likely(!ret))
-                                break;
-                        /* mem+swap counter fails */
-                        res_counter_uncharge(&mem->res, csize);
-                        flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        memsw);
-                } else
-                        /* mem counter fails */
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        res);
-                /* reduce request size and retry */
-                if (csize > PAGE_SIZE) {
-                        csize = PAGE_SIZE;
-                        continue;
-                }
-                if (!(gfp_mask & __GFP_WAIT))
-                        goto nomem;
-                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                                gfp_mask, flags);
-                if (ret)
-                        continue;
                /*
-                 * try_to_free_mem_cgroup_pages() might not give us a full
+                 * because we don't have task_lock(), "p" can exit while
-                 * picture of reclaim. Some pages are reclaimed and might be
+                 * we're here. In that case, "mem" can point to root
-                 * moved to swap cache or just unmapped from the cgroup.
+                 * cgroup but never be NULL. (and task_struct itself is freed
-                 * Check the limit again to see if the reclaim reduced the
+                 * by RCU, cgroup itself is RCU safe.) Then, we have small
-                 * current usage of the cgroup before giving up
+                 * risk here to get wrong cgroup. But such kind of mis-account
-                 *
+                 * by race always happens because we don't have cgroup_mutex().
+                 * It's overkill and we allow that small race, here.
                 */
-                if (mem_cgroup_check_under_limit(mem_over_limit))
+                mem = mem_cgroup_from_task(p);
-                        continue;
+                VM_BUG_ON(!mem);
+                if (mem_cgroup_is_root(mem)) {
-                /* try to avoid oom while someone is moving charge */
+                        rcu_read_unlock();
-                if (mc.moving_task && current != mc.moving_task) {
+                        goto done;
-                        struct mem_cgroup *from, *to;
+                }
-                        bool do_continue = false;
+                if (consume_stock(mem)) {
                        /*
-                         * There is a small race that "from" or "to" can be
+                         * It seems dagerous to access memcg without css_get().
-                         * freed by rmdir, so we use css_tryget().
+                         * But considering how consume_stok works, it's not
+                         * necessary. If consume_stock success, some charges
+                         * from this memcg are cached on this cpu. So, we
+                         * don't need to call css_get()/css_tryget() before
+                         * calling consume_stock().
                         */
-                        from = mc.from;
+                        rcu_read_unlock();
-                        to = mc.to;
+                        goto done;
-                        if (from && css_tryget(&from->css)) {
+                }
-                                if (mem_over_limit->use_hierarchy)
+                /* after here, we may be blocked. we need to get refcnt */
-                                        do_continue = css_is_ancestor(
+                if (!css_tryget(&mem->css)) {
-                                                        &from->css,
+                        rcu_read_unlock();
-                                                        &mem_over_limit->css);
+                        goto again;
-                                else
+                }
-                                        do_continue = (from == mem_over_limit);
+                rcu_read_unlock();
-                                css_put(&from->css);
+        }
-                        }
-                        if (!do_continue && to && css_tryget(&to->css)) {
+        do {
-                                if (mem_over_limit->use_hierarchy)
+                bool oom_check;
-                                        do_continue = css_is_ancestor(
-                                                        &to->css,
+                /* If killed, bypass charge */
-                                                        &mem_over_limit->css);
+                if (fatal_signal_pending(current)) {
-                                else
+                        css_put(&mem->css);
-                                        do_continue = (to == mem_over_limit);
+                        goto bypass;
-                                css_put(&to->css);
+                }
-                        }
-                        if (do_continue) {
+                oom_check = false;
-                                DEFINE_WAIT(wait);
+                if (oom && !nr_oom_retries) {
-                                prepare_to_wait(&mc.waitq, &wait,
+                        oom_check = true;
-                                                        TASK_INTERRUPTIBLE);
+                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                                /* moving charge context might have finished. */
-                                if (mc.moving_task)
-                                        schedule();
-                                finish_wait(&mc.waitq, &wait);
-                                continue;
-                        }
                }
-                if (!nr_retries--) {
+                ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
-                        if (!oom)
+                switch (ret) {
+                case CHARGE_OK:
+                        break;
+                case CHARGE_RETRY: /* not in OOM situation but retry */
+                        csize = PAGE_SIZE;
+                        css_put(&mem->css);
+                        mem = NULL;
+                        goto again;
+                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                        css_put(&mem->css);
+                        goto nomem;
+                case CHARGE_NOMEM: /* OOM routine works */
+                        if (!oom) {
+                                css_put(&mem->css);
                                goto nomem;
-                        if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
-                                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                                continue;
                        }
-                        /* When we reach here, current task is dying .*/
+                        /* If oom, we never return -ENOMEM */
+                        nr_oom_retries--;
+                        break;
+                case CHARGE_OOM_DIE: /* Killed by OOM Killer */
                        css_put(&mem->css);
                        goto bypass;
                }
-        }
+        } while (ret != CHARGE_OK);
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
+        css_put(&mem->css);
 done:
+        *memcg = mem;
        return 0;
 nomem:
-        css_put(&mem->css);
+        *memcg = NULL;
        return -ENOMEM;
 bypass:
        *memcg = NULL;
@@ -1747,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                if (do_swap_account)
                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_put(&mem->css, (int)count);
        }
-        /* we don't need css_put for root */
 }
 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2061,9 @@ out:
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype,
+                                gfp_t gfp_mask, enum charge_type ctype)
-                                struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
@@ -1992,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        mem = memcg;
        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
@@ -2020,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 static void
@@ -2030,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
        int ret;
        if (mem_cgroup_disabled())
@@ -2051,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
                pc = lookup_page_cgroup(page);
                if (!pc)
                        return 0;
@@ -2063,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
-        if (unlikely(!mm && !mem))
+        if (unlikely(!mm))
                mm = &init_mm;
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_CACHE);
        /* shmem */
        if (PageSwapCache(page)) {
+                struct mem_cgroup *mem = NULL;
                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
                if (!ret)
                        __mem_cgroup_commit_charge_swapin(page, mem,
                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        } else
                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        return ret;
 }
@@ -2114,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                goto charge_cur_mm;
        *ptr = mem;
        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
@@ -2245,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-        struct mem_cgroup_per_zone *mz;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2285,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-                mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
        ClearPageCgroupUsed(pc);
@@ -2299,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         * special functions.
         */
-        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
+        /*
+         * even after unlock, we have mem->res.usage here and this memcg
+         * will never be freed.
+         */
        memcg_check_events(mem, page);
-        /* at swapout, this memcg will be accessed to record to swap */
+        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                mem_cgroup_swap_statistics(mem, true);
-                css_put(&mem->css);
+                mem_cgroup_get(mem);
+        }
+        if (!mem_cgroup_is_root(mem))
+                __do_uncharge(mem, ctype);
        return mem;
@@ -2392,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        memcg = __mem_cgroup_uncharge_common(page, ctype);
-        /* record memcg information */
+        /*
-        if (do_swap_account && swapout && memcg) {
+         * record memcg information,  if swapout && memcg != NULL,
+         * mem_cgroup_get() was called in uncharge().
+         */
+        if (do_swap_account && swapout && memcg)
                swap_cgroup_record(ent, css_id(&memcg->css));
-                mem_cgroup_get(memcg);
-        }
-        if (swapout && memcg)
-                css_put(&memcg->css);
 }
 #endif
@@ -2476,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                         */
                        if (!mem_cgroup_is_root(to))
                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                        css_put(&to->css);
                }
                return 0;
        }
@@ -2611,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        if (unused != oldpage)
-                pc = lookup_page_cgroup(unused);
        __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
-        pc = lookup_page_cgroup(used);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
         * and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                                gfp_t gfp_mask, int nid,
+                                            gfp_t gfp_mask)
-                                                int zid)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        if (order > 0)
                return 0;
-        mctz = soft_limit_tree_node_zone(nid, zid);
+        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
        /*
         * This loop can run a while, specially if mem_cgroup's continuously
         * keep exceeding their soft limit and putting the system under
@@ -3759,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        return 0;
 }
-/*
- */
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        struct cftype *cft, u64 val)
 {
@@ -4180,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
                        goto one_by_one;
                }
                mc.precharge += count;
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_get(&mem->css, (int)count);
                return ret;
        }
 one_by_one:
@@ -4400,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 static void mem_cgroup_clear_mc(void)
 {
+        struct mem_cgroup *from = mc.from;
+        struct mem_cgroup *to = mc.to;
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
                mc.precharge = 0;
-                memcg_oom_recover(mc.to);
        }
        /*
         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
        if (mc.moved_charge) {
                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                mc.moved_charge = 0;
-                memcg_oom_recover(mc.from);
        }
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
-                WARN_ON_ONCE(mc.moved_swap > INT_MAX);
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
                         */
                        res_counter_uncharge(&mc.to->res,
                                                PAGE_SIZE * mc.moved_swap);
-                        VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-                        __css_put(&mc.to->css, mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
+        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        mc.moving_task = NULL;
+        spin_unlock(&mc.lock);
+        memcg_oom_recover(from);
+        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
 }
@@ -4469,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
                        mc.precharge = 0;
                        mc.moved_charge = 0;
                        mc.moved_swap = 0;
                        mc.moving_task = current;
+                        spin_unlock(&mc.lock);
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52cacaa..9c26eeca1342 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
 #include <linux/suspend.h>
 #include <linux/slab.h>
 #include <linux/swapops.h>
+#include <linux/hugetlb.h>
 #include "internal.h"
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
 /*
 * Huge pages. Needs work.
 * Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
- * all MMs and look for the mappings, but that would be non atomic and racy.
+ *   To narrow down kill region to one page, we need to break up pmd.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
+ * - To support soft-offlining for hugepage, we need to support hugepage
- * like just walking the current process and hoping it has it mapped (that
+ *   migration.
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
 */
 static int me_huge_page(struct page *p, unsigned long pfn)
 {
-        return FAILED;
+        struct page *hpage = compound_head(p);
+        /*
+         * We can safely recover from error on free or reserved (i.e.
+         * not in-use) hugepage by dequeuing it from freelist.
+         * To check whether a hugepage is in-use or not, we can't use
+         * page->lru because it can be used in other hugepage operations,
+         * such as __unmap_hugepage_range() and gather_surplus_pages().
+         * So instead we use page_mapping() and PageAnon().
+         * We assume that this function is called with page lock held,
+         * so there is no race between isolation and mapping/unmapping.
+         */
+        if (!(page_mapping(hpage) || PageAnon(hpage))) {
+                __isolate_hwpoisoned_huge_page(hpage);
+                return RECOVERED;
+        }
+        return DELAYED;
 }
 /*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        int ret;
        int i;
        int kill = 1;
+        struct page *hpage = compound_head(p);
        if (PageReserved(p) || PageSlab(p))
                return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * This check implies we don't kill processes if their pages
         * are in the swap cache early. Those are always late kills.
         */
-        if (!page_mapped(p))
+        if (!page_mapped(hpage))
                return SWAP_SUCCESS;
-        if (PageCompound(p) || PageKsm(p))
+        if (PageKsm(p))
                return SWAP_FAIL;
        if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * XXX: the dirty test could be racy: set_page_dirty() may not always
         * be called inside page lock (it's recommended but not enforced).
         */
-        mapping = page_mapping(p);
+        mapping = page_mapping(hpage);
-        if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
+        if (!PageDirty(hpage) && mapping &&
-                if (page_mkclean(p)) {
+            mapping_cap_writeback_dirty(mapping)) {
-                        SetPageDirty(p);
+                if (page_mkclean(hpage)) {
+                        SetPageDirty(hpage);
                } else {
                        kill = 0;
                        ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * there's nothing that can be done.
         */
        if (kill)
-                collect_procs(p, &tokill);
+                collect_procs(hpage, &tokill);
        /*
         * try_to_unmap can fail temporarily due to races.
         * Try a few times (RED-PEN better strategy?)
         */
        for (i = 0; i < N_UNMAP_TRIES; i++) {
-                ret = try_to_unmap(p, ttu);
+                ret = try_to_unmap(hpage, ttu);
                if (ret == SWAP_SUCCESS)
                        break;
                pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
        if (ret != SWAP_SUCCESS)
                printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                                pfn, page_mapcount(p));
+                                pfn, page_mapcount(hpage));
        /*
         * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         * use a more force-full uncatchable kill to prevent
         * any accesses to the poisoned memory.
         */
-        kill_procs_ao(&tokill, !!PageDirty(p), trapno,
+        kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
                      ret != SWAP_SUCCESS, pfn);
        return ret;
 }
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+        int i;
+        int nr_pages = 1 << compound_order(hpage);
+        for (i = 0; i < nr_pages; i++)
+                SetPageHWPoison(hpage + i);
+}
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+        int i;
+        int nr_pages = 1 << compound_order(hpage);
+        for (i = 0; i < nr_pages; i++)
+                ClearPageHWPoison(hpage + i);
+}
 int __memory_failure(unsigned long pfn, int trapno, int flags)
 {
        struct page_state *ps;
        struct page *p;
+        struct page *hpage;
        int res;
+        unsigned int nr_pages;
        if (!sysctl_memory_failure_recovery)
                panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
        }
        p = pfn_to_page(pfn);
+        hpage = compound_head(p);
        if (TestSetPageHWPoison(p)) {
                printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                return 0;
        }
-        atomic_long_add(1, &mce_bad_pages);
+        nr_pages = 1 << compound_order(hpage);
+        atomic_long_add(nr_pages, &mce_bad_pages);
        /*
         * We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
         */
        if (!(flags & MF_COUNT_INCREASED) &&
-                !get_page_unless_zero(compound_head(p))) {
+                !get_page_unless_zero(hpage)) {
                if (is_free_buddy_page(p)) {
                        action_result(pfn, "free buddy", DELAYED);
                        return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * The check (unnecessarily) ignores LRU pages being isolated and
         * walked by the page reclaim code, however that's not a big loss.
         */
-        if (!PageLRU(p))
+        if (!PageLRU(p) && !PageHuge(p))
                shake_page(p, 0);
-        if (!PageLRU(p)) {
+        if (!PageLRU(p) && !PageHuge(p)) {
                /*
                 * shake_page could have turned it free.
                 */
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         * It's very difficult to mess with pages currently under IO
         * and in many cases impossible, so we just avoid it here.
         */
-        lock_page_nosync(p);
+        lock_page_nosync(hpage);
        /*
         * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
        }
        if (hwpoison_filter(p)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_dec(&mce_bad_pages);
+                        atomic_long_sub(nr_pages, &mce_bad_pages);
-                unlock_page(p);
+                unlock_page(hpage);
-                put_page(p);
+                put_page(hpage);
+                return 0;
+        }
+        /*
+         * For error on the tail page, we should set PG_hwpoison
+         * on the head page to show that the hugepage is hwpoisoned
+         */
+        if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+                action_result(pfn, "hugepage already hardware poisoned",
+                                IGNORED);
+                unlock_page(hpage);
+                put_page(hpage);
                return 0;
        }
+        /*
+         * Set PG_hwpoison on all pages in an error hugepage,
+         * because containment is done in hugepage unit for now.
+         * Since we have done TestSetPageHWPoison() for the head page with
+         * page lock held, we can safely set PG_hwpoison bits on tail pages.
+         */
+        if (PageHuge(p))
+                set_page_hwpoison_huge_page(hpage);
        wait_on_page_writeback(p);
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                }
        }
 out:
-        unlock_page(p);
+        unlock_page(hpage);
        return res;
 }
 EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
        struct page *page;
        struct page *p;
        int freeit = 0;
+        unsigned int nr_pages;
        if (!pfn_valid(pfn))
                return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
                return 0;
        }
+        nr_pages = 1 << compound_order(page);
        if (!get_page_unless_zero(page)) {
                if (TestClearPageHWPoison(p))
-                        atomic_long_dec(&mce_bad_pages);
+                        atomic_long_sub(nr_pages, &mce_bad_pages);
                pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
                return 0;
        }
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
         * the PG_hwpoison page will be caught and isolated on the entrance to
         * the free buddy page pool.
         */
-        if (TestClearPageHWPoison(p)) {
+        if (TestClearPageHWPoison(page)) {
                pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
-                atomic_long_dec(&mce_bad_pages);
+                atomic_long_sub(nr_pages, &mce_bad_pages);
                freeit = 1;
        }
+        if (PageHuge(p))
+                clear_page_hwpoison_huge_page(page);
        unlock_page(page);
        put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index bde42c6d3633..2ed2267439df 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -307,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start;
        /*
         * The next few lines have given us lots of grief...
@@ -351,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb,
        if (addr > end - 1)
                return;
-        start = addr;
        pgd = pgd_offset(tlb->mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -2008,11 +2006,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 {
        pgd_t *pgd;
        unsigned long next;
-        unsigned long start = addr, end = addr + size;
+        unsigned long end = addr + size;
        int err;
        BUG_ON(addr >= end);
-        mmu_notifier_invalidate_range_start(mm, start, end);
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
@@ -2020,7 +2017,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
-        mmu_notifier_invalidate_range_end(mm, start, end);
        return err;
 }
 EXPORT_SYMBOL_GPL(apply_to_page_range);
@@ -2630,6 +2627,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        swp_entry_t entry;
        pte_t pte;
        struct mem_cgroup *ptr = NULL;
+        int exclusive = 0;
        int ret = 0;
        if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2724,10 +2722,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
+                ret |= VM_FAULT_WRITE;
+                exclusive = 1;
        }
        flush_icache_page(vma, page);
        set_pte_at(mm, address, page_table, pte);
-        page_add_anon_rmap(page, vma, address);
+        do_page_add_anon_rmap(page, vma, address, exclusive);
        /* It's better to call commit-charge after rmap is established */
        mem_cgroup_commit_charge_swapin(page, ptr);
@@ -2760,6 +2760,33 @@ out_release:
 }
 /*
+ * This is like a special single-page "expand_downwards()",
+ * except we must first make sure that 'address-PAGE_SIZE'
+ * doesn't hit another vma.
+ *
+ * The "find_vma()" will do the right thing even if we wrap
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+        address &= PAGE_MASK;
+        if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+                struct vm_area_struct *prev = vma->vm_prev;
+                /*
+                 * Is there a mapping abutting this one below?
+                 *
+                 * That's only ok if it's the same stack mapping
+                 * that has gotten split..
+                 */
+                if (prev && prev->vm_end == address)
+                        return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+                expand_stack(vma, address - PAGE_SIZE);
+        }
+        return 0;
+}
+/*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,19 +2799,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        spinlock_t *ptl;
        pte_t entry;
+        pte_unmap(page_table);
+        /* Check if we need to add a guard page to the stack */
+        if (check_stack_guard_page(vma, address) < 0)
+                return VM_FAULT_SIGBUS;
+        /* Use the zero-page for reads */
        if (!(flags & FAULT_FLAG_WRITE)) {
                entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
                                                vma->vm_page_prot));
-                ptl = pte_lockptr(mm, pmd);
+                page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-                spin_lock(ptl);
                if (!pte_none(*page_table))
                        goto unlock;
                goto setpte;
        }
        /* Allocate our own private page. */
-        pte_unmap(page_table);
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
        page = alloc_zeroed_user_highpage_movable(vma, address);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..f969da5dd8a2 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1275,33 +1275,42 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
                const unsigned long __user *, new_nodes)
 {
        const struct cred *cred = current_cred(), *tcred;
-        struct mm_struct *mm;
+        struct mm_struct *mm = NULL;
        struct task_struct *task;
-        nodemask_t old;
-        nodemask_t new;
        nodemask_t task_nodes;
        int err;
+        nodemask_t *old;
+        nodemask_t *new;
+        NODEMASK_SCRATCH(scratch);
+        if (!scratch)
+                return -ENOMEM;
+        old = &scratch->mask1;
+        new = &scratch->mask2;
-        err = get_nodes(&old, old_nodes, maxnode);
+        err = get_nodes(old, old_nodes, maxnode);
        if (err)
-                return err;
+                goto out;
-        err = get_nodes(&new, new_nodes, maxnode);
+        err = get_nodes(new, new_nodes, maxnode);
        if (err)
-                return err;
+                goto out;
        /* Find the mm_struct */
        read_lock(&tasklist_lock);
        task = pid ? find_task_by_vpid(pid) : current;
        if (!task) {
                read_unlock(&tasklist_lock);
-                return -ESRCH;
+                err = -ESRCH;
+                goto out;
        }
        mm = get_task_mm(task);
        read_unlock(&tasklist_lock);
+        err = -EINVAL;
        if (!mm)
-                return -EINVAL;
+                goto out;
        /*
         * Check if this process has the right to modify the specified
@@ -1322,12 +1331,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        task_nodes = cpuset_mems_allowed(task);
        /* Is the user allowed to access the target nodes? */
-        if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
+        if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) {
                err = -EPERM;
                goto out;
        }
-        if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+        if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) {
                err = -EINVAL;
                goto out;
        }
@@ -1336,10 +1345,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
        if (err)
                goto out;
-        err = do_migrate_pages(mm, &old, &new,
+        err = do_migrate_pages(mm, old, new,
                capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 out:
-        mmput(mm);
+        if (mm)
+                mmput(mm);
+        NODEMASK_SCRATCH_FREE(scratch);
        return err;
 }
@@ -1712,6 +1724,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
 }
 #endif
+/*
+ * mempolicy_nodemask_intersects
+ *
+ * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
+ * policy.  Otherwise, check for intersection between mask and the policy
+ * nodemask for 'bind' or 'interleave' policy.  For 'perferred' or 'local'
+ * policy, always return true since it may allocate elsewhere on fallback.
+ *
+ * Takes task_lock(tsk) to prevent freeing of its mempolicy.
+ */
+bool mempolicy_nodemask_intersects(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        struct mempolicy *mempolicy;
+        bool ret = true;
+        if (!mask)
+                return ret;
+        task_lock(tsk);
+        mempolicy = tsk->mempolicy;
+        if (!mempolicy)
+                goto out;
+        switch (mempolicy->mode) {
+        case MPOL_PREFERRED:
+                /*
+                 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
+                 * allocate from, they may fallback to other nodes when oom.
+                 * Thus, it's possible for tsk to have allocated memory from
+                 * nodes in mask.
+                 */
+                break;
+        case MPOL_BIND:
+        case MPOL_INTERLEAVE:
+                ret = nodes_intersects(mempolicy->v.nodes, *mask);
+                break;
+        default:
+                BUG();
+        }
+out:
+        task_unlock(tsk);
+        return ret;
+}
 /* Allocate a page in interleaved policy.
   Own path because it needs to do special accounting. */
 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/migrate.c b/mm/migrate.c
index 4205b1d6049e..38e7cad782f4 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -639,7 +639,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
                         * exist when the page is remapped later
                         */
                        anon_vma = page_anon_vma(page);
-                        atomic_inc(&anon_vma->external_refcount);
+                        get_anon_vma(anon_vma);
                }
        }
@@ -682,12 +682,8 @@ skip_unmap:
 rcu_unlock:
        /* Drop an anon_vma reference if we took one */
-        if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->lock)) {
+        if (anon_vma)
-                int empty = list_empty(&anon_vma->head);
+                drop_anon_vma(anon_vma);
-                spin_unlock(&anon_vma->lock);
-                if (empty)
-                        anon_vma_free(anon_vma);
-        }
        if (rcu_locked)
                rcu_read_unlock();
diff --git a/mm/mlock.c b/mm/mlock.c
index 3f82720e0515..cbae7c5b9568 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,19 @@ void munlock_vma_page(struct page *page)
        }
 }
+/* Is the vma a continuation of the stack vma above it? */
+static inline int vma_stack_continue(struct vm_area_struct *vma, unsigned long addr)
+{
+        return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
+}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+        return (vma->vm_flags & VM_GROWSDOWN) &&
+                (vma->vm_start == addr) &&
+                !vma_stack_continue(vma->vm_prev, addr);
+}
 /**
 * __mlock_vma_pages_range() -  mlock a range of pages in the vma.
 * @vma:   target vma
@@ -167,6 +180,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
        if (vma->vm_flags & VM_WRITE)
                gup_flags |= FOLL_WRITE;
+        /* We don't try to access the guard page of a stack vma */
+        if (stack_guard_page(vma, start)) {
+                addr += PAGE_SIZE;
+                nr_pages--;
+        }
        while (nr_pages > 0) {
                int i;
diff --git a/mm/mmap.c b/mm/mmap.c
index e38e910cb756..331e51af38c9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -388,17 +388,23 @@ static inline void
 __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev, struct rb_node *rb_parent)
 {
+        struct vm_area_struct *next;
+        vma->vm_prev = prev;
        if (prev) {
-                vma->vm_next = prev->vm_next;
+                next = prev->vm_next;
                prev->vm_next = vma;
        } else {
                mm->mmap = vma;
                if (rb_parent)
-                        vma->vm_next = rb_entry(rb_parent,
+                        next = rb_entry(rb_parent,
                                        struct vm_area_struct, vm_rb);
                else
-                        vma->vm_next = NULL;
+                        next = NULL;
        }
+        vma->vm_next = next;
+        if (next)
+                next->vm_prev = vma;
 }
 void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -452,12 +458,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
                spin_lock(&mapping->i_mmap_lock);
                vma->vm_truncate_count = mapping->truncate_count;
        }
-        anon_vma_lock(vma);
        __vma_link(mm, vma, prev, rb_link, rb_parent);
        __vma_link_file(vma);
-        anon_vma_unlock(vma);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
@@ -485,7 +489,11 @@ static inline void
 __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
                struct vm_area_struct *prev)
 {
-        prev->vm_next = vma->vm_next;
+        struct vm_area_struct *next = vma->vm_next;
+        prev->vm_next = next;
+        if (next)
+                next->vm_prev = prev;
        rb_erase(&vma->vm_rb, &mm->mm_rb);
        if (mm->mmap_cache == vma)
                mm->mmap_cache = prev;
@@ -506,6 +514,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
        struct vm_area_struct *importer = NULL;
        struct address_space *mapping = NULL;
        struct prio_tree_root *root = NULL;
+        struct anon_vma *anon_vma = NULL;
        struct file *file = vma->vm_file;
        long adjust_next = 0;
        int remove_next = 0;
@@ -578,6 +587,17 @@ again:			remove_next = 1 + (end > next->vm_end);
                }
        }
+        /*
+         * When changing only vma->vm_end, we don't really need anon_vma
+         * lock. This is a fairly rare case by itself, but the anon_vma
+         * lock may be shared between many sibling processes.  Skipping
+         * the lock for brk adjustments makes a difference sometimes.
+         */
+        if (vma->anon_vma && (insert || importer || start != vma->vm_start)) {
+                anon_vma = vma->anon_vma;
+                anon_vma_lock(anon_vma);
+        }
        if (root) {
                flush_dcache_mmap_lock(mapping);
                vma_prio_tree_remove(vma, root);
@@ -617,6 +637,8 @@ again:			remove_next = 1 + (end > next->vm_end);
                __insert_vm_struct(mm, insert);
        }
+        if (anon_vma)
+                anon_vma_unlock(anon_vma);
        if (mapping)
                spin_unlock(&mapping->i_mmap_lock);
@@ -1710,7 +1732,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
         */
        if (unlikely(anon_vma_prepare(vma)))
                return -ENOMEM;
-        anon_vma_lock(vma);
+        vma_lock_anon_vma(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
@@ -1721,7 +1743,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
        if (address < PAGE_ALIGN(address+4))
                address = PAGE_ALIGN(address+4);
        else {
-                anon_vma_unlock(vma);
+                vma_unlock_anon_vma(vma);
                return -ENOMEM;
        }
        error = 0;
@@ -1739,7 +1761,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
                        perf_event_mmap(vma);
                }
        }
-        anon_vma_unlock(vma);
+        vma_unlock_anon_vma(vma);
        return error;
 }
 #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -1764,7 +1786,7 @@ static int expand_downwards(struct vm_area_struct *vma,
        if (error)
                return error;
-        anon_vma_lock(vma);
+        vma_lock_anon_vma(vma);
        /*
         * vma->vm_start/vm_end cannot change under us because the caller
@@ -1786,7 +1808,7 @@ static int expand_downwards(struct vm_area_struct *vma,
                        perf_event_mmap(vma);
                }
        }
-        anon_vma_unlock(vma);
+        vma_unlock_anon_vma(vma);
        return error;
 }
@@ -1903,6 +1925,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
        unsigned long addr;
        insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+        vma->vm_prev = NULL;
        do {
                rb_erase(&vma->vm_rb, &mm->mm_rb);
                mm->map_count--;
@@ -1910,6 +1933,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
                vma = vma->vm_next;
        } while (vma && vma->vm_start < end);
        *insertion_point = vma;
+        if (vma)
+                vma->vm_prev = prev;
        tail_vma->vm_next = NULL;
        if (mm->unmap_area == arch_unmap_area)
                addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2470,23 +2495,23 @@ static DEFINE_MUTEX(mm_all_locks_mutex);
 static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
 {
-        if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+        if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
                /*
                 * The LSB of head.next can't change from under us
                 * because we hold the mm_all_locks_mutex.
                 */
-                spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem);
+                spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem);
                /*
                 * We can safely modify head.next after taking the
-                 * anon_vma->lock. If some other vma in this mm shares
+                 * anon_vma->root->lock. If some other vma in this mm shares
                 * the same anon_vma we won't take it again.
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us thanks to the
-                 * anon_vma->lock.
+                 * anon_vma->root->lock.
                 */
                if (__test_and_set_bit(0, (unsigned long *)
-                                       &anon_vma->head.next))
+                                       &anon_vma->root->head.next))
                        BUG();
        }
 }
@@ -2577,7 +2602,7 @@ out_unlock:
 static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        if (test_bit(0, (unsigned long *) &anon_vma->head.next)) {
+        if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) {
                /*
                 * The LSB of head.next can't change to 0 from under
                 * us because we hold the mm_all_locks_mutex.
@@ -2588,12 +2613,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
                 *
                 * No need of atomic instructions here, head.next
                 * can't change from under us until we release the
-                 * anon_vma->lock.
+                 * anon_vma->root->lock.
                 */
                if (!__test_and_clear_bit(0, (unsigned long *)
-                                          &anon_vma->head.next))
+                                          &anon_vma->root->head.next))
                        BUG();
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
        }
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee0abe0..88ff091eb07a 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,11 +36,6 @@
 #include <asm/mmu_context.h>
 #include "internal.h"
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
 #if 0
 #define kenter(FMT, ...) \
        printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -609,7 +604,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
 */
 static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
 {
-        struct vm_area_struct *pvma, **pp;
+        struct vm_area_struct *pvma, **pp, *next;
        struct address_space *mapping;
        struct rb_node **p, *parent;
@@ -669,8 +664,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
                        break;
        }
-        vma->vm_next = *pp;
+        next = *pp;
        *pp = vma;
+        vma->vm_next = next;
+        if (next)
+                next->vm_prev = vma;
 }
 /*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 709aedfaa014..fc81cb22869e 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
 *  Copyright (C)  1998,2000  Rik van Riel
 *      Thanks go out to Claus Fischer for some serious inspiration and
 *      for goading me into coding this file...
+ *  Copyright (C)  2010  Google, Inc.
+ *      Rewritten by David Rientjes
 *
 *  The routines in this file are used to kill a process when
 *  we're seriously out of memory. This gets called from __alloc_pages()
@@ -27,171 +29,188 @@
 #include <linux/module.h>
 #include <linux/notifier.h>
 #include <linux/memcontrol.h>
+#include <linux/mempolicy.h>
 #include <linux/security.h>
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks;
+int sysctl_oom_dump_tasks = 1;
 static DEFINE_SPINLOCK(zone_scan_lock);
-/* #define DEBUG */
+#ifdef CONFIG_NUMA
+/**
+ * has_intersects_mems_allowed() - check task eligiblity for kill
+ * @tsk: task struct of which task to consider
+ * @mask: nodemask passed to page allocator for mempolicy ooms
+ *
+ * Task eligibility is determined by whether or not a candidate task, @tsk,
+ * shares the same mempolicy nodes as current if it is bound by such a policy
+ * and whether or not it has the same set of allowed cpuset nodes.
+ */
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        struct task_struct *start = tsk;
+        do {
+                if (mask) {
+                        /*
+                         * If this is a mempolicy constrained oom, tsk's
+                         * cpuset is irrelevant.  Only return true if its
+                         * mempolicy intersects current, otherwise it may be
+                         * needlessly killed.
+                         */
+                        if (mempolicy_nodemask_intersects(tsk, mask))
+                                return true;
+                } else {
+                        /*
+                         * This is not a mempolicy constrained oom, so only
+                         * check the mems of tsk's cpuset.
+                         */
+                        if (cpuset_mems_allowed_intersects(current, tsk))
+                                return true;
+                }
+        } while_each_thread(start, tsk);
+        return false;
+}
+#else
+static bool has_intersects_mems_allowed(struct task_struct *tsk,
+                                        const nodemask_t *mask)
+{
+        return true;
+}
+#endif /* CONFIG_NUMA */
 /*
- * Is all threads of the target process nodes overlap ours?
+ * If this is a system OOM (not a memcg OOM) and the task selected to be
+ * killed is not already running at high (RT) priorities, speed up the
+ * recovery by boosting the dying task to the lowest FIFO priority.
+ * That helps with the recovery and avoids interfering with RT tasks.
 */
-static int has_intersects_mems_allowed(struct task_struct *tsk)
+static void boost_dying_task_prio(struct task_struct *p,
+                                  struct mem_cgroup *mem)
 {
-        struct task_struct *t;
+        struct sched_param param = { .sched_priority = 1 };
+        if (mem)
+                return;
+        if (!rt_task(p))
+                sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
+}
+/*
+ * The process p may have detached its own ->mm while exiting or through
+ * use_mm(), but one or more of its subthreads may still have a valid
+ * pointer.  Return p, or any of its subthreads with a valid ->mm, with
+ * task_lock() held.
+ */
+struct task_struct *find_lock_task_mm(struct task_struct *p)
+{
+        struct task_struct *t = p;
-        t = tsk;
        do {
-                if (cpuset_mems_allowed_intersects(current, t))
+                task_lock(t);
-                        return 1;
+                if (likely(t->mm))
-                t = next_thread(t);
+                        return t;
-        } while (t != tsk);
+                task_unlock(t);
+        } while_each_thread(p, t);
-        return 0;
+        return NULL;
+}
+/* return true if the task is not adequate as candidate victim task. */
+static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
+                           const nodemask_t *nodemask)
+{
+        if (is_global_init(p))
+                return true;
+        if (p->flags & PF_KTHREAD)
+                return true;
+        /* When mem_cgroup_out_of_memory() and p is not member of the group */
+        if (mem && !task_in_mem_cgroup(p, mem))
+                return true;
+        /* p may not have freeable memory in nodemask */
+        if (!has_intersects_mems_allowed(p, nodemask))
+                return true;
+        return false;
 }
 /**
- * badness - calculate a numeric value for how bad this task has been
+ * oom_badness - heuristic function to determine which candidate task to kill
 * @p: task struct of which task we should calculate
- * @uptime: current uptime in seconds
+ * @totalpages: total present RAM allowed for page allocation
- *
- * The formula used is relatively simple and documented inline in the
- * function. The main rationale is that we want to select a good task
- * to kill when we run out of memory.
 *
- * Good in this context means that:
+ * The heuristic for determining which task to kill is made to be as simple and
- * 1) we lose the minimum amount of work done
+ * predictable as possible.  The goal is to return the highest value for the
- * 2) we recover a large amount of memory
+ * task consuming the most memory to avoid subsequent oom failures.
- * 3) we don't kill anything innocent of eating tons of memory
- * 4) we want to kill the minimum amount of processes (one)
- * 5) we try to kill the process the user expects us to kill, this
- *    algorithm has been meticulously tuned to meet the principle
- *    of least surprise ... (be careful when you change it)
 */
+unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
-unsigned long badness(struct task_struct *p, unsigned long uptime)
+                      const nodemask_t *nodemask, unsigned long totalpages)
 {
-        unsigned long points, cpu_time, run_time;
+        int points;
-        struct mm_struct *mm;
-        struct task_struct *child;
-        int oom_adj = p->signal->oom_adj;
-        struct task_cputime task_time;
-        unsigned long utime;
-        unsigned long stime;
-        if (oom_adj == OOM_DISABLE)
+        if (oom_unkillable_task(p, mem, nodemask))
                return 0;
-        task_lock(p);
+        p = find_lock_task_mm(p);
-        mm = p->mm;
+        if (!p)
-        if (!mm) {
-                task_unlock(p);
                return 0;
-        }
-        /*
-         * The memory size of the process is the basis for the badness.
-         */
-        points = mm->total_vm;
        /*
-         * After this unlock we can no longer dereference local variable `mm'
+         * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
+         * need to be executed for something that cannot be killed.
         */
-        task_unlock(p);
+        if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+                task_unlock(p);
-        /*
+                return 0;
-         * swapoff can easily use up all memory, so kill those first.
-         */
-        if (p->flags & PF_OOM_ORIGIN)
-                return ULONG_MAX;
-        /*
-         * Processes which fork a lot of child processes are likely
-         * a good choice. We add half the vmsize of the children if they
-         * have an own mm. This prevents forking servers to flood the
-         * machine with an endless amount of children. In case a single
-         * child is eating the vast majority of memory, adding only half
-         * to the parents will make the child our kill candidate of choice.
-         */
-        list_for_each_entry(child, &p->children, sibling) {
-                task_lock(child);
-                if (child->mm != mm && child->mm)
-                        points += child->mm->total_vm/2 + 1;
-                task_unlock(child);
        }
        /*
-         * CPU time is in tens of seconds and run time is in thousands
+         * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
-         * of seconds. There is no particular reason for this other than
+         * priority for oom killing.
-         * that it turned out to work very well in practice.
-         */
-        thread_group_cputime(p, &task_time);
-        utime = cputime_to_jiffies(task_time.utime);
-        stime = cputime_to_jiffies(task_time.stime);
-        cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
-        if (uptime >= p->start_time.tv_sec)
-                run_time = (uptime - p->start_time.tv_sec) >> 10;
-        else
-                run_time = 0;
-        if (cpu_time)
-                points /= int_sqrt(cpu_time);
-        if (run_time)
-                points /= int_sqrt(int_sqrt(run_time));
-        /*
-         * Niced processes are most likely less important, so double
-         * their badness points.
         */
-        if (task_nice(p) > 0)
+        if (p->flags & PF_OOM_ORIGIN) {
-                points *= 2;
+                task_unlock(p);
+                return 1000;
+        }
        /*
-         * Superuser processes are usually more important, so we make it
+         * The memory controller may have a limit of 0 bytes, so avoid a divide
-         * less likely that we kill those.
+         * by zero, if necessary.
         */
-        if (has_capability_noaudit(p, CAP_SYS_ADMIN) ||
+        if (!totalpages)
-            has_capability_noaudit(p, CAP_SYS_RESOURCE))
+                totalpages = 1;
-                points /= 4;
        /*
-         * We don't want to kill a process with direct hardware access.
+         * The baseline for the badness score is the proportion of RAM that each
-         * Not only could that mess up the hardware, but usually users
+         * task's rss and swap space use.
-         * tend to only have this flag set on applications they think
-         * of as important.
         */
-        if (has_capability_noaudit(p, CAP_SYS_RAWIO))
+        points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
-                points /= 4;
+                        totalpages;
+        task_unlock(p);
        /*
-         * If p's nodes don't overlap ours, it may still help to kill p
+         * Root processes get 3% bonus, just like the __vm_enough_memory()
-         * because p may have allocated or otherwise mapped memory on
+         * implementation used by LSMs.
-         * this node before. However it will be less likely.
         */
-        if (!has_intersects_mems_allowed(p))
+        if (has_capability_noaudit(p, CAP_SYS_ADMIN))
-                points /= 8;
+                points -= 30;
        /*
-         * Adjust the score by oom_adj.
+         * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
+         * either completely disable oom killing or always prefer a certain
+         * task.
         */
-        if (oom_adj) {
+        points += p->signal->oom_score_adj;
-                if (oom_adj > 0) {
-                        if (!points)
-                                points = 1;
-                        points <<= oom_adj;
-                } else
-                        points >>= -(oom_adj);
-        }
-#ifdef DEBUG
+        if (points < 0)
-        printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n",
+                return 0;
-        p->pid, p->comm, points);
+        return (points < 1000) ? points : 1000;
-#endif
-        return points;
 }
 /*
@@ -199,12 +218,20 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
 */
 #ifdef CONFIG_NUMA
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                                    gfp_t gfp_mask, nodemask_t *nodemask)
+                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                unsigned long *totalpages)
 {
        struct zone *zone;
        struct zoneref *z;
        enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+        bool cpuset_limited = false;
+        int nid;
+        /* Default to all available memory */
+        *totalpages = totalram_pages + total_swap_pages;
+        if (!zonelist)
+                return CONSTRAINT_NONE;
        /*
         * Reach here only when __GFP_NOFAIL is used. So, we should avoid
         * to kill current.We have to random task kill in this case.
@@ -214,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
                return CONSTRAINT_NONE;
        /*
-         * The nodemask here is a nodemask passed to alloc_pages(). Now,
+         * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
-         * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy
+         * the page allocator means a mempolicy is in effect.  Cpuset policy
-         * feature. mempolicy is an only user of nodemask here.
+         * is enforced in get_page_from_freelist().
-         * check mempolicy's nodemask contains all N_HIGH_MEMORY
         */
-        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask))
+        if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
+                *totalpages = total_swap_pages;
+                for_each_node_mask(nid, *nodemask)
+                        *totalpages += node_spanned_pages(nid);
                return CONSTRAINT_MEMORY_POLICY;
+        }
        /* Check this allocation failure is caused by cpuset's wall function */
        for_each_zone_zonelist_nodemask(zone, z, zonelist,
                        high_zoneidx, nodemask)
                if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
-                        return CONSTRAINT_CPUSET;
+                        cpuset_limited = true;
+        if (cpuset_limited) {
+                *totalpages = total_swap_pages;
+                for_each_node_mask(nid, cpuset_current_mems_allowed)
+                        *totalpages += node_spanned_pages(nid);
+                return CONSTRAINT_CPUSET;
+        }
        return CONSTRAINT_NONE;
 }
 #else
 static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
-                                gfp_t gfp_mask, nodemask_t *nodemask)
+                                gfp_t gfp_mask, nodemask_t *nodemask,
+                                unsigned long *totalpages)
 {
+        *totalpages = totalram_pages + total_swap_pages;
        return CONSTRAINT_NONE;
 }
 #endif
@@ -244,28 +282,18 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 *
 * (not docbooked, we don't want this one cluttering up the manual)
 */
-static struct task_struct *select_bad_process(unsigned long *ppoints,
+static struct task_struct *select_bad_process(unsigned int *ppoints,
-                                                struct mem_cgroup *mem)
+                unsigned long totalpages, struct mem_cgroup *mem,
+                const nodemask_t *nodemask)
 {
        struct task_struct *p;
        struct task_struct *chosen = NULL;
-        struct timespec uptime;
        *ppoints = 0;
-        do_posix_clock_monotonic_gettime(&uptime);
        for_each_process(p) {
-                unsigned long points;
+                unsigned int points;
-                /*
+                if (oom_unkillable_task(p, mem, nodemask))
-                 * skip kernel threads and tasks which have already released
-                 * their mm.
-                 */
-                if (!p->mm)
-                        continue;
-                /* skip the init task */
-                if (is_global_init(p))
-                        continue;
-                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
                /*
@@ -290,19 +318,16 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                 * the process of exiting and releasing its resources.
                 * Otherwise we could get an easy OOM deadlock.
                 */
-                if (p->flags & PF_EXITING) {
+                if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) {
                        if (p != current)
                                return ERR_PTR(-1UL);
                        chosen = p;
-                        *ppoints = ULONG_MAX;
+                        *ppoints = 1000;
                }
-                if (p->signal->oom_adj == OOM_DISABLE)
+                points = oom_badness(p, mem, nodemask, totalpages);
-                        continue;
+                if (points > *ppoints) {
-                points = badness(p, uptime.tv_sec);
-                if (points > *ppoints || !chosen) {
                        chosen = p;
                        *ppoints = points;
                }
@@ -313,11 +338,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 /**
 * dump_tasks - dump current memory state of all system tasks
- * @mem: target memory controller
+ * @mem: current's memory controller, if constrained
 *
 * Dumps the current memory state of all system tasks, excluding kernel threads.
 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
- * score, and name.
+ * value, oom_score_adj value, and name.
 *
 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
 * shown.
@@ -326,44 +351,43 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
 */
 static void dump_tasks(const struct mem_cgroup *mem)
 {
-        struct task_struct *g, *p;
+        struct task_struct *p;
+        struct task_struct *task;
-        printk(KERN_INFO "[ pid ]   uid  tgid total_vm      rss cpu oom_adj "
-               "name\n");
-        do_each_thread(g, p) {
-                struct mm_struct *mm;
-                if (mem && !task_in_mem_cgroup(p, mem))
+        pr_info("[ pid ]   uid  tgid total_vm      rss cpu oom_adj oom_score_adj name\n");
+        for_each_process(p) {
+                if (p->flags & PF_KTHREAD)
                        continue;
-                if (!thread_group_leader(p))
+                if (mem && !task_in_mem_cgroup(p, mem))
                        continue;
-                task_lock(p);
+                task = find_lock_task_mm(p);
-                mm = p->mm;
+                if (!task) {
-                if (!mm) {
                        /*
-                         * total_vm and rss sizes do not exist for tasks with no
+                         * This is a kthread or all of p's threads have already
-                         * mm so there's no need to report them; they can't be
+                         * detached their mm's.  There's no need to report
-                         * oom killed anyway.
+                         * them; they can't be oom killed anyway.
                         */
-                        task_unlock(p);
                        continue;
                }
-                printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d     %3d %s\n",
-                       p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
+                pr_info("[%5d] %5d %5d %8lu %8lu %3u     %3d         %5d %s\n",
-                       get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
+                        task->pid, task_uid(task), task->tgid,
-                       p->comm);
+                        task->mm->total_vm, get_mm_rss(task->mm),
-                task_unlock(p);
+                        task_cpu(task), task->signal->oom_adj,
-        } while_each_thread(g, p);
+                        task->signal->oom_score_adj, task->comm);
+                task_unlock(task);
+        }
 }
 static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
                                                        struct mem_cgroup *mem)
 {
-        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
-                "oom_adj=%d\n",
-                current->comm, gfp_mask, order, current->signal->oom_adj);
        task_lock(current);
+        pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
+                "oom_adj=%d, oom_score_adj=%d\n",
+                current->comm, gfp_mask, order, current->signal->oom_adj,
+                current->signal->oom_score_adj);
        cpuset_print_task_mems_allowed(current);
        task_unlock(current);
        dump_stack();
@@ -374,72 +398,42 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 }
 #define K(x) ((x) << (PAGE_SHIFT-10))
+static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
-/*
- * Send SIGKILL to the selected  process irrespective of  CAP_SYS_RAW_IO
- * flag though it's unlikely that  we select a process with CAP_SYS_RAW_IO
- * set.
- */
-static void __oom_kill_task(struct task_struct *p, int verbose)
 {
-        if (is_global_init(p)) {
+        p = find_lock_task_mm(p);
-                WARN_ON(1);
+        if (!p)
-                printk(KERN_WARNING "tried to kill init!\n");
+                return 1;
-                return;
-        }
-        task_lock(p);
-        if (!p->mm) {
-                WARN_ON(1);
-                printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n",
-                        task_pid_nr(p), p->comm);
-                task_unlock(p);
-                return;
-        }
-        if (verbose)
+        pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
-                printk(KERN_ERR "Killed process %d (%s) "
+                task_pid_nr(p), p->comm, K(p->mm->total_vm),
-                       "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+                K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                       task_pid_nr(p), p->comm,
+                K(get_mm_counter(p->mm, MM_FILEPAGES)));
-                       K(p->mm->total_vm),
-                       K(get_mm_counter(p->mm, MM_ANONPAGES)),
-                       K(get_mm_counter(p->mm, MM_FILEPAGES)));
        task_unlock(p);
+        set_tsk_thread_flag(p, TIF_MEMDIE);
+        force_sig(SIGKILL, p);
        /*
         * We give our sacrificial lamb high priority and access to
         * all the memory it needs. That way it should be able to
         * exit() and clear out its resources quickly...
         */
-        p->rt.time_slice = HZ;
+        boost_dying_task_prio(p, mem);
-        set_tsk_thread_flag(p, TIF_MEMDIE);
-        force_sig(SIGKILL, p);
-}
-static int oom_kill_task(struct task_struct *p)
-{
-        /* WARNING: mm may not be dereferenced since we did not obtain its
-         * value from get_task_mm(p).  This is OK since all we need to do is
-         * compare mm to q->mm below.
-         *
-         * Furthermore, even if mm contains a non-NULL value, p->mm may
-         * change to NULL at any time since we do not hold task_lock(p).
-         * However, this is of no concern to us.
-         */
-        if (!p->mm || p->signal->oom_adj == OOM_DISABLE)
-                return 1;
-        __oom_kill_task(p, 1);
        return 0;
 }
+#undef K
 static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-                            unsigned long points, struct mem_cgroup *mem,
+                            unsigned int points, unsigned long totalpages,
+                            struct mem_cgroup *mem, nodemask_t *nodemask,
                            const char *message)
 {
-        struct task_struct *c;
+        struct task_struct *victim = p;
+        struct task_struct *child;
+        struct task_struct *t = p;
+        unsigned int victim_points = 0;
        if (printk_ratelimit())
                dump_header(p, gfp_mask, order, mem);
@@ -449,40 +443,81 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
         * its children or threads, just set TIF_MEMDIE so it can die quickly
         */
        if (p->flags & PF_EXITING) {
-                __oom_kill_task(p, 0);
+                set_tsk_thread_flag(p, TIF_MEMDIE);
+                boost_dying_task_prio(p, mem);
                return 0;
        }
-        printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n",
+        task_lock(p);
-                                        message, task_pid_nr(p), p->comm, points);
+        pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
+                message, task_pid_nr(p), p->comm, points);
+        task_unlock(p);
-        /* Try to kill a child first */
+        /*
-        list_for_each_entry(c, &p->children, sibling) {
+         * If any of p's children has a different mm and is eligible for kill,
-                if (c->mm == p->mm)
+         * the one with the highest badness() score is sacrificed for its
-                        continue;
+         * parent.  This attempts to lose the minimal amount of work done while
-                if (mem && !task_in_mem_cgroup(c, mem))
+         * still freeing memory.
-                        continue;
+         */
-                if (!oom_kill_task(c))
+        do {
-                        return 0;
+                list_for_each_entry(child, &t->children, sibling) {
+                        unsigned int child_points;
+                        /*
+                         * oom_badness() returns 0 if the thread is unkillable
+                         */
+                        child_points = oom_badness(child, mem, nodemask,
+                                                                totalpages);
+                        if (child_points > victim_points) {
+                                victim = child;
+                                victim_points = child_points;
+                        }
+                }
+        } while_each_thread(p, t);
+        return oom_kill_task(victim, mem);
+}
+/*
+ * Determines whether the kernel must panic because of the panic_on_oom sysctl.
+ */
+static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
+                                int order)
+{
+        if (likely(!sysctl_panic_on_oom))
+                return;
+        if (sysctl_panic_on_oom != 2) {
+                /*
+                 * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
+                 * does not panic for cpuset, mempolicy, or memcg allocation
+                 * failures.
+                 */
+                if (constraint != CONSTRAINT_NONE)
+                        return;
        }
-        return oom_kill_task(p);
+        read_lock(&tasklist_lock);
+        dump_header(NULL, gfp_mask, order, NULL);
+        read_unlock(&tasklist_lock);
+        panic("Out of memory: %s panic_on_oom is enabled\n",
+                sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
 void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
 {
-        unsigned long points = 0;
+        unsigned long limit;
+        unsigned int points = 0;
        struct task_struct *p;
-        if (sysctl_panic_on_oom == 2)
+        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
-                panic("out of memory(memcg). panic_on_oom is selected.\n");
+        limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
        read_lock(&tasklist_lock);
 retry:
-        p = select_bad_process(&points, mem);
+        p = select_bad_process(&points, limit, mem, NULL);
        if (!p || PTR_ERR(p) == -1UL)
                goto out;
-        if (oom_kill_process(p, gfp_mask, 0, points, mem,
+        if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
                                "Memory cgroup out of memory"))
                goto retry;
 out:
@@ -509,7 +544,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 * if a parallel OOM killing is already taking place that includes a zone in
 * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
 */
-int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
+int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 {
        struct zoneref *z;
        struct zone *zone;
@@ -526,7 +561,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
        for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
                /*
                 * Lock each zone in the zonelist under zone_scan_lock so a
-                 * parallel invocation of try_set_zone_oom() doesn't succeed
+                 * parallel invocation of try_set_zonelist_oom() doesn't succeed
                 * when it shouldn't.
                 */
                zone_set_flag(zone, ZONE_OOM_LOCKED);
@@ -555,65 +590,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
 }
 /*
- * Must be called with tasklist_lock held for read.
+ * Try to acquire the oom killer lock for all system zones.  Returns zero if a
+ * parallel oom killing is taking place, otherwise locks all zones and returns
+ * non-zero.
 */
-static void __out_of_memory(gfp_t gfp_mask, int order)
+static int try_set_system_oom(void)
 {
-        struct task_struct *p;
+        struct zone *zone;
-        unsigned long points;
+        int ret = 1;
-        if (sysctl_oom_kill_allocating_task)
-                if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
-                                "Out of memory (oom_kill_allocating_task)"))
-                        return;
-retry:
-        /*
-         * Rambo mode: Shoot down a process and hope it solves whatever
-         * issues we may have.
-         */
-        p = select_bad_process(&points, NULL);
-        if (PTR_ERR(p) == -1UL)
-                return;
-        /* Found nothing?!?! Either we hang forever, or we panic. */
-        if (!p) {
-                read_unlock(&tasklist_lock);
-                dump_header(NULL, gfp_mask, order, NULL);
-                panic("Out of memory and no killable processes...\n");
-        }
-        if (oom_kill_process(p, gfp_mask, order, points, NULL,
+        spin_lock(&zone_scan_lock);
-                             "Out of memory"))
+        for_each_populated_zone(zone)
-                goto retry;
+                if (zone_is_oom_locked(zone)) {
+                        ret = 0;
+                        goto out;
+                }
+        for_each_populated_zone(zone)
+                zone_set_flag(zone, ZONE_OOM_LOCKED);
+out:
+        spin_unlock(&zone_scan_lock);
+        return ret;
 }
 /*
- * pagefault handler calls into here because it is out of memory but
+ * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation
- * doesn't know exactly how or why.
+ * attempts or page faults may now recall the oom killer, if necessary.
 */
-void pagefault_out_of_memory(void)
+static void clear_system_oom(void)
 {
-        unsigned long freed = 0;
+        struct zone *zone;
-        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
-        if (freed > 0)
-                /* Got some memory back in the last second. */
-                return;
-        if (sysctl_panic_on_oom)
-                panic("out of memory from page fault. panic_on_oom is selected.\n");
-        read_lock(&tasklist_lock);
-        __out_of_memory(0, 0); /* unknown gfp_mask and order */
-        read_unlock(&tasklist_lock);
-        /*
+        spin_lock(&zone_scan_lock);
-         * Give "p" a good chance of killing itself before we
+        for_each_populated_zone(zone)
-         * retry to allocate memory.
+                zone_clear_flag(zone, ZONE_OOM_LOCKED);
-         */
+        spin_unlock(&zone_scan_lock);
-        if (!test_thread_flag(TIF_MEMDIE))
-                schedule_timeout_uninterruptible(1);
 }
 /**
@@ -621,6 +631,7 @@ void pagefault_out_of_memory(void)
 * @zonelist: zonelist pointer
 * @gfp_mask: memory allocation flags
 * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
 *
 * If we run out of memory, we have the choice between either
 * killing a random task (bad), letting the system crash (worse)
@@ -630,49 +641,93 @@ void pagefault_out_of_memory(void)
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
                int order, nodemask_t *nodemask)
 {
+        struct task_struct *p;
+        unsigned long totalpages;
        unsigned long freed = 0;
-        enum oom_constraint constraint;
+        unsigned int points;
+        enum oom_constraint constraint = CONSTRAINT_NONE;
+        int killed = 0;
        blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
        if (freed > 0)
                /* Got some memory back in the last second. */
                return;
-        if (sysctl_panic_on_oom == 2) {
+        /*
-                dump_header(NULL, gfp_mask, order, NULL);
+         * If current has a pending SIGKILL, then automatically select it.  The
-                panic("out of memory. Compulsory panic_on_oom is selected.\n");
+         * goal is to allow it to allocate so that it may quickly exit and free
+         * its memory.
+         */
+        if (fatal_signal_pending(current)) {
+                set_thread_flag(TIF_MEMDIE);
+                boost_dying_task_prio(current, NULL);
+                return;
        }
        /*
         * Check if there were limitations on the allocation (only relevant for
         * NUMA) that may require different handling.
         */
-        constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
+        constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
+                                                &totalpages);
+        check_panic_on_oom(constraint, gfp_mask, order);
        read_lock(&tasklist_lock);
+        if (sysctl_oom_kill_allocating_task &&
+            !oom_unkillable_task(current, NULL, nodemask) &&
+            (current->signal->oom_adj != OOM_DISABLE)) {
+                /*
+                 * oom_kill_process() needs tasklist_lock held.  If it returns
+                 * non-zero, current could not be killed so we must fallback to
+                 * the tasklist scan.
+                 */
+                if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
+                                NULL, nodemask,
+                                "Out of memory (oom_kill_allocating_task)"))
+                        goto out;
+        }
-        switch (constraint) {
+retry:
-        case CONSTRAINT_MEMORY_POLICY:
+        p = select_bad_process(&points, totalpages, NULL,
-                oom_kill_process(current, gfp_mask, order, 0, NULL,
+                        constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
-                                "No available memory (MPOL_BIND)");
+                                                                 NULL);
-                break;
+        if (PTR_ERR(p) == -1UL)
+                goto out;
-        case CONSTRAINT_NONE:
+        /* Found nothing?!?! Either we hang forever, or we panic. */
-                if (sysctl_panic_on_oom) {
+        if (!p) {
-                        dump_header(NULL, gfp_mask, order, NULL);
+                dump_header(NULL, gfp_mask, order, NULL);
-                        panic("out of memory. panic_on_oom is selected\n");
+                read_unlock(&tasklist_lock);
-                }
+                panic("Out of memory and no killable processes...\n");
-                /* Fall-through */
-        case CONSTRAINT_CPUSET:
-                __out_of_memory(gfp_mask, order);
-                break;
        }
+        if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+                                nodemask, "Out of memory"))
+                goto retry;
+        killed = 1;
+out:
        read_unlock(&tasklist_lock);
        /*
         * Give "p" a good chance of killing itself before we
         * retry to allocate memory unless "p" is current
         */
+        if (killed && !test_thread_flag(TIF_MEMDIE))
+                schedule_timeout_uninterruptible(1);
+}
+/*
+ * The pagefault handler calls here because it is out of memory, so kill a
+ * memory-hogging task.  If a populated zone has ZONE_OOM_LOCKED set, a parallel
+ * oom killing is already in progress so do nothing.  If a task is found with
+ * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit.
+ */
+void pagefault_out_of_memory(void)
+{
+        if (try_set_system_oom()) {
+                out_of_memory(NULL, 0, 0, NULL);
+                clear_system_oom();
+        }
        if (!test_thread_flag(TIF_MEMDIE))
                schedule_timeout_uninterruptible(1);
 }
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 37498ef61548..c09ef5219cbe 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/events/writeback.h>
 /*
 * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
@@ -252,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
        }
 }
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
-                unsigned long dirty, unsigned long *pbdi_dirty)
-{
-        unsigned long avail_dirty;
-        avail_dirty = global_page_state(NR_FILE_DIRTY) +
-                 global_page_state(NR_WRITEBACK) +
-                 global_page_state(NR_UNSTABLE_NFS) +
-                 global_page_state(NR_WRITEBACK_TEMP);
-        if (avail_dirty < dirty)
-                avail_dirty = dirty - avail_dirty;
-        else
-                avail_dirty = 0;
-        avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
-                bdi_stat(bdi, BDI_WRITEBACK);
-        *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
 static inline void task_dirties_fraction(struct task_struct *tsk,
                long *numerator, long *denominator)
 {
@@ -286,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
 }
 /*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
 *
 * task specific dirty limit:
 *
 *   dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
 */
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+                                       unsigned long bdi_dirty)
 {
        long numerator, denominator;
-        unsigned long dirty = *pdirty;
+        unsigned long dirty = bdi_dirty;
        u64 inv = dirty >> 3;
        task_dirties_fraction(tsk, &numerator, &denominator);
@@ -303,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
        do_div(inv, denominator);
        dirty -= inv;
-        if (dirty < *pdirty/2)
-                dirty = *pdirty/2;
-        *pdirty = dirty;
+        return max(dirty, bdi_dirty/2);
 }
 /*
@@ -416,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
-void
+/*
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
-                 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
+ * - vm.dirty_ratio             or  vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 {
        unsigned long background;
        unsigned long dirty;
@@ -450,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
        }
        *pbackground = background;
        *pdirty = dirty;
+}
-        if (bdi) {
+/*
-                u64 bdi_dirty;
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
-                long numerator, denominator;
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+        u64 bdi_dirty;
+        long numerator, denominator;
-                /*
+        /*
-                 * Calculate this BDI's share of the dirty ratio.
+         * Calculate this BDI's share of the dirty ratio.
-                 */
+         */
-                bdi_writeout_fraction(bdi, &numerator, &denominator);
+        bdi_writeout_fraction(bdi, &numerator, &denominator);
-                bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+        bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-                bdi_dirty *= numerator;
+        bdi_dirty *= numerator;
-                do_div(bdi_dirty, denominator);
+        do_div(bdi_dirty, denominator);
-                bdi_dirty += (dirty * bdi->min_ratio) / 100;
-                if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+        bdi_dirty += (dirty * bdi->min_ratio) / 100;
-                        bdi_dirty = dirty * bdi->max_ratio / 100;
+        if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+                bdi_dirty = dirty * bdi->max_ratio / 100;
-                *pbdi_dirty = bdi_dirty;
-                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
+        return bdi_dirty;
-                task_dirty_limit(current, pbdi_dirty);
-        }
 }
 /*
@@ -490,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
        unsigned long bdi_thresh;
        unsigned long pages_written = 0;
        unsigned long pause = 1;
+        bool dirty_exceeded = false;
        struct backing_dev_info *bdi = mapping->backing_dev_info;
        for (;;) {
@@ -501,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
                        .range_cyclic   = 1,
                };
-                get_dirty_limits(&background_thresh, &dirty_thresh,
-                                &bdi_thresh, bdi);
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
                nr_writeback = global_page_state(NR_WRITEBACK);
-                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
+                global_dirty_limits(&background_thresh, &dirty_thresh);
-                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
-                        break;
                /*
                 * Throttle it only when the background writeback cannot
@@ -523,24 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
                                (background_thresh + dirty_thresh) / 2)
                        break;
-                if (!bdi->dirty_exceeded)
+                bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-                        bdi->dirty_exceeded = 1;
+                bdi_thresh = task_dirty_limit(current, bdi_thresh);
-                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
-                 * Unstable writes are a feature of certain networked
-                 * filesystems (i.e. NFS) in which data may have been
-                 * written to the server's write cache, but has not yet
-                 * been flushed to permanent storage.
-                 * Only move pages to writeback if this bdi is over its
-                 * threshold otherwise wait until the disk writes catch
-                 * up.
-                 */
-                if (bdi_nr_reclaimable > bdi_thresh) {
-                        writeback_inodes_wb(&bdi->wb, &wbc);
-                        pages_written += write_chunk - wbc.nr_to_write;
-                        get_dirty_limits(&background_thresh, &dirty_thresh,
-                                       &bdi_thresh, bdi);
-                }
                /*
                 * In order to avoid the stacked BDI deadlock we need
@@ -555,16 +530,45 @@ static void balance_dirty_pages(struct address_space *mapping,
                if (bdi_thresh < 2*bdi_stat_error(bdi)) {
                        bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
-                } else if (bdi_nr_reclaimable) {
+                } else {
                        bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
                        bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
                }
-                if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+                /*
+                 * The bdi thresh is somehow "soft" limit derived from the
+                 * global "hard" limit. The former helps to prevent heavy IO
+                 * bdi or process from holding back light ones; The latter is
+                 * the last resort safeguard.
+                 */
+                dirty_exceeded =
+                        (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+                        || (nr_reclaimable + nr_writeback >= dirty_thresh);
+                if (!dirty_exceeded)
                        break;
-                if (pages_written >= write_chunk)
-                        break;          /* We've done our duty */
+                if (!bdi->dirty_exceeded)
+                        bdi->dirty_exceeded = 1;
+                /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+                 * Unstable writes are a feature of certain networked
+                 * filesystems (i.e. NFS) in which data may have been
+                 * written to the server's write cache, but has not yet
+                 * been flushed to permanent storage.
+                 * Only move pages to writeback if this bdi is over its
+                 * threshold otherwise wait until the disk writes catch
+                 * up.
+                 */
+                trace_wbc_balance_dirty_start(&wbc, bdi);
+                if (bdi_nr_reclaimable > bdi_thresh) {
+                        writeback_inodes_wb(&bdi->wb, &wbc);
+                        pages_written += write_chunk - wbc.nr_to_write;
+                        trace_wbc_balance_dirty_written(&wbc, bdi);
+                        if (pages_written >= write_chunk)
+                                break;          /* We've done our duty */
+                }
+                trace_wbc_balance_dirty_wait(&wbc, bdi);
                __set_current_state(TASK_INTERRUPTIBLE);
                io_schedule_timeout(pause);
@@ -577,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
                        pause = HZ / 10;
        }
-        if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
+        if (!dirty_exceeded && bdi->dirty_exceeded)
-                        bdi->dirty_exceeded)
                bdi->dirty_exceeded = 0;
        if (writeback_in_progress(bdi))
@@ -593,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-            (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
+            (!laptop_mode && (nr_reclaimable > background_thresh)))
-                               + global_page_state(NR_UNSTABLE_NFS))
-                                          > background_thresh)))
                bdi_start_background_writeback(bdi);
 }
@@ -659,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        unsigned long dirty_thresh;
        for ( ; ; ) {
-                get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+                global_dirty_limits(&background_thresh, &dirty_thresh);
                /*
                 * Boost the allowable dirty threshold a bit for page
@@ -805,6 +806,42 @@ void __init page_writeback_init(void)
 }
 /**
+ * tag_pages_for_writeback - tag pages to be written by write_cache_pages
+ * @mapping: address space structure to write
+ * @start: starting page index
+ * @end: ending page index (inclusive)
+ *
+ * This function scans the page range from @start to @end (inclusive) and tags
+ * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is
+ * that write_cache_pages (or whoever calls this function) will then use
+ * TOWRITE tag to identify pages eligible for writeback.  This mechanism is
+ * used to avoid livelocking of writeback by a process steadily creating new
+ * dirty pages in the file (thus it is important for this function to be quick
+ * so that it can tag pages faster than a dirtying process can create them).
+ */
+/*
+ * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
+ */
+void tag_pages_for_writeback(struct address_space *mapping,
+                             pgoff_t start, pgoff_t end)
+{
+#define WRITEBACK_TAG_BATCH 4096
+        unsigned long tagged;
+        do {
+                spin_lock_irq(&mapping->tree_lock);
+                tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree,
+                                &start, end, WRITEBACK_TAG_BATCH,
+                                PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE);
+                spin_unlock_irq(&mapping->tree_lock);
+                WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
+                cond_resched();
+                /* We check 'start' to handle wrapping when end == ~0UL */
+        } while (tagged >= WRITEBACK_TAG_BATCH && start);
+}
+EXPORT_SYMBOL(tag_pages_for_writeback);
+/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
@@ -818,6 +855,13 @@ void __init page_writeback_init(void)
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
+ *
+ * To avoid livelocks (when other process dirties new pages), we first tag
+ * pages which should be written back with TOWRITE tag and only then start
+ * writing them. For data-integrity sync we have to be careful so that we do
+ * not miss some pages (e.g., because some other process has cleared TOWRITE
+ * tag we set). The rule we follow is that TOWRITE tag can be cleared only
+ * by the process clearing the DIRTY tag (and submitting the page for IO).
 */
 int write_cache_pages(struct address_space *mapping,
                      struct writeback_control *wbc, writepage_t writepage,
@@ -833,6 +877,7 @@ int write_cache_pages(struct address_space *mapping,
        pgoff_t done_index;
        int cycled;
        int range_whole = 0;
+        int tag;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -849,29 +894,19 @@ int write_cache_pages(struct address_space *mapping,
                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
                        range_whole = 1;
                cycled = 1; /* ignore range_cyclic tests */
-                /*
-                 * If this is a data integrity sync, cap the writeback to the
-                 * current end of file. Any extension to the file that occurs
-                 * after this is a new write and we don't need to write those
-                 * pages out to fulfil our data integrity requirements. If we
-                 * try to write them out, we can get stuck in this scan until
-                 * the concurrent writer stops adding dirty pages and extending
-                 * EOF.
-                 */
-                if (wbc->sync_mode == WB_SYNC_ALL &&
-                    wbc->range_end == LLONG_MAX) {
-                        end = i_size_read(mapping->host) >> PAGE_CACHE_SHIFT;
-                }
        }
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag = PAGECACHE_TAG_TOWRITE;
+        else
+                tag = PAGECACHE_TAG_DIRTY;
 retry:
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                tag_pages_for_writeback(mapping, index, end);
        done_index = index;
        while (!done && (index <= end)) {
                int i;
-                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
-                              PAGECACHE_TAG_DIRTY,
                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
                if (nr_pages == 0)
                        break;
@@ -929,6 +964,7 @@ continue_unlock:
                        if (!clear_page_dirty_for_io(page))
                                goto continue_unlock;
+                        trace_wbc_writepage(wbc, mapping->backing_dev_info);
                        ret = (*writepage)(page, wbc, data);
                        if (unlikely(ret)) {
                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -1327,6 +1363,9 @@ int test_set_page_writeback(struct page *page)
                        radix_tree_tag_clear(&mapping->page_tree,
                                                page_index(page),
                                                PAGECACHE_TAG_DIRTY);
+                radix_tree_tag_clear(&mapping->page_tree,
+                                     page_index(page),
+                                     PAGECACHE_TAG_TOWRITE);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
        } else {
                ret = TestSetPageWriteback(page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9bd339eb04c6..a9649f4b261e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1738,7 +1738,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
        struct page *page;
        /* Acquire the OOM killer lock for the zones in zonelist */
-        if (!try_set_zone_oom(zonelist, gfp_mask)) {
+        if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
                schedule_timeout_uninterruptible(1);
                return NULL;
        }
@@ -1759,6 +1759,9 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                /* The OOM killer will not help higher order allocs */
                if (order > PAGE_ALLOC_COSTLY_ORDER)
                        goto out;
+                /* The OOM killer does not needlessly kill tasks for lowmem */
+                if (high_zoneidx < ZONE_NORMAL)
+                        goto out;
                /*
                 * GFP_THISNODE contains __GFP_NORETRY and we never hit this.
                 * Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2052,15 +2055,23 @@ rebalance:
                        if (page)
                                goto got_pg;
-                        /*
+                        if (!(gfp_mask & __GFP_NOFAIL)) {
-                         * The OOM killer does not trigger for high-order
+                                /*
-                         * ~__GFP_NOFAIL allocations so if no progress is being
+                                 * The oom killer is not called for high-order
-                         * made, there are no other options and retrying is
+                                 * allocations that may fail, so if no progress
-                         * unlikely to help.
+                                 * is being made, there are no other options and
-                         */
+                                 * retrying is unlikely to help.
-                        if (order > PAGE_ALLOC_COSTLY_ORDER &&
+                                 */
-                                                !(gfp_mask & __GFP_NOFAIL))
+                                if (order > PAGE_ALLOC_COSTLY_ORDER)
-                                goto nopage;
+                                        goto nopage;
+                                /*
+                                 * The oom killer is not called for lowmem
+                                 * allocations to prevent needlessly killing
+                                 * innocent tasks.
+                                 */
+                                if (high_zoneidx < ZONE_NORMAL)
+                                        goto nopage;
+                        }
                        goto restart;
                }
@@ -4089,8 +4100,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
                zone_seqlock_init(zone);
                zone->zone_pgdat = pgdat;
-                zone->prev_priority = DEF_PRIORITY;
                zone_pcp_init(zone);
                for_each_lru(l) {
                        INIT_LIST_HEAD(&zone->lru[l].list);
diff --git a/mm/page_io.c b/mm/page_io.c
index 31a3b962230a..2dee975bf469 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -106,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
                goto out;
        }
        if (wbc->sync_mode == WB_SYNC_ALL)
-                rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
+                rw |= REQ_SYNC | REQ_UNPLUG;
        count_vm_event(PSWPOUT);
        set_page_writeback(page);
        unlock_page(page);
diff --git a/mm/rmap.c b/mm/rmap.c
index 38a336e2eea1..87b9e8ad4509 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/hugetlb.h>
 #include <asm/tlbflush.h>
@@ -132,9 +133,14 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        if (unlikely(!anon_vma))
                                goto out_enomem_free_avc;
                        allocated = anon_vma;
+                        /*
+                         * This VMA had no anon_vma yet.  This anon_vma is
+                         * the root of any anon_vma tree that might form.
+                         */
+                        anon_vma->root = anon_vma;
                }
-                spin_lock(&anon_vma->lock);
+                anon_vma_lock(anon_vma);
                /* page_table_lock to protect against threads */
                spin_lock(&mm->page_table_lock);
                if (likely(!vma->anon_vma)) {
@@ -142,12 +148,12 @@ int anon_vma_prepare(struct vm_area_struct *vma)
                        avc->anon_vma = anon_vma;
                        avc->vma = vma;
                        list_add(&avc->same_vma, &vma->anon_vma_chain);
-                        list_add(&avc->same_anon_vma, &anon_vma->head);
+                        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
                        allocated = NULL;
                        avc = NULL;
                }
                spin_unlock(&mm->page_table_lock);
-                spin_unlock(&anon_vma->lock);
+                anon_vma_unlock(anon_vma);
                if (unlikely(allocated))
                        anon_vma_free(allocated);
@@ -170,9 +176,9 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
        avc->anon_vma = anon_vma;
        list_add(&avc->same_vma, &vma->anon_vma_chain);
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_add_tail(&avc->same_anon_vma, &anon_vma->head);
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
 }
 /*
@@ -224,9 +230,21 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
        avc = anon_vma_chain_alloc();
        if (!avc)
                goto out_error_free_anon_vma;
-        anon_vma_chain_link(vma, avc, anon_vma);
+        /*
+         * The root anon_vma's spinlock is the lock actually used when we
+         * lock any of the anon_vmas in this anon_vma tree.
+         */
+        anon_vma->root = pvma->anon_vma->root;
+        /*
+         * With KSM refcounts, an anon_vma can stay around longer than the
+         * process it belongs to.  The root anon_vma needs to be pinned
+         * until this anon_vma is freed, because the lock lives in the root.
+         */
+        get_anon_vma(anon_vma->root);
        /* Mark this anon_vma as the one where our new (COWed) pages go. */
        vma->anon_vma = anon_vma;
+        anon_vma_chain_link(vma, avc, anon_vma);
        return 0;
@@ -246,22 +264,29 @@ static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain)
        if (!anon_vma)
                return;
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_del(&anon_vma_chain->same_anon_vma);
        /* We must garbage collect the anon_vma if it's empty */
        empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma);
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
-        if (empty)
+        if (empty) {
+                /* We no longer need the root anon_vma */
+                if (anon_vma->root != anon_vma)
+                        drop_anon_vma(anon_vma->root);
                anon_vma_free(anon_vma);
+        }
 }
 void unlink_anon_vmas(struct vm_area_struct *vma)
 {
        struct anon_vma_chain *avc, *next;
-        /* Unlink each anon_vma chained to the VMA. */
+        /*
+         * Unlink each anon_vma chained to the VMA.  This list is ordered
+         * from newest to oldest, ensuring the root anon_vma gets freed last.
+         */
        list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
                anon_vma_unlink(avc);
                list_del(&avc->same_vma);
@@ -302,7 +327,7 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
                goto out;
        anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        return anon_vma;
 out:
        rcu_read_unlock();
@@ -311,7 +336,7 @@ out:
 void page_unlock_anon_vma(struct anon_vma *anon_vma)
 {
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
        rcu_read_unlock();
 }
@@ -326,6 +351,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
        pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
        unsigned long address;
+        if (unlikely(is_vm_hugetlb_page(vma)))
+                pgoff = page->index << huge_page_order(page_hstate(page));
        address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
        if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
                /* page should be within @vma mapping range */
@@ -340,9 +367,10 @@ vma_address(struct page *page, struct vm_area_struct *vma)
 */
 unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 {
-        if (PageAnon(page))
+        if (PageAnon(page)) {
-                ;
+                if (vma->anon_vma->root != page_anon_vma(page)->root)
-        else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
+                        return -EFAULT;
+        } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
                if (!vma->vm_file ||
                    vma->vm_file->f_mapping != page->mapping)
                        return -EFAULT;
@@ -369,6 +397,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        pte_t *pte;
        spinlock_t *ptl;
+        if (unlikely(PageHuge(page))) {
+                pte = huge_pte_offset(mm, address);
+                ptl = &mm->page_table_lock;
+                goto check;
+        }
        pgd = pgd_offset(mm, address);
        if (!pgd_present(*pgd))
                return NULL;
@@ -389,6 +423,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
        }
        ptl = pte_lockptr(mm, pmd);
+check:
        spin_lock(ptl);
        if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
                *ptlp = ptl;
@@ -743,14 +778,20 @@ static void __page_set_anon_rmap(struct page *page,
         * If the page isn't exclusively mapped into this vma,
         * we must use the _oldest_ possible anon_vma for the
         * page mapping!
-         *
-         * So take the last AVC chain entry in the vma, which is
-         * the deepest ancestor, and use the anon_vma from that.
         */
        if (!exclusive) {
-                struct anon_vma_chain *avc;
+                if (PageAnon(page))
-                avc = list_entry(vma->anon_vma_chain.prev, struct anon_vma_chain, same_vma);
+                        return;
-                anon_vma = avc->anon_vma;
+                anon_vma = anon_vma->root;
+        } else {
+                /*
+                 * In this case, swapped-out-but-not-discarded swap-cache
+                 * is remapped. So, no need to update page->mapping here.
+                 * We convice anon_vma poitned by page->mapping is not obsolete
+                 * because vma->anon_vma is necessary to be a family of it.
+                 */
+                if (PageAnon(page))
+                        return;
        }
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
@@ -780,6 +821,7 @@ static void __page_check_anon_rmap(struct page *page,
         * are initially only visible via the pagetables, and the pte is locked
         * over the call to page_add_new_anon_rmap.
         */
+        BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
        BUG_ON(page->index != linear_page_index(vma, address));
 #endif
 }
@@ -798,6 +840,17 @@ static void __page_check_anon_rmap(struct page *page,
 void page_add_anon_rmap(struct page *page,
        struct vm_area_struct *vma, unsigned long address)
 {
+        do_page_add_anon_rmap(page, vma, address, 0);
+}
+/*
+ * Special version of the above for do_swap_page, which often runs
+ * into pages that are exclusively owned by the current process.
+ * Everybody else should continue to use page_add_anon_rmap above.
+ */
+void do_page_add_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
        int first = atomic_inc_and_test(&page->_mapcount);
        if (first)
                __inc_zone_page_state(page, NR_ANON_PAGES);
@@ -807,7 +860,7 @@ void page_add_anon_rmap(struct page *page,
        VM_BUG_ON(!PageLocked(page));
        VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
        if (first)
-                __page_set_anon_rmap(page, vma, address, 0);
+                __page_set_anon_rmap(page, vma, address, exclusive);
        else
                __page_check_anon_rmap(page, vma, address);
 }
@@ -873,6 +926,12 @@ void page_remove_rmap(struct page *page)
                page_clear_dirty(page);
                set_page_dirty(page);
        }
+        /*
+         * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+         * and not charged by memcg for now.
+         */
+        if (unlikely(PageHuge(page)))
+                return;
        if (PageAnon(page)) {
                mem_cgroup_uncharge_page(page);
                __dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1368,6 +1427,42 @@ int try_to_munlock(struct page *page)
                return try_to_unmap_file(page, TTU_MUNLOCK);
 }
+#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION)
+/*
+ * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root
+ * if necessary.  Be careful to do all the tests under the lock.  Once
+ * we know we are the last user, nobody else can get a reference and we
+ * can do the freeing without the lock.
+ */
+void drop_anon_vma(struct anon_vma *anon_vma)
+{
+        BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0);
+        if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) {
+                struct anon_vma *root = anon_vma->root;
+                int empty = list_empty(&anon_vma->head);
+                int last_root_user = 0;
+                int root_empty = 0;
+                /*
+                 * The refcount on a non-root anon_vma got dropped.  Drop
+                 * the refcount on the root and check if we need to free it.
+                 */
+                if (empty && anon_vma != root) {
+                        BUG_ON(atomic_read(&root->external_refcount) <= 0);
+                        last_root_user = atomic_dec_and_test(&root->external_refcount);
+                        root_empty = list_empty(&root->head);
+                }
+                anon_vma_unlock(anon_vma);
+                if (empty) {
+                        anon_vma_free(anon_vma);
+                        if (root_empty && last_root_user)
+                                anon_vma_free(root);
+                }
+        }
+}
+#endif
 #ifdef CONFIG_MIGRATION
 /*
 * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
@@ -1389,7 +1484,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
        anon_vma = page_anon_vma(page);
        if (!anon_vma)
                return ret;
-        spin_lock(&anon_vma->lock);
+        anon_vma_lock(anon_vma);
        list_for_each_entry(avc, &anon_vma->head, same_anon_vma) {
                struct vm_area_struct *vma = avc->vma;
                unsigned long address = vma_address(page, vma);
@@ -1399,7 +1494,7 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
                if (ret != SWAP_AGAIN)
                        break;
        }
-        spin_unlock(&anon_vma->lock);
+        anon_vma_unlock(anon_vma);
        return ret;
 }
@@ -1445,3 +1540,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
                return rmap_walk_file(page, rmap_one, arg);
 }
 #endif /* CONFIG_MIGRATION */
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+        struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        BUG_ON(!anon_vma);
+        if (!exclusive) {
+                struct anon_vma_chain *avc;
+                avc = list_entry(vma->anon_vma_chain.prev,
+                                 struct anon_vma_chain, same_vma);
+                anon_vma = avc->anon_vma;
+        }
+        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+        page->mapping = (struct address_space *) anon_vma;
+        page->index = linear_page_index(vma, address);
+}
+void hugepage_add_anon_rmap(struct page *page,
+                            struct vm_area_struct *vma, unsigned long address)
+{
+        struct anon_vma *anon_vma = vma->anon_vma;
+        int first;
+        BUG_ON(!anon_vma);
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        first = atomic_inc_and_test(&page->_mapcount);
+        if (first)
+                __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+void hugepage_add_new_anon_rmap(struct page *page,
+                        struct vm_area_struct *vma, unsigned long address)
+{
+        BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+        atomic_set(&page->_mapcount, 0);
+        __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index f65f84062db5..080b09a57a8f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,6 +28,7 @@
 #include <linux/file.h>
 #include <linux/mm.h>
 #include <linux/module.h>
+#include <linux/percpu_counter.h>
 #include <linux/swap.h>
 static struct vfsmount *shm_mnt;
@@ -233,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
        if (sbinfo->max_blocks) {
-                spin_lock(&sbinfo->stat_lock);
+                percpu_counter_add(&sbinfo->used_blocks, -pages);
-                sbinfo->free_blocks += pages;
+                spin_lock(&inode->i_lock);
                inode->i_blocks -= pages*BLOCKS_PER_PAGE;
-                spin_unlock(&sbinfo->stat_lock);
+                spin_unlock(&inode->i_lock);
        }
 }
@@ -416,19 +417,17 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
                if (sgp == SGP_READ)
                        return shmem_swp_map(ZERO_PAGE(0));
                /*
-                 * Test free_blocks against 1 not 0, since we have 1 data
+                 * Test used_blocks against 1 less max_blocks, since we have 1 data
                 * page (and perhaps indirect index pages) yet to allocate:
                 * a waste to allocate index if we cannot allocate data.
                 */
                if (sbinfo->max_blocks) {
-                        spin_lock(&sbinfo->stat_lock);
+                        if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0)
-                        if (sbinfo->free_blocks <= 1) {
-                                spin_unlock(&sbinfo->stat_lock);
                                return ERR_PTR(-ENOSPC);
-                        }
+                        percpu_counter_inc(&sbinfo->used_blocks);
-                        sbinfo->free_blocks--;
+                        spin_lock(&inode->i_lock);
                        inode->i_blocks += BLOCKS_PER_PAGE;
-                        spin_unlock(&sbinfo->stat_lock);
+                        spin_unlock(&inode->i_lock);
                }
                spin_unlock(&info->lock);
@@ -767,6 +766,10 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
        loff_t newsize = attr->ia_size;
        int error;
+        error = inode_change_ok(inode, attr);
+        if (error)
+                return error;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)
                                        && newsize != inode->i_size) {
                struct page *page = NULL;
@@ -801,25 +804,22 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
                        }
                }
-                error = simple_setsize(inode, newsize);
+                /* XXX(truncate): truncate_setsize should be called last */
+                truncate_setsize(inode, newsize);
                if (page)
                        page_cache_release(page);
-                if (error)
-                        return error;
                shmem_truncate_range(inode, newsize, (loff_t)-1);
        }
-        error = inode_change_ok(inode, attr);
+        setattr_copy(inode, attr);
-        if (!error)
-                generic_setattr(inode, attr);
 #ifdef CONFIG_TMPFS_POSIX_ACL
-        if (!error && (attr->ia_valid & ATTR_MODE))
+        if (attr->ia_valid & ATTR_MODE)
                error = generic_acl_chmod(inode);
 #endif
        return error;
 }
-static void shmem_delete_inode(struct inode *inode)
+static void shmem_evict_inode(struct inode *inode)
 {
        struct shmem_inode_info *info = SHMEM_I(inode);
@@ -836,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode)
        }
        BUG_ON(inode->i_blocks);
        shmem_free_inode(inode->i_sb);
-        clear_inode(inode);
+        end_writeback(inode);
 }
 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
@@ -933,7 +933,7 @@ found:
        /*
         * Move _head_ to start search for next from here.
-         * But be careful: shmem_delete_inode checks list_empty without taking
+         * But be careful: shmem_evict_inode checks list_empty without taking
         * mutex, and there's an instant in list_move_tail when info->swaplist
         * would appear empty, if it were the only one on shmem_swaplist.  We
         * could avoid doing it if inode NULL; or use this minor optimization.
@@ -1223,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx,
        struct shmem_sb_info *sbinfo;
        struct page *filepage = *pagep;
        struct page *swappage;
+        struct page *prealloc_page = NULL;
        swp_entry_t *entry;
        swp_entry_t swap;
        gfp_t gfp;
@@ -1247,7 +1248,6 @@ repeat:
                filepage = find_lock_page(mapping, idx);
        if (filepage && PageUptodate(filepage))
                goto done;
-        error = 0;
        gfp = mapping_gfp_mask(mapping);
        if (!filepage) {
                /*
@@ -1258,7 +1258,19 @@ repeat:
                if (error)
                        goto failed;
                radix_tree_preload_end();
+                if (sgp != SGP_READ && !prealloc_page) {
+                        /* We don't care if this fails */
+                        prealloc_page = shmem_alloc_page(gfp, info, idx);
+                        if (prealloc_page) {
+                                if (mem_cgroup_cache_charge(prealloc_page,
+                                                current->mm, GFP_KERNEL)) {
+                                        page_cache_release(prealloc_page);
+                                        prealloc_page = NULL;
+                                }
+                        }
+                }
        }
+        error = 0;
        spin_lock(&info->lock);
        shmem_recalc_inode(inode);
@@ -1387,17 +1399,16 @@ repeat:
                shmem_swp_unmap(entry);
                sbinfo = SHMEM_SB(inode->i_sb);
                if (sbinfo->max_blocks) {
-                        spin_lock(&sbinfo->stat_lock);
+                        if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) ||
-                        if (sbinfo->free_blocks == 0 ||
                            shmem_acct_block(info->flags)) {
-                                spin_unlock(&sbinfo->stat_lock);
                                spin_unlock(&info->lock);
                                error = -ENOSPC;
                                goto failed;
                        }
-                        sbinfo->free_blocks--;
+                        percpu_counter_inc(&sbinfo->used_blocks);
+                        spin_lock(&inode->i_lock);
                        inode->i_blocks += BLOCKS_PER_PAGE;
-                        spin_unlock(&sbinfo->stat_lock);
+                        spin_unlock(&inode->i_lock);
                } else if (shmem_acct_block(info->flags)) {
                        spin_unlock(&info->lock);
                        error = -ENOSPC;
@@ -1407,28 +1418,38 @@ repeat:
                if (!filepage) {
                        int ret;
-                        spin_unlock(&info->lock);
+                        if (!prealloc_page) {
-                        filepage = shmem_alloc_page(gfp, info, idx);
+                                spin_unlock(&info->lock);
-                        if (!filepage) {
+                                filepage = shmem_alloc_page(gfp, info, idx);
-                                shmem_unacct_blocks(info->flags, 1);
+                                if (!filepage) {
-                                shmem_free_blocks(inode, 1);
+                                        shmem_unacct_blocks(info->flags, 1);
-                                error = -ENOMEM;
+                                        shmem_free_blocks(inode, 1);
-                                goto failed;
+                                        error = -ENOMEM;
-                        }
+                                        goto failed;
-                        SetPageSwapBacked(filepage);
+                                }
+                                SetPageSwapBacked(filepage);
-                        /* Precharge page while we can wait, compensate after */
+                                /*
-                        error = mem_cgroup_cache_charge(filepage, current->mm,
+                                 * Precharge page while we can wait, compensate
-                                        GFP_KERNEL);
+                                 * after
-                        if (error) {
+                                 */
-                                page_cache_release(filepage);
+                                error = mem_cgroup_cache_charge(filepage,
-                                shmem_unacct_blocks(info->flags, 1);
+                                        current->mm, GFP_KERNEL);
-                                shmem_free_blocks(inode, 1);
+                                if (error) {
-                                filepage = NULL;
+                                        page_cache_release(filepage);
-                                goto failed;
+                                        shmem_unacct_blocks(info->flags, 1);
+                                        shmem_free_blocks(inode, 1);
+                                        filepage = NULL;
+                                        goto failed;
+                                }
+                                spin_lock(&info->lock);
+                        } else {
+                                filepage = prealloc_page;
+                                prealloc_page = NULL;
+                                SetPageSwapBacked(filepage);
                        }
-                        spin_lock(&info->lock);
                        entry = shmem_swp_alloc(info, idx, sgp);
                        if (IS_ERR(entry))
                                error = PTR_ERR(entry);
@@ -1469,13 +1490,19 @@ repeat:
        }
 done:
        *pagep = filepage;
-        return 0;
+        error = 0;
+        goto out;
 failed:
        if (*pagep != filepage) {
                unlock_page(filepage);
                page_cache_release(filepage);
        }
+out:
+        if (prealloc_page) {
+                mem_cgroup_uncharge_cache_page(prealloc_page);
+                page_cache_release(prealloc_page);
+        }
        return error;
 }
@@ -1791,17 +1818,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_type = TMPFS_MAGIC;
        buf->f_bsize = PAGE_CACHE_SIZE;
        buf->f_namelen = NAME_MAX;
-        spin_lock(&sbinfo->stat_lock);
        if (sbinfo->max_blocks) {
                buf->f_blocks = sbinfo->max_blocks;
-                buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
+                buf->f_bavail = buf->f_bfree =
+                                sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
        }
        if (sbinfo->max_inodes) {
                buf->f_files = sbinfo->max_inodes;
                buf->f_ffree = sbinfo->free_inodes;
        }
        /* else leave those fields 0 like simple_statfs */
-        spin_unlock(&sbinfo->stat_lock);
        return 0;
 }
@@ -2242,7 +2268,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
 {
        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
        struct shmem_sb_info config = *sbinfo;
-        unsigned long blocks;
        unsigned long inodes;
        int error = -EINVAL;
@@ -2250,9 +2275,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
                return error;
        spin_lock(&sbinfo->stat_lock);
-        blocks = sbinfo->max_blocks - sbinfo->free_blocks;
        inodes = sbinfo->max_inodes - sbinfo->free_inodes;
-        if (config.max_blocks < blocks)
+        if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
                goto out;
        if (config.max_inodes < inodes)
                goto out;
@@ -2269,7 +2293,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
        error = 0;
        sbinfo->max_blocks  = config.max_blocks;
-        sbinfo->free_blocks = config.max_blocks - blocks;
        sbinfo->max_inodes  = config.max_inodes;
        sbinfo->free_inodes = config.max_inodes - inodes;
@@ -2302,7 +2325,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
 static void shmem_put_super(struct super_block *sb)
 {
-        kfree(sb->s_fs_info);
+        struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+        percpu_counter_destroy(&sbinfo->used_blocks);
+        kfree(sbinfo);
        sb->s_fs_info = NULL;
 }
@@ -2344,7 +2370,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
 #endif
        spin_lock_init(&sbinfo->stat_lock);
-        sbinfo->free_blocks = sbinfo->max_blocks;
+        if (percpu_counter_init(&sbinfo->used_blocks, 0))
+                goto failed;
        sbinfo->free_inodes = sbinfo->max_inodes;
        sb->s_maxbytes = SHMEM_MAX_BYTES;
@@ -2496,7 +2523,7 @@ static const struct super_operations shmem_ops = {
        .remount_fs     = shmem_remount_fs,
        .show_options   = shmem_show_options,
 #endif
-        .delete_inode   = shmem_delete_inode,
+        .evict_inode    = shmem_evict_inode,
        .drop_inode     = generic_delete_inode,
        .put_super      = shmem_put_super,
 };
diff --git a/mm/slab.c b/mm/slab.c
index dd41b74c8322..fcae9815d3b3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -394,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 #define STATS_DEC_ACTIVE(x)     do { } while (0)
 #define STATS_INC_ALLOCED(x)    do { } while (0)
 #define STATS_INC_GROWN(x)      do { } while (0)
-#define STATS_ADD_REAPED(x,y)   do { } while (0)
+#define STATS_ADD_REAPED(x,y)   do { (void)(y); } while (0)
 #define STATS_SET_HIGH(x)       do { } while (0)
 #define STATS_INC_ERR(x)        do { } while (0)
 #define STATS_INC_NODEALLOCS(x) do { } while (0)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 03aa2d55f1a2..1f3f9c59a73a 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,6 +47,8 @@ long nr_swap_pages;
 long total_swap_pages;
 static int least_priority;
+static bool swap_for_hibernation;
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
@@ -318,8 +320,10 @@ checks:
        if (offset > si->highest_bit)
                scan_base = offset = si->lowest_bit;
-        /* reuse swap entry of cache-only swap if not busy. */
+        /* reuse swap entry of cache-only swap if not hibernation. */
-        if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+        if (vm_swap_full()
+                && usage == SWAP_HAS_CACHE
+                && si->swap_map[offset] == SWAP_HAS_CACHE) {
                int swap_was_freed;
                spin_unlock(&swap_lock);
                swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -449,6 +453,8 @@ swp_entry_t get_swap_page(void)
        spin_lock(&swap_lock);
        if (nr_swap_pages <= 0)
                goto noswap;
+        if (swap_for_hibernation)
+                goto noswap;
        nr_swap_pages--;
        for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -481,28 +487,6 @@ noswap:
        return (swp_entry_t) {0};
 }
-/* The only caller of this function is now susupend routine */
-swp_entry_t get_swap_page_of_type(int type)
-{
-        struct swap_info_struct *si;
-        pgoff_t offset;
-        spin_lock(&swap_lock);
-        si = swap_info[type];
-        if (si && (si->flags & SWP_WRITEOK)) {
-                nr_swap_pages--;
-                /* This is called for allocating swap entry, not cache */
-                offset = scan_swap_map(si, 1);
-                if (offset) {
-                        spin_unlock(&swap_lock);
-                        return swp_entry(type, offset);
-                }
-                nr_swap_pages++;
-        }
-        spin_unlock(&swap_lock);
-        return (swp_entry_t) {0};
-}
 static struct swap_info_struct *swap_info_get(swp_entry_t entry)
 {
        struct swap_info_struct *p;
@@ -762,6 +746,74 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
 #endif
 #ifdef CONFIG_HIBERNATION
+static pgoff_t hibernation_offset[MAX_SWAPFILES];
+/*
+ * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
+ * saved swap_map[] image to the disk will be an incomplete because it's
+ * changing without synchronization with hibernation snap shot.
+ * At resume, we just make swap_for_hibernation=false. We can forget
+ * used maps easily.
+ */
+void hibernation_freeze_swap(void)
+{
+        int i;
+        spin_lock(&swap_lock);
+        printk(KERN_INFO "PM: Freeze Swap\n");
+        swap_for_hibernation = true;
+        for (i = 0; i < MAX_SWAPFILES; i++)
+                hibernation_offset[i] = 1;
+        spin_unlock(&swap_lock);
+}
+void hibernation_thaw_swap(void)
+{
+        spin_lock(&swap_lock);
+        if (swap_for_hibernation) {
+                printk(KERN_INFO "PM: Thaw Swap\n");
+                swap_for_hibernation = false;
+        }
+        spin_unlock(&swap_lock);
+}
+/*
+ * Because updateing swap_map[] can make not-saved-status-change,
+ * we use our own easy allocator.
+ * Please see kernel/power/swap.c, Used swaps are recorded into
+ * RB-tree.
+ */
+swp_entry_t get_swap_for_hibernation(int type)
+{
+        pgoff_t off;
+        swp_entry_t val = {0};
+        struct swap_info_struct *si;
+        spin_lock(&swap_lock);
+        si = swap_info[type];
+        if (!si || !(si->flags & SWP_WRITEOK))
+                goto done;
+        for (off = hibernation_offset[type]; off < si->max; ++off) {
+                if (!si->swap_map[off])
+                        break;
+        }
+        if (off < si->max) {
+                val = swp_entry(type, off);
+                hibernation_offset[type] = off + 1;
+        }
+done:
+        spin_unlock(&swap_lock);
+        return val;
+}
+void swap_free_for_hibernation(swp_entry_t ent)
+{
+        /* Nothing to do */
+}
 /*
 * Find the swap type that corresponds to given device (if any).
 *
diff --git a/mm/truncate.c b/mm/truncate.c
index 937571b8b233..ba887bff48c5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -541,28 +541,48 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new)
 EXPORT_SYMBOL(truncate_pagecache);
 /**
+ * truncate_setsize - update inode and pagecache for a new file size
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * truncate_setsize updastes i_size update and performs pagecache
+ * truncation (if necessary) for a file size updates. It will be
+ * typically be called from the filesystem's setattr function when
+ * ATTR_SIZE is passed in.
+ *
+ * Must be called with inode_mutex held and after all filesystem
+ * specific block truncation has been performed.
+ */
+void truncate_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+}
+EXPORT_SYMBOL(truncate_setsize);
+/**
 * vmtruncate - unmap mappings "freed" by truncate() syscall
 * @inode: inode of the file used
 * @offset: file offset to start truncating
 *
- * NOTE! We have to be ready to update the memory sharing
+ * This function is deprecated and truncate_setsize or truncate_pagecache
- * between the file and the memory map for a potential last
+ * should be used instead, together with filesystem specific block truncation.
- * incomplete page.  Ugly, but necessary.
- *
- * This function is deprecated and simple_setsize or truncate_pagecache
- * should be used instead.
 */
 int vmtruncate(struct inode *inode, loff_t offset)
 {
        int error;
-        error = simple_setsize(inode, offset);
+        error = inode_newsize_ok(inode, offset);
        if (error)
                return error;
+        truncate_setsize(inode, offset);
        if (inode->i_op->truncate)
                inode->i_op->truncate(inode);
+        return 0;
-        return error;
 }
 EXPORT_SYMBOL(vmtruncate);
diff --git a/mm/util.c b/mm/util.c
index f5712e8964be..4735ea481816 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -225,15 +225,10 @@ char *strndup_user(const char __user *s, long n)
        if (length > n)
                return ERR_PTR(-EINVAL);
-        p = kmalloc(length, GFP_KERNEL);
+        p = memdup_user(s, length);
-        if (!p)
+        if (IS_ERR(p))
-                return ERR_PTR(-ENOMEM);
+                return p;
-        if (copy_from_user(p, s, length)) {
-                kfree(p);
-                return ERR_PTR(-EFAULT);
-        }
        p[length - 1] = '\0';
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index b7e314b1009f..6b8889da69a6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
 #include <asm/tlbflush.h>
 #include <asm/shmparam.h>
+bool vmap_lazy_unmap __read_mostly = true;
 /*** Page table manipulation functions ***/
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
 {
        unsigned int log;
+        if (!vmap_lazy_unmap)
+                return 0;
        log = fls(num_online_cpus());
        return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -732,7 +736,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
                                        node, gfp_mask);
        if (unlikely(IS_ERR(va))) {
                kfree(vb);
-                return ERR_PTR(PTR_ERR(va));
+                return ERR_CAST(va);
        }
        err = radix_tree_preload(gfp_mask);
@@ -2437,8 +2441,11 @@ static int vmalloc_open(struct inode *inode, struct file *file)
        unsigned int *ptr = NULL;
        int ret;
-        if (NUMA_BUILD)
+        if (NUMA_BUILD) {
                ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
+                if (ptr == NULL)
+                        return -ENOMEM;
+        }
        ret = seq_open(file, &vmalloc_op);
        if (!ret) {
                struct seq_file *m = file->private_data;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b94fe1b3da43..c391c320dbaf 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -48,6 +48,9 @@
 #include "internal.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmscan.h>
 struct scan_control {
        /* Incremented by the number of inactive pages that were scanned */
        unsigned long nr_scanned;
@@ -398,6 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
                }
+                trace_mm_vmscan_writepage(page,
+                        trace_reclaim_flags(page, sync_writeback));
                inc_zone_page_state(page, NR_VMSCAN_WRITE);
                return PAGE_SUCCESS;
        }
@@ -617,6 +622,24 @@ static enum page_references page_check_references(struct page *page,
        return PAGEREF_RECLAIM;
 }
+static noinline_for_stack void free_page_list(struct list_head *free_pages)
+{
+        struct pagevec freed_pvec;
+        struct page *page, *tmp;
+        pagevec_init(&freed_pvec, 1);
+        list_for_each_entry_safe(page, tmp, free_pages, lru) {
+                list_del(&page->lru);
+                if (!pagevec_add(&freed_pvec, page)) {
+                        __pagevec_free(&freed_pvec);
+                        pagevec_reinit(&freed_pvec);
+                }
+        }
+        pagevec_free(&freed_pvec);
+}
 /*
 * shrink_page_list() returns the number of reclaimed pages
 */
@@ -625,13 +648,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                        enum pageout_io sync_writeback)
 {
        LIST_HEAD(ret_pages);
-        struct pagevec freed_pvec;
+        LIST_HEAD(free_pages);
        int pgactivate = 0;
        unsigned long nr_reclaimed = 0;
        cond_resched();
-        pagevec_init(&freed_pvec, 1);
        while (!list_empty(page_list)) {
                enum page_references references;
                struct address_space *mapping;
@@ -806,10 +828,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                __clear_page_locked(page);
 free_it:
                nr_reclaimed++;
-                if (!pagevec_add(&freed_pvec, page)) {
-                        __pagevec_free(&freed_pvec);
+                /*
-                        pagevec_reinit(&freed_pvec);
+                 * Is there need to periodically free_page_list? It would
-                }
+                 * appear not as the counts should be low
+                 */
+                list_add(&page->lru, &free_pages);
                continue;
 cull_mlocked:
@@ -832,9 +856,10 @@ keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
        }
+        free_page_list(&free_pages);
        list_splice(&ret_pages, page_list);
-        if (pagevec_count(&freed_pvec))
-                __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -916,6 +941,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                unsigned long *scanned, int order, int mode, int file)
 {
        unsigned long nr_taken = 0;
+        unsigned long nr_lumpy_taken = 0;
+        unsigned long nr_lumpy_dirty = 0;
+        unsigned long nr_lumpy_failed = 0;
        unsigned long scan;
        for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
@@ -993,12 +1021,25 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                                list_move(&cursor_page->lru, dst);
                                mem_cgroup_del_lru(cursor_page);
                                nr_taken++;
+                                nr_lumpy_taken++;
+                                if (PageDirty(cursor_page))
+                                        nr_lumpy_dirty++;
                                scan++;
+                        } else {
+                                if (mode == ISOLATE_BOTH &&
+                                                page_count(cursor_page))
+                                        nr_lumpy_failed++;
                        }
                }
        }
        *scanned = scan;
+        trace_mm_vmscan_lru_isolate(order,
+                        nr_to_scan, scan,
+                        nr_taken,
+                        nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed,
+                        mode);
        return nr_taken;
 }
@@ -1035,7 +1076,8 @@ static unsigned long clear_active_flags(struct list_head *page_list,
                        ClearPageActive(page);
                        nr_active++;
                }
-                count[lru]++;
+                if (count)
+                        count[lru]++;
        }
        return nr_active;
@@ -1112,174 +1154,212 @@ static int too_many_isolated(struct zone *zone, int file,
 }
 /*
- * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * TODO: Try merging with migrations version of putback_lru_pages
- * of reclaimed pages
 */
-static unsigned long shrink_inactive_list(unsigned long max_scan,
+static noinline_for_stack void
-                        struct zone *zone, struct scan_control *sc,
+putback_lru_pages(struct zone *zone, struct scan_control *sc,
-                        int priority, int file)
+                                unsigned long nr_anon, unsigned long nr_file,
+                                struct list_head *page_list)
 {
-        LIST_HEAD(page_list);
+        struct page *page;
        struct pagevec pvec;
-        unsigned long nr_scanned = 0;
-        unsigned long nr_reclaimed = 0;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        while (unlikely(too_many_isolated(zone, file, sc))) {
+        pagevec_init(&pvec, 1);
-                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                /* We are about to die and free our memory. Return now. */
+        /*
-                if (fatal_signal_pending(current))
+         * Put back any unfreeable pages.
-                        return SWAP_CLUSTER_MAX;
+         */
+        spin_lock(&zone->lru_lock);
+        while (!list_empty(page_list)) {
+                int lru;
+                page = lru_to_page(page_list);
+                VM_BUG_ON(PageLRU(page));
+                list_del(&page->lru);
+                if (unlikely(!page_evictable(page, NULL))) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        putback_lru_page(page);
+                        spin_lock_irq(&zone->lru_lock);
+                        continue;
+                }
+                SetPageLRU(page);
+                lru = page_lru(page);
+                add_page_to_lru_list(zone, page, lru);
+                if (is_active_lru(lru)) {
+                        int file = is_file_lru(lru);
+                        reclaim_stat->recent_rotated[file]++;
+                }
+                if (!pagevec_add(&pvec, page)) {
+                        spin_unlock_irq(&zone->lru_lock);
+                        __pagevec_release(&pvec);
+                        spin_lock_irq(&zone->lru_lock);
+                }
        }
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
+        spin_unlock_irq(&zone->lru_lock);
+        pagevec_release(&pvec);
+}
+static noinline_for_stack void update_isolated_counts(struct zone *zone,
+                                        struct scan_control *sc,
+                                        unsigned long *nr_anon,
+                                        unsigned long *nr_file,
+                                        struct list_head *isolated_list)
+{
+        unsigned long nr_active;
+        unsigned int count[NR_LRU_LISTS] = { 0, };
+        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
-        pagevec_init(&pvec, 1);
+        nr_active = clear_active_flags(isolated_list, count);
+        __count_vm_events(PGDEACTIVATE, nr_active);
-        lru_add_drain();
+        __mod_zone_page_state(zone, NR_ACTIVE_FILE,
-        spin_lock_irq(&zone->lru_lock);
+                              -count[LRU_ACTIVE_FILE]);
-        do {
+        __mod_zone_page_state(zone, NR_INACTIVE_FILE,
-                struct page *page;
+                              -count[LRU_INACTIVE_FILE]);
-                unsigned long nr_taken;
+        __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                unsigned long nr_scan;
+                              -count[LRU_ACTIVE_ANON]);
-                unsigned long nr_freed;
+        __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                unsigned long nr_active;
+                              -count[LRU_INACTIVE_ANON]);
-                unsigned int count[NR_LRU_LISTS] = { 0, };
-                int mode = sc->lumpy_reclaim_mode ? ISOLATE_BOTH : ISOLATE_INACTIVE;
-                unsigned long nr_anon;
-                unsigned long nr_file;
-                if (scanning_global_lru(sc)) {
+        *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
-                        nr_taken = isolate_pages_global(SWAP_CLUSTER_MAX,
+        *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
-                                                        &page_list, &nr_scan,
+        __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon);
-                                                        sc->order, mode,
+        __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file);
-                                                        zone, 0, file);
-                        zone->pages_scanned += nr_scan;
-                        if (current_is_kswapd())
-                                __count_zone_vm_events(PGSCAN_KSWAPD, zone,
-                                                       nr_scan);
-                        else
-                                __count_zone_vm_events(PGSCAN_DIRECT, zone,
-                                                       nr_scan);
-                } else {
-                        nr_taken = mem_cgroup_isolate_pages(SWAP_CLUSTER_MAX,
-                                                        &page_list, &nr_scan,
-                                                        sc->order, mode,
-                                                        zone, sc->mem_cgroup,
-                                                        0, file);
-                        /*
-                         * mem_cgroup_isolate_pages() keeps track of
-                         * scanned pages on its own.
-                         */
-                }
-                if (nr_taken == 0)
+        reclaim_stat->recent_scanned[0] += *nr_anon;
-                        goto done;
+        reclaim_stat->recent_scanned[1] += *nr_file;
+}
-                nr_active = clear_active_flags(&page_list, count);
+/*
-                __count_vm_events(PGDEACTIVATE, nr_active);
+ * Returns true if the caller should wait to clean dirty/writeback pages.
+ *
+ * If we are direct reclaiming for contiguous pages and we do not reclaim
+ * everything in the list, try again and wait for writeback IO to complete.
+ * This will stall high-order allocations noticeably. Only do that when really
+ * need to free the pages under high memory pressure.
+ */
+static inline bool should_reclaim_stall(unsigned long nr_taken,
+                                        unsigned long nr_freed,
+                                        int priority,
+                                        struct scan_control *sc)
+{
+        int lumpy_stall_priority;
-                __mod_zone_page_state(zone, NR_ACTIVE_FILE,
+        /* kswapd should not stall on sync IO */
-                                                -count[LRU_ACTIVE_FILE]);
+        if (current_is_kswapd())
-                __mod_zone_page_state(zone, NR_INACTIVE_FILE,
+                return false;
-                                                -count[LRU_INACTIVE_FILE]);
-                __mod_zone_page_state(zone, NR_ACTIVE_ANON,
-                                                -count[LRU_ACTIVE_ANON]);
-                __mod_zone_page_state(zone, NR_INACTIVE_ANON,
-                                                -count[LRU_INACTIVE_ANON]);
-                nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
+        /* Only stall on lumpy reclaim */
-                nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
+        if (!sc->lumpy_reclaim_mode)
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
+                return false;
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
-                reclaim_stat->recent_scanned[0] += nr_anon;
+        /* If we have relaimed everything on the isolated list, no stall */
-                reclaim_stat->recent_scanned[1] += nr_file;
+        if (nr_freed == nr_taken)
+                return false;
-                spin_unlock_irq(&zone->lru_lock);
+        /*
+         * For high-order allocations, there are two stall thresholds.
+         * High-cost allocations stall immediately where as lower
+         * order allocations such as stacks require the scanning
+         * priority to be much higher before stalling.
+         */
+        if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+                lumpy_stall_priority = DEF_PRIORITY;
+        else
+                lumpy_stall_priority = DEF_PRIORITY / 3;
-                nr_scanned += nr_scan;
+        return priority <= lumpy_stall_priority;
-                nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+}
+/*
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
+ */
+static noinline_for_stack unsigned long
+shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
+                        struct scan_control *sc, int priority, int file)
+{
+        LIST_HEAD(page_list);
+        unsigned long nr_scanned;
+        unsigned long nr_reclaimed = 0;
+        unsigned long nr_taken;
+        unsigned long nr_active;
+        unsigned long nr_anon;
+        unsigned long nr_file;
+        while (unlikely(too_many_isolated(zone, file, sc))) {
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
+                /* We are about to die and free our memory. Return now. */
+                if (fatal_signal_pending(current))
+                        return SWAP_CLUSTER_MAX;
+        }
+        lru_add_drain();
+        spin_lock_irq(&zone->lru_lock);
+        if (scanning_global_lru(sc)) {
+                nr_taken = isolate_pages_global(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, 0, file);
+                zone->pages_scanned += nr_scanned;
+                if (current_is_kswapd())
+                        __count_zone_vm_events(PGSCAN_KSWAPD, zone,
+                                               nr_scanned);
+                else
+                        __count_zone_vm_events(PGSCAN_DIRECT, zone,
+                                               nr_scanned);
+        } else {
+                nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
+                        &page_list, &nr_scanned, sc->order,
+                        sc->lumpy_reclaim_mode ?
+                                ISOLATE_BOTH : ISOLATE_INACTIVE,
+                        zone, sc->mem_cgroup,
+                        0, file);
                /*
-                 * If we are direct reclaiming for contiguous pages and we do
+                 * mem_cgroup_isolate_pages() keeps track of
-                 * not reclaim everything in the list, try again and wait
+                 * scanned pages on its own.
-                 * for IO to complete. This will stall high-order allocations
-                 * but that should be acceptable to the caller
                 */
-                if (nr_freed < nr_taken && !current_is_kswapd() &&
+        }
-                    sc->lumpy_reclaim_mode) {
-                        congestion_wait(BLK_RW_ASYNC, HZ/10);
-                        /*
+        if (nr_taken == 0) {
-                         * The attempt at page out may have made some
+                spin_unlock_irq(&zone->lru_lock);
-                         * of the pages active, mark them inactive again.
+                return 0;
-                         */
+        }
-                        nr_active = clear_active_flags(&page_list, count);
-                        count_vm_events(PGDEACTIVATE, nr_active);
-                        nr_freed += shrink_page_list(&page_list, sc,
+        update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list);
-                                                        PAGEOUT_IO_SYNC);
-                }
-                nr_reclaimed += nr_freed;
+        spin_unlock_irq(&zone->lru_lock);
-                local_irq_disable();
+        nr_reclaimed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
-                if (current_is_kswapd())
-                        __count_vm_events(KSWAPD_STEAL, nr_freed);
+        /* Check if we should syncronously wait for writeback */
-                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+        if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
+                congestion_wait(BLK_RW_ASYNC, HZ/10);
-                spin_lock(&zone->lru_lock);
                /*
-                 * Put back any unfreeable pages.
+                 * The attempt at page out may have made some
+                 * of the pages active, mark them inactive again.
                 */
-                while (!list_empty(&page_list)) {
+                nr_active = clear_active_flags(&page_list, NULL);
-                        int lru;
+                count_vm_events(PGDEACTIVATE, nr_active);
-                        page = lru_to_page(&page_list);
-                        VM_BUG_ON(PageLRU(page));
-                        list_del(&page->lru);
-                        if (unlikely(!page_evictable(page, NULL))) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                putback_lru_page(page);
-                                spin_lock_irq(&zone->lru_lock);
-                                continue;
-                        }
-                        SetPageLRU(page);
-                        lru = page_lru(page);
-                        add_page_to_lru_list(zone, page, lru);
-                        if (is_active_lru(lru)) {
-                                int file = is_file_lru(lru);
-                                reclaim_stat->recent_rotated[file]++;
-                        }
-                        if (!pagevec_add(&pvec, page)) {
-                                spin_unlock_irq(&zone->lru_lock);
-                                __pagevec_release(&pvec);
-                                spin_lock_irq(&zone->lru_lock);
-                        }
-                }
-                __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon);
-                __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file);
-        } while (nr_scanned < max_scan);
+                nr_reclaimed += shrink_page_list(&page_list, sc, PAGEOUT_IO_SYNC);
+        }
-done:
+        local_irq_disable();
-        spin_unlock_irq(&zone->lru_lock);
+        if (current_is_kswapd())
-        pagevec_release(&pvec);
+                __count_vm_events(KSWAPD_STEAL, nr_reclaimed);
-        return nr_reclaimed;
+        __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed);
-}
-/*
+        putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
- * We are about to scan this zone at a certain priority level.  If that priority
+        return nr_reclaimed;
- * level is smaller (ie: more urgent) than the previous priority, then note
- * that priority level within the zone.  This is done so that when the next
- * process comes in to scan this zone, it will immediately start out at this
- * priority level rather than having to build up its own scanning priority.
- * Here, this priority affects only the reclaim-mapped threshold.
- */
-static inline void note_zone_scanning_priority(struct zone *zone, int priority)
-{
-        if (priority < zone->prev_priority)
-                zone->prev_priority = priority;
 }
 /*
@@ -1583,6 +1663,13 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        }
        /*
+         * With swappiness at 100, anonymous and file have the same priority.
+         * This scanning priority is essentially the inverse of IO cost.
+         */
+        anon_prio = sc->swappiness;
+        file_prio = 200 - sc->swappiness;
+        /*
         * OK, so we have swap space and a fair amount of page cache
         * pages.  We use the recently rotated / recently scanned
         * ratios to determine how valuable each cache is.
@@ -1593,28 +1680,18 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
         *
         * anon in [0], file in [1]
         */
+        spin_lock_irq(&zone->lru_lock);
        if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[0] /= 2;
                reclaim_stat->recent_rotated[0] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
-                spin_lock_irq(&zone->lru_lock);
                reclaim_stat->recent_scanned[1] /= 2;
                reclaim_stat->recent_rotated[1] /= 2;
-                spin_unlock_irq(&zone->lru_lock);
        }
        /*
-         * With swappiness at 100, anonymous and file have the same priority.
-         * This scanning priority is essentially the inverse of IO cost.
-         */
-        anon_prio = sc->swappiness;
-        file_prio = 200 - sc->swappiness;
-        /*
         * The amount of pressure on anon vs file pages is inversely
         * proportional to the fraction of recently scanned pages on
         * each list that were recently referenced and in active use.
@@ -1624,6 +1701,7 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
        fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
        fp /= reclaim_stat->recent_rotated[1] + 1;
+        spin_unlock_irq(&zone->lru_lock);
        fraction[0] = ap;
        fraction[1] = fp;
@@ -1729,13 +1807,12 @@ static void shrink_zone(int priority, struct zone *zone,
 static bool shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        struct zoneref *z;
        struct zone *zone;
        bool all_unreclaimable = true;
-        for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+        for_each_zone_zonelist_nodemask(zone, z, zonelist,
-                                        sc->nodemask) {
+                                        gfp_zone(sc->gfp_mask), sc->nodemask) {
                if (!populated_zone(zone))
                        continue;
                /*
@@ -1745,17 +1822,8 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
                if (scanning_global_lru(sc)) {
                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                                continue;
-                        note_zone_scanning_priority(zone, priority);
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
-                } else {
-                        /*
-                         * Ignore cpuset limitation here. We just want to reduce
-                         * # of used pages by us regardless of memory shortage.
-                         */
-                        mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
-                                                        priority);
                }
                shrink_zone(priority, zone, sc);
@@ -1787,10 +1855,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        bool all_unreclaimable;
        unsigned long total_scanned = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
-        unsigned long lru_pages = 0;
        struct zoneref *z;
        struct zone *zone;
-        enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        unsigned long writeback_threshold;
        get_mems_allowed();
@@ -1798,18 +1864,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
        if (scanning_global_lru(sc))
                count_vm_event(ALLOCSTALL);
-        /*
-         * mem_cgroup will not do shrink_slab.
-         */
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        lru_pages += zone_reclaimable_pages(zone);
-                }
-        }
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                sc->nr_scanned = 0;
@@ -1821,6 +1875,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                 * over limit cgroups
                 */
                if (scanning_global_lru(sc)) {
+                        unsigned long lru_pages = 0;
+                        for_each_zone_zonelist(zone, z, zonelist,
+                                        gfp_zone(sc->gfp_mask)) {
+                                if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                                        continue;
+                                lru_pages += zone_reclaimable_pages(zone);
+                        }
                        shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
                        if (reclaim_state) {
                                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1861,17 +1924,6 @@ out:
        if (priority < 0)
                priority = 0;
-        if (scanning_global_lru(sc)) {
-                for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
-                        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                                continue;
-                        zone->prev_priority = priority;
-                }
-        } else
-                mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
        delayacct_freepages_end();
        put_mems_allowed();
@@ -1888,6 +1940,7 @@ out:
 unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                                gfp_t gfp_mask, nodemask_t *nodemask)
 {
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
@@ -1900,7 +1953,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
                .nodemask = nodemask,
        };
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_begin(order,
+                                sc.may_writepage,
+                                gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR
@@ -1908,9 +1969,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                                                gfp_t gfp_mask, bool noswap,
                                                unsigned int swappiness,
-                                                struct zone *zone, int nid)
+                                                struct zone *zone)
 {
        struct scan_control sc = {
+                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
@@ -1918,13 +1980,13 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .order = 0,
                .mem_cgroup = mem,
        };
-        nodemask_t nm  = nodemask_of_node(nid);
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
-        sc.nodemask = &nm;
-        sc.nr_reclaimed = 0;
+        trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
-        sc.nr_scanned = 0;
+                                                      sc.may_writepage,
+                                                      sc.gfp_mask);
        /*
         * NOTE: Although we can get the priority field, using it
         * here is not a good idea, since it limits the pages we can scan.
@@ -1933,6 +1995,9 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
         * the priority and make it zero.
         */
        shrink_zone(0, zone, &sc);
+        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
        return sc.nr_reclaimed;
 }
@@ -1942,6 +2007,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                                           unsigned int swappiness)
 {
        struct zonelist *zonelist;
+        unsigned long nr_reclaimed;
        struct scan_control sc = {
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
@@ -1956,7 +2022,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
        zonelist = NODE_DATA(numa_node_id())->node_zonelists;
-        return do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_begin(0,
+                                            sc.may_writepage,
+                                            sc.gfp_mask);
+        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+        return nr_reclaimed;
 }
 #endif
@@ -2028,22 +2103,12 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .order = order,
                .mem_cgroup = NULL,
        };
-        /*
-         * temp_priority is used to remember the scanning priority at which
-         * this zone was successfully refilled to
-         * free_pages == high_wmark_pages(zone).
-         */
-        int temp_priority[MAX_NR_ZONES];
 loop_again:
        total_scanned = 0;
        sc.nr_reclaimed = 0;
        sc.may_writepage = !laptop_mode;
        count_vm_event(PAGEOUTRUN);
-        for (i = 0; i < pgdat->nr_zones; i++)
-                temp_priority[i] = DEF_PRIORITY;
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
                int end_zone = 0;       /* Inclusive.  0 = ZONE_DMA */
                unsigned long lru_pages = 0;
@@ -2103,7 +2168,6 @@ loop_again:
                for (i = 0; i <= end_zone; i++) {
                        struct zone *zone = pgdat->node_zones + i;
                        int nr_slab;
-                        int nid, zid;
                        if (!populated_zone(zone))
                                continue;
@@ -2111,18 +2175,14 @@ loop_again:
                        if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
-                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
-                        note_zone_scanning_priority(zone, priority);
-                        nid = pgdat->node_id;
-                        zid = zone_idx(zone);
                        /*
                         * Call soft limit reclaim before calling shrink_zone.
                         * For now we ignore the return value
                         */
-                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
+                        mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
-                                                        nid, zid);
                        /*
                         * We put equal pressure on every zone, unless one
                         * zone has way too many pages free already.
@@ -2186,16 +2246,6 @@ loop_again:
                        break;
        }
 out:
-        /*
-         * Note within each zone the priority level at which this zone was
-         * brought into a happy state.  So that the next thread which scans this
-         * zone will start out at that priority level.
-         */
-        for (i = 0; i < pgdat->nr_zones; i++) {
-                struct zone *zone = pgdat->node_zones + i;
-                zone->prev_priority = temp_priority[i];
-        }
        if (!all_zones_ok) {
                cond_resched();
@@ -2299,9 +2349,10 @@ static int kswapd(void *p)
                                 * premature sleep. If not, then go fully
                                 * to sleep until explicitly woken up
                                 */
-                                if (!sleeping_prematurely(pgdat, order, remaining))
+                                if (!sleeping_prematurely(pgdat, order, remaining)) {
+                                        trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
                                        schedule();
-                                else {
+                                } else {
                                        if (remaining)
                                                count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
                                        else
@@ -2321,8 +2372,10 @@ static int kswapd(void *p)
                 * We can speed up thawing tasks if we don't call balance_pgdat
                 * after returning from the refrigerator
                 */
-                if (!ret)
+                if (!ret) {
+                        trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
                        balance_pgdat(pgdat, order);
+                }
        }
        return 0;
 }
@@ -2342,6 +2395,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
+        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
        if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                return;
        if (!waitqueue_active(&pgdat->kswapd_wait))
@@ -2590,9 +2644,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .swappiness = vm_swappiness,
                .order = order,
        };
-        unsigned long slab_reclaimable;
+        unsigned long nr_slab_pages0, nr_slab_pages1;
-        disable_swap_token();
        cond_resched();
        /*
         * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -2611,14 +2664,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 */
                priority = ZONE_RECLAIM_PRIORITY;
                do {
-                        note_zone_scanning_priority(zone, priority);
                        shrink_zone(priority, zone, &sc);
                        priority--;
                } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
        }
-        slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+        nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-        if (slab_reclaimable > zone->min_slab_pages) {
+        if (nr_slab_pages0 > zone->min_slab_pages) {
                /*
                 * shrink_slab() does not currently allow us to determine how
                 * many pages were freed in this zone. So we take the current
@@ -2629,17 +2681,27 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                 * Note that shrink_slab will free memory on all zones and may
                 * take a long time.
                 */
-                while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+                for (;;) {
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+                        unsigned long lru_pages = zone_reclaimable_pages(zone);
-                                slab_reclaimable - nr_pages)
-                        ;
+                        /* No reclaimable slab or very low memory pressure */
+                        if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages))
+                                break;
+                        /* Freed enough memory */
+                        nr_slab_pages1 = zone_page_state(zone,
+                                                        NR_SLAB_RECLAIMABLE);
+                        if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
+                                break;
+                }
                /*
                 * Update nr_reclaimed by the number of slab pages we
                 * reclaimed from this zone.
                 */
-                sc.nr_reclaimed += slab_reclaimable -
+                nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-                        zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+                if (nr_slab_pages1 < nr_slab_pages0)
+                        sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
        }
        p->reclaim_state = NULL;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7759941d4e77..f389168f9a83 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -22,14 +22,14 @@
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
-static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
+static void sum_vm_events(unsigned long *ret)
 {
        int cpu;
        int i;
        memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
-        for_each_cpu(cpu, cpumask) {
+        for_each_online_cpu(cpu) {
                struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
                for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
@@ -45,7 +45,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 void all_vm_events(unsigned long *ret)
 {
        get_online_cpus();
-        sum_vm_events(ret, cpu_online_mask);
+        sum_vm_events(ret);
        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
@@ -853,11 +853,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
        }
        seq_printf(m,
                   "\n  all_unreclaimable: %u"
-                   "\n  prev_priority:     %i"
                   "\n  start_pfn:         %lu"
                   "\n  inactive_ratio:    %u",
                   zone->all_unreclaimable,
-                   zone->prev_priority,
                   zone->zone_start_pfn,
                   zone->inactive_ratio);
        seq_putc(m, '\n');