1 files changed, 267 insertions, 195 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 20a8193a7af8..3eed583895a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,10 +47,13 @@
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
+#include <linux/oom.h>
 #include "internal.h"
 #include <asm/uaccess.h>
+#include <trace/events/vmscan.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 struct mem_cgroup *root_mem_cgroup __read_mostly;
@@ -211,8 +214,6 @@ struct mem_cgroup {
        */
        spinlock_t reclaim_param_lock;
-        int     prev_priority;  /* for recording reclaim priority */
        /*
         * While reclaiming in a hierarchy, we cache the last child we
         * reclaimed from.
@@ -268,6 +269,7 @@ enum move_type {
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
+        spinlock_t        lock; /* for from, to, moving_task */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
        unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
        struct task_struct *moving_task;        /* a task moving charges */
        wait_queue_head_t waitq;                /* a waitq for other context */
 } mc = {
+        .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
        int ret;
        struct mem_cgroup *curr = NULL;
+        struct task_struct *p;
-        task_lock(task);
+        p = find_lock_task_mm(task);
-        rcu_read_lock();
+        if (!p)
-        curr = try_get_mem_cgroup_from_mm(task->mm);
+                return 0;
-        rcu_read_unlock();
+        curr = try_get_mem_cgroup_from_mm(p->mm);
-        task_unlock(task);
+        task_unlock(p);
        if (!curr)
                return 0;
        /*
@@ -858,35 +862,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
        return ret;
 }
-/*
- * prev_priority control...this will be used in memory reclaim path.
- */
-int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
-{
-        int prev_priority;
-        spin_lock(&mem->reclaim_param_lock);
-        prev_priority = mem->prev_priority;
-        spin_unlock(&mem->reclaim_param_lock);
-        return prev_priority;
-}
-void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        if (priority < mem->prev_priority)
-                mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
-void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
-{
-        spin_lock(&mem->reclaim_param_lock);
-        mem->prev_priority = priority;
-        spin_unlock(&mem->reclaim_param_lock);
-}
 static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
 {
        unsigned long active;
@@ -944,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
                                       struct zone *zone,
                                       enum lru_list lru)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -954,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
 struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
                                                      struct zone *zone)
 {
-        int nid = zone->zone_pgdat->node_id;
+        int nid = zone_to_nid(zone);
        int zid = zone_idx(zone);
        struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -999,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        LIST_HEAD(pc_list);
        struct list_head *src;
        struct page_cgroup *pc, *tmp;
-        int nid = z->zone_pgdat->node_id;
+        int nid = zone_to_nid(z);
        int zid = zone_idx(z);
        struct mem_cgroup_per_zone *mz;
        int lru = LRU_FILE * file + active;
@@ -1038,6 +1013,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
        }
        *scanned = scan;
+        trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken,
+                                      0, 0, 0, mode);
        return nr_taken;
 }
@@ -1072,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
        return swappiness;
 }
+/* A routine for testing mem is not under move_account */
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+        struct mem_cgroup *from;
+        struct mem_cgroup *to;
+        bool ret = false;
+        /*
+         * Unlike task_move routines, we access mc.to, mc.from not under
+         * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+         */
+        spin_lock(&mc.lock);
+        from = mc.from;
+        to = mc.to;
+        if (!from)
+                goto unlock;
+        if (from == mem || to == mem
+            || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+            || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+                ret = true;
+unlock:
+        spin_unlock(&mc.lock);
+        return ret;
+}
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+        if (mc.moving_task && current != mc.moving_task) {
+                if (mem_cgroup_under_move(mem)) {
+                        DEFINE_WAIT(wait);
+                        prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+                        /* moving charge context might have finished. */
+                        if (mc.moving_task)
+                                schedule();
+                        finish_wait(&mc.waitq, &wait);
+                        return true;
+                }
+        }
+        return false;
+}
 static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
 {
        int *val = data;
@@ -1158,6 +1178,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
 }
 /*
+ * Return the memory (and swap, if configured) limit for a memcg.
+ */
+u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
+{
+        u64 limit;
+        u64 memsw;
+        limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
+                        total_swap_pages;
+        memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
+        /*
+         * If memsw is finite and limits the amount of swap space available
+         * to this memcg, return that limit.
+         */
+        return min(limit, memsw);
+}
+/*
 * Visit the first child (need not be the first child as per the ordering
 * of the cgroup list, since we track last_scanned_child) of @mem and use
 * that to reclaim free pages from.
@@ -1262,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                /* we use swappiness of local cgroup */
                if (check_soft)
                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
-                                noswap, get_swappiness(victim), zone,
+                                noswap, get_swappiness(victim), zone);
-                                zone->zone_pgdat->node_id);
                else
                        ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
                                                noswap, get_swappiness(victim));
@@ -1370,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
 static void memcg_oom_recover(struct mem_cgroup *mem)
 {
-        if (atomic_read(&mem->oom_lock))
+        if (mem && atomic_read(&mem->oom_lock))
                memcg_wakeup_oom(mem);
 }
@@ -1582,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
        return NOTIFY_OK;
 }
+/* See __mem_cgroup_try_charge() for details */
+enum {
+        CHARGE_OK,              /* success */
+        CHARGE_RETRY,           /* need to retry but retry is not bad */
+        CHARGE_NOMEM,           /* we can't do more. return -ENOMEM */
+        CHARGE_WOULDBLOCK,      /* GFP_WAIT wasn't set and no enough res. */
+        CHARGE_OOM_DIE,         /* the current is killed because of OOM */
+};
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+                                int csize, bool oom_check)
+{
+        struct mem_cgroup *mem_over_limit;
+        struct res_counter *fail_res;
+        unsigned long flags = 0;
+        int ret;
+        ret = res_counter_charge(&mem->res, csize, &fail_res);
+        if (likely(!ret)) {
+                if (!do_swap_account)
+                        return CHARGE_OK;
+                ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+                if (likely(!ret))
+                        return CHARGE_OK;
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+                flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+        } else
+                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+        if (csize > PAGE_SIZE) /* change csize and retry */
+                return CHARGE_RETRY;
+        if (!(gfp_mask & __GFP_WAIT))
+                return CHARGE_WOULDBLOCK;
+        ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+                                        gfp_mask, flags);
+        /*
+         * try_to_free_mem_cgroup_pages() might not give us a full
+         * picture of reclaim. Some pages are reclaimed and might be
+         * moved to swap cache or just unmapped from the cgroup.
+         * Check the limit again to see if the reclaim reduced the
+         * current usage of the cgroup before giving up
+         */
+        if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+                return CHARGE_RETRY;
+        /*
+         * At task move, charge accounts can be doubly counted. So, it's
+         * better to wait until the end of task_move if something is going on.
+         */
+        if (mem_cgroup_wait_acct_move(mem_over_limit))
+                return CHARGE_RETRY;
+        /* If we don't need to call oom-killer at el, return immediately */
+        if (!oom_check)
+                return CHARGE_NOMEM;
+        /* check OOM */
+        if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+                return CHARGE_OOM_DIE;
+        return CHARGE_RETRY;
+}
 /*
 * Unlike exported interface, "oom" parameter is added. if oom==true,
 * oom-killer can be invoked.
 */
 static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                        gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+                gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
 {
-        struct mem_cgroup *mem, *mem_over_limit;
+        int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+        struct mem_cgroup *mem = NULL;
-        struct res_counter *fail_res;
+        int ret;
        int csize = CHARGE_SIZE;
        /*
@@ -1609,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
         * thread group leader migrates. It's possible that mm is not
         * set, if so charge the init_mm (happens for pagecache usage).
         */
-        mem = *memcg;
+        if (!*memcg && !mm)
-        if (likely(!mem)) {
+                goto bypass;
-                mem = try_get_mem_cgroup_from_mm(mm);
+again:
-                *memcg = mem;
+        if (*memcg) { /* css should be a valid one */
-        } else {
+                mem = *memcg;
-                css_get(&mem->css);
+                VM_BUG_ON(css_is_removed(&mem->css));
-        }
+                if (mem_cgroup_is_root(mem))
-        if (unlikely(!mem))
+                        goto done;
-                return 0;
-        VM_BUG_ON(css_is_removed(&mem->css));
-        if (mem_cgroup_is_root(mem))
-                goto done;
-        while (1) {
-                int ret = 0;
-                unsigned long flags = 0;
                if (consume_stock(mem))
                        goto done;
+                css_get(&mem->css);
+        } else {
+                struct task_struct *p;
-                ret = res_counter_charge(&mem->res, csize, &fail_res);
+                rcu_read_lock();
-                if (likely(!ret)) {
+                p = rcu_dereference(mm->owner);
-                        if (!do_swap_account)
+                VM_BUG_ON(!p);
-                                break;
-                        ret = res_counter_charge(&mem->memsw, csize, &fail_res);
-                        if (likely(!ret))
-                                break;
-                        /* mem+swap counter fails */
-                        res_counter_uncharge(&mem->res, csize);
-                        flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        memsw);
-                } else
-                        /* mem counter fails */
-                        mem_over_limit = mem_cgroup_from_res_counter(fail_res,
-                                                                        res);
-                /* reduce request size and retry */
-                if (csize > PAGE_SIZE) {
-                        csize = PAGE_SIZE;
-                        continue;
-                }
-                if (!(gfp_mask & __GFP_WAIT))
-                        goto nomem;
-                ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
-                                                gfp_mask, flags);
-                if (ret)
-                        continue;
                /*
-                 * try_to_free_mem_cgroup_pages() might not give us a full
+                 * because we don't have task_lock(), "p" can exit while
-                 * picture of reclaim. Some pages are reclaimed and might be
+                 * we're here. In that case, "mem" can point to root
-                 * moved to swap cache or just unmapped from the cgroup.
+                 * cgroup but never be NULL. (and task_struct itself is freed
-                 * Check the limit again to see if the reclaim reduced the
+                 * by RCU, cgroup itself is RCU safe.) Then, we have small
-                 * current usage of the cgroup before giving up
+                 * risk here to get wrong cgroup. But such kind of mis-account
-                 *
+                 * by race always happens because we don't have cgroup_mutex().
+                 * It's overkill and we allow that small race, here.
                 */
-                if (mem_cgroup_check_under_limit(mem_over_limit))
+                mem = mem_cgroup_from_task(p);
-                        continue;
+                VM_BUG_ON(!mem);
+                if (mem_cgroup_is_root(mem)) {
-                /* try to avoid oom while someone is moving charge */
+                        rcu_read_unlock();
-                if (mc.moving_task && current != mc.moving_task) {
+                        goto done;
-                        struct mem_cgroup *from, *to;
+                }
-                        bool do_continue = false;
+                if (consume_stock(mem)) {
                        /*
-                         * There is a small race that "from" or "to" can be
+                         * It seems dagerous to access memcg without css_get().
-                         * freed by rmdir, so we use css_tryget().
+                         * But considering how consume_stok works, it's not
+                         * necessary. If consume_stock success, some charges
+                         * from this memcg are cached on this cpu. So, we
+                         * don't need to call css_get()/css_tryget() before
+                         * calling consume_stock().
                         */
-                        from = mc.from;
+                        rcu_read_unlock();
-                        to = mc.to;
+                        goto done;
-                        if (from && css_tryget(&from->css)) {
+                }
-                                if (mem_over_limit->use_hierarchy)
+                /* after here, we may be blocked. we need to get refcnt */
-                                        do_continue = css_is_ancestor(
+                if (!css_tryget(&mem->css)) {
-                                                        &from->css,
+                        rcu_read_unlock();
-                                                        &mem_over_limit->css);
+                        goto again;
-                                else
+                }
-                                        do_continue = (from == mem_over_limit);
+                rcu_read_unlock();
-                                css_put(&from->css);
+        }
-                        }
-                        if (!do_continue && to && css_tryget(&to->css)) {
+        do {
-                                if (mem_over_limit->use_hierarchy)
+                bool oom_check;
-                                        do_continue = css_is_ancestor(
-                                                        &to->css,
+                /* If killed, bypass charge */
-                                                        &mem_over_limit->css);
+                if (fatal_signal_pending(current)) {
-                                else
+                        css_put(&mem->css);
-                                        do_continue = (to == mem_over_limit);
+                        goto bypass;
-                                css_put(&to->css);
+                }
-                        }
-                        if (do_continue) {
+                oom_check = false;
-                                DEFINE_WAIT(wait);
+                if (oom && !nr_oom_retries) {
-                                prepare_to_wait(&mc.waitq, &wait,
+                        oom_check = true;
-                                                        TASK_INTERRUPTIBLE);
+                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                                /* moving charge context might have finished. */
-                                if (mc.moving_task)
-                                        schedule();
-                                finish_wait(&mc.waitq, &wait);
-                                continue;
-                        }
                }
-                if (!nr_retries--) {
+                ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
-                        if (!oom)
+                switch (ret) {
+                case CHARGE_OK:
+                        break;
+                case CHARGE_RETRY: /* not in OOM situation but retry */
+                        csize = PAGE_SIZE;
+                        css_put(&mem->css);
+                        mem = NULL;
+                        goto again;
+                case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+                        css_put(&mem->css);
+                        goto nomem;
+                case CHARGE_NOMEM: /* OOM routine works */
+                        if (!oom) {
+                                css_put(&mem->css);
                                goto nomem;
-                        if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
-                                nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-                                continue;
                        }
-                        /* When we reach here, current task is dying .*/
+                        /* If oom, we never return -ENOMEM */
+                        nr_oom_retries--;
+                        break;
+                case CHARGE_OOM_DIE: /* Killed by OOM Killer */
                        css_put(&mem->css);
                        goto bypass;
                }
-        }
+        } while (ret != CHARGE_OK);
        if (csize > PAGE_SIZE)
                refill_stock(mem, csize - PAGE_SIZE);
+        css_put(&mem->css);
 done:
+        *memcg = mem;
        return 0;
 nomem:
-        css_put(&mem->css);
+        *memcg = NULL;
        return -ENOMEM;
 bypass:
        *memcg = NULL;
@@ -1747,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
                res_counter_uncharge(&mem->res, PAGE_SIZE * count);
                if (do_swap_account)
                        res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_put(&mem->css, (int)count);
        }
-        /* we don't need css_put for root */
 }
 static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1979,10 +2061,9 @@ out:
 * < 0 if the cgroup is over its limit
 */
 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-                                gfp_t gfp_mask, enum charge_type ctype,
+                                gfp_t gfp_mask, enum charge_type ctype)
-                                struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *mem;
+        struct mem_cgroup *mem = NULL;
        struct page_cgroup *pc;
        int ret;
@@ -1992,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
                return 0;
        prefetchw(pc);
-        mem = memcg;
        ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
        if (ret || !mem)
                return ret;
@@ -2020,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
        if (unlikely(!mm))
                mm = &init_mm;
        return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_MAPPED);
 }
 static void
@@ -2030,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                gfp_t gfp_mask)
 {
-        struct mem_cgroup *mem = NULL;
        int ret;
        if (mem_cgroup_disabled())
@@ -2051,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
        if (!(gfp_mask & __GFP_WAIT)) {
                struct page_cgroup *pc;
                pc = lookup_page_cgroup(page);
                if (!pc)
                        return 0;
@@ -2063,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                unlock_page_cgroup(pc);
        }
-        if (unlikely(!mm && !mem))
+        if (unlikely(!mm))
                mm = &init_mm;
        if (page_is_file_cache(page))
                return mem_cgroup_charge_common(page, mm, gfp_mask,
-                                MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+                                MEM_CGROUP_CHARGE_TYPE_CACHE);
        /* shmem */
        if (PageSwapCache(page)) {
+                struct mem_cgroup *mem = NULL;
                ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
                if (!ret)
                        __mem_cgroup_commit_charge_swapin(page, mem,
                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        } else
                ret = mem_cgroup_charge_common(page, mm, gfp_mask,
-                                        MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+                                        MEM_CGROUP_CHARGE_TYPE_SHMEM);
        return ret;
 }
@@ -2114,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
                goto charge_cur_mm;
        *ptr = mem;
        ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
-        /* drop extra refcnt from tryget */
        css_put(&mem->css);
        return ret;
 charge_cur_mm:
@@ -2245,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 {
        struct page_cgroup *pc;
        struct mem_cgroup *mem = NULL;
-        struct mem_cgroup_per_zone *mz;
        if (mem_cgroup_disabled())
                return NULL;
@@ -2285,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                break;
        }
-        if (!mem_cgroup_is_root(mem))
-                __do_uncharge(mem, ctype);
-        if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-                mem_cgroup_swap_statistics(mem, true);
        mem_cgroup_charge_statistics(mem, pc, false);
        ClearPageCgroupUsed(pc);
@@ -2299,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         * special functions.
         */
-        mz = page_cgroup_zoneinfo(pc);
        unlock_page_cgroup(pc);
+        /*
+         * even after unlock, we have mem->res.usage here and this memcg
+         * will never be freed.
+         */
        memcg_check_events(mem, page);
-        /* at swapout, this memcg will be accessed to record to swap */
+        if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-        if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
+                mem_cgroup_swap_statistics(mem, true);
-                css_put(&mem->css);
+                mem_cgroup_get(mem);
+        }
+        if (!mem_cgroup_is_root(mem))
+                __do_uncharge(mem, ctype);
        return mem;
@@ -2392,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
        memcg = __mem_cgroup_uncharge_common(page, ctype);
-        /* record memcg information */
+        /*
-        if (do_swap_account && swapout && memcg) {
+         * record memcg information,  if swapout && memcg != NULL,
+         * mem_cgroup_get() was called in uncharge().
+         */
+        if (do_swap_account && swapout && memcg)
                swap_cgroup_record(ent, css_id(&memcg->css));
-                mem_cgroup_get(memcg);
-        }
-        if (swapout && memcg)
-                css_put(&memcg->css);
 }
 #endif
@@ -2476,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
                         */
                        if (!mem_cgroup_is_root(to))
                                res_counter_uncharge(&to->res, PAGE_SIZE);
-                        css_put(&to->css);
                }
                return 0;
        }
@@ -2611,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
        ClearPageCgroupMigration(pc);
        unlock_page_cgroup(pc);
-        if (unused != oldpage)
-                pc = lookup_page_cgroup(unused);
        __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
-        pc = lookup_page_cgroup(used);
        /*
         * If a page is a file cache, radix-tree replacement is very atomic
         * and we can skip this check. When it was an Anon page, its mapcount
@@ -2791,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
 }
 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
-                                                gfp_t gfp_mask, int nid,
+                                            gfp_t gfp_mask)
-                                                int zid)
 {
        unsigned long nr_reclaimed = 0;
        struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2804,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
        if (order > 0)
                return 0;
-        mctz = soft_limit_tree_node_zone(nid, zid);
+        mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
        /*
         * This loop can run a while, specially if mem_cgroup's continuously
         * keep exceeding their soft limit and putting the system under
@@ -3759,8 +3832,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
        return 0;
 }
-/*
- */
 static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
        struct cftype *cft, u64 val)
 {
@@ -4180,9 +4251,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
                        goto one_by_one;
                }
                mc.precharge += count;
-                VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
-                WARN_ON_ONCE(count > INT_MAX);
-                __css_get(&mem->css, (int)count);
                return ret;
        }
 one_by_one:
@@ -4400,11 +4468,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
 static void mem_cgroup_clear_mc(void)
 {
+        struct mem_cgroup *from = mc.from;
+        struct mem_cgroup *to = mc.to;
        /* we must uncharge all the leftover precharges from mc.to */
        if (mc.precharge) {
                __mem_cgroup_cancel_charge(mc.to, mc.precharge);
                mc.precharge = 0;
-                memcg_oom_recover(mc.to);
        }
        /*
         * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4413,11 +4483,9 @@ static void mem_cgroup_clear_mc(void)
        if (mc.moved_charge) {
                __mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                mc.moved_charge = 0;
-                memcg_oom_recover(mc.from);
        }
        /* we must fixup refcnts and charges */
        if (mc.moved_swap) {
-                WARN_ON_ONCE(mc.moved_swap > INT_MAX);
                /* uncharge swap account from the old cgroup */
                if (!mem_cgroup_is_root(mc.from))
                        res_counter_uncharge(&mc.from->memsw,
@@ -4431,16 +4499,18 @@ static void mem_cgroup_clear_mc(void)
                         */
                        res_counter_uncharge(&mc.to->res,
                                                PAGE_SIZE * mc.moved_swap);
-                        VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
-                        __css_put(&mc.to->css, mc.moved_swap);
                }
                /* we've already done mem_cgroup_get(mc.to) */
                mc.moved_swap = 0;
        }
+        spin_lock(&mc.lock);
        mc.from = NULL;
        mc.to = NULL;
        mc.moving_task = NULL;
+        spin_unlock(&mc.lock);
+        memcg_oom_recover(from);
+        memcg_oom_recover(to);
        wake_up_all(&mc.waitq);
 }
@@ -4469,12 +4539,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
                        VM_BUG_ON(mc.moved_charge);
                        VM_BUG_ON(mc.moved_swap);
                        VM_BUG_ON(mc.moving_task);
+                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = mem;
                        mc.precharge = 0;
                        mc.moved_charge = 0;
                        mc.moved_swap = 0;
                        mc.moving_task = current;
+                        spin_unlock(&mc.lock);
                        ret = mem_cgroup_precharge_mc(mm);
                        if (ret)