Merge tag 'v4.0-rc2' into x86/asm, to refresh the tree

Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Ingo Molnar <mingo@kernel.org> 2015-03-04 00:35:43 -0500
committer: Ingo Molnar <mingo@kernel.org> 2015-03-04 00:35:43 -0500
commit: d2c032e3dc58137a7261a7824d3acce435db1d66 (patch)
tree: 7eea1c7c6103eefe879f07472eec99b3c41eb792 /mm/memcontrol.c
parent: 7e8e385aaf6ed5b64b5d9108081cfcdcdd021b78 (diff)
parent: 13a7a6ac0a11197edcd0f756a035f472b42cdf8b (diff)
1 files changed, 580 insertions, 493 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c2f01b..9fe07692eaad 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES      5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
 #else
 #define do_swap_account         0
 #endif
 static const char * const mem_cgroup_stat_names[] = {
        "cache",
        "rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
        "swap",
 };
-enum mem_cgroup_events_index {
-        MEM_CGROUP_EVENTS_PGPGIN,       /* # of pages paged in */
-        MEM_CGROUP_EVENTS_PGPGOUT,      /* # of pages paged out */
-        MEM_CGROUP_EVENTS_PGFAULT,      /* # of page-faults */
-        MEM_CGROUP_EVENTS_PGMAJFAULT,   /* # of major page-faults */
-        MEM_CGROUP_EVENTS_NSTATS,
-};
 static const char * const mem_cgroup_events_names[] = {
        "pgpgin",
        "pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
 struct mem_cgroup_stat_cpu {
        long count[MEM_CGROUP_STAT_NSTATS];
-        unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+        unsigned long events[MEMCG_NR_EVENTS];
        unsigned long nr_page_events;
        unsigned long targets[MEM_CGROUP_NTARGETS];
 };
@@ -284,6 +267,10 @@ struct mem_cgroup {
        struct page_counter memsw;
        struct page_counter kmem;
+        /* Normal memory consumption range */
+        unsigned long low;
+        unsigned long high;
        unsigned long soft_limit;
        /* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
        /*
         * set > 0 if pages under this cgroup are moving to other cgroup.
         */
-        atomic_t        moving_account;
+        atomic_t                moving_account;
        /* taken only while moving_account > 0 */
-        spinlock_t      move_lock;
+        spinlock_t              move_lock;
+        struct task_struct      *move_lock_task;
+        unsigned long           move_lock_flags;
        /*
         * percpu counter.
         */
@@ -343,11 +332,10 @@ struct mem_cgroup {
        struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* analogous to slab_common's slab_caches list, but per-memcg;
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
-         * protected by memcg_slab_mutex */
-        struct list_head memcg_slab_caches;
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
        int kmemcg_id;
+        bool kmem_acct_activated;
+        bool kmem_acct_active;
 #endif
        int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
 };
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-        return memcg->kmemcg_id >= 0;
+        return memcg->kmem_acct_active;
 }
 #endif
 /* Stuffs for move charges at task migration. */
 /*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
+ * Types of charges to be moved.
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
 */
-enum move_type {
+#define MOVE_ANON       0x1U
-        MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
+#define MOVE_FILE       0x2U
-        MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
+#define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
-        NR_MOVE_TYPE,
-};
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
-        unsigned long immigrate_flags;
+        unsigned long flags;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
-static bool move_anon(void)
-{
-        return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-static bool move_file(void)
-{
-        return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
 /*
 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 * limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-        if (!memcg_proto_activated(&memcg->tcp_mem))
-                return;
-        static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
 *
- * The current size of the caches array is stored in
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * memcg_limited_groups_array_size.  It will double each time we have to
+ * will double each time we have to increase it.
- * increase it.
 */
-static DEFINE_IDA(kmem_limited_groups);
+static DEFINE_IDA(memcg_cache_ida);
-int memcg_limited_groups_array_size;
+int memcg_nr_cache_ids;
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+void memcg_get_cache_ids(void)
+{
+        down_read(&memcg_cache_ids_sem);
+}
+void memcg_put_cache_ids(void)
+{
+        up_read(&memcg_cache_ids_sem);
+}
 /*
 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-        if (memcg_kmem_is_active(memcg)) {
-                static_key_slow_dec(&memcg_kmem_enabled_key);
-                memcg_free_cache_id(memcg->kmemcg_id);
-        }
-        /*
-         * This check can't live in kmem destruction function,
-         * since the charges will outlive the cgroup
-         */
-        WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
-        disarm_sock_keys(memcg);
-        disarm_kmem_keys(memcg);
-}
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 {
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
        return inactive * inactive_ratio < active;
 }
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+        struct mem_cgroup_per_zone *mz;
+        struct mem_cgroup *memcg;
+        if (mem_cgroup_disabled())
+                return true;
+        mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+        memcg = mz->memcg;
+        return !!(memcg->css.flags & CSS_ONLINE);
+}
 #define mem_cgroup_from_counter(counter, member)        \
        container_of(counter, struct mem_cgroup, member)
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-                set_thread_flag(TIF_MEMDIE);
+                mark_tsk_oom_victim(current);
                return;
        }
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (!memcg)
                return false;
-        if (!handle)
+        if (!handle || oom_killer_disabled)
                goto cleanup;
        owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
 /**
 * mem_cgroup_begin_page_stat - begin a page state statistics transaction
 * @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
 *
 * This function must mark the beginning of an accounted page state
 * change to prevent double accounting when the page is concurrently
 * being moved to another memcg:
 *
- *   memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ *   memcg = mem_cgroup_begin_page_stat(page);
 *   if (TestClearPageState(page))
 *     mem_cgroup_update_page_stat(memcg, state, -1);
- *   mem_cgroup_end_page_stat(memcg, locked, flags);
+ *   mem_cgroup_end_page_stat(memcg);
- *
- * The RCU lock is held throughout the transaction.  The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged.  E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
 */
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
-                                              bool *locked,
-                                              unsigned long *flags)
 {
        struct mem_cgroup *memcg;
+        unsigned long flags;
+        /*
+         * The RCU lock is held throughout the transaction.  The fast
+         * path can get away without acquiring the memcg->move_lock
+         * because page moving starts with an RCU grace period.
+         *
+         * The RCU lock also protects the memcg from being freed when
+         * the page state that is going to change is the only thing
+         * preventing the page from being uncharged.
+         * E.g. end-writeback clearing PageWriteback(), which allows
+         * migration to go ahead and uncharge the page before the
+         * account transaction might be complete.
+         */
        rcu_read_lock();
        if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
        if (unlikely(!memcg))
                return NULL;
-        *locked = false;
        if (atomic_read(&memcg->moving_account) <= 0)
                return memcg;
-        spin_lock_irqsave(&memcg->move_lock, *flags);
+        spin_lock_irqsave(&memcg->move_lock, flags);
        if (memcg != page->mem_cgroup) {
-                spin_unlock_irqrestore(&memcg->move_lock, *flags);
+                spin_unlock_irqrestore(&memcg->move_lock, flags);
                goto again;
        }
-        *locked = true;
+        /*
+         * When charge migration first begins, we can have locked and
+         * unlocked page stat updates happening concurrently.  Track
+         * the task who has the lock for mem_cgroup_end_page_stat().
+         */
+        memcg->move_lock_task = current;
+        memcg->move_lock_flags = flags;
        return memcg;
 }
@@ -2034,14 +2006,17 @@ again:
 /**
 * mem_cgroup_end_page_stat - finish a page state statistics transaction
 * @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
 */
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
-                              unsigned long *flags)
 {
-        if (memcg && *locked)
+        if (memcg && memcg->move_lock_task == current) {
-                spin_unlock_irqrestore(&memcg->move_lock, *flags);
+                unsigned long flags = memcg->move_lock_flags;
+                memcg->move_lock_task = NULL;
+                memcg->move_lock_flags = 0;
+                spin_unlock_irqrestore(&memcg->move_lock, flags);
+        }
        rcu_read_unlock();
 }
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
-static void __init memcg_stock_init(void)
-{
-        int cpu;
-        for_each_possible_cpu(cpu) {
-                struct memcg_stock_pcp *stock =
-                                        &per_cpu(memcg_stock, cpu);
-                INIT_WORK(&stock->work, drain_local_stock);
-        }
-}
 /*
 * Cache charges(val) to local per_cpu area.
 * This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
        if (!(gfp_mask & __GFP_WAIT))
                goto nomem;
+        mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
@@ -2335,6 +2301,8 @@ retry:
        if (fatal_signal_pending(current))
                goto bypass;
+        mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
        mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
 nomem:
        if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
+        /*
+         * If the hierarchy is above the normal consumption range,
+         * make the charging task trim their excess contribution.
+         */
+        do {
+                if (page_counter_read(&memcg->memory) <= memcg->high)
+                        continue;
+                mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+        } while ((memcg = parent_mem_cgroup(memcg)));
 done:
        return ret;
 }
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 #ifdef CONFIG_MEMCG_KMEM
-/*
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
+                      unsigned long nr_pages)
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-        struct kmem_cache *cachep;
-        VM_BUG_ON(p->is_root_cache);
-        cachep = p->root_cache;
-        return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
-                             unsigned long nr_pages)
 {
        struct page_counter *counter;
        int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
        return ret;
 }
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
-                                unsigned long nr_pages)
 {
        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
        int id, size;
        int err;
-        id = ida_simple_get(&kmem_limited_groups,
+        id = ida_simple_get(&memcg_cache_ida,
                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
        if (id < 0)
                return id;
-        if (id < memcg_limited_groups_array_size)
+        if (id < memcg_nr_cache_ids)
                return id;
        /*
         * There's no space for the new id in memcg_caches arrays,
         * so we have to grow them.
         */
+        down_write(&memcg_cache_ids_sem);
        size = 2 * (id + 1);
        if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
        else if (size > MEMCG_CACHES_MAX_SIZE)
                size = MEMCG_CACHES_MAX_SIZE;
-        mutex_lock(&memcg_slab_mutex);
        err = memcg_update_all_caches(size);
-        mutex_unlock(&memcg_slab_mutex);
+        if (!err)
+                err = memcg_update_all_list_lrus(size);
+        if (!err)
+                memcg_nr_cache_ids = size;
+        up_write(&memcg_cache_ids_sem);
        if (err) {
-                ida_simple_remove(&kmem_limited_groups, id);
+                ida_simple_remove(&memcg_cache_ida, id);
                return err;
        }
        return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
 static void memcg_free_cache_id(int id)
 {
-        ida_simple_remove(&kmem_limited_groups, id);
+        ida_simple_remove(&memcg_cache_ida, id);
 }
-/*
+struct memcg_kmem_cache_create_work {
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-        memcg_limited_groups_array_size = num;
-}
-static void memcg_register_cache(struct mem_cgroup *memcg,
-                                 struct kmem_cache *root_cache)
-{
-        static char memcg_name_buf[NAME_MAX + 1]; /* protected by
-                                                     memcg_slab_mutex */
-        struct kmem_cache *cachep;
-        int id;
-        lockdep_assert_held(&memcg_slab_mutex);
-        id = memcg_cache_id(memcg);
-        /*
-         * Since per-memcg caches are created asynchronously on first
-         * allocation (see memcg_kmem_get_cache()), several threads can try to
-         * create the same cache, but only one of them may succeed.
-         */
-        if (cache_from_memcg_idx(root_cache, id))
-                return;
-        cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
-        cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
-        /*
-         * If we could not create a memcg cache, do not complain, because
-         * that's not critical at all as we can always proceed with the root
-         * cache.
-         */
-        if (!cachep)
-                return;
-        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-        /*
-         * Since readers won't lock (see cache_from_memcg_idx()), we need a
-         * barrier here to ensure nobody will see the kmem_cache partially
-         * initialized.
-         */
-        smp_wmb();
-        BUG_ON(root_cache->memcg_params->memcg_caches[id]);
-        root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
-        struct kmem_cache *root_cache;
-        struct mem_cgroup *memcg;
-        int id;
-        lockdep_assert_held(&memcg_slab_mutex);
-        BUG_ON(is_root_cache(cachep));
-        root_cache = cachep->memcg_params->root_cache;
-        memcg = cachep->memcg_params->memcg;
-        id = memcg_cache_id(memcg);
-        BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
-        root_cache->memcg_params->memcg_caches[id] = NULL;
-        list_del(&cachep->memcg_params->list);
-        kmem_cache_destroy(cachep);
-}
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
-        struct kmem_cache *c;
-        int i, failed = 0;
-        mutex_lock(&memcg_slab_mutex);
-        for_each_memcg_cache_index(i) {
-                c = cache_from_memcg_idx(s, i);
-                if (!c)
-                        continue;
-                memcg_unregister_cache(c);
-                if (cache_from_memcg_idx(s, i))
-                        failed++;
-        }
-        mutex_unlock(&memcg_slab_mutex);
-        return failed;
-}
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
-        struct kmem_cache *cachep;
-        struct memcg_cache_params *params, *tmp;
-        if (!memcg_kmem_is_active(memcg))
-                return;
-        mutex_lock(&memcg_slab_mutex);
-        list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
-                cachep = memcg_params_to_cache(params);
-                memcg_unregister_cache(cachep);
-        }
-        mutex_unlock(&memcg_slab_mutex);
-}
-struct memcg_register_cache_work {
        struct mem_cgroup *memcg;
        struct kmem_cache *cachep;
        struct work_struct work;
 };
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-        struct memcg_register_cache_work *cw =
+        struct memcg_kmem_cache_create_work *cw =
-                container_of(w, struct memcg_register_cache_work, work);
+                container_of(w, struct memcg_kmem_cache_create_work, work);
        struct mem_cgroup *memcg = cw->memcg;
        struct kmem_cache *cachep = cw->cachep;
-        mutex_lock(&memcg_slab_mutex);
+        memcg_create_kmem_cache(memcg, cachep);
-        memcg_register_cache(memcg, cachep);
-        mutex_unlock(&memcg_slab_mutex);
        css_put(&memcg->css);
        kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
 /*
 * Enqueue the creation of a per-memcg kmem_cache.
 */
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                            struct kmem_cache *cachep)
+                                               struct kmem_cache *cachep)
 {
-        struct memcg_register_cache_work *cw;
+        struct memcg_kmem_cache_create_work *cw;
        cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
        if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
        cw->memcg = memcg;
        cw->cachep = cachep;
+        INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-        INIT_WORK(&cw->work, memcg_register_cache_func);
        schedule_work(&cw->work);
 }
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                          struct kmem_cache *cachep)
+                                             struct kmem_cache *cachep)
 {
        /*
         * We need to stop accounting when we kmalloc, because if the
         * corresponding kmalloc cache is not yet created, the first allocation
-         * in __memcg_schedule_register_cache will recurse.
+         * in __memcg_schedule_kmem_cache_create will recurse.
         *
         * However, it is better to enclose the whole function. Depending on
         * the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
         * the safest choice is to do it like this, wrapping the whole function.
         */
        current->memcg_kmem_skip_account = 1;
-        __memcg_schedule_register_cache(memcg, cachep);
+        __memcg_schedule_kmem_cache_create(memcg, cachep);
        current->memcg_kmem_skip_account = 0;
 }
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
-        unsigned int nr_pages = 1 << order;
-        return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
-        unsigned int nr_pages = 1 << order;
-        memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
 /*
 * Return the kmem_cache we're supposed to use for a slab allocation.
 * We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
+        int kmemcg_id;
-        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!is_root_cache(cachep));
-        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
        if (current->memcg_kmem_skip_account)
                return cachep;
        memcg = get_mem_cgroup_from_mm(current->mm);
-        if (!memcg_kmem_is_active(memcg))
+        kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+        if (kmemcg_id < 0)
                goto out;
-        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
        if (likely(memcg_cachep))
                return memcg_cachep;
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
         * could happen with the slab_mutex held. So it's better to
         * defer everything.
         */
-        memcg_schedule_register_cache(memcg, cachep);
+        memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
        css_put(&memcg->css);
        return cachep;
@@ -2834,7 +2671,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
        if (!is_root_cache(cachep))
-                css_put(&cachep->memcg_params->memcg->css);
+                css_put(&cachep->memcg_params.memcg->css);
 }
 /*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct kmem_cache *cachep;
+        struct page *page;
+        page = virt_to_head_page(ptr);
+        if (PageSlab(page)) {
+                cachep = page->slab_cache;
+                if (!is_root_cache(cachep))
+                        memcg = cachep->memcg_params.memcg;
+        } else
+                /* page allocated by alloc_kmem_pages */
+                memcg = page->mem_cgroup;
+        return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
        int err = 0;
        int memcg_id;
-        if (memcg_kmem_is_active(memcg))
+        BUG_ON(memcg->kmemcg_id >= 0);
-                return 0;
+        BUG_ON(memcg->kmem_acct_activated);
+        BUG_ON(memcg->kmem_acct_active);
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         * patched.
         */
        memcg->kmemcg_id = memcg_id;
+        memcg->kmem_acct_activated = true;
+        memcg->kmem_acct_active = true;
 out:
        return err;
 }
@@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
        int ret;
        buf = strstrip(buf);
-        ret = page_counter_memparse(buf, &nr_pages);
+        ret = page_counter_memparse(buf, "-1", &nr_pages);
        if (ret)
                return ret;
@@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        if (val >= (1 << NR_MOVE_TYPE))
+        if (val & ~MOVE_MASK)
                return -EINVAL;
        /*
@@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
        struct mem_cgroup *mi;
        unsigned int i;
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+                     MEM_CGROUP_STAT_NSTATS);
+        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+                     MEM_CGROUP_EVENTS_NSTATS);
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
        for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        unsigned long usage;
        int i, size, ret;
-        ret = page_counter_memparse(args, &threshold);
+        ret = page_counter_memparse(args, "-1", &threshold);
        if (ret)
                return ret;
@@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(memcg, ss);
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+        struct cgroup_subsys_state *css;
+        struct mem_cgroup *parent, *child;
+        int kmemcg_id;
+        if (!memcg->kmem_acct_active)
+                return;
+        /*
+         * Clear the 'active' flag before clearing memcg_caches arrays entries.
+         * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+         * guarantees no cache will be created for this cgroup after we are
+         * done (see memcg_create_kmem_cache()).
+         */
+        memcg->kmem_acct_active = false;
+        memcg_deactivate_kmem_caches(memcg);
+        kmemcg_id = memcg->kmemcg_id;
+        BUG_ON(kmemcg_id < 0);
+        parent = parent_mem_cgroup(memcg);
+        if (!parent)
+                parent = root_mem_cgroup;
+        /*
+         * Change kmemcg_id of this cgroup and all its descendants to the
+         * parent's id, and then move all entries from this cgroup's list_lrus
+         * to ones of the parent. After we have finished, all list_lrus
+         * corresponding to this cgroup are guaranteed to remain empty. The
+         * ordering is imposed by list_lru_node->lock taken by
+         * memcg_drain_all_list_lrus().
+         */
+        css_for_each_descendant_pre(css, &memcg->css) {
+                child = mem_cgroup_from_css(css);
+                BUG_ON(child->kmemcg_id != kmemcg_id);
+                child->kmemcg_id = parent->kmemcg_id;
+                if (!memcg->use_hierarchy)
+                        break;
+        }
+        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+        memcg_free_cache_id(kmemcg_id);
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
-        memcg_unregister_all_caches(memcg);
+        if (memcg->kmem_acct_activated) {
+                memcg_destroy_kmem_caches(memcg);
+                static_key_slow_dec(&memcg_kmem_enabled_key);
+                WARN_ON(page_counter_read(&memcg->kmem));
+        }
        mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return 0;
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4391,7 +4307,7 @@ out_kfree:
        return ret;
 }
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
        { },    /* terminate */
 };
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
-        {
-                .name = "memsw.usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.max_usage_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-                .write = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.limit_in_bytes",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-                .write = mem_cgroup_write,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        {
-                .name = "memsw.failcnt",
-                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-                .write = mem_cgroup_reset,
-                .read_u64 = mem_cgroup_read_u64,
-        },
-        { },    /* terminate */
-};
-#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
        free_percpu(memcg->stat);
-        disarm_static_keys(memcg);
        kfree(memcg);
 }
@@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-        struct mem_cgroup_tree_per_node *rtpn;
-        struct mem_cgroup_tree_per_zone *rtpz;
-        int tmp, node, zone;
-        for_each_node(node) {
-                tmp = node;
-                if (!node_state(node, N_NORMAL_MEMORY))
-                        tmp = -1;
-                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-                BUG_ON(!rtpn);
-                soft_limit_tree.rb_tree_per_node[node] = rtpn;
-                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                        rtpz = &rtpn->rb_tree_per_zone[zone];
-                        rtpz->rb_root = RB_ROOT;
-                        spin_lock_init(&rtpz->lock);
-                }
-        }
-}
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
                page_counter_init(&memcg->memory, NULL);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
@@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        spin_lock_init(&memcg->event_list_lock);
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
-        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
 #endif
        return &memcg->css;
@@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
        if (parent->use_hierarchy) {
                page_counter_init(&memcg->memory, &parent->memory);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, &parent->memsw);
                page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                 */
        } else {
                page_counter_init(&memcg->memory, NULL);
+                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
                page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
@@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        spin_unlock(&memcg->event_list_lock);
        vmpressure_cleanup(&memcg->vmpressure);
+        memcg_deactivate_kmem(memcg);
 }
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
        mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
        memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+        memcg->low = 0;
+        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
 }
@@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        if (!page || !page_mapped(page))
                return NULL;
        if (PageAnon(page)) {
-                /* we don't move shared anon */
+                if (!(mc.flags & MOVE_ANON))
-                if (!move_anon())
                        return NULL;
-        } else if (!move_file())
+        } else {
-                /* we ignore mapcount for file pages */
+                if (!(mc.flags & MOVE_FILE))
-                return NULL;
+                        return NULL;
+        }
        if (!get_page_unless_zero(page))
                return NULL;
@@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
-        if (!move_anon() || non_swap_entry(ent))
+        if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                return NULL;
        /*
         * Because lookup_swap_cache() updates some statistics counter,
@@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
        if (!vma->vm_file) /* anonymous vma */
                return NULL;
-        if (!move_file())
+        if (!(mc.flags & MOVE_FILE))
                return NULL;
        mapping = vma->vm_file->f_mapping;
-        if (pte_none(ptent))
+        pgoff = linear_page_index(vma, addr);
-                pgoff = linear_page_index(vma, addr);
-        else /* pte_file(ptent) is true */
-                pgoff = pte_to_pgoff(ptent);
        /* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                page = mc_handle_present_pte(vma, addr, ptent);
        else if (is_swap_pte(ptent))
                page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-        else if (pte_none(ptent) || pte_file(ptent))
+        else if (pte_none(ptent))
                page = mc_handle_file_pte(vma, addr, ptent, &ent);
        if (!page && !ent.val)
@@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
-        if (!move_anon())
+        if (!(mc.flags & MOVE_ANON))
                return ret;
        if (page->mem_cgroup == mc.from) {
                ret = MC_TARGET_PAGE;
@@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                        unsigned long addr, unsigned long end,
                                        struct mm_walk *walk)
 {
-        struct vm_area_struct *vma = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
@@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
        unsigned long precharge;
-        struct vm_area_struct *vma;
+        struct mm_walk mem_cgroup_count_precharge_walk = {
+                .pmd_entry = mem_cgroup_count_precharge_pte_range,
+                .mm = mm,
+        };
        down_read(&mm->mmap_sem);
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
-                struct mm_walk mem_cgroup_count_precharge_walk = {
-                        .pmd_entry = mem_cgroup_count_precharge_pte_range,
-                        .mm = mm,
-                        .private = vma,
-                };
-                if (is_vm_hugetlb_page(vma))
-                        continue;
-                walk_page_range(vma->vm_start, vma->vm_end,
-                                        &mem_cgroup_count_precharge_walk);
-        }
        up_read(&mm->mmap_sem);
        precharge = mc.precharge;
@@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        unsigned long move_charge_at_immigrate;
+        unsigned long move_flags;
        /*
         * We are now commited to this value whatever it is. Changes in this
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
-        move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
+        move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
-        if (move_charge_at_immigrate) {
+        if (move_flags) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
-                        mc.immigrate_flags = move_charge_at_immigrate;
+                        mc.flags = move_flags;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
@@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                struct mm_walk *walk)
 {
        int ret = 0;
-        struct vm_area_struct *vma = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
        enum mc_target_type target_type;
@@ -5295,7 +5154,10 @@ put:			/* get_mctgt_type() gets the page */
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
-        struct vm_area_struct *vma;
+        struct mm_walk mem_cgroup_move_charge_walk = {
+                .pmd_entry = mem_cgroup_move_charge_pte_range,
+                .mm = mm,
+        };
        lru_add_drain_all();
        /*
@@ -5318,24 +5180,11 @@ retry:
                cond_resched();
                goto retry;
        }
-        for (vma = mm->mmap; vma; vma = vma->vm_next) {
+        /*
-                int ret;
+         * When we have consumed all precharges and failed in doing
-                struct mm_walk mem_cgroup_move_charge_walk = {
+         * additional charge, the page walk just aborts.
-                        .pmd_entry = mem_cgroup_move_charge_pte_range,
+         */
-                        .mm = mm,
+        walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
-                        .private = vma,
-                };
-                if (is_vm_hugetlb_page(vma))
-                        continue;
-                ret = walk_page_range(vma->vm_start, vma->vm_end,
-                                                &mem_cgroup_move_charge_walk);
-                if (ret)
-                        /*
-                         * means we have consumed all precharges and failed in
-                         * doing additional charge. Just abandon here.
-                         */
-                        break;
-        }
        up_read(&mm->mmap_sem);
        atomic_dec(&mc.from->moving_account);
 }
@@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
                mem_cgroup_from_css(root_css)->use_hierarchy = true;
 }
-struct cgroup_subsys memory_cgrp_subsys = {
+static u64 memory_current_read(struct cgroup_subsys_state *css,
-        .css_alloc = mem_cgroup_css_alloc,
+                               struct cftype *cft)
-        .css_online = mem_cgroup_css_online,
+{
-        .css_offline = mem_cgroup_css_offline,
+        return mem_cgroup_usage(mem_cgroup_from_css(css), false);
-        .css_free = mem_cgroup_css_free,
+}
-        .css_reset = mem_cgroup_css_reset,
-        .can_attach = mem_cgroup_can_attach,
-        .cancel_attach = mem_cgroup_cancel_attach,
-        .attach = mem_cgroup_move_task,
-        .bind = mem_cgroup_bind,
-        .legacy_cftypes = mem_cgroup_files,
-        .early_init = 0,
-};
-#ifdef CONFIG_MEMCG_SWAP
+static int memory_low_show(struct seq_file *m, void *v)
-static int __init enable_swap_account(char *s)
 {
-        if (!strcmp(s, "1"))
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-                really_do_swap_account = 1;
+        unsigned long low = ACCESS_ONCE(memcg->low);
-        else if (!strcmp(s, "0"))
-                really_do_swap_account = 0;
+        if (low == PAGE_COUNTER_MAX)
-        return 1;
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+        return 0;
 }
-__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off)
 {
-        WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-                                          memsw_cgroup_files));
+        unsigned long low;
+        int err;
+        buf = strstrip(buf);
+        err = page_counter_memparse(buf, "max", &low);
+        if (err)
+                return err;
+        memcg->low = low;
+        return nbytes;
 }
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
 {
-        if (!mem_cgroup_disabled() && really_do_swap_account) {
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-                do_swap_account = 1;
+        unsigned long high = ACCESS_ONCE(memcg->high);
-                memsw_file_init();
-        }
+        if (high == PAGE_COUNTER_MAX)
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+        return 0;
 }
-#else
+static ssize_t memory_high_write(struct kernfs_open_file *of,
-static void __init enable_swap_cgroup(void)
+                                 char *buf, size_t nbytes, loff_t off)
 {
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+        unsigned long high;
+        int err;
+        buf = strstrip(buf);
+        err = page_counter_memparse(buf, "max", &high);
+        if (err)
+                return err;
+        memcg->high = high;
+        return nbytes;
 }
-#endif
-#ifdef CONFIG_MEMCG_SWAP
+static int memory_max_show(struct seq_file *m, void *v)
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 {
-        struct mem_cgroup *memcg;
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        unsigned short oldid;
+        unsigned long max = ACCESS_ONCE(memcg->memory.limit);
-        VM_BUG_ON_PAGE(PageLRU(page), page);
+        if (max == PAGE_COUNTER_MAX)
-        VM_BUG_ON_PAGE(page_count(page), page);
+                seq_puts(m, "max\n");
+        else
+                seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-        if (!do_swap_account)
+        return 0;
-                return;
+}
-        memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+                                char *buf, size_t nbytes, loff_t off)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+        unsigned long max;
+        int err;
-        /* Readahead page, never charged */
+        buf = strstrip(buf);
-        if (!memcg)
+        err = page_counter_memparse(buf, "max", &max);
-                return;
+        if (err)
+                return err;
-        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+        err = mem_cgroup_resize_limit(memcg, max);
-        VM_BUG_ON_PAGE(oldid, page);
+        if (err)
-        mem_cgroup_swap_statistics(memcg, true);
+                return err;
-        page->mem_cgroup = NULL;
+        return nbytes;
+}
-        if (!mem_cgroup_is_root(memcg))
+static int memory_events_show(struct seq_file *m, void *v)
-                page_counter_uncharge(&memcg->memory, 1);
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-        /* XXX: caller holds IRQ-safe mapping->tree_lock */
+        seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
-        VM_BUG_ON(!irqs_disabled());
+        seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+        seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+        seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
-        mem_cgroup_charge_statistics(memcg, page, -1);
+        return 0;
-        memcg_check_events(memcg, page);
 }
+static struct cftype memory_files[] = {
+        {
+                .name = "current",
+                .read_u64 = memory_current_read,
+        },
+        {
+                .name = "low",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_low_show,
+                .write = memory_low_write,
+        },
+        {
+                .name = "high",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_high_show,
+                .write = memory_high_write,
+        },
+        {
+                .name = "max",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_max_show,
+                .write = memory_max_write,
+        },
+        {
+                .name = "events",
+                .flags = CFTYPE_NOT_ON_ROOT,
+                .seq_show = memory_events_show,
+        },
+        { }     /* terminate */
+};
+struct cgroup_subsys memory_cgrp_subsys = {
+        .css_alloc = mem_cgroup_css_alloc,
+        .css_online = mem_cgroup_css_online,
+        .css_offline = mem_cgroup_css_offline,
+        .css_free = mem_cgroup_css_free,
+        .css_reset = mem_cgroup_css_reset,
+        .can_attach = mem_cgroup_can_attach,
+        .cancel_attach = mem_cgroup_cancel_attach,
+        .attach = mem_cgroup_move_task,
+        .bind = mem_cgroup_bind,
+        .dfl_cftypes = memory_files,
+        .legacy_cftypes = mem_cgroup_legacy_files,
+        .early_init = 0,
+};
 /**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * mem_cgroup_events - count memory events against a cgroup
- * @entry: swap entry to uncharge
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+                       enum mem_cgroup_events_index idx,
+                       unsigned int nr)
+{
+        this_cpu_add(memcg->stat->events[idx], nr);
+}
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
 *
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
 */
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *memcg;
+        if (mem_cgroup_disabled())
-        unsigned short id;
+                return false;
-        if (!do_swap_account)
+        /*
-                return;
+         * The toplevel group doesn't have a configurable range, so
+         * it's never low when looked at directly, and it is not
+         * considered an ancestor when assessing the hierarchy.
+         */
-        id = swap_cgroup_record(entry, 0);
+        if (memcg == root_mem_cgroup)
-        rcu_read_lock();
+                return false;
-        memcg = mem_cgroup_lookup(id);
-        if (memcg) {
+        if (page_counter_read(&memcg->memory) >= memcg->low)
-                if (!mem_cgroup_is_root(memcg))
+                return false;
-                        page_counter_uncharge(&memcg->memsw, 1);
-                mem_cgroup_swap_statistics(memcg, false);
+        while (memcg != root) {
-                css_put(&memcg->css);
+                memcg = parent_mem_cgroup(memcg);
+                if (memcg == root_mem_cgroup)
+                        break;
+                if (page_counter_read(&memcg->memory) >= memcg->low)
+                        return false;
        }
-        rcu_read_unlock();
+        return true;
 }
-#endif
 /**
 * mem_cgroup_try_charge - try charging a page
@@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
 */
 static int __init mem_cgroup_init(void)
 {
+        int cpu, node;
        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-        enable_swap_cgroup();
-        mem_cgroup_soft_limit_tree_init();
+        for_each_possible_cpu(cpu)
-        memcg_stock_init();
+                INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+                          drain_local_stock);
+        for_each_node(node) {
+                struct mem_cgroup_tree_per_node *rtpn;
+                int zone;
+                rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+                                    node_online(node) ? node : NUMA_NO_NODE);
+                for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                        struct mem_cgroup_tree_per_zone *rtpz;
+                        rtpz = &rtpn->rb_tree_per_zone[zone];
+                        rtpz->rb_root = RB_ROOT;
+                        spin_lock_init(&rtpz->lock);
+                }
+                soft_limit_tree.rb_tree_per_node[node] = rtpn;
+        }
        return 0;
 }
 subsys_initcall(mem_cgroup_init);
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+        struct mem_cgroup *memcg;
+        unsigned short oldid;
+        VM_BUG_ON_PAGE(PageLRU(page), page);
+        VM_BUG_ON_PAGE(page_count(page), page);
+        if (!do_swap_account)
+                return;
+        memcg = page->mem_cgroup;
+        /* Readahead page, never charged */
+        if (!memcg)
+                return;
+        oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+        VM_BUG_ON_PAGE(oldid, page);
+        mem_cgroup_swap_statistics(memcg, true);
+        page->mem_cgroup = NULL;
+        if (!mem_cgroup_is_root(memcg))
+                page_counter_uncharge(&memcg->memory, 1);
+        /* XXX: caller holds IRQ-safe mapping->tree_lock */
+        VM_BUG_ON(!irqs_disabled());
+        mem_cgroup_charge_statistics(memcg, page, -1);
+        memcg_check_events(memcg, page);
+}
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+        struct mem_cgroup *memcg;
+        unsigned short id;
+        if (!do_swap_account)
+                return;
+        id = swap_cgroup_record(entry, 0);
+        rcu_read_lock();
+        memcg = mem_cgroup_lookup(id);
+        if (memcg) {
+                if (!mem_cgroup_is_root(memcg))
+                        page_counter_uncharge(&memcg->memsw, 1);
+                mem_cgroup_swap_statistics(memcg, false);
+                css_put(&memcg->css);
+        }
+        rcu_read_unlock();
+}
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+static int __init enable_swap_account(char *s)
+{
+        if (!strcmp(s, "1"))
+                really_do_swap_account = 1;
+        else if (!strcmp(s, "0"))
+                really_do_swap_account = 0;
+        return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+static struct cftype memsw_cgroup_files[] = {
+        {
+                .name = "memsw.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+                .write = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+                .write = mem_cgroup_write,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        {
+                .name = "memsw.failcnt",
+                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+                .write = mem_cgroup_reset,
+                .read_u64 = mem_cgroup_read_u64,
+        },
+        { },    /* terminate */
+};
+static int __init mem_cgroup_swap_init(void)
+{
+        if (!mem_cgroup_disabled() && really_do_swap_account) {
+                do_swap_account = 1;
+                WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                                  memsw_cgroup_files));
+        }
+        return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+#endif /* CONFIG_MEMCG_SWAP */
author	Ingo Molnar <mingo@kernel.org>	2015-03-04 00:35:43 -0500
committer	Ingo Molnar <mingo@kernel.org>	2015-03-04 00:35:43 -0500
commit	d2c032e3dc58137a7261a7824d3acce435db1d66 (patch)
tree	7eea1c7c6103eefe879f07472eec99b3c41eb792 /mm/memcontrol.c
parent	7e8e385aaf6ed5b64b5d9108081cfcdcdd021b78 (diff)
parent	13a7a6ac0a11197edcd0f756a035f472b42cdf8b (diff)