1 files changed, 625 insertions, 353 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7f1a356153c0..5b6b0039f725 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -45,16 +45,17 @@
 #include <linux/swapops.h>
 #include <linux/spinlock.h>
 #include <linux/eventfd.h>
+#include <linux/poll.h>
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
-#include <linux/vmalloc.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
 #include <linux/lockdep.h>
+#include <linux/file.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
@@ -148,7 +149,7 @@ struct mem_cgroup_reclaim_iter {
         * matches memcg->dead_count of the hierarchy root group.
         */
        struct mem_cgroup *last_visited;
-        unsigned long last_dead_count;
+        int last_dead_count;
        /* scan generation, increased every round-trip */
        unsigned int generation;
@@ -227,6 +228,46 @@ struct mem_cgroup_eventfd_list {
        struct eventfd_ctx *eventfd;
 };
+/*
+ * cgroup_event represents events which userspace want to receive.
+ */
+struct mem_cgroup_event {
+        /*
+         * memcg which the event belongs to.
+         */
+        struct mem_cgroup *memcg;
+        /*
+         * eventfd to signal userspace about the event.
+         */
+        struct eventfd_ctx *eventfd;
+        /*
+         * Each of these stored in a list by the cgroup.
+         */
+        struct list_head list;
+        /*
+         * register_event() callback will be used to add new userspace
+         * waiter for changes related to this event.  Use eventfd_signal()
+         * on eventfd to send notification to userspace.
+         */
+        int (*register_event)(struct mem_cgroup *memcg,
+                              struct eventfd_ctx *eventfd, const char *args);
+        /*
+         * unregister_event() callback will be called when userspace closes
+         * the eventfd or on cgroup removing.  This callback must be set,
+         * if you want provide notification functionality.
+         */
+        void (*unregister_event)(struct mem_cgroup *memcg,
+                                 struct eventfd_ctx *eventfd);
+        /*
+         * All fields below needed to unregister event when
+         * userspace closes eventfd.
+         */
+        poll_table pt;
+        wait_queue_head_t *wqh;
+        wait_queue_t wait;
+        struct work_struct remove;
+};
 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
@@ -331,27 +372,20 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
+        /* List of events which userspace want to receive */
+        struct list_head event_list;
+        spinlock_t event_list_lock;
        struct mem_cgroup_per_node *nodeinfo[0];
        /* WARNING: nodeinfo must be the last member here */
 };
-static size_t memcg_size(void)
-{
-        return sizeof(struct mem_cgroup) +
-                nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
 /* internal only representation about the status of kmem accounting. */
 enum {
-        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+        KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
-        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
 #ifdef CONFIG_MEMCG_KMEM
 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
 {
@@ -363,16 +397,6 @@ static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
        /*
@@ -490,11 +514,6 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
        return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 }
-struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
-{
-        return &mem_cgroup_from_css(css)->vmpressure;
-}
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
        return (memcg == root_mem_cgroup);
@@ -1098,16 +1117,22 @@ skip_node:
         * skipped and we should continue the tree walk.
         * last_visited css is safe to use because it is
         * protected by css_get and the tree walk is rcu safe.
+         *
+         * We do not take a reference on the root of the tree walk
+         * because we might race with the root removal when it would
+         * be the only node in the iterated hierarchy and mem_cgroup_iter
+         * would end up in an endless loop because it expects that at
+         * least one valid node will be returned. Root cannot disappear
+         * because caller of the iterator should hold it already so
+         * skipping css reference should be safe.
         */
        if (next_css) {
-                struct mem_cgroup *mem = mem_cgroup_from_css(next_css);
+                if ((next_css == &root->css) ||
+                    ((next_css->flags & CSS_ONLINE) && css_tryget(next_css)))
+                        return mem_cgroup_from_css(next_css);
-                if (css_tryget(&mem->css))
+                prev_css = next_css;
-                        return mem;
+                goto skip_node;
-                else {
-                        prev_css = next_css;
-                        goto skip_node;
-                }
        }
        return NULL;
@@ -1141,7 +1166,15 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
        if (iter->last_dead_count == *sequence) {
                smp_rmb();
                position = iter->last_visited;
-                if (position && !css_tryget(&position->css))
+                /*
+                 * We cannot take a reference to root because we might race
+                 * with root removal and returning NULL would end up in
+                 * an endless loop on the iterator user level when root
+                 * would be returned all the time.
+                 */
+                if (position && position != root &&
+                                !css_tryget(&position->css))
                        position = NULL;
        }
        return position;
@@ -1150,9 +1183,11 @@ mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter,
 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter,
                                   struct mem_cgroup *last_visited,
                                   struct mem_cgroup *new_position,
+                                   struct mem_cgroup *root,
                                   int sequence)
 {
-        if (last_visited)
+        /* root reference counting symmetric to mem_cgroup_iter_load */
+        if (last_visited && last_visited != root)
                css_put(&last_visited->css);
        /*
         * We store the sequence count from the time @last_visited was
@@ -1227,7 +1262,8 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                memcg = __mem_cgroup_iter_next(root, last_visited);
                if (reclaim) {
-                        mem_cgroup_iter_update(iter, last_visited, memcg, seq);
+                        mem_cgroup_iter_update(iter, last_visited, memcg, root,
+                                        seq);
                        if (!memcg)
                                iter->generation++;
@@ -1647,13 +1683,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
 */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-        struct cgroup *task_cgrp;
-        struct cgroup *mem_cgrp;
        /*
-         * Need a buffer in BSS, can't rely on allocations. The code relies
+         * protects memcg_name and makes sure that parallel ooms do not
-         * on the assumption that OOM is serialized for memory controller.
+         * interleave
-         * If this assumption is broken, revisit this code.
         */
+        static DEFINE_MUTEX(oom_info_lock);
+        struct cgroup *task_cgrp;
+        struct cgroup *mem_cgrp;
        static char memcg_name[PATH_MAX];
        int ret;
        struct mem_cgroup *iter;
@@ -1662,6 +1698,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
        if (!p)
                return;
+        mutex_lock(&oom_info_lock);
        rcu_read_lock();
        mem_cgrp = memcg->css.cgroup;
@@ -1730,6 +1767,7 @@ done:
                pr_cont("\n");
        }
+        mutex_unlock(&oom_info_lock);
 }
 /*
@@ -1822,13 +1860,18 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                                break;
                        };
                        points = oom_badness(task, memcg, NULL, totalpages);
-                        if (points > chosen_points) {
+                        if (!points || points < chosen_points)
-                                if (chosen)
+                                continue;
-                                        put_task_struct(chosen);
+                        /* Prefer thread group leaders for display purposes */
-                                chosen = task;
+                        if (points == chosen_points &&
-                                chosen_points = points;
+                            thread_group_leader(chosen))
-                                get_task_struct(chosen);
+                                continue;
-                        }
+                        if (chosen)
+                                put_task_struct(chosen);
+                        chosen = task;
+                        chosen_points = points;
+                        get_task_struct(chosen);
                }
                css_task_iter_end(&it);
        }
@@ -2861,7 +2904,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
        unsigned short id;
        swp_entry_t ent;
-        VM_BUG_ON(!PageLocked(page));
+        VM_BUG_ON_PAGE(!PageLocked(page), page);
        pc = lookup_page_cgroup(page);
        lock_page_cgroup(pc);
@@ -2895,7 +2938,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        bool anon;
        lock_page_cgroup(pc);
-        VM_BUG_ON(PageCgroupUsed(pc));
+        VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
        /*
         * we don't need page_cgroup_lock about tail pages, becase they are not
         * accessed by any other context at this point.
@@ -2930,7 +2973,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        if (lrucare) {
                if (was_on_lru) {
                        lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-                        VM_BUG_ON(PageLRU(page));
+                        VM_BUG_ON_PAGE(PageLRU(page), page);
                        SetPageLRU(page);
                        add_page_to_lru_list(page, lruvec, page_lru(page));
                }
@@ -2956,10 +2999,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 static DEFINE_MUTEX(set_limit_mutex);
 #ifdef CONFIG_MEMCG_KMEM
+static DEFINE_MUTEX(activate_kmem_mutex);
 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
 {
        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+                memcg_kmem_is_active(memcg);
 }
 /*
@@ -2976,10 +3021,9 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
 }
 #ifdef CONFIG_SLABINFO
-static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css,
+static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v)
-                                    struct cftype *cft, struct seq_file *m)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        struct memcg_cache_params *params;
        if (!memcg_can_account_kmem(memcg))
@@ -3059,16 +3103,6 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
                css_put(&memcg->css);
 }
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
-{
-        if (!memcg)
-                return;
-        mutex_lock(&memcg->slab_caches_mutex);
-        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-        mutex_unlock(&memcg->slab_caches_mutex);
-}
 /*
 * helper for acessing a memcg's index. It will be used as an index in the
 * child cache array in kmem_cache, and also to derive its name. This function
@@ -3079,43 +3113,6 @@ int memcg_cache_id(struct mem_cgroup *memcg)
        return memcg ? memcg->kmemcg_id : -1;
 }
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
-{
-        int num, ret;
-        num = ida_simple_get(&kmem_limited_groups,
-                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-        if (num < 0)
-                return num;
-        /*
-         * After this point, kmem_accounted (that we test atomically in
-         * the beginning of this conditional), is no longer 0. This
-         * guarantees only one process will set the following boolean
-         * to true. We don't need test_and_set because we're protected
-         * by the set_limit_mutex anyway.
-         */
-        memcg_kmem_set_activated(memcg);
-        ret = memcg_update_all_caches(num+1);
-        if (ret) {
-                ida_simple_remove(&kmem_limited_groups, num);
-                memcg_kmem_clear_activated(memcg);
-                return ret;
-        }
-        memcg->kmemcg_id = num;
-        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-        mutex_init(&memcg->slab_caches_mutex);
-        return 0;
-}
 static size_t memcg_caches_array_size(int num_groups)
 {
        ssize_t size;
@@ -3152,18 +3149,17 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
        if (num_groups > memcg_limited_groups_array_size) {
                int i;
+                struct memcg_cache_params *new_params;
                ssize_t size = memcg_caches_array_size(num_groups);
                size *= sizeof(void *);
                size += offsetof(struct memcg_cache_params, memcg_caches);
-                s->memcg_params = kzalloc(size, GFP_KERNEL);
+                new_params = kzalloc(size, GFP_KERNEL);
-                if (!s->memcg_params) {
+                if (!new_params)
-                        s->memcg_params = cur_params;
                        return -ENOMEM;
-                }
-                s->memcg_params->is_root_cache = true;
+                new_params->is_root_cache = true;
                /*
                 * There is the chance it will be bigger than
@@ -3177,7 +3173,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                for (i = 0; i < memcg_limited_groups_array_size; i++) {
                        if (!cur_params->memcg_caches[i])
                                continue;
-                        s->memcg_params->memcg_caches[i] =
+                        new_params->memcg_caches[i] =
                                                cur_params->memcg_caches[i];
                }
@@ -3190,13 +3186,15 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
                 * bigger than the others. And all updates will reset this
                 * anyway.
                 */
-                kfree(cur_params);
+                rcu_assign_pointer(s->memcg_params, new_params);
+                if (cur_params)
+                        kfree_rcu(cur_params, rcu_head);
        }
        return 0;
 }
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s,
-                         struct kmem_cache *root_cache)
+                             struct kmem_cache *root_cache)
 {
        size_t size;
@@ -3224,35 +3222,85 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
        return 0;
 }
-void memcg_release_cache(struct kmem_cache *s)
+void memcg_free_cache_params(struct kmem_cache *s)
+{
+        kfree(s->memcg_params);
+}
+void memcg_register_cache(struct kmem_cache *s)
 {
        struct kmem_cache *root;
        struct mem_cgroup *memcg;
        int id;
-        /*
+        if (is_root_cache(s))
-         * This happens, for instance, when a root cache goes away before we
-         * add any memcg.
-         */
-        if (!s->memcg_params)
                return;
-        if (s->memcg_params->is_root_cache)
+        /*
-                goto out;
+         * Holding the slab_mutex assures nobody will touch the memcg_caches
+         * array while we are modifying it.
+         */
+        lockdep_assert_held(&slab_mutex);
+        root = s->memcg_params->root_cache;
        memcg = s->memcg_params->memcg;
-        id  = memcg_cache_id(memcg);
+        id = memcg_cache_id(memcg);
+        css_get(&memcg->css);
+        /*
+         * Since readers won't lock (see cache_from_memcg_idx()), we need a
+         * barrier here to ensure nobody will see the kmem_cache partially
+         * initialized.
+         */
+        smp_wmb();
+        /*
+         * Initialize the pointer to this cache in its parent's memcg_params
+         * before adding it to the memcg_slab_caches list, otherwise we can
+         * fail to convert memcg_params_to_cache() while traversing the list.
+         */
+        VM_BUG_ON(root->memcg_params->memcg_caches[id]);
+        root->memcg_params->memcg_caches[id] = s;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_add(&s->memcg_params->list, &memcg->memcg_slab_caches);
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+void memcg_unregister_cache(struct kmem_cache *s)
+{
+        struct kmem_cache *root;
+        struct mem_cgroup *memcg;
+        int id;
+        if (is_root_cache(s))
+                return;
+        /*
+         * Holding the slab_mutex assures nobody will touch the memcg_caches
+         * array while we are modifying it.
+         */
+        lockdep_assert_held(&slab_mutex);
        root = s->memcg_params->root_cache;
-        root->memcg_params->memcg_caches[id] = NULL;
+        memcg = s->memcg_params->memcg;
+        id = memcg_cache_id(memcg);
        mutex_lock(&memcg->slab_caches_mutex);
        list_del(&s->memcg_params->list);
        mutex_unlock(&memcg->slab_caches_mutex);
+        /*
+         * Clear the pointer to this cache in its parent's memcg_params only
+         * after removing it from the memcg_slab_caches list, otherwise we can
+         * fail to convert memcg_params_to_cache() while traversing the list.
+         */
+        VM_BUG_ON(!root->memcg_params->memcg_caches[id]);
+        root->memcg_params->memcg_caches[id] = NULL;
        css_put(&memcg->css);
-out:
-        kfree(s->memcg_params);
 }
 /*
@@ -3311,11 +3359,9 @@ static void kmem_cache_destroy_work_func(struct work_struct *w)
         * So if we aren't down to zero, we'll just schedule a worker and try
         * again
         */
-        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+        if (atomic_read(&cachep->memcg_params->nr_pages) != 0)
                kmem_cache_shrink(cachep);
-                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+        else
-                        return;
-        } else
                kmem_cache_destroy(cachep);
 }
@@ -3351,27 +3397,16 @@ void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
        schedule_work(&cachep->memcg_params->destroy);
 }
-/*
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- * This lock protects updaters, not readers. We want readers to be as fast as
+                                                  struct kmem_cache *s)
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-                                         struct kmem_cache *s)
 {
-        struct kmem_cache *new;
+        struct kmem_cache *new = NULL;
        static char *tmp_name = NULL;
+        static DEFINE_MUTEX(mutex);     /* protects tmp_name */
-        lockdep_assert_held(&memcg_cache_mutex);
+        BUG_ON(!memcg_can_account_kmem(memcg));
+        mutex_lock(&mutex);
        /*
         * kmem_cache_create_memcg duplicates the given name and
         * cgroup_name for this name requires RCU context.
@@ -3381,7 +3416,7 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
        if (!tmp_name) {
                tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
                if (!tmp_name)
-                        return NULL;
+                        goto out;
        }
        rcu_read_lock();
@@ -3391,48 +3426,13 @@ static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
        new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
        if (new)
                new->allocflags |= __GFP_KMEMCG;
+        else
-        return new;
+                new = s;
-}
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-                                                  struct kmem_cache *cachep)
-{
-        struct kmem_cache *new_cachep;
-        int idx;
-        BUG_ON(!memcg_can_account_kmem(memcg));
-        idx = memcg_cache_id(memcg);
-        mutex_lock(&memcg_cache_mutex);
-        new_cachep = cache_from_memcg_idx(cachep, idx);
-        if (new_cachep) {
-                css_put(&memcg->css);
-                goto out;
-        }
-        new_cachep = kmem_cache_dup(memcg, cachep);
-        if (new_cachep == NULL) {
-                new_cachep = cachep;
-                css_put(&memcg->css);
-                goto out;
-        }
-        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-        cachep->memcg_params->memcg_caches[idx] = new_cachep;
-        /*
-         * the readers won't lock, make sure everybody sees the updated value,
-         * so they won't put stuff in the queue again for no reason
-         */
-        wmb();
 out:
-        mutex_unlock(&memcg_cache_mutex);
+        mutex_unlock(&mutex);
-        return new_cachep;
+        return new;
 }
 void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
@@ -3452,9 +3452,10 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
         *
         * Still, we don't want anyone else freeing memcg_caches under our
         * noses, which can happen if a new memcg comes to life. As usual,
-         * we'll take the set_limit_mutex to protect ourselves against this.
+         * we'll take the activate_kmem_mutex to protect ourselves against
+         * this.
         */
-        mutex_lock(&set_limit_mutex);
+        mutex_lock(&activate_kmem_mutex);
        for_each_memcg_cache_index(i) {
                c = cache_from_memcg_idx(s, i);
                if (!c)
@@ -3477,7 +3478,7 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
                cancel_work_sync(&c->memcg_params->destroy);
                kmem_cache_destroy(c);
        }
-        mutex_unlock(&set_limit_mutex);
+        mutex_unlock(&activate_kmem_mutex);
 }
 struct create_work {
@@ -3509,6 +3510,7 @@ static void memcg_create_cache_work_func(struct work_struct *w)
        cw = container_of(w, struct create_work, work);
        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        css_put(&cw->memcg->css);
        kfree(cw);
 }
@@ -3568,7 +3570,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
                                          gfp_t gfp)
 {
        struct mem_cgroup *memcg;
-        int idx;
+        struct kmem_cache *memcg_cachep;
        VM_BUG_ON(!cachep->memcg_params);
        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
@@ -3582,15 +3584,9 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
        if (!memcg_can_account_kmem(memcg))
                goto out;
-        idx = memcg_cache_id(memcg);
+        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+        if (likely(memcg_cachep)) {
-        /*
+                cachep = memcg_cachep;
-         * barrier to mare sure we're always seeing the up to date value.  The
-         * code updating memcg_caches will issue a write barrier to match this.
-         */
-        read_barrier_depends();
-        if (likely(cache_from_memcg_idx(cachep, idx))) {
-                cachep = cache_from_memcg_idx(cachep, idx);
                goto out;
        }
@@ -3744,7 +3740,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        if (!memcg)
                return;
-        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
 }
 #else
@@ -3823,7 +3819,7 @@ static int mem_cgroup_move_account(struct page *page,
        bool anon = PageAnon(page);
        VM_BUG_ON(from == to);
-        VM_BUG_ON(PageLRU(page));
+        VM_BUG_ON_PAGE(PageLRU(page), page);
        /*
         * The page is isolated from LRU. So, collapse function
         * will not handle this page. But page splitting can happen.
@@ -3916,7 +3912,7 @@ static int mem_cgroup_move_parent(struct page *page,
                parent = root_mem_cgroup;
        if (nr_pages > 1) {
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                flags = compound_lock_irqsave(page);
        }
@@ -3950,7 +3946,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                /*
                 * Never OOM-kill a process for a huge page.  The
                 * fault handler will fall back to regular pages.
@@ -3970,8 +3966,8 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
        if (mem_cgroup_disabled())
                return 0;
-        VM_BUG_ON(page_mapped(page));
+        VM_BUG_ON_PAGE(page_mapped(page), page);
-        VM_BUG_ON(page->mapping && !PageAnon(page));
+        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
        VM_BUG_ON(!mm);
        return mem_cgroup_charge_common(page, mm, gfp_mask,
                                        MEM_CGROUP_CHARGE_TYPE_ANON);
@@ -4175,7 +4171,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
        if (PageTransHuge(page)) {
                nr_pages <<= compound_order(page);
-                VM_BUG_ON(!PageTransHuge(page));
+                VM_BUG_ON_PAGE(!PageTransHuge(page), page);
        }
        /*
         * Check if our page_cgroup is valid
@@ -4267,7 +4263,7 @@ void mem_cgroup_uncharge_page(struct page *page)
        /* early check. */
        if (page_mapped(page))
                return;
-        VM_BUG_ON(page->mapping && !PageAnon(page));
+        VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
        /*
         * If the page is in swap cache, uncharge should be deferred
         * to the swap path, which also properly accounts swap usage
@@ -4287,8 +4283,8 @@ void mem_cgroup_uncharge_page(struct page *page)
 void mem_cgroup_uncharge_cache_page(struct page *page)
 {
-        VM_BUG_ON(page_mapped(page));
+        VM_BUG_ON_PAGE(page_mapped(page), page);
-        VM_BUG_ON(page->mapping);
+        VM_BUG_ON_PAGE(page->mapping, page);
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
 }
@@ -5112,14 +5108,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
        return val << PAGE_SHIFT;
 }
-static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
+static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
-                               struct cftype *cft, struct file *file,
+                                   struct cftype *cft)
-                               char __user *buf, size_t nbytes, loff_t *ppos)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-        char str[64];
        u64 val;
-        int name, len;
+        int name;
        enum res_type type;
        type = MEMFILE_TYPE(cft->private);
@@ -5145,15 +5139,26 @@ static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css,
                BUG();
        }
-        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
+        return val;
-        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
-static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
-{
-        int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+                                 unsigned long long limit)
+{
+        int err = 0;
+        int memcg_id;
+        if (memcg_kmem_is_active(memcg))
+                return 0;
+        /*
+         * We are going to allocate memory for data shared by all memory
+         * cgroups so let's stop accounting here.
+         */
+        memcg_stop_kmem_account();
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
         * be changed if the cgroup has children already, or if tasks had
@@ -5167,72 +5172,101 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val)
         * of course permitted.
         */
        mutex_lock(&memcg_create_mutex);
-        mutex_lock(&set_limit_mutex);
+        if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
-        if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) {
+                err = -EBUSY;
-                if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) {
+        mutex_unlock(&memcg_create_mutex);
-                        ret = -EBUSY;
+        if (err)
-                        goto out;
+                goto out;
-                }
-                ret = res_counter_set_limit(&memcg->kmem, val);
-                VM_BUG_ON(ret);
-                ret = memcg_update_cache_sizes(memcg);
+        memcg_id = ida_simple_get(&kmem_limited_groups,
-                if (ret) {
+                                  0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-                        res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX);
+        if (memcg_id < 0) {
-                        goto out;
+                err = memcg_id;
-                }
+                goto out;
-                static_key_slow_inc(&memcg_kmem_enabled_key);
+        }
-                /*
-                 * setting the active bit after the inc will guarantee no one
+        /*
-                 * starts accounting before all call sites are patched
+         * Make sure we have enough space for this cgroup in each root cache's
-                 */
+         * memcg_params.
-                memcg_kmem_set_active(memcg);
+         */
-        } else
+        err = memcg_update_all_caches(memcg_id + 1);
-                ret = res_counter_set_limit(&memcg->kmem, val);
+        if (err)
+                goto out_rmid;
+        memcg->kmemcg_id = memcg_id;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+        mutex_init(&memcg->slab_caches_mutex);
+        /*
+         * We couldn't have accounted to this cgroup, because it hasn't got the
+         * active bit set yet, so this should succeed.
+         */
+        err = res_counter_set_limit(&memcg->kmem, limit);
+        VM_BUG_ON(err);
+        static_key_slow_inc(&memcg_kmem_enabled_key);
+        /*
+         * Setting the active bit after enabling static branching will
+         * guarantee no one starts accounting before all call sites are
+         * patched.
+         */
+        memcg_kmem_set_active(memcg);
 out:
-        mutex_unlock(&set_limit_mutex);
+        memcg_resume_kmem_account();
-        mutex_unlock(&memcg_create_mutex);
+        return err;
-#endif
+out_rmid:
+        ida_simple_remove(&kmem_limited_groups, memcg_id);
+        goto out;
+}
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+                               unsigned long long limit)
+{
+        int ret;
+        mutex_lock(&activate_kmem_mutex);
+        ret = __memcg_activate_kmem(memcg, limit);
+        mutex_unlock(&activate_kmem_mutex);
+        return ret;
+}
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                   unsigned long long val)
+{
+        int ret;
+        if (!memcg_kmem_is_active(memcg))
+                ret = memcg_activate_kmem(memcg, val);
+        else
+                ret = res_counter_set_limit(&memcg->kmem, val);
        return ret;
 }
-#ifdef CONFIG_MEMCG_KMEM
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
        int ret = 0;
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-        if (!parent)
-                goto out;
-        memcg->kmem_account_flags = parent->kmem_account_flags;
+        if (!parent)
-        /*
+                return 0;
-         * When that happen, we need to disable the static branch only on those
-         * memcgs that enabled it. To achieve this, we would be forced to
-         * complicate the code by keeping track of which memcgs were the ones
-         * that actually enabled limits, and which ones got it from its
-         * parents.
-         *
-         * It is a lot simpler just to do static_key_slow_inc() on every child
-         * that is accounted.
-         */
-        if (!memcg_kmem_is_active(memcg))
-                goto out;
+        mutex_lock(&activate_kmem_mutex);
        /*
-         * __mem_cgroup_free() will issue static_key_slow_dec() because this
+         * If the parent cgroup is not kmem-active now, it cannot be activated
-         * memcg is active already. If the later initialization fails then the
+         * after this point, because it has at least one child already.
-         * cgroup core triggers the cleanup so we do not have to do it here.
         */
-        static_key_slow_inc(&memcg_kmem_enabled_key);
+        if (memcg_kmem_is_active(parent))
+                ret = __memcg_activate_kmem(memcg, RES_COUNTER_MAX);
-        mutex_lock(&set_limit_mutex);
+        mutex_unlock(&activate_kmem_mutex);
-        memcg_stop_kmem_account();
-        ret = memcg_update_cache_sizes(memcg);
-        memcg_resume_kmem_account();
-        mutex_unlock(&set_limit_mutex);
-out:
        return ret;
 }
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+                                   unsigned long long val)
+{
+        return -EINVAL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 /*
@@ -5266,7 +5300,7 @@ static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft,
                else if (type == _MEMSWAP)
                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
                else if (type == _KMEM)
-                        ret = memcg_update_kmem_limit(css, val);
+                        ret = memcg_update_kmem_limit(memcg, val);
                else
                        return -EINVAL;
                break;
@@ -5383,8 +5417,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 #endif
 #ifdef CONFIG_NUMA
-static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
+static int memcg_numa_stat_show(struct seq_file *m, void *v)
-                                struct cftype *cft, struct seq_file *m)
 {
        struct numa_stat {
                const char *name;
@@ -5400,7 +5433,7 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
        const struct numa_stat *stat;
        int nid;
        unsigned long nr;
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
                nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -5439,10 +5472,9 @@ static inline void mem_cgroup_lru_names_not_uptodate(void)
        BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
 }
-static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft,
+static int memcg_stat_show(struct seq_file *m, void *v)
-                                 struct seq_file *m)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
        struct mem_cgroup *mi;
        unsigned int i;
@@ -5651,13 +5683,11 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
                mem_cgroup_oom_notify_cb(iter);
 }
-static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
+static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+        struct eventfd_ctx *eventfd, const char *args, enum res_type type)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 threshold, usage;
        int i, size, ret;
@@ -5734,13 +5764,23 @@ unlock:
        return ret;
 }
-static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
+static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd)
+        struct eventfd_ctx *eventfd, const char *args)
+{
+        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+}
+static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, const char *args)
+{
+        return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+}
+static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, enum res_type type)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 usage;
        int i, j, size;
@@ -5813,14 +5853,23 @@ unlock:
        mutex_unlock(&memcg->thresholds_lock);
 }
-static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+        struct eventfd_ctx *eventfd)
+{
+        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+}
+static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd)
+{
+        return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+}
+static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+        struct eventfd_ctx *eventfd, const char *args)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_eventfd_list *event;
-        enum res_type type = MEMFILE_TYPE(cft->private);
-        BUG_ON(type != _OOM_TYPE);
        event = kmalloc(sizeof(*event), GFP_KERNEL);
        if (!event)
                return -ENOMEM;
@@ -5838,14 +5887,10 @@ static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
        return 0;
 }
-static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
+static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
-        struct cftype *cft, struct eventfd_ctx *eventfd)
+        struct eventfd_ctx *eventfd)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup_eventfd_list *ev, *tmp;
-        enum res_type type = MEMFILE_TYPE(cft->private);
-        BUG_ON(type != _OOM_TYPE);
        spin_lock(&memcg_oom_lock);
@@ -5859,17 +5904,12 @@ static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
        spin_unlock(&memcg_oom_lock);
 }
-static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css,
+static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
-        struct cftype *cft,  struct cgroup_map_cb *cb)
 {
-        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
-        cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable);
+        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
+        seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
-        if (atomic_read(&memcg->under_oom))
-                cb->fill(cb, "under_oom", 1);
-        else
-                cb->fill(cb, "under_oom", 0);
        return 0;
 }
@@ -5962,41 +6002,261 @@ static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
 }
 #endif
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * "cgroup.event_control" implementation.
+ *
+ * This is way over-engineered.  It tries to support fully configurable
+ * events for each user.  Such level of flexibility is completely
+ * unnecessary especially in the light of the planned unified hierarchy.
+ *
+ * Please deprecate this and replace with something simpler if at all
+ * possible.
+ */
+/*
+ * Unregister event and free resources.
+ *
+ * Gets called from workqueue.
+ */
+static void memcg_event_remove(struct work_struct *work)
+{
+        struct mem_cgroup_event *event =
+                container_of(work, struct mem_cgroup_event, remove);
+        struct mem_cgroup *memcg = event->memcg;
+        remove_wait_queue(event->wqh, &event->wait);
+        event->unregister_event(memcg, event->eventfd);
+        /* Notify userspace the event is going away. */
+        eventfd_signal(event->eventfd, 1);
+        eventfd_ctx_put(event->eventfd);
+        kfree(event);
+        css_put(&memcg->css);
+}
+/*
+ * Gets called on POLLHUP on eventfd when user closes it.
+ *
+ * Called with wqh->lock held and interrupts disabled.
+ */
+static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+                            int sync, void *key)
+{
+        struct mem_cgroup_event *event =
+                container_of(wait, struct mem_cgroup_event, wait);
+        struct mem_cgroup *memcg = event->memcg;
+        unsigned long flags = (unsigned long)key;
+        if (flags & POLLHUP) {
+                /*
+                 * If the event has been detached at cgroup removal, we
+                 * can simply return knowing the other side will cleanup
+                 * for us.
+                 *
+                 * We can't race against event freeing since the other
+                 * side will require wqh->lock via remove_wait_queue(),
+                 * which we hold.
+                 */
+                spin_lock(&memcg->event_list_lock);
+                if (!list_empty(&event->list)) {
+                        list_del_init(&event->list);
+                        /*
+                         * We are in atomic context, but cgroup_event_remove()
+                         * may sleep, so we have to call it in workqueue.
+                         */
+                        schedule_work(&event->remove);
+                }
+                spin_unlock(&memcg->event_list_lock);
+        }
+        return 0;
+}
+static void memcg_event_ptable_queue_proc(struct file *file,
+                wait_queue_head_t *wqh, poll_table *pt)
+{
+        struct mem_cgroup_event *event =
+                container_of(pt, struct mem_cgroup_event, pt);
+        event->wqh = wqh;
+        add_wait_queue(wqh, &event->wait);
+}
+/*
+ * DO NOT USE IN NEW FILES.
+ *
+ * Parse input and register new cgroup event handler.
+ *
+ * Input must be in format '<event_fd> <control_fd> <args>'.
+ * Interpretation of args is defined by control file implementation.
+ */
+static int memcg_write_event_control(struct cgroup_subsys_state *css,
+                                     struct cftype *cft, const char *buffer)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup_event *event;
+        struct cgroup_subsys_state *cfile_css;
+        unsigned int efd, cfd;
+        struct fd efile;
+        struct fd cfile;
+        const char *name;
+        char *endp;
+        int ret;
+        efd = simple_strtoul(buffer, &endp, 10);
+        if (*endp != ' ')
+                return -EINVAL;
+        buffer = endp + 1;
+        cfd = simple_strtoul(buffer, &endp, 10);
+        if ((*endp != ' ') && (*endp != '\0'))
+                return -EINVAL;
+        buffer = endp + 1;
+        event = kzalloc(sizeof(*event), GFP_KERNEL);
+        if (!event)
+                return -ENOMEM;
+        event->memcg = memcg;
+        INIT_LIST_HEAD(&event->list);
+        init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+        init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+        INIT_WORK(&event->remove, memcg_event_remove);
+        efile = fdget(efd);
+        if (!efile.file) {
+                ret = -EBADF;
+                goto out_kfree;
+        }
+        event->eventfd = eventfd_ctx_fileget(efile.file);
+        if (IS_ERR(event->eventfd)) {
+                ret = PTR_ERR(event->eventfd);
+                goto out_put_efile;
+        }
+        cfile = fdget(cfd);
+        if (!cfile.file) {
+                ret = -EBADF;
+                goto out_put_eventfd;
+        }
+        /* the process need read permission on control file */
+        /* AV: shouldn't we check that it's been opened for read instead? */
+        ret = inode_permission(file_inode(cfile.file), MAY_READ);
+        if (ret < 0)
+                goto out_put_cfile;
+        /*
+         * Determine the event callbacks and set them in @event.  This used
+         * to be done via struct cftype but cgroup core no longer knows
+         * about these events.  The following is crude but the whole thing
+         * is for compatibility anyway.
+         *
+         * DO NOT ADD NEW FILES.
+         */
+        name = cfile.file->f_dentry->d_name.name;
+        if (!strcmp(name, "memory.usage_in_bytes")) {
+                event->register_event = mem_cgroup_usage_register_event;
+                event->unregister_event = mem_cgroup_usage_unregister_event;
+        } else if (!strcmp(name, "memory.oom_control")) {
+                event->register_event = mem_cgroup_oom_register_event;
+                event->unregister_event = mem_cgroup_oom_unregister_event;
+        } else if (!strcmp(name, "memory.pressure_level")) {
+                event->register_event = vmpressure_register_event;
+                event->unregister_event = vmpressure_unregister_event;
+        } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+                event->register_event = memsw_cgroup_usage_register_event;
+                event->unregister_event = memsw_cgroup_usage_unregister_event;
+        } else {
+                ret = -EINVAL;
+                goto out_put_cfile;
+        }
+        /*
+         * Verify @cfile should belong to @css.  Also, remaining events are
+         * automatically removed on cgroup destruction but the removal is
+         * asynchronous, so take an extra ref on @css.
+         */
+        rcu_read_lock();
+        ret = -EINVAL;
+        cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+                                 &mem_cgroup_subsys);
+        if (cfile_css == css && css_tryget(css))
+                ret = 0;
+        rcu_read_unlock();
+        if (ret)
+                goto out_put_cfile;
+        ret = event->register_event(memcg, event->eventfd, buffer);
+        if (ret)
+                goto out_put_css;
+        efile.file->f_op->poll(efile.file, &event->pt);
+        spin_lock(&memcg->event_list_lock);
+        list_add(&event->list, &memcg->event_list);
+        spin_unlock(&memcg->event_list_lock);
+        fdput(cfile);
+        fdput(efile);
+        return 0;
+out_put_css:
+        css_put(css);
+out_put_cfile:
+        fdput(cfile);
+out_put_eventfd:
+        eventfd_ctx_put(event->eventfd);
+out_put_efile:
+        fdput(efile);
+out_kfree:
+        kfree(event);
+        return ret;
+}
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
-                .register_event = mem_cgroup_usage_register_event,
-                .unregister_event = mem_cgroup_usage_unregister_event,
        },
        {
                .name = "max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "soft_limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "failcnt",
                .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "stat",
-                .read_seq_string = memcg_stat_show,
+                .seq_show = memcg_stat_show,
        },
        {
                .name = "force_empty",
@@ -6009,6 +6269,12 @@ static struct cftype mem_cgroup_files[] = {
                .read_u64 = mem_cgroup_hierarchy_read,
        },
        {
+                .name = "cgroup.event_control",         /* XXX: for compat */
+                .write_string = memcg_write_event_control,
+                .flags = CFTYPE_NO_PREFIX,
+                .mode = S_IWUGO,
+        },
+        {
                .name = "swappiness",
                .read_u64 = mem_cgroup_swappiness_read,
                .write_u64 = mem_cgroup_swappiness_write,
@@ -6020,21 +6286,17 @@ static struct cftype mem_cgroup_files[] = {
        },
        {
                .name = "oom_control",
-                .read_map = mem_cgroup_oom_control_read,
+                .seq_show = mem_cgroup_oom_control_read,
                .write_u64 = mem_cgroup_oom_control_write,
-                .register_event = mem_cgroup_oom_register_event,
-                .unregister_event = mem_cgroup_oom_unregister_event,
                .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
        },
        {
                .name = "pressure_level",
-                .register_event = vmpressure_register_event,
-                .unregister_event = vmpressure_unregister_event,
        },
 #ifdef CONFIG_NUMA
        {
                .name = "numa_stat",
-                .read_seq_string = memcg_numa_stat_show,
+                .seq_show = memcg_numa_stat_show,
        },
 #endif
 #ifdef CONFIG_MEMCG_KMEM
@@ -6042,29 +6304,29 @@ static struct cftype mem_cgroup_files[] = {
                .name = "kmem.limit_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.failcnt",
                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "kmem.max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
 #ifdef CONFIG_SLABINFO
        {
                .name = "kmem.slabinfo",
-                .read_seq_string = mem_cgroup_slabinfo_read,
+                .seq_show = mem_cgroup_slabinfo_read,
        },
 #endif
 #endif
@@ -6076,27 +6338,25 @@ static struct cftype memsw_cgroup_files[] = {
        {
                .name = "memsw.usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
-                .register_event = mem_cgroup_usage_register_event,
-                .unregister_event = mem_cgroup_usage_unregister_event,
        },
        {
                .name = "memsw.max_usage_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "memsw.limit_in_bytes",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
                .write_string = mem_cgroup_write,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        {
                .name = "memsw.failcnt",
                .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
                .trigger = mem_cgroup_reset,
-                .read = mem_cgroup_read,
+                .read_u64 = mem_cgroup_read_u64,
        },
        { },    /* terminate */
 };
@@ -6139,14 +6399,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *memcg;
-        size_t size = memcg_size();
+        size_t size;
-        /* Can be very big if nr_node_ids is very big */
+        size = sizeof(struct mem_cgroup);
-        if (size < PAGE_SIZE)
+        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-                memcg = kzalloc(size, GFP_KERNEL);
-        else
-                memcg = vzalloc(size);
+        memcg = kzalloc(size, GFP_KERNEL);
        if (!memcg)
                return NULL;
@@ -6157,10 +6415,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        return memcg;
 out_free:
-        if (size < PAGE_SIZE)
+        kfree(memcg);
-                kfree(memcg);
-        else
-                vfree(memcg);
        return NULL;
 }
@@ -6178,7 +6433,6 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
-        size_t size = memcg_size();
        mem_cgroup_remove_from_trees(memcg);
@@ -6199,10 +6453,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
         * the cgroup_lock.
         */
        disarm_static_keys(memcg);
-        if (size < PAGE_SIZE)
+        kfree(memcg);
-                kfree(memcg);
-        else
-                vfree(memcg);
 }
 /*
@@ -6268,6 +6519,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        mutex_init(&memcg->thresholds_lock);
        spin_lock_init(&memcg->move_lock);
        vmpressure_init(&memcg->vmpressure);
+        INIT_LIST_HEAD(&memcg->event_list);
+        spin_lock_init(&memcg->event_list_lock);
        return &memcg->css;
@@ -6281,7 +6534,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
-        int error = 0;
        if (css->cgroup->id > MEM_CGROUP_ID_MAX)
                return -ENOSPC;
@@ -6316,10 +6568,9 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
                if (parent != root_mem_cgroup)
                        mem_cgroup_subsys.broken_hierarchy = true;
        }
-        error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
        mutex_unlock(&memcg_create_mutex);
-        return error;
+        return memcg_init_kmem(memcg, &mem_cgroup_subsys);
 }
 /*
@@ -6343,11 +6594,32 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg)
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+        struct mem_cgroup_event *event, *tmp;
+        struct cgroup_subsys_state *iter;
+        /*
+         * Unregister events and notify userspace.
+         * Notify userspace about cgroup removing only after rmdir of cgroup
+         * directory to avoid race between userspace and kernelspace.
+         */
+        spin_lock(&memcg->event_list_lock);
+        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+                list_del_init(&event->list);
+                schedule_work(&event->remove);
+        }
+        spin_unlock(&memcg->event_list_lock);
        kmem_cgroup_css_offline(memcg);
        mem_cgroup_invalidate_reclaim_iterators(memcg);
-        mem_cgroup_reparent_charges(memcg);
+        /*
+         * This requires that offlining is serialized.  Right now that is
+         * guaranteed because css_killed_work_fn() holds the cgroup_mutex.
+         */
+        css_for_each_descendant_post(iter, css)
+                mem_cgroup_reparent_charges(mem_cgroup_from_css(iter));
        mem_cgroup_destroy_all_caches(memcg);
        vmpressure_cleanup(&memcg->vmpressure);
 }
@@ -6615,7 +6887,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
        enum mc_target_type ret = MC_TARGET_NONE;
        page = pmd_page(pmd);
-        VM_BUG_ON(!page || !PageHead(page));
+        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
        if (!move_anon())
                return ret;
        pc = lookup_page_cgroup(page);