14 files changed, 1740 insertions, 149 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 71259e052ce8..278e3ab1f169 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -149,7 +149,18 @@ config MOVABLE_NODE
        depends on NO_BOOTMEM
        depends on X86_64
        depends on NUMA
-        depends on BROKEN
+        default n
+        help
+          Allow a node to have only movable memory.  Pages used by the kernel,
+          such as direct mapping pages cannot be migrated.  So the corresponding
+          memory device cannot be hotplugged.  This option allows users to
+          online all the memory of a node as movable memory so that the whole
+          node can be hotplugged.  Users who don't use the memory hotplug
+          feature are fine with this option on since they don't online memory
+          as movable.
+          Say Y here if you want to hotplug a whole node.
+          Say N here if you want kernel to use memory on all nodes evenly.
 # eventually, we can have this option just 'select SPARSEMEM'
 config MEMORY_HOTPLUG
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e5318c7793ae..4f3ea0b1e57c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1906,14 +1906,12 @@ static int __init hugetlb_init(void)
                default_hstate.max_huge_pages = default_hstate_max_huge_pages;
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();
        hugetlb_sysfs_init();
        hugetlb_register_all_nodes();
+        hugetlb_cgroup_file_init();
        return 0;
 }
@@ -1943,13 +1941,6 @@ void __init hugetlb_add_hstate(unsigned order)
        h->next_nid_to_free = first_node(node_states[N_MEMORY]);
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
-        /*
-         * Add cgroup control files only if the huge page consists
-         * of more than two normal pages. This is because we use
-         * page[2].lru.next for storing cgoup details.
-         */
-        if (order >= HUGETLB_CGROUP_MIN_ORDER)
-                hugetlb_cgroup_file_init(hugetlb_max_hstate - 1);
        parsed_hstate = h;
 }
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index b5bde7a5c017..9cea7de22ffb 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -333,7 +333,7 @@ static char *mem_fmt(char *buf, int size, unsigned long hsize)
        return buf;
 }
-int __init hugetlb_cgroup_file_init(int idx)
+static void __init __hugetlb_cgroup_file_init(int idx)
 {
        char buf[32];
        struct cftype *cft;
@@ -375,7 +375,22 @@ int __init hugetlb_cgroup_file_init(int idx)
        WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files));
-        return 0;
+        return;
+}
+void __init hugetlb_cgroup_file_init(void)
+{
+        struct hstate *h;
+        for_each_hstate(h) {
+                /*
+                 * Add cgroup control files only if the huge page consists
+                 * of more than two normal pages. This is because we use
+                 * page[2].lru.next for storing cgroup details.
+                 */
+                if (huge_page_order(h) >= HUGETLB_CGROUP_MIN_ORDER)
+                        __hugetlb_cgroup_file_init(hstate_index(h));
+        }
 }
 /*
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index a217cc544060..752a705c77c2 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1556,7 +1556,8 @@ static int dump_str_object_info(const char *str)
        struct kmemleak_object *object;
        unsigned long addr;
-        addr= simple_strtoul(str, NULL, 0);
+        if (kstrtoul(str, 0, &addr))
+                return -EINVAL;
        object = find_and_get_object(addr, 0);
        if (!object) {
                pr_info("Unknown object at 0x%08lx\n", addr);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bbfac5063ca8..f3009b4bae51 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -10,6 +10,10 @@
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 *
+ * Kernel Memory Controller
+ * Copyright (C) 2012 Parallels Inc. and Google Inc.
+ * Authors: Glauber Costa and Suleiman Souhlal
+ *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
@@ -268,6 +272,10 @@ struct mem_cgroup {
        };
        /*
+         * the counter to account for kernel memory usage.
+         */
+        struct res_counter kmem;
+        /*
         * Per cgroup active and inactive list, similar to the
         * per zone LRU lists.
         */
@@ -282,6 +290,7 @@ struct mem_cgroup {
         * Should the accounting and control be hierarchical, per subtree?
         */
        bool use_hierarchy;
+        unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
        bool            oom_lock;
        atomic_t        under_oom;
@@ -332,8 +341,61 @@ struct mem_cgroup {
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        struct tcp_memcontrol tcp_mem;
 #endif
+#if defined(CONFIG_MEMCG_KMEM)
+        /* analogous to slab_common's slab_caches list. per-memcg */
+        struct list_head memcg_slab_caches;
+        /* Not a spinlock, we can take a lot of time walking the list */
+        struct mutex slab_caches_mutex;
+        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        int kmemcg_id;
+#endif
 };
+/* internal only representation about the status of kmem accounting. */
+enum {
+        KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
+        KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+        KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
+};
+/* We account when limit is on, but only after call sites are patched */
+#define KMEM_ACCOUNTED_MASK \
+                ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
+#ifdef CONFIG_MEMCG_KMEM
+static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
+{
+        set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+        return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
+{
+        set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
+{
+        clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
+}
+static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
+{
+        if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
+                set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+}
+static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
+{
+        return test_and_clear_bit(KMEM_ACCOUNTED_DEAD,
+                                  &memcg->kmem_account_flags);
+}
+#endif
 /* Stuffs for move charges at task migration. */
 /*
 * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a
@@ -388,9 +450,13 @@ enum charge_type {
 };
 /* for encoding cft->private value on file */
-#define _MEM                    (0)
+enum res_type {
-#define _MEMSWAP                (1)
+        _MEM,
-#define _OOM_TYPE               (2)
+        _MEMSWAP,
+        _OOM_TYPE,
+        _KMEM,
+};
 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
@@ -487,6 +553,75 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 }
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * There are two main reasons for not using the css_id for this:
+ *  1) this works better in sparse environments, where we have a lot of memcgs,
+ *     but only a few kmem-limited. Or also, if we have, for instance, 200
+ *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ *     200 entry array for that.
+ *
+ *  2) In order not to violate the cgroup API, we would like to do all memory
+ *     allocation in ->create(). At that point, we haven't yet allocated the
+ *     css_id. Having a separate index prevents us from messing with the cgroup
+ *     core for this
+ *
+ * The current size of the caches array is stored in
+ * memcg_limited_groups_array_size.  It will double each time we have to
+ * increase it.
+ */
+static DEFINE_IDA(kmem_limited_groups);
+int memcg_limited_groups_array_size;
+/*
+ * MIN_SIZE is different than 1, because we would like to avoid going through
+ * the alloc/free process all the time. In a small machine, 4 kmem-limited
+ * cgroups is a reasonable guess. In the future, it could be a parameter or
+ * tunable, but that is strictly not necessary.
+ *
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ * this constant directly from cgroup, but it is understandable that this is
+ * better kept as an internal representation in cgroup.c. In any case, the
+ * css_id space is not getting any smaller, and we don't have to necessarily
+ * increase ours as well if it increases.
+ */
+#define MEMCG_CACHES_MIN_SIZE 4
+#define MEMCG_CACHES_MAX_SIZE 65535
+/*
+ * A lot of the calls to the cache allocation functions are expected to be
+ * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * conditional to this static branch, we'll have to allow modules that does
+ * kmem_cache_alloc and the such to see this symbol as well
+ */
+struct static_key memcg_kmem_enabled_key;
+EXPORT_SYMBOL(memcg_kmem_enabled_key);
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+        if (memcg_kmem_is_active(memcg)) {
+                static_key_slow_dec(&memcg_kmem_enabled_key);
+                ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
+        }
+        /*
+         * This check can't live in kmem destruction function,
+         * since the charges will outlive the cgroup
+         */
+        WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0);
+}
+#else
+static void disarm_kmem_keys(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+static void disarm_static_keys(struct mem_cgroup *memcg)
+{
+        disarm_sock_keys(memcg);
+        disarm_kmem_keys(memcg);
+}
 static void drain_all_stock_async(struct mem_cgroup *memcg);
 static struct mem_cgroup_per_zone *
@@ -1453,6 +1588,10 @@ done:
                res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
                res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
+        printk(KERN_INFO "kmem: usage %llukB, limit %llukB, failcnt %llu\n",
+                res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10,
+                res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10,
+                res_counter_read_u64(&memcg->kmem, RES_FAILCNT));
 }
 /*
@@ -2060,20 +2199,28 @@ struct memcg_stock_pcp {
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
-/*
+/**
- * Try to consume stocked charge on this cpu. If success, one page is consumed
+ * consume_stock: Try to consume stocked charge on this cpu.
- * from local stock and true is returned. If the stock is 0 or charges from a
+ * @memcg: memcg to consume from.
- * cgroup which is not current target, returns false. This stock will be
+ * @nr_pages: how many pages to charge.
- * refilled.
+ *
+ * The charges will only happen if @memcg matches the current cpu's memcg
+ * stock, and at least @nr_pages are available in that stock.  Failure to
+ * service an allocation will refill the stock.
+ *
+ * returns true if successful, false otherwise.
 */
-static bool consume_stock(struct mem_cgroup *memcg)
+static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
        struct memcg_stock_pcp *stock;
        bool ret = true;
+        if (nr_pages > CHARGE_BATCH)
+                return false;
        stock = &get_cpu_var(memcg_stock);
-        if (memcg == stock->cached && stock->nr_pages)
+        if (memcg == stock->cached && stock->nr_pages >= nr_pages)
-                stock->nr_pages--;
+                stock->nr_pages -= nr_pages;
        else /* need to call res_counter_charge */
                ret = false;
        put_cpu_var(memcg_stock);
@@ -2250,7 +2397,8 @@ enum {
 };
 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-                                unsigned int nr_pages, bool oom_check)
+                                unsigned int nr_pages, unsigned int min_pages,
+                                bool oom_check)
 {
        unsigned long csize = nr_pages * PAGE_SIZE;
        struct mem_cgroup *mem_over_limit;
@@ -2273,18 +2421,18 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
        } else
                mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
        /*
-         * nr_pages can be either a huge page (HPAGE_PMD_NR), a batch
-         * of regular pages (CHARGE_BATCH), or a single regular page (1).
-         *
         * Never reclaim on behalf of optional batching, retry with a
         * single page instead.
         */
-        if (nr_pages == CHARGE_BATCH)
+        if (nr_pages > min_pages)
                return CHARGE_RETRY;
        if (!(gfp_mask & __GFP_WAIT))
                return CHARGE_WOULDBLOCK;
+        if (gfp_mask & __GFP_NORETRY)
+                return CHARGE_NOMEM;
        ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                return CHARGE_RETRY;
@@ -2297,7 +2445,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * unlikely to succeed so close to the limit, and we fall back
         * to regular pages anyway in case of failure.
         */
-        if (nr_pages == 1 && ret)
+        if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
                return CHARGE_RETRY;
        /*
@@ -2371,7 +2519,7 @@ again:
                memcg = *ptr;
                if (mem_cgroup_is_root(memcg))
                        goto done;
-                if (nr_pages == 1 && consume_stock(memcg))
+                if (consume_stock(memcg, nr_pages))
                        goto done;
                css_get(&memcg->css);
        } else {
@@ -2396,7 +2544,7 @@ again:
                        rcu_read_unlock();
                        goto done;
                }
-                if (nr_pages == 1 && consume_stock(memcg)) {
+                if (consume_stock(memcg, nr_pages)) {
                        /*
                         * It seems dagerous to access memcg without css_get().
                         * But considering how consume_stok works, it's not
@@ -2431,7 +2579,8 @@ again:
                        nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
                }
-                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check);
+                ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
+                    oom_check);
                switch (ret) {
                case CHARGE_OK:
                        break;
@@ -2624,6 +2773,766 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
        memcg_check_events(memcg, page);
 }
+static DEFINE_MUTEX(set_limit_mutex);
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
+{
+        return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
+                (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
+}
+/*
+ * This is a bit cumbersome, but it is rarely used and avoids a backpointer
+ * in the memcg_cache_params struct.
+ */
+static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
+{
+        struct kmem_cache *cachep;
+        VM_BUG_ON(p->is_root_cache);
+        cachep = p->root_cache;
+        return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+}
+#ifdef CONFIG_SLABINFO
+static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
+                                        struct seq_file *m)
+{
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        struct memcg_cache_params *params;
+        if (!memcg_can_account_kmem(memcg))
+                return -EIO;
+        print_slabinfo_header(m);
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_for_each_entry(params, &memcg->memcg_slab_caches, list)
+                cache_show(memcg_params_to_cache(params), m);
+        mutex_unlock(&memcg->slab_caches_mutex);
+        return 0;
+}
+#endif
+static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
+{
+        struct res_counter *fail_res;
+        struct mem_cgroup *_memcg;
+        int ret = 0;
+        bool may_oom;
+        ret = res_counter_charge(&memcg->kmem, size, &fail_res);
+        if (ret)
+                return ret;
+        /*
+         * Conditions under which we can wait for the oom_killer. Those are
+         * the same conditions tested by the core page allocator
+         */
+        may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+        _memcg = memcg;
+        ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
+                                      &_memcg, may_oom);
+        if (ret == -EINTR)  {
+                /*
+                 * __mem_cgroup_try_charge() chosed to bypass to root due to
+                 * OOM kill or fatal signal.  Since our only options are to
+                 * either fail the allocation or charge it to this cgroup, do
+                 * it as a temporary condition. But we can't fail. From a
+                 * kmem/slab perspective, the cache has already been selected,
+                 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+                 * our minds.
+                 *
+                 * This condition will only trigger if the task entered
+                 * memcg_charge_kmem in a sane state, but was OOM-killed during
+                 * __mem_cgroup_try_charge() above. Tasks that were already
+                 * dying when the allocation triggers should have been already
+                 * directed to the root cgroup in memcontrol.h
+                 */
+                res_counter_charge_nofail(&memcg->res, size, &fail_res);
+                if (do_swap_account)
+                        res_counter_charge_nofail(&memcg->memsw, size,
+                                                  &fail_res);
+                ret = 0;
+        } else if (ret)
+                res_counter_uncharge(&memcg->kmem, size);
+        return ret;
+}
+static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
+{
+        res_counter_uncharge(&memcg->res, size);
+        if (do_swap_account)
+                res_counter_uncharge(&memcg->memsw, size);
+        /* Not down to 0 */
+        if (res_counter_uncharge(&memcg->kmem, size))
+                return;
+        if (memcg_kmem_test_and_clear_dead(memcg))
+                mem_cgroup_put(memcg);
+}
+void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+{
+        if (!memcg)
+                return;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+/*
+ * helper for acessing a memcg's index. It will be used as an index in the
+ * child cache array in kmem_cache, and also to derive its name. This function
+ * will return -1 when this is not a kmem-limited memcg.
+ */
+int memcg_cache_id(struct mem_cgroup *memcg)
+{
+        return memcg ? memcg->kmemcg_id : -1;
+}
+/*
+ * This ends up being protected by the set_limit mutex, during normal
+ * operation, because that is its main call site.
+ *
+ * But when we create a new cache, we can call this as well if its parent
+ * is kmem-limited. That will have to hold set_limit_mutex as well.
+ */
+int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+{
+        int num, ret;
+        num = ida_simple_get(&kmem_limited_groups,
+                                0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+        if (num < 0)
+                return num;
+        /*
+         * After this point, kmem_accounted (that we test atomically in
+         * the beginning of this conditional), is no longer 0. This
+         * guarantees only one process will set the following boolean
+         * to true. We don't need test_and_set because we're protected
+         * by the set_limit_mutex anyway.
+         */
+        memcg_kmem_set_activated(memcg);
+        ret = memcg_update_all_caches(num+1);
+        if (ret) {
+                ida_simple_remove(&kmem_limited_groups, num);
+                memcg_kmem_clear_activated(memcg);
+                return ret;
+        }
+        memcg->kmemcg_id = num;
+        INIT_LIST_HEAD(&memcg->memcg_slab_caches);
+        mutex_init(&memcg->slab_caches_mutex);
+        return 0;
+}
+static size_t memcg_caches_array_size(int num_groups)
+{
+        ssize_t size;
+        if (num_groups <= 0)
+                return 0;
+        size = 2 * num_groups;
+        if (size < MEMCG_CACHES_MIN_SIZE)
+                size = MEMCG_CACHES_MIN_SIZE;
+        else if (size > MEMCG_CACHES_MAX_SIZE)
+                size = MEMCG_CACHES_MAX_SIZE;
+        return size;
+}
+/*
+ * We should update the current array size iff all caches updates succeed. This
+ * can only be done from the slab side. The slab mutex needs to be held when
+ * calling this.
+ */
+void memcg_update_array_size(int num)
+{
+        if (num > memcg_limited_groups_array_size)
+                memcg_limited_groups_array_size = memcg_caches_array_size(num);
+}
+int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
+{
+        struct memcg_cache_params *cur_params = s->memcg_params;
+        VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+        if (num_groups > memcg_limited_groups_array_size) {
+                int i;
+                ssize_t size = memcg_caches_array_size(num_groups);
+                size *= sizeof(void *);
+                size += sizeof(struct memcg_cache_params);
+                s->memcg_params = kzalloc(size, GFP_KERNEL);
+                if (!s->memcg_params) {
+                        s->memcg_params = cur_params;
+                        return -ENOMEM;
+                }
+                s->memcg_params->is_root_cache = true;
+                /*
+                 * There is the chance it will be bigger than
+                 * memcg_limited_groups_array_size, if we failed an allocation
+                 * in a cache, in which case all caches updated before it, will
+                 * have a bigger array.
+                 *
+                 * But if that is the case, the data after
+                 * memcg_limited_groups_array_size is certainly unused
+                 */
+                for (i = 0; i < memcg_limited_groups_array_size; i++) {
+                        if (!cur_params->memcg_caches[i])
+                                continue;
+                        s->memcg_params->memcg_caches[i] =
+                                                cur_params->memcg_caches[i];
+                }
+                /*
+                 * Ideally, we would wait until all caches succeed, and only
+                 * then free the old one. But this is not worth the extra
+                 * pointer per-cache we'd have to have for this.
+                 *
+                 * It is not a big deal if some caches are left with a size
+                 * bigger than the others. And all updates will reset this
+                 * anyway.
+                 */
+                kfree(cur_params);
+        }
+        return 0;
+}
+int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
+                         struct kmem_cache *root_cache)
+{
+        size_t size = sizeof(struct memcg_cache_params);
+        if (!memcg_kmem_enabled())
+                return 0;
+        if (!memcg)
+                size += memcg_limited_groups_array_size * sizeof(void *);
+        s->memcg_params = kzalloc(size, GFP_KERNEL);
+        if (!s->memcg_params)
+                return -ENOMEM;
+        if (memcg) {
+                s->memcg_params->memcg = memcg;
+                s->memcg_params->root_cache = root_cache;
+        }
+        return 0;
+}
+void memcg_release_cache(struct kmem_cache *s)
+{
+        struct kmem_cache *root;
+        struct mem_cgroup *memcg;
+        int id;
+        /*
+         * This happens, for instance, when a root cache goes away before we
+         * add any memcg.
+         */
+        if (!s->memcg_params)
+                return;
+        if (s->memcg_params->is_root_cache)
+                goto out;
+        memcg = s->memcg_params->memcg;
+        id  = memcg_cache_id(memcg);
+        root = s->memcg_params->root_cache;
+        root->memcg_params->memcg_caches[id] = NULL;
+        mem_cgroup_put(memcg);
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_del(&s->memcg_params->list);
+        mutex_unlock(&memcg->slab_caches_mutex);
+out:
+        kfree(s->memcg_params);
+}
+/*
+ * During the creation a new cache, we need to disable our accounting mechanism
+ * altogether. This is true even if we are not creating, but rather just
+ * enqueing new caches to be created.
+ *
+ * This is because that process will trigger allocations; some visible, like
+ * explicit kmallocs to auxiliary data structures, name strings and internal
+ * cache structures; some well concealed, like INIT_WORK() that can allocate
+ * objects during debug.
+ *
+ * If any allocation happens during memcg_kmem_get_cache, we will recurse back
+ * to it. This may not be a bounded recursion: since the first cache creation
+ * failed to complete (waiting on the allocation), we'll just try to create the
+ * cache again, failing at the same point.
+ *
+ * memcg_kmem_get_cache is prepared to abort after seeing a positive count of
+ * memcg_kmem_skip_account. So we enclose anything that might allocate memory
+ * inside the following two functions.
+ */
+static inline void memcg_stop_kmem_account(void)
+{
+        VM_BUG_ON(!current->mm);
+        current->memcg_kmem_skip_account++;
+}
+static inline void memcg_resume_kmem_account(void)
+{
+        VM_BUG_ON(!current->mm);
+        current->memcg_kmem_skip_account--;
+}
+static void kmem_cache_destroy_work_func(struct work_struct *w)
+{
+        struct kmem_cache *cachep;
+        struct memcg_cache_params *p;
+        p = container_of(w, struct memcg_cache_params, destroy);
+        cachep = memcg_params_to_cache(p);
+        /*
+         * If we get down to 0 after shrink, we could delete right away.
+         * However, memcg_release_pages() already puts us back in the workqueue
+         * in that case. If we proceed deleting, we'll get a dangling
+         * reference, and removing the object from the workqueue in that case
+         * is unnecessary complication. We are not a fast path.
+         *
+         * Note that this case is fundamentally different from racing with
+         * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
+         * kmem_cache_shrink, not only we would be reinserting a dead cache
+         * into the queue, but doing so from inside the worker racing to
+         * destroy it.
+         *
+         * So if we aren't down to zero, we'll just schedule a worker and try
+         * again
+         */
+        if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
+                kmem_cache_shrink(cachep);
+                if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
+                        return;
+        } else
+                kmem_cache_destroy(cachep);
+}
+void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
+{
+        if (!cachep->memcg_params->dead)
+                return;
+        /*
+         * There are many ways in which we can get here.
+         *
+         * We can get to a memory-pressure situation while the delayed work is
+         * still pending to run. The vmscan shrinkers can then release all
+         * cache memory and get us to destruction. If this is the case, we'll
+         * be executed twice, which is a bug (the second time will execute over
+         * bogus data). In this case, cancelling the work should be fine.
+         *
+         * But we can also get here from the worker itself, if
+         * kmem_cache_shrink is enough to shake all the remaining objects and
+         * get the page count to 0. In this case, we'll deadlock if we try to
+         * cancel the work (the worker runs with an internal lock held, which
+         * is the same lock we would hold for cancel_work_sync().)
+         *
+         * Since we can't possibly know who got us here, just refrain from
+         * running if there is already work pending
+         */
+        if (work_pending(&cachep->memcg_params->destroy))
+                return;
+        /*
+         * We have to defer the actual destroying to a workqueue, because
+         * we might currently be in a context that cannot sleep.
+         */
+        schedule_work(&cachep->memcg_params->destroy);
+}
+static char *memcg_cache_name(struct mem_cgroup *memcg, struct kmem_cache *s)
+{
+        char *name;
+        struct dentry *dentry;
+        rcu_read_lock();
+        dentry = rcu_dereference(memcg->css.cgroup->dentry);
+        rcu_read_unlock();
+        BUG_ON(dentry == NULL);
+        name = kasprintf(GFP_KERNEL, "%s(%d:%s)", s->name,
+                         memcg_cache_id(memcg), dentry->d_name.name);
+        return name;
+}
+static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
+                                         struct kmem_cache *s)
+{
+        char *name;
+        struct kmem_cache *new;
+        name = memcg_cache_name(memcg, s);
+        if (!name)
+                return NULL;
+        new = kmem_cache_create_memcg(memcg, name, s->object_size, s->align,
+                                      (s->flags & ~SLAB_PANIC), s->ctor, s);
+        if (new)
+                new->allocflags |= __GFP_KMEMCG;
+        kfree(name);
+        return new;
+}
+/*
+ * This lock protects updaters, not readers. We want readers to be as fast as
+ * they can, and they will either see NULL or a valid cache value. Our model
+ * allow them to see NULL, in which case the root memcg will be selected.
+ *
+ * We need this lock because multiple allocations to the same cache from a non
+ * will span more than one worker. Only one of them can create the cache.
+ */
+static DEFINE_MUTEX(memcg_cache_mutex);
+static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
+                                                  struct kmem_cache *cachep)
+{
+        struct kmem_cache *new_cachep;
+        int idx;
+        BUG_ON(!memcg_can_account_kmem(memcg));
+        idx = memcg_cache_id(memcg);
+        mutex_lock(&memcg_cache_mutex);
+        new_cachep = cachep->memcg_params->memcg_caches[idx];
+        if (new_cachep)
+                goto out;
+        new_cachep = kmem_cache_dup(memcg, cachep);
+        if (new_cachep == NULL) {
+                new_cachep = cachep;
+                goto out;
+        }
+        mem_cgroup_get(memcg);
+        atomic_set(&new_cachep->memcg_params->nr_pages , 0);
+        cachep->memcg_params->memcg_caches[idx] = new_cachep;
+        /*
+         * the readers won't lock, make sure everybody sees the updated value,
+         * so they won't put stuff in the queue again for no reason
+         */
+        wmb();
+out:
+        mutex_unlock(&memcg_cache_mutex);
+        return new_cachep;
+}
+void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+{
+        struct kmem_cache *c;
+        int i;
+        if (!s->memcg_params)
+                return;
+        if (!s->memcg_params->is_root_cache)
+                return;
+        /*
+         * If the cache is being destroyed, we trust that there is no one else
+         * requesting objects from it. Even if there are, the sanity checks in
+         * kmem_cache_destroy should caught this ill-case.
+         *
+         * Still, we don't want anyone else freeing memcg_caches under our
+         * noses, which can happen if a new memcg comes to life. As usual,
+         * we'll take the set_limit_mutex to protect ourselves against this.
+         */
+        mutex_lock(&set_limit_mutex);
+        for (i = 0; i < memcg_limited_groups_array_size; i++) {
+                c = s->memcg_params->memcg_caches[i];
+                if (!c)
+                        continue;
+                /*
+                 * We will now manually delete the caches, so to avoid races
+                 * we need to cancel all pending destruction workers and
+                 * proceed with destruction ourselves.
+                 *
+                 * kmem_cache_destroy() will call kmem_cache_shrink internally,
+                 * and that could spawn the workers again: it is likely that
+                 * the cache still have active pages until this very moment.
+                 * This would lead us back to mem_cgroup_destroy_cache.
+                 *
+                 * But that will not execute at all if the "dead" flag is not
+                 * set, so flip it down to guarantee we are in control.
+                 */
+                c->memcg_params->dead = false;
+                cancel_work_sync(&c->memcg_params->destroy);
+                kmem_cache_destroy(c);
+        }
+        mutex_unlock(&set_limit_mutex);
+}
+struct create_work {
+        struct mem_cgroup *memcg;
+        struct kmem_cache *cachep;
+        struct work_struct work;
+};
+static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+        struct kmem_cache *cachep;
+        struct memcg_cache_params *params;
+        if (!memcg_kmem_is_active(memcg))
+                return;
+        mutex_lock(&memcg->slab_caches_mutex);
+        list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
+                cachep = memcg_params_to_cache(params);
+                cachep->memcg_params->dead = true;
+                INIT_WORK(&cachep->memcg_params->destroy,
+                                  kmem_cache_destroy_work_func);
+                schedule_work(&cachep->memcg_params->destroy);
+        }
+        mutex_unlock(&memcg->slab_caches_mutex);
+}
+static void memcg_create_cache_work_func(struct work_struct *w)
+{
+        struct create_work *cw;
+        cw = container_of(w, struct create_work, work);
+        memcg_create_kmem_cache(cw->memcg, cw->cachep);
+        /* Drop the reference gotten when we enqueued. */
+        css_put(&cw->memcg->css);
+        kfree(cw);
+}
+/*
+ * Enqueue the creation of a per-memcg kmem_cache.
+ * Called with rcu_read_lock.
+ */
+static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+                                         struct kmem_cache *cachep)
+{
+        struct create_work *cw;
+        cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
+        if (cw == NULL)
+                return;
+        /* The corresponding put will be done in the workqueue. */
+        if (!css_tryget(&memcg->css)) {
+                kfree(cw);
+                return;
+        }
+        cw->memcg = memcg;
+        cw->cachep = cachep;
+        INIT_WORK(&cw->work, memcg_create_cache_work_func);
+        schedule_work(&cw->work);
+}
+static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
+                                       struct kmem_cache *cachep)
+{
+        /*
+         * We need to stop accounting when we kmalloc, because if the
+         * corresponding kmalloc cache is not yet created, the first allocation
+         * in __memcg_create_cache_enqueue will recurse.
+         *
+         * However, it is better to enclose the whole function. Depending on
+         * the debugging options enabled, INIT_WORK(), for instance, can
+         * trigger an allocation. This too, will make us recurse. Because at
+         * this point we can't allow ourselves back into memcg_kmem_get_cache,
+         * the safest choice is to do it like this, wrapping the whole function.
+         */
+        memcg_stop_kmem_account();
+        __memcg_create_cache_enqueue(memcg, cachep);
+        memcg_resume_kmem_account();
+}
+/*
+ * Return the kmem_cache we're supposed to use for a slab allocation.
+ * We try to use the current memcg's version of the cache.
+ *
+ * If the cache does not exist yet, if we are the first user of it,
+ * we either create it immediately, if possible, or create it asynchronously
+ * in a workqueue.
+ * In the latter case, we will let the current allocation go through with
+ * the original cache.
+ *
+ * Can't be called in interrupt context or from kernel threads.
+ * This function needs to be called with rcu_read_lock() held.
+ */
+struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
+                                          gfp_t gfp)
+{
+        struct mem_cgroup *memcg;
+        int idx;
+        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+        if (!current->mm || current->memcg_kmem_skip_account)
+                return cachep;
+        rcu_read_lock();
+        memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+        rcu_read_unlock();
+        if (!memcg_can_account_kmem(memcg))
+                return cachep;
+        idx = memcg_cache_id(memcg);
+        /*
+         * barrier to mare sure we're always seeing the up to date value.  The
+         * code updating memcg_caches will issue a write barrier to match this.
+         */
+        read_barrier_depends();
+        if (unlikely(cachep->memcg_params->memcg_caches[idx] == NULL)) {
+                /*
+                 * If we are in a safe context (can wait, and not in interrupt
+                 * context), we could be be predictable and return right away.
+                 * This would guarantee that the allocation being performed
+                 * already belongs in the new cache.
+                 *
+                 * However, there are some clashes that can arrive from locking.
+                 * For instance, because we acquire the slab_mutex while doing
+                 * kmem_cache_dup, this means no further allocation could happen
+                 * with the slab_mutex held.
+                 *
+                 * Also, because cache creation issue get_online_cpus(), this
+                 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
+                 * that ends up reversed during cpu hotplug. (cpuset allocates
+                 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
+                 * better to defer everything.
+                 */
+                memcg_create_cache_enqueue(memcg, cachep);
+                return cachep;
+        }
+        return cachep->memcg_params->memcg_caches[idx];
+}
+EXPORT_SYMBOL(__memcg_kmem_get_cache);
+/*
+ * We need to verify if the allocation against current->mm->owner's memcg is
+ * possible for the given order. But the page is not allocated yet, so we'll
+ * need a further commit step to do the final arrangements.
+ *
+ * It is possible for the task to switch cgroups in this mean time, so at
+ * commit time, we can't rely on task conversion any longer.  We'll then use
+ * the handle argument to return to the caller which cgroup we should commit
+ * against. We could also return the memcg directly and avoid the pointer
+ * passing, but a boolean return value gives better semantics considering
+ * the compiled-out case as well.
+ *
+ * Returning true means the allocation is possible.
+ */
+bool
+__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+{
+        struct mem_cgroup *memcg;
+        int ret;
+        *_memcg = NULL;
+        memcg = try_get_mem_cgroup_from_mm(current->mm);
+        /*
+         * very rare case described in mem_cgroup_from_task. Unfortunately there
+         * isn't much we can do without complicating this too much, and it would
+         * be gfp-dependent anyway. Just let it go
+         */
+        if (unlikely(!memcg))
+                return true;
+        if (!memcg_can_account_kmem(memcg)) {
+                css_put(&memcg->css);
+                return true;
+        }
+        ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order);
+        if (!ret)
+                *_memcg = memcg;
+        css_put(&memcg->css);
+        return (ret == 0);
+}
+void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
+                              int order)
+{
+        struct page_cgroup *pc;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        /* The page allocation failed. Revert */
+        if (!page) {
+                memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+                return;
+        }
+        pc = lookup_page_cgroup(page);
+        lock_page_cgroup(pc);
+        pc->mem_cgroup = memcg;
+        SetPageCgroupUsed(pc);
+        unlock_page_cgroup(pc);
+}
+void __memcg_kmem_uncharge_pages(struct page *page, int order)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct page_cgroup *pc;
+        pc = lookup_page_cgroup(page);
+        /*
+         * Fast unlocked return. Theoretically might have changed, have to
+         * check again after locking.
+         */
+        if (!PageCgroupUsed(pc))
+                return;
+        lock_page_cgroup(pc);
+        if (PageCgroupUsed(pc)) {
+                memcg = pc->mem_cgroup;
+                ClearPageCgroupUsed(pc);
+        }
+        unlock_page_cgroup(pc);
+        /*
+         * We trust that only if there is a memcg associated with the page, it
+         * is a valid allocation
+         */
+        if (!memcg)
+                return;
+        VM_BUG_ON(mem_cgroup_is_root(memcg));
+        memcg_uncharge_kmem(memcg, PAGE_SIZE << order);
+}
+#else
+static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
@@ -3486,8 +4395,6 @@ void mem_cgroup_print_bad_page(struct page *page)
 }
 #endif
-static DEFINE_MUTEX(set_limit_mutex);
 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
                                unsigned long long val)
 {
@@ -3772,6 +4679,7 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg,
 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
 {
        int node, zid;
+        u64 usage;
        do {
                /* This is for making all *used* pages to be on LRU. */
@@ -3792,13 +4700,20 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg)
                cond_resched();
                /*
+                 * Kernel memory may not necessarily be trackable to a specific
+                 * process. So they are not migrated, and therefore we can't
+                 * expect their value to drop to 0 here.
+                 * Having res filled up with kmem only is enough.
+                 *
                 * This is a safety check because mem_cgroup_force_empty_list
                 * could have raced with mem_cgroup_replace_page_cache callers
                 * so the lru seemed empty but the page could have been added
                 * right after the check. RES_USAGE should be safe as we always
                 * charge before adding to the LRU.
                 */
-        } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0);
+                usage = res_counter_read_u64(&memcg->res, RES_USAGE) -
+                        res_counter_read_u64(&memcg->kmem, RES_USAGE);
+        } while (usage > 0);
 }
 /*
@@ -3942,7 +4857,8 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        char str[64];
        u64 val;
-        int type, name, len;
+        int name, len;
+        enum res_type type;
        type = MEMFILE_TYPE(cft->private);
        name = MEMFILE_ATTR(cft->private);
@@ -3963,6 +4879,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
                else
                        val = res_counter_read_u64(&memcg->memsw, name);
                break;
+        case _KMEM:
+                val = res_counter_read_u64(&memcg->kmem, name);
+                break;
        default:
                BUG();
        }
@@ -3970,6 +4889,125 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
        len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
        return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
+static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
+{
+        int ret = -EINVAL;
+#ifdef CONFIG_MEMCG_KMEM
+        bool must_inc_static_branch = false;
+        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+        /*
+         * For simplicity, we won't allow this to be disabled.  It also can't
+         * be changed if the cgroup has children already, or if tasks had
+         * already joined.
+         *
+         * If tasks join before we set the limit, a person looking at
+         * kmem.usage_in_bytes will have no way to determine when it took
+         * place, which makes the value quite meaningless.
+         *
+         * After it first became limited, changes in the value of the limit are
+         * of course permitted.
+         *
+         * Taking the cgroup_lock is really offensive, but it is so far the only
+         * way to guarantee that no children will appear. There are plenty of
+         * other offenders, and they should all go away. Fine grained locking
+         * is probably the way to go here. When we are fully hierarchical, we
+         * can also get rid of the use_hierarchy check.
+         */
+        cgroup_lock();
+        mutex_lock(&set_limit_mutex);
+        if (!memcg->kmem_account_flags && val != RESOURCE_MAX) {
+                if (cgroup_task_count(cont) || (memcg->use_hierarchy &&
+                                                !list_empty(&cont->children))) {
+                        ret = -EBUSY;
+                        goto out;
+                }
+                ret = res_counter_set_limit(&memcg->kmem, val);
+                VM_BUG_ON(ret);
+                ret = memcg_update_cache_sizes(memcg);
+                if (ret) {
+                        res_counter_set_limit(&memcg->kmem, RESOURCE_MAX);
+                        goto out;
+                }
+                must_inc_static_branch = true;
+                /*
+                 * kmem charges can outlive the cgroup. In the case of slab
+                 * pages, for instance, a page contain objects from various
+                 * processes, so it is unfeasible to migrate them away. We
+                 * need to reference count the memcg because of that.
+                 */
+                mem_cgroup_get(memcg);
+        } else
+                ret = res_counter_set_limit(&memcg->kmem, val);
+out:
+        mutex_unlock(&set_limit_mutex);
+        cgroup_unlock();
+        /*
+         * We are by now familiar with the fact that we can't inc the static
+         * branch inside cgroup_lock. See disarm functions for details. A
+         * worker here is overkill, but also wrong: After the limit is set, we
+         * must start accounting right away. Since this operation can't fail,
+         * we can safely defer it to here - no rollback will be needed.
+         *
+         * The boolean used to control this is also safe, because
+         * KMEM_ACCOUNTED_ACTIVATED guarantees that only one process will be
+         * able to set it to true;
+         */
+        if (must_inc_static_branch) {
+                static_key_slow_inc(&memcg_kmem_enabled_key);
+                /*
+                 * setting the active bit after the inc will guarantee no one
+                 * starts accounting before all call sites are patched
+                 */
+                memcg_kmem_set_active(memcg);
+        }
+#endif
+        return ret;
+}
+static int memcg_propagate_kmem(struct mem_cgroup *memcg)
+{
+        int ret = 0;
+        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+        if (!parent)
+                goto out;
+        memcg->kmem_account_flags = parent->kmem_account_flags;
+#ifdef CONFIG_MEMCG_KMEM
+        /*
+         * When that happen, we need to disable the static branch only on those
+         * memcgs that enabled it. To achieve this, we would be forced to
+         * complicate the code by keeping track of which memcgs were the ones
+         * that actually enabled limits, and which ones got it from its
+         * parents.
+         *
+         * It is a lot simpler just to do static_key_slow_inc() on every child
+         * that is accounted.
+         */
+        if (!memcg_kmem_is_active(memcg))
+                goto out;
+        /*
+         * destroy(), called if we fail, will issue static_key_slow_inc() and
+         * mem_cgroup_put() if kmem is enabled. We have to either call them
+         * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
+         * this more consistent, since it always leads to the same destroy path
+         */
+        mem_cgroup_get(memcg);
+        static_key_slow_inc(&memcg_kmem_enabled_key);
+        mutex_lock(&set_limit_mutex);
+        ret = memcg_update_cache_sizes(memcg);
+        mutex_unlock(&set_limit_mutex);
+#endif
+out:
+        return ret;
+}
 /*
 * The user of this function is...
 * RES_LIMIT.
@@ -3978,7 +5016,8 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                            const char *buffer)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        int type, name;
+        enum res_type type;
+        int name;
        unsigned long long val;
        int ret;
@@ -4000,8 +5039,12 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
                        break;
                if (type == _MEM)
                        ret = mem_cgroup_resize_limit(memcg, val);
-                else
+                else if (type == _MEMSWAP)
                        ret = mem_cgroup_resize_memsw_limit(memcg, val);
+                else if (type == _KMEM)
+                        ret = memcg_update_kmem_limit(cont, val);
+                else
+                        return -EINVAL;
                break;
        case RES_SOFT_LIMIT:
                ret = res_counter_memparse_write_strategy(buffer, &val);
@@ -4054,7 +5097,8 @@ out:
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-        int type, name;
+        int name;
+        enum res_type type;
        type = MEMFILE_TYPE(event);
        name = MEMFILE_ATTR(event);
@@ -4066,14 +5110,22 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
        case RES_MAX_USAGE:
                if (type == _MEM)
                        res_counter_reset_max(&memcg->res);
-                else
+                else if (type == _MEMSWAP)
                        res_counter_reset_max(&memcg->memsw);
+                else if (type == _KMEM)
+                        res_counter_reset_max(&memcg->kmem);
+                else
+                        return -EINVAL;
                break;
        case RES_FAILCNT:
                if (type == _MEM)
                        res_counter_reset_failcnt(&memcg->res);
-                else
+                else if (type == _MEMSWAP)
                        res_counter_reset_failcnt(&memcg->memsw);
+                else if (type == _KMEM)
+                        res_counter_reset_failcnt(&memcg->kmem);
+                else
+                        return -EINVAL;
                break;
        }
@@ -4390,7 +5442,7 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 threshold, usage;
        int i, size, ret;
@@ -4473,7 +5525,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        u64 usage;
        int i, j, size;
@@ -4551,7 +5603,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_eventfd_list *event;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        BUG_ON(type != _OOM_TYPE);
        event = kmalloc(sizeof(*event), GFP_KERNEL);
@@ -4576,7 +5628,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
        struct mem_cgroup_eventfd_list *ev, *tmp;
-        int type = MEMFILE_TYPE(cft->private);
+        enum res_type type = MEMFILE_TYPE(cft->private);
        BUG_ON(type != _OOM_TYPE);
@@ -4635,12 +5687,33 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
+        int ret;
+        memcg->kmemcg_id = -1;
+        ret = memcg_propagate_kmem(memcg);
+        if (ret)
+                return ret;
        return mem_cgroup_sockets_init(memcg, ss);
 };
 static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 {
        mem_cgroup_sockets_destroy(memcg);
+        memcg_kmem_mark_dead(memcg);
+        if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
+                return;
+        /*
+         * Charges already down to 0, undo mem_cgroup_get() done in the charge
+         * path here, being careful not to race with memcg_uncharge_kmem: it is
+         * possible that the charges went down to 0 between mark_dead and the
+         * res_counter read, so in that case, we don't need the put
+         */
+        if (memcg_kmem_test_and_clear_dead(memcg))
+                mem_cgroup_put(memcg);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -4749,6 +5822,37 @@ static struct cftype mem_cgroup_files[] = {
                .read = mem_cgroup_read,
        },
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+        {
+                .name = "kmem.limit_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+                .write_string = mem_cgroup_write,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.failcnt",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+        {
+                .name = "kmem.max_usage_in_bytes",
+                .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
+                .trigger = mem_cgroup_reset,
+                .read = mem_cgroup_read,
+        },
+#ifdef CONFIG_SLABINFO
+        {
+                .name = "kmem.slabinfo",
+                .read_seq_string = mem_cgroup_slabinfo_read,
+        },
+#endif
+#endif
        { },    /* terminate */
 };
@@ -4816,16 +5920,29 @@ out_free:
 }
 /*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
+ * At destroying mem_cgroup, references from swap_cgroup can remain.
- * but in process context.  The work_freeing structure is overlaid
+ * (scanning all at force_empty is too costly...)
- * on the rcu_freeing structure, which itself is overlaid on memsw.
+ *
+ * Instead of clearing all references at force_empty, we remember
+ * the number of reference from swap_cgroup and free mem_cgroup when
+ * it goes down to 0.
+ *
+ * Removal of cgroup itself succeeds regardless of refs from swap.
 */
-static void free_work(struct work_struct *work)
+static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
-        struct mem_cgroup *memcg;
+        int node;
        int size = sizeof(struct mem_cgroup);
-        memcg = container_of(work, struct mem_cgroup, work_freeing);
+        mem_cgroup_remove_from_trees(memcg);
+        free_css_id(&mem_cgroup_subsys, &memcg->css);
+        for_each_node(node)
+                free_mem_cgroup_per_zone_info(memcg, node);
+        free_percpu(memcg->stat);
        /*
         * We need to make sure that (at least for now), the jump label
         * destruction code runs outside of the cgroup lock. This is because
@@ -4837,45 +5954,34 @@ static void free_work(struct work_struct *work)
         * to move this code around, and make sure it is outside
         * the cgroup_lock.
         */
-        disarm_sock_keys(memcg);
+        disarm_static_keys(memcg);
        if (size < PAGE_SIZE)
                kfree(memcg);
        else
                vfree(memcg);
 }
-static void free_rcu(struct rcu_head *rcu_head)
-{
-        struct mem_cgroup *memcg;
-        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        INIT_WORK(&memcg->work_freeing, free_work);
-        schedule_work(&memcg->work_freeing);
-}
 /*
- * At destroying mem_cgroup, references from swap_cgroup can remain.
+ * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * (scanning all at force_empty is too costly...)
+ * but in process context.  The work_freeing structure is overlaid
- *
+ * on the rcu_freeing structure, which itself is overlaid on memsw.
- * Instead of clearing all references at force_empty, we remember
- * the number of reference from swap_cgroup and free mem_cgroup when
- * it goes down to 0.
- *
- * Removal of cgroup itself succeeds regardless of refs from swap.
 */
+static void free_work(struct work_struct *work)
-static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
-        int node;
+        struct mem_cgroup *memcg;
-        mem_cgroup_remove_from_trees(memcg);
+        memcg = container_of(work, struct mem_cgroup, work_freeing);
-        free_css_id(&mem_cgroup_subsys, &memcg->css);
+        __mem_cgroup_free(memcg);
+}
-        for_each_node(node)
+static void free_rcu(struct rcu_head *rcu_head)
-                free_mem_cgroup_per_zone_info(memcg, node);
+{
+        struct mem_cgroup *memcg;
-        free_percpu(memcg->stat);
+        memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-        call_rcu(&memcg->rcu_freeing, free_rcu);
+        INIT_WORK(&memcg->work_freeing, free_work);
+        schedule_work(&memcg->work_freeing);
 }
 static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -4887,7 +5993,7 @@ static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
 {
        if (atomic_sub_and_test(count, &memcg->refcnt)) {
                struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-                __mem_cgroup_free(memcg);
+                call_rcu(&memcg->rcu_freeing, free_rcu);
                if (parent)
                        mem_cgroup_put(parent);
        }
@@ -4994,6 +6100,8 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        if (parent && parent->use_hierarchy) {
                res_counter_init(&memcg->res, &parent->res);
                res_counter_init(&memcg->memsw, &parent->memsw);
+                res_counter_init(&memcg->kmem, &parent->kmem);
                /*
                 * We increment refcnt of the parent to ensure that we can
                 * safely access it on res_counter_charge/uncharge.
@@ -5004,6 +6112,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
        } else {
                res_counter_init(&memcg->res, NULL);
                res_counter_init(&memcg->memsw, NULL);
+                res_counter_init(&memcg->kmem, NULL);
                /*
                 * Deeper hierachy with use_hierarchy == false doesn't make
                 * much sense so let cgroup subsystem know about this
@@ -5043,6 +6152,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
        struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
        mem_cgroup_reparent_charges(memcg);
+        mem_cgroup_destroy_all_caches(memcg);
 }
 static void mem_cgroup_css_free(struct cgroup *cont)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 962e353aa86f..d04ed87bfacb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -590,18 +590,21 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 }
 #ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit onlining of a node which doesn't have
+ * normal memory.
+ */
 static bool can_online_high_movable(struct zone *zone)
 {
        return true;
 }
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
 /* ensure every online node has NORMAL memory */
 static bool can_online_high_movable(struct zone *zone)
 {
        return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
 }
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
 /* check which state of node_states will be changed when online memory */
 static void node_states_check_changes_online(unsigned long nr_pages,
@@ -1112,12 +1115,15 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 }
 #ifdef CONFIG_MOVABLE_NODE
-/* when CONFIG_MOVABLE_NODE, we allow online node don't have normal memory */
+/*
+ * When CONFIG_MOVABLE_NODE, we permit offlining of a node which doesn't have
+ * normal memory.
+ */
 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 {
        return true;
 }
-#else /* #ifdef CONFIG_MOVABLE_NODE */
+#else /* CONFIG_MOVABLE_NODE */
 /* ensure the node has NORMAL memory if it is still online */
 static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 {
@@ -1141,7 +1147,7 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
         */
        return present_pages == 0;
 }
-#endif /* #ifdef CONFIG_MOVABLE_NODE */
+#endif /* CONFIG_MOVABLE_NODE */
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3dca970367db..94722a4d6b43 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -114,7 +114,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 #ifdef CONFIG_NUMA_BALANCING
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                pmd_t *pmd)
+                                       pmd_t *pmd)
 {
        spin_lock(&mm->page_table_lock);
        set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
@@ -122,15 +122,15 @@ static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
 }
 #else
 static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
-                pmd_t *pmd)
+                                       pmd_t *pmd)
 {
        BUG();
 }
 #endif /* CONFIG_NUMA_BALANCING */
-static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud,
+static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, pgprot_t newprot,
+                pud_t *pud, unsigned long addr, unsigned long end,
-                int dirty_accountable, int prot_numa)
+                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pmd_t *pmd;
        unsigned long next;
@@ -143,7 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
                if (pmd_trans_huge(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                split_huge_page_pmd(vma, addr, pmd);
-                        else if (change_huge_pmd(vma, pmd, addr, newprot, prot_numa)) {
+                        else if (change_huge_pmd(vma, pmd, addr, newprot,
+                                                 prot_numa)) {
                                pages += HPAGE_PMD_NR;
                                continue;
                        }
@@ -167,9 +168,9 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *
        return pages;
 }
-static inline unsigned long change_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
+static inline unsigned long change_pud_range(struct vm_area_struct *vma,
-                unsigned long addr, unsigned long end, pgprot_t newprot,
+                pgd_t *pgd, unsigned long addr, unsigned long end,
-                int dirty_accountable, int prot_numa)
+                pgprot_t newprot, int dirty_accountable, int prot_numa)
 {
        pud_t *pud;
        unsigned long next;
@@ -304,7 +305,8 @@ success:
                dirty_accountable = 1;
        }
-        change_protection(vma, start, end, vma->vm_page_prot, dirty_accountable, 0);
+        change_protection(vma, start, end, vma->vm_page_prot,
+                          dirty_accountable, 0);
        vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
        vm_stat_account(mm, newflags, vma->vm_file, nrpages);
@@ -361,8 +363,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
                error = -EINVAL;
                if (!(vma->vm_flags & VM_GROWSDOWN))
                        goto out;
-        }
+        } else {
-        else {
                if (vma->vm_start > start)
                        goto out;
                if (unlikely(grows & PROT_GROWSUP)) {
@@ -378,9 +379,10 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
        for (nstart = start ; ; ) {
                unsigned long newflags;
-                /* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
+                /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
-                newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
+                newflags = vm_flags;
+                newflags |= (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
                /* newflags >> 4 shift VM_MAY% in place of VM_% */
                if ((newflags & ~(newflags >> 4)) & (VM_READ | VM_WRITE | VM_EXEC)) {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d037c8bc1512..2ad2ad168efe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -371,8 +371,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
        int nr_pages = 1 << order;
        int bad = 0;
-        if (unlikely(compound_order(page) != order) ||
+        if (unlikely(compound_order(page) != order)) {
-            unlikely(!PageHead(page))) {
                bad_page(page);
                bad++;
        }
@@ -2613,6 +2612,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        int migratetype = allocflags_to_migratetype(gfp_mask);
        unsigned int cpuset_mems_cookie;
        int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET;
+        struct mem_cgroup *memcg = NULL;
        gfp_mask &= gfp_allowed_mask;
@@ -2631,6 +2631,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
        if (unlikely(!zonelist->_zonerefs->zone))
                return NULL;
+        /*
+         * Will only have any effect when __GFP_KMEMCG is set.  This is
+         * verified in the (always inline) callee
+         */
+        if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
+                return NULL;
 retry_cpuset:
        cpuset_mems_cookie = get_mems_allowed();
@@ -2666,6 +2673,8 @@ out:
        if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page))
                goto retry_cpuset;
+        memcg_kmem_commit_charge(page, memcg, order);
        return page;
 }
 EXPORT_SYMBOL(__alloc_pages_nodemask);
@@ -2718,6 +2727,31 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
+/*
+ * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
+ * pages allocated with __GFP_KMEMCG.
+ *
+ * Those pages are accounted to a particular memcg, embedded in the
+ * corresponding page_cgroup. To avoid adding a hit in the allocator to search
+ * for that information only to find out that it is NULL for users who have no
+ * interest in that whatsoever, we provide these functions.
+ *
+ * The caller knows better which flags it relies on.
+ */
+void __free_memcg_kmem_pages(struct page *page, unsigned int order)
+{
+        memcg_kmem_uncharge_pages(page, order);
+        __free_pages(page, order);
+}
+void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
+{
+        if (addr != 0) {
+                VM_BUG_ON(!virt_addr_valid((void *)addr));
+                __free_memcg_kmem_pages(virt_to_page((void *)addr), order);
+        }
+}
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
        if (addr) {
diff --git a/mm/slab.c b/mm/slab.c
index 2c3a2e0394db..e7667a3584bc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -87,7 +87,6 @@
 */
 #include        <linux/slab.h>
-#include        "slab.h"
 #include        <linux/mm.h>
 #include        <linux/poison.h>
 #include        <linux/swap.h>
@@ -128,6 +127,8 @@
 #include        "internal.h"
+#include        "slab.h"
 /*
 * DEBUG        - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON.
 *                0 for faster, smaller code (especially in the critical paths).
@@ -641,6 +642,26 @@ static void init_node_lock_keys(int q)
        }
 }
+static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
+{
+        struct kmem_list3 *l3;
+        l3 = cachep->nodelists[q];
+        if (!l3)
+                return;
+        slab_set_lock_classes(cachep, &on_slab_l3_key,
+                        &on_slab_alc_key, q);
+}
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+        int node;
+        VM_BUG_ON(OFF_SLAB(cachep));
+        for_each_node(node)
+                on_slab_lock_classes_node(cachep, node);
+}
 static inline void init_lock_keys(void)
 {
        int node;
@@ -657,6 +678,14 @@ static inline void init_lock_keys(void)
 {
 }
+static inline void on_slab_lock_classes(struct kmem_cache *cachep)
+{
+}
+static inline void on_slab_lock_classes_node(struct kmem_cache *cachep, int node)
+{
+}
 static void slab_set_debugobj_lock_classes_node(struct kmem_cache *cachep, int node)
 {
 }
@@ -1385,6 +1414,9 @@ static int __cpuinit cpuup_prepare(long cpu)
                free_alien_cache(alien);
                if (cachep->flags & SLAB_DEBUG_OBJECTS)
                        slab_set_debugobj_lock_classes_node(cachep, node);
+                else if (!OFF_SLAB(cachep) &&
+                         !(cachep->flags & SLAB_DESTROY_BY_RCU))
+                        on_slab_lock_classes_node(cachep, node);
        }
        init_node_lock_keys(node);
@@ -1863,6 +1895,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
                if (page->pfmemalloc)
                        SetPageSlabPfmemalloc(page + i);
        }
+        memcg_bind_pages(cachep, cachep->gfporder);
        if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
                kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1899,9 +1932,11 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
                __ClearPageSlab(page);
                page++;
        }
+        memcg_release_pages(cachep, cachep->gfporder);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += nr_freed;
-        free_pages((unsigned long)addr, cachep->gfporder);
+        free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
 }
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2489,7 +2524,8 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
                WARN_ON_ONCE(flags & SLAB_DESTROY_BY_RCU);
                slab_set_debugobj_lock_classes(cachep);
-        }
+        } else if (!OFF_SLAB(cachep) && !(flags & SLAB_DESTROY_BY_RCU))
+                on_slab_lock_classes(cachep);
        return 0;
 }
@@ -3453,6 +3489,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
        if (slab_should_failslab(cachep, flags))
                return NULL;
+        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
@@ -3538,6 +3576,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
        if (slab_should_failslab(cachep, flags))
                return NULL;
+        cachep = memcg_kmem_get_cache(cachep, flags);
        cache_alloc_debugcheck_before(cachep, flags);
        local_irq_save(save_flags);
        objp = __do_cache_alloc(cachep, flags);
@@ -3851,6 +3891,9 @@ EXPORT_SYMBOL(__kmalloc);
 void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 {
        unsigned long flags;
+        cachep = cache_from_obj(cachep, objp);
+        if (!cachep)
+                return;
        local_irq_save(flags);
        debug_check_no_locks_freed(objp, cachep->object_size);
@@ -3998,7 +4041,7 @@ static void do_ccupdate_local(void *info)
 }
 /* Always called with the slab_mutex held */
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
        struct ccupdate_struct *new;
@@ -4041,12 +4084,49 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        return alloc_kmemlist(cachep, gfp);
 }
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+                                int batchcount, int shared, gfp_t gfp)
+{
+        int ret;
+        struct kmem_cache *c = NULL;
+        int i = 0;
+        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
+        if (slab_state < FULL)
+                return ret;
+        if ((ret < 0) || !is_root_cache(cachep))
+                return ret;
+        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+        for_each_memcg_cache_index(i) {
+                c = cache_from_memcg(cachep, i);
+                if (c)
+                        /* return value determined by the parent cache only */
+                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+        }
+        return ret;
+}
 /* Called with slab_mutex held always */
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
        int err;
-        int limit, shared;
+        int limit = 0;
+        int shared = 0;
+        int batchcount = 0;
+        if (!is_root_cache(cachep)) {
+                struct kmem_cache *root = memcg_root_cache(cachep);
+                limit = root->limit;
+                shared = root->shared;
+                batchcount = root->batchcount;
+        }
+        if (limit && shared && batchcount)
+                goto skip_setup;
        /*
         * The head array serves three purposes:
         * - create a LIFO ordering, i.e. return objects that are cache-warm
@@ -4088,7 +4168,9 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
        if (limit > 32)
                limit = 32;
 #endif
-        err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
+        batchcount = (limit + 1) / 2;
+skip_setup:
+        err = do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
        if (err)
                printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
                       cachep->name, -err);
diff --git a/mm/slab.h b/mm/slab.h
index 1cb9c9ee0e6f..34a98d642196 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -43,12 +43,15 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
                        size_t size, unsigned long flags);
+struct mem_cgroup;
 #ifdef CONFIG_SLUB
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+struct kmem_cache *
-        size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *));
 #else
-static inline struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+static inline struct kmem_cache *
-        size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
@@ -100,4 +103,130 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo);
 void slabinfo_show_stats(struct seq_file *m, struct kmem_cache *s);
 ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos);
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+        return !s->memcg_params || s->memcg_params->is_root_cache;
+}
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+                                     struct mem_cgroup *memcg)
+{
+        return (is_root_cache(cachep) && !memcg) ||
+                                (cachep->memcg_params->memcg == memcg);
+}
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+        if (!is_root_cache(s))
+                atomic_add(1 << order, &s->memcg_params->nr_pages);
+}
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+        if (is_root_cache(s))
+                return;
+        if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
+                mem_cgroup_destroy_cache(s);
+}
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+                                        struct kmem_cache *p)
+{
+        return (p == s) ||
+                (s->memcg_params && (p == s->memcg_params->root_cache));
+}
+/*
+ * We use suffixes to the name in memcg because we can't have caches
+ * created in the system with the same name. But when we print them
+ * locally, better refer to them with the base name
+ */
+static inline const char *cache_name(struct kmem_cache *s)
+{
+        if (!is_root_cache(s))
+                return s->memcg_params->root_cache->name;
+        return s->name;
+}
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+        return s->memcg_params->memcg_caches[idx];
+}
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+        if (is_root_cache(s))
+                return s;
+        return s->memcg_params->root_cache;
+}
+#else
+static inline bool is_root_cache(struct kmem_cache *s)
+{
+        return true;
+}
+static inline bool cache_match_memcg(struct kmem_cache *cachep,
+                                     struct mem_cgroup *memcg)
+{
+        return true;
+}
+static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+{
+}
+static inline void memcg_release_pages(struct kmem_cache *s, int order)
+{
+}
+static inline bool slab_equal_or_root(struct kmem_cache *s,
+                                      struct kmem_cache *p)
+{
+        return true;
+}
+static inline const char *cache_name(struct kmem_cache *s)
+{
+        return s->name;
+}
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+{
+        return NULL;
+}
+static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+{
+        return s;
+}
+#endif
+static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
+{
+        struct kmem_cache *cachep;
+        struct page *page;
+        /*
+         * When kmemcg is not being used, both assignments should return the
+         * same value. but we don't want to pay the assignment price in that
+         * case. If it is not compiled in, the compiler should be smart enough
+         * to not do even the assignment. In that case, slab_equal_or_root
+         * will also be a constant.
+         */
+        if (!memcg_kmem_enabled() && !unlikely(s->flags & SLAB_DEBUG_FREE))
+                return s;
+        page = virt_to_head_page(x);
+        cachep = page->slab_cache;
+        if (slab_equal_or_root(cachep, s))
+                return cachep;
+        pr_err("%s: Wrong slab cache. %s but object is from %s\n",
+                __FUNCTION__, cachep->name, s->name);
+        WARN_ON_ONCE(1);
+        return s;
+}
 #endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index a8e76d79ee65..3f3cd97d3fdf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -18,6 +18,7 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 #include <asm/page.h>
+#include <linux/memcontrol.h>
 #include "slab.h"
@@ -27,7 +28,8 @@ DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(const char *name, size_t size)
+static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
+                                   size_t size)
 {
        struct kmem_cache *s = NULL;
@@ -53,7 +55,13 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
                        continue;
                }
-                if (!strcmp(s->name, name)) {
+                /*
+                 * For simplicity, we won't check this in the list of memcg
+                 * caches. We have control over memcg naming, and if there
+                 * aren't duplicates in the global list, there won't be any
+                 * duplicates in the memcg lists as well.
+                 */
+                if (!memcg && !strcmp(s->name, name)) {
                        pr_err("%s (%s): Cache name already exists.\n",
                               __func__, name);
                        dump_stack();
@@ -66,12 +74,41 @@ static int kmem_cache_sanity_check(const char *name, size_t size)
        return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(const char *name, size_t size)
+static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
+                                          const char *name, size_t size)
 {
        return 0;
 }
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_caches(int num_memcgs)
+{
+        struct kmem_cache *s;
+        int ret = 0;
+        mutex_lock(&slab_mutex);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (!is_root_cache(s))
+                        continue;
+                ret = memcg_update_cache_size(s, num_memcgs);
+                /*
+                 * See comment in memcontrol.c, memcg_update_cache_size:
+                 * Instead of freeing the memory, we'll just leave the caches
+                 * up to this point in an updated state.
+                 */
+                if (ret)
+                        goto out;
+        }
+        memcg_update_array_size(num_memcgs);
+out:
+        mutex_unlock(&slab_mutex);
+        return ret;
+}
+#endif
 /*
 * Figure out what the alignment of the objects will be given a set of
 * flags, a user specified alignment and the size of the objects.
@@ -125,8 +162,10 @@ unsigned long calculate_alignment(unsigned long flags,
 * as davem.
 */
-struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align,
+struct kmem_cache *
-                unsigned long flags, void (*ctor)(void *))
+kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
+                        size_t align, unsigned long flags, void (*ctor)(void *),
+                        struct kmem_cache *parent_cache)
 {
        struct kmem_cache *s = NULL;
        int err = 0;
@@ -134,7 +173,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
        get_online_cpus();
        mutex_lock(&slab_mutex);
-        if (!kmem_cache_sanity_check(name, size) == 0)
+        if (!kmem_cache_sanity_check(memcg, name, size) == 0)
                goto out_locked;
        /*
@@ -145,7 +184,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
         */
        flags &= CACHE_CREATE_MASK;
-        s = __kmem_cache_alias(name, size, align, flags, ctor);
+        s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
        if (s)
                goto out_locked;
@@ -154,6 +193,13 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
                s->object_size = s->size = size;
                s->align = calculate_alignment(flags, align, size);
                s->ctor = ctor;
+                if (memcg_register_cache(memcg, s, parent_cache)) {
+                        kmem_cache_free(kmem_cache, s);
+                        err = -ENOMEM;
+                        goto out_locked;
+                }
                s->name = kstrdup(name, GFP_KERNEL);
                if (!s->name) {
                        kmem_cache_free(kmem_cache, s);
@@ -163,10 +209,9 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align
                err = __kmem_cache_create(s, flags);
                if (!err) {
                        s->refcount = 1;
                        list_add(&s->list, &slab_caches);
+                        memcg_cache_list_add(memcg, s);
                } else {
                        kfree(s->name);
                        kmem_cache_free(kmem_cache, s);
@@ -194,10 +239,20 @@ out_locked:
        return s;
 }
+struct kmem_cache *
+kmem_cache_create(const char *name, size_t size, size_t align,
+                  unsigned long flags, void (*ctor)(void *))
+{
+        return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+}
 EXPORT_SYMBOL(kmem_cache_create);
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+        /* Destroy all the children caches if we aren't a memcg cache */
+        kmem_cache_destroy_memcg_children(s);
        get_online_cpus();
        mutex_lock(&slab_mutex);
        s->refcount--;
@@ -209,6 +264,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
                        if (s->flags & SLAB_DESTROY_BY_RCU)
                                rcu_barrier();
+                        memcg_release_cache(s);
                        kfree(s->name);
                        kmem_cache_free(kmem_cache, s);
                } else {
@@ -267,7 +323,7 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
 #ifdef CONFIG_SLABINFO
-static void print_slabinfo_header(struct seq_file *m)
+void print_slabinfo_header(struct seq_file *m)
 {
        /*
         * Output format version, so at least we can change it
@@ -311,16 +367,43 @@ static void s_stop(struct seq_file *m, void *p)
        mutex_unlock(&slab_mutex);
 }
-static int s_show(struct seq_file *m, void *p)
+static void
+memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
+{
+        struct kmem_cache *c;
+        struct slabinfo sinfo;
+        int i;
+        if (!is_root_cache(s))
+                return;
+        for_each_memcg_cache_index(i) {
+                c = cache_from_memcg(s, i);
+                if (!c)
+                        continue;
+                memset(&sinfo, 0, sizeof(sinfo));
+                get_slabinfo(c, &sinfo);
+                info->active_slabs += sinfo.active_slabs;
+                info->num_slabs += sinfo.num_slabs;
+                info->shared_avail += sinfo.shared_avail;
+                info->active_objs += sinfo.active_objs;
+                info->num_objs += sinfo.num_objs;
+        }
+}
+int cache_show(struct kmem_cache *s, struct seq_file *m)
 {
-        struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
        struct slabinfo sinfo;
        memset(&sinfo, 0, sizeof(sinfo));
        get_slabinfo(s, &sinfo);
+        memcg_accumulate_slabinfo(s, &sinfo);
        seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
-                   s->name, sinfo.active_objs, sinfo.num_objs, s->size,
+                   cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
                   sinfo.objects_per_slab, (1 << sinfo.cache_order));
        seq_printf(m, " : tunables %4u %4u %4u",
@@ -332,6 +415,15 @@ static int s_show(struct seq_file *m, void *p)
        return 0;
 }
+static int s_show(struct seq_file *m, void *p)
+{
+        struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+        if (!is_root_cache(s))
+                return 0;
+        return cache_show(s, m);
+}
 /*
 * slabinfo_op - iterator that generates /proc/slabinfo
 *
diff --git a/mm/slob.c b/mm/slob.c
index 795bab7d391d..a99fdf7a0907 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -58,7 +58,6 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
-#include "slab.h"
 #include <linux/mm.h>
 #include <linux/swap.h> /* struct reclaim_state */
@@ -73,6 +72,7 @@
 #include <linux/atomic.h>
+#include "slab.h"
 /*
 * slob_block has a field 'units', which indicates size of block if +ve,
 * or offset of next block if -ve (in SLOB_UNITs).
diff --git a/mm/slub.c b/mm/slub.c
index 87f9f32bf0cd..ba2ca53f6c3a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -31,6 +31,7 @@
 #include <linux/fault-inject.h>
 #include <linux/stacktrace.h>
 #include <linux/prefetch.h>
+#include <linux/memcontrol.h>
 #include <trace/events/kmem.h>
@@ -200,13 +201,14 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
 static void sysfs_slab_remove(struct kmem_cache *);
+static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
                                                        { return 0; }
 static inline void sysfs_slab_remove(struct kmem_cache *s) { }
+static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 #endif
 static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -1343,6 +1345,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        void *start;
        void *last;
        void *p;
+        int order;
        BUG_ON(flags & GFP_SLAB_BUG_MASK);
@@ -1351,7 +1354,9 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        if (!page)
                goto out;
+        order = compound_order(page);
        inc_slabs_node(s, page_to_nid(page), page->objects);
+        memcg_bind_pages(s, order);
        page->slab_cache = s;
        __SetPageSlab(page);
        if (page->pfmemalloc)
@@ -1360,7 +1365,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
        start = page_address(page);
        if (unlikely(s->flags & SLAB_POISON))
-                memset(start, POISON_INUSE, PAGE_SIZE << compound_order(page));
+                memset(start, POISON_INUSE, PAGE_SIZE << order);
        last = start;
        for_each_object(p, s, start, page->objects) {
@@ -1401,10 +1406,12 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
        __ClearPageSlabPfmemalloc(page);
        __ClearPageSlab(page);
+        memcg_release_pages(s, order);
        reset_page_mapcount(page);
        if (current->reclaim_state)
                current->reclaim_state->reclaimed_slab += pages;
-        __free_pages(page, order);
+        __free_memcg_kmem_pages(page, order);
 }
 #define need_reserve_slab_rcu                                           \
@@ -2322,6 +2329,7 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
        if (slab_pre_alloc_hook(s, gfpflags))
                return NULL;
+        s = memcg_kmem_get_cache(s, gfpflags);
 redo:
        /*
@@ -2610,19 +2618,10 @@ redo:
 void kmem_cache_free(struct kmem_cache *s, void *x)
 {
-        struct page *page;
+        s = cache_from_obj(s, x);
+        if (!s)
-        page = virt_to_head_page(x);
-        if (kmem_cache_debug(s) && page->slab_cache != s) {
-                pr_err("kmem_cache_free: Wrong slab cache. %s but object"
-                        " is from  %s\n", page->slab_cache->name, s->name);
-                WARN_ON_ONCE(1);
                return;
-        }
+        slab_free(s, virt_to_head_page(x), x, _RET_IP_);
-        slab_free(s, page, x, _RET_IP_);
        trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
@@ -3154,8 +3153,19 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
 {
        int rc = kmem_cache_close(s);
-        if (!rc)
+        if (!rc) {
+                /*
+                 * We do the same lock strategy around sysfs_slab_add, see
+                 * __kmem_cache_create. Because this is pretty much the last
+                 * operation we do and the lock will be released shortly after
+                 * that in slab_common.c, we could just move sysfs_slab_remove
+                 * to a later point in common code. We should do that when we
+                 * have a common sysfs framework for all allocators.
+                 */
+                mutex_unlock(&slab_mutex);
                sysfs_slab_remove(s);
+                mutex_lock(&slab_mutex);
+        }
        return rc;
 }
@@ -3292,7 +3302,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
        struct page *page;
        void *ptr = NULL;
-        flags |= __GFP_COMP | __GFP_NOTRACK;
+        flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
        page = alloc_pages_node(node, flags, get_order(size));
        if (page)
                ptr = page_address(page);
@@ -3398,7 +3408,7 @@ void kfree(const void *x)
        if (unlikely(!PageSlab(page))) {
                BUG_ON(!PageCompound(page));
                kmemleak_free(x);
-                __free_pages(page, compound_order(page));
+                __free_memcg_kmem_pages(page, compound_order(page));
                return;
        }
        slab_free(page->slab_cache, page, object, _RET_IP_);
@@ -3786,7 +3796,7 @@ static int slab_unmergeable(struct kmem_cache *s)
        return 0;
 }
-static struct kmem_cache *find_mergeable(size_t size,
+static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
                size_t align, unsigned long flags, const char *name,
                void (*ctor)(void *))
 {
@@ -3822,17 +3832,21 @@ static struct kmem_cache *find_mergeable(size_t size,
                if (s->size - size >= sizeof(void *))
                        continue;
+                if (!cache_match_memcg(s, memcg))
+                        continue;
                return s;
        }
        return NULL;
 }
-struct kmem_cache *__kmem_cache_alias(const char *name, size_t size,
+struct kmem_cache *
-                size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
+                   size_t align, unsigned long flags, void (*ctor)(void *))
 {
        struct kmem_cache *s;
-        s = find_mergeable(size, align, flags, name, ctor);
+        s = find_mergeable(memcg, size, align, flags, name, ctor);
        if (s) {
                s->refcount++;
                /*
@@ -3863,6 +3877,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
        if (slab_state <= UP)
                return 0;
+        memcg_propagate_slab_attrs(s);
        mutex_unlock(&slab_mutex);
        err = sysfs_slab_add(s);
        mutex_lock(&slab_mutex);
@@ -5096,10 +5111,95 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                return -EIO;
        err = attribute->store(s, buf, len);
+#ifdef CONFIG_MEMCG_KMEM
+        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
+                int i;
+                mutex_lock(&slab_mutex);
+                if (s->max_attr_size < len)
+                        s->max_attr_size = len;
+                /*
+                 * This is a best effort propagation, so this function's return
+                 * value will be determined by the parent cache only. This is
+                 * basically because not all attributes will have a well
+                 * defined semantics for rollbacks - most of the actions will
+                 * have permanent effects.
+                 *
+                 * Returning the error value of any of the children that fail
+                 * is not 100 % defined, in the sense that users seeing the
+                 * error code won't be able to know anything about the state of
+                 * the cache.
+                 *
+                 * Only returning the error code for the parent cache at least
+                 * has well defined semantics. The cache being written to
+                 * directly either failed or succeeded, in which case we loop
+                 * through the descendants with best-effort propagation.
+                 */
+                for_each_memcg_cache_index(i) {
+                        struct kmem_cache *c = cache_from_memcg(s, i);
+                        if (c)
+                                attribute->store(c, buf, len);
+                }
+                mutex_unlock(&slab_mutex);
+        }
+#endif
        return err;
 }
+static void memcg_propagate_slab_attrs(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+        int i;
+        char *buffer = NULL;
+        if (!is_root_cache(s))
+                return;
+        /*
+         * This mean this cache had no attribute written. Therefore, no point
+         * in copying default values around
+         */
+        if (!s->max_attr_size)
+                return;
+        for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
+                char mbuf[64];
+                char *buf;
+                struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
+                if (!attr || !attr->store || !attr->show)
+                        continue;
+                /*
+                 * It is really bad that we have to allocate here, so we will
+                 * do it only as a fallback. If we actually allocate, though,
+                 * we can just use the allocated buffer until the end.
+                 *
+                 * Most of the slub attributes will tend to be very small in
+                 * size, but sysfs allows buffers up to a page, so they can
+                 * theoretically happen.
+                 */
+                if (buffer)
+                        buf = buffer;
+                else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+                        buf = mbuf;
+                else {
+                        buffer = (char *) get_zeroed_page(GFP_KERNEL);
+                        if (WARN_ON(!buffer))
+                                continue;
+                        buf = buffer;
+                }
+                attr->show(s->memcg_params->root_cache, buf);
+                attr->store(s, buf, strlen(buf));
+        }
+        if (buffer)
+                free_page((unsigned long)buffer);
+#endif
+}
 static const struct sysfs_ops slab_sysfs_ops = {
        .show = slab_attr_show,
        .store = slab_attr_store,
@@ -5156,6 +5256,12 @@ static char *create_unique_id(struct kmem_cache *s)
        if (p != name + 1)
                *p++ = '-';
        p += sprintf(p, "%07d", s->size);
+#ifdef CONFIG_MEMCG_KMEM
+        if (!is_root_cache(s))
+                p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+#endif
        BUG_ON(p > name + ID_STR_LENGTH - 1);
        return name;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7f3096137b8a..828530e2794a 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1177,7 +1177,11 @@ int isolate_lru_page(struct page *page)
 }
 /*
- * Are there way too many processes in the direct reclaim path already?
+ * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
+ * then get resheduled. When there are massive number of tasks doing page
+ * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
+ * the LRU list will go small and be scanned faster than necessary, leading to
+ * unnecessary swapping, thrashing and OOM.
 */
 static int too_many_isolated(struct zone *zone, int file,
                struct scan_control *sc)
@@ -1198,6 +1202,14 @@ static int too_many_isolated(struct zone *zone, int file,
                isolated = zone_page_state(zone, NR_ISOLATED_ANON);
        }
+        /*
+         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+         * won't get blocked by normal direct-reclaimers, forming a circular
+         * deadlock.
+         */
+        if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+                inactive >>= 3;
        return isolated > inactive;
 }