Merge branch 'akpm' (patches from Andrew)

Merge third set of updates from Andrew Morton: - the rest of MM [ This includes getting rid of the numa hinting bits, in favor of just generic protnone logic. Yay. - Linus ] - core kernel - procfs - some of lib/ (lots of lib/ material this time) * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (104 commits) lib/lcm.c: replace include lib/percpu_ida.c: remove redundant includes lib/strncpy_from_user.c: replace module.h include lib/stmp_device.c: replace module.h include lib/sort.c: move include inside #if 0 lib/show_mem.c: remove redundant include lib/radix-tree.c: change to simpler include lib/plist.c: remove redundant include lib/nlattr.c: remove redundant include lib/kobject_uevent.c: remove redundant include lib/llist.c: remove redundant include lib/md5.c: simplify include lib/list_sort.c: rearrange includes lib/genalloc.c: remove redundant include lib/idr.c: remove redundant include lib/halfmd4.c: simplify includes lib/dynamic_queue_limits.c: simplify includes lib/sort.c: use simpler includes lib/interval_tree.c: simplify includes hexdump: make it return number of bytes placed in buffer ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-12 21:54:28 -0500
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-02-12 21:54:28 -0500
commit: 818099574b04c5301eacbbcd441022b353a65466 (patch)
tree: 77b3645b375105cb0389df2b4ea5ffa90329f7f8 /mm
parent: 802ea9d8645d33d24b7b4cd4537c14f3e698bde0 (diff)
parent: 6016daed58ee482a2f7684e93342e89139cf4419 (diff)
26 files changed, 1217 insertions, 400 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12869c8..de5239c152f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
          You can check speed with zsmalloc benchmark:
          https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+        bool "Export zsmalloc statistics"
+        depends on ZSMALLOC
+        select DEBUG_FS
+        help
+          This option enables code in the zsmalloc to collect various
+          statistics about whats happening in zsmalloc and exports that
+          information to userspace via debugfs.
+          If unsure, say N.
 config GENERIC_EARLY_IOREMAP
        bool
diff --git a/mm/compaction.c b/mm/compaction.c
index b68736c8a1ce..d50d6de6f1b6 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -490,6 +490,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
                /* If a page was split, advance to the end of it */
                if (isolated) {
+                        cc->nr_freepages += isolated;
+                        if (!strict &&
+                                cc->nr_migratepages <= cc->nr_freepages) {
+                                blockpfn += isolated;
+                                break;
+                        }
                        blockpfn += isolated - 1;
                        cursor += isolated - 1;
                        continue;
@@ -899,7 +906,6 @@ static void isolate_freepages(struct compact_control *cc)
        unsigned long isolate_start_pfn; /* exact pfn we start at */
        unsigned long block_end_pfn;    /* end of current pageblock */
        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
-        int nr_freepages = cc->nr_freepages;
        struct list_head *freelist = &cc->freepages;
        /*
@@ -924,11 +930,11 @@ static void isolate_freepages(struct compact_control *cc)
         * pages on cc->migratepages. We stop searching if the migrate
         * and free page scanners meet or enough free pages are isolated.
         */
-        for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+        for (; block_start_pfn >= low_pfn &&
+                        cc->nr_migratepages > cc->nr_freepages;
                                block_end_pfn = block_start_pfn,
                                block_start_pfn -= pageblock_nr_pages,
                                isolate_start_pfn = block_start_pfn) {
-                unsigned long isolated;
                /*
                 * This can iterate a massively long zone without finding any
@@ -953,9 +959,8 @@ static void isolate_freepages(struct compact_control *cc)
                        continue;
                /* Found a block suitable for isolating free pages from. */
-                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+                isolate_freepages_block(cc, &isolate_start_pfn,
                                        block_end_pfn, freelist, false);
-                nr_freepages += isolated;
                /*
                 * Remember where the free scanner should restart next time,
@@ -987,8 +992,6 @@ static void isolate_freepages(struct compact_control *cc)
         */
        if (block_start_pfn < low_pfn)
                cc->free_pfn = cc->migrate_pfn;
-        cc->nr_freepages = nr_freepages;
 }
 /*
@@ -1100,8 +1103,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
                low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
                                                                isolate_mode);
-                if (!low_pfn || cc->contended)
+                if (!low_pfn || cc->contended) {
+                        acct_isolated(zone, cc);
                        return ISOLATE_ABORT;
+                }
                /*
                 * Either we isolated something and proceed with migration. Or
@@ -1173,7 +1178,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
                        return COMPACT_PARTIAL;
                /* Job done if allocation would set block type */
-                if (cc->order >= pageblock_order && area->nr_free)
+                if (order >= pageblock_order && area->nr_free)
                        return COMPACT_PARTIAL;
        }
diff --git a/mm/gup.c b/mm/gup.c
index c2da1163986a..51bf0b06ca7b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,7 +64,7 @@ retry:
                migration_entry_wait(mm, pmd, address);
                goto retry;
        }
-        if ((flags & FOLL_NUMA) && pte_numa(pte))
+        if ((flags & FOLL_NUMA) && pte_protnone(pte))
                goto no_page;
        if ((flags & FOLL_WRITE) && !pte_write(pte)) {
                pte_unmap_unlock(ptep, ptl);
@@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
                        return page;
                return no_page_table(vma, flags);
        }
-        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                return no_page_table(vma, flags);
        if (pmd_trans_huge(*pmd)) {
                if (flags & FOLL_SPLIT) {
@@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
                /*
                 * Similar to the PMD case below, NUMA hinting must take slow
-                 * path
+                 * path using the pte_protnone check.
                 */
                if (!pte_present(pte) || pte_special(pte) ||
-                        pte_numa(pte) || (write && !pte_write(pte)))
+                        pte_protnone(pte) || (write && !pte_write(pte)))
                        goto pte_unmap;
                VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
                         * slowpath for accounting purposes and so that they
                         * can be serialised against THP migration.
                         */
-                        if (pmd_numa(pmd))
+                        if (pmd_protnone(pmd))
                                return 0;
                        if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                return ERR_PTR(-EFAULT);
        /* Full NUMA hinting faults to serialise migration in fault paths */
-        if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+        if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
                goto out;
        page = pmd_page(*pmd);
@@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
+        /* A PROT_NONE fault should not end up here */
+        BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
        ptl = pmd_lock(mm, pmdp);
        if (unlikely(!pmd_same(pmd, *pmdp)))
                goto out_unlock;
@@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
         * check_same as the page may no longer be mapped.
         */
        if (unlikely(pmd_trans_migrating(*pmdp))) {
+                page = pmd_page(*pmdp);
                spin_unlock(ptl);
-                wait_migrate_huge_page(vma->anon_vma, pmdp);
+                wait_on_page_locked(page);
                goto out;
        }
@@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /*
         * Migrate the THP to the requested node, returns with page unlocked
-         * and pmd_numa cleared.
+         * and access rights restored.
         */
        spin_unlock(ptl);
        migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        goto out;
 clear_pmdnuma:
        BUG_ON(!PageLocked(page));
-        pmd = pmd_mknonnuma(pmd);
+        pmd = pmd_modify(pmd, vma->vm_page_prot);
        set_pmd_at(mm, haddr, pmdp, pmd);
-        VM_BUG_ON(pmd_numa(*pmdp));
        update_mmu_cache_pmd(vma, addr, pmdp);
        unlock_page(page);
 out_unlock:
@@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                pmd_t entry;
-                ret = 1;
-                if (!prot_numa) {
+                /*
+                 * Avoid trapping faults against the zero page. The read-only
+                 * data is likely to be read-cached on the local CPU and
+                 * local/remote hits to the zero page are not interesting.
+                 */
+                if (prot_numa && is_huge_zero_pmd(*pmd)) {
+                        spin_unlock(ptl);
+                        return 0;
+                }
+                if (!prot_numa || !pmd_protnone(*pmd)) {
+                        ret = 1;
                        entry = pmdp_get_and_clear_notify(mm, addr, pmd);
-                        if (pmd_numa(entry))
-                                entry = pmd_mknonnuma(entry);
                        entry = pmd_modify(entry, newprot);
                        ret = HPAGE_PMD_NR;
                        set_pmd_at(mm, addr, pmd, entry);
                        BUG_ON(pmd_write(entry));
-                } else {
-                        struct page *page = pmd_page(*pmd);
-                        /*
-                         * Do not trap faults against the zero page. The
-                         * read-only data is likely to be read-cached on the
-                         * local CPU cache and it is less useful to know about
-                         * local vs remote hits on the zero page.
-                         */
-                        if (!is_huge_zero_page(page) &&
-                            !pmd_numa(*pmd)) {
-                                pmdp_set_numa(mm, addr, pmd);
-                                ret = HPAGE_PMD_NR;
-                        }
                }
                spin_unlock(ptl);
        }
@@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
                        pte_t *pte, entry;
                        BUG_ON(PageCompound(page+i));
                        /*
-                         * Note that pmd_numa is not transferred deliberately
+                         * Note that NUMA hinting access restrictions are not
-                         * to avoid any possibility that pte_numa leaks to
+                         * transferred to avoid any possibility of altering
-                         * a PROT_NONE VMA by accident.
+                         * permissions across VMAs.
                         */
                        entry = mk_pte(page + i, vma->vm_page_prot);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/internal.h b/mm/internal.h
index c4d6c9b43491..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -351,8 +351,10 @@ extern int mminit_loglevel;
 #define mminit_dprintk(level, prefix, fmt, arg...) \
 do { \
        if (level < mminit_loglevel) { \
-                printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
+                if (level <= MMINIT_WARNING) \
-                printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+                        printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+                else \
+                        printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
        } \
 } while (0)
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+static void list_lru_register(struct list_lru *lru)
+{
+        mutex_lock(&list_lrus_mutex);
+        list_add(&lru->list, &list_lrus);
+        mutex_unlock(&list_lrus_mutex);
+}
+static void list_lru_unregister(struct list_lru *lru)
+{
+        mutex_lock(&list_lrus_mutex);
+        list_del(&lru->list);
+        mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+        return !!lru->node[0].memcg_lrus;
+}
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+        /*
+         * The lock protects the array of per cgroup lists from relocation
+         * (see memcg_update_list_lru_node).
+         */
+        lockdep_assert_held(&nlru->lock);
+        if (nlru->memcg_lrus && idx >= 0)
+                return nlru->memcg_lrus->lru[idx];
+        return &nlru->lru;
+}
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+        struct mem_cgroup *memcg;
+        if (!nlru->memcg_lrus)
+                return &nlru->lru;
+        memcg = mem_cgroup_from_kmem(ptr);
+        if (!memcg)
+                return &nlru->lru;
+        return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+        return false;
+}
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+        return &nlru->lru;
+}
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+        return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        spin_lock(&nlru->lock);
-        WARN_ON_ONCE(nlru->nr_items < 0);
+        l = list_lru_from_kmem(nlru, item);
        if (list_empty(item)) {
-                list_add_tail(item, &nlru->list);
+                list_add_tail(item, &l->list);
-                if (nlru->nr_items++ == 0)
+                l->nr_items++;
-                        node_set(nid, lru->active_nodes);
                spin_unlock(&nlru->lock);
                return true;
        }
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
        int nid = page_to_nid(virt_to_page(item));
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        spin_lock(&nlru->lock);
+        l = list_lru_from_kmem(nlru, item);
        if (!list_empty(item)) {
                list_del_init(item);
-                if (--nlru->nr_items == 0)
+                l->nr_items--;
-                        node_clear(nid, lru->active_nodes);
-                WARN_ON_ONCE(nlru->nr_items < 0);
                spin_unlock(&nlru->lock);
                return true;
        }
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
-list_lru_count_node(struct list_lru *lru, int nid)
+{
+        list_del_init(item);
+        list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+                           struct list_head *head)
+{
+        list_move(item, head);
+        list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+                                          int nid, int memcg_idx)
 {
-        unsigned long count = 0;
        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
+        unsigned long count;
        spin_lock(&nlru->lock);
-        WARN_ON_ONCE(nlru->nr_items < 0);
+        l = list_lru_from_memcg_idx(nlru, memcg_idx);
-        count += nlru->nr_items;
+        count = l->nr_items;
        spin_unlock(&nlru->lock);
        return count;
 }
+unsigned long list_lru_count_one(struct list_lru *lru,
+                                 int nid, struct mem_cgroup *memcg)
+{
+        return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+        long count = 0;
+        int memcg_idx;
+        count += __list_lru_count_one(lru, nid, -1);
+        if (list_lru_memcg_aware(lru)) {
+                for_each_memcg_cache_index(memcg_idx)
+                        count += __list_lru_count_one(lru, nid, memcg_idx);
+        }
+        return count;
+}
 EXPORT_SYMBOL_GPL(list_lru_count_node);
-unsigned long
+static unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
-                   void *cb_arg, unsigned long *nr_to_walk)
+                    list_lru_walk_cb isolate, void *cb_arg,
+                    unsigned long *nr_to_walk)
 {
-        struct list_lru_node    *nlru = &lru->node[nid];
+        struct list_lru_node *nlru = &lru->node[nid];
+        struct list_lru_one *l;
        struct list_head *item, *n;
        unsigned long isolated = 0;
        spin_lock(&nlru->lock);
+        l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-        list_for_each_safe(item, n, &nlru->list) {
+        list_for_each_safe(item, n, &l->list) {
                enum lru_status ret;
                /*
@@ -85,14 +206,11 @@ restart:
                        break;
                --*nr_to_walk;
-                ret = isolate(item, &nlru->lock, cb_arg);
+                ret = isolate(item, l, &nlru->lock, cb_arg);
                switch (ret) {
                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                case LRU_REMOVED:
-                        if (--nlru->nr_items == 0)
-                                node_clear(nid, lru->active_nodes);
-                        WARN_ON_ONCE(nlru->nr_items < 0);
                        isolated++;
                        /*
                         * If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
                                goto restart;
                        break;
                case LRU_ROTATE:
-                        list_move_tail(item, &nlru->list);
+                        list_move_tail(item, &l->list);
                        break;
                case LRU_SKIP:
                        break;
@@ -122,31 +240,322 @@ restart:
        spin_unlock(&nlru->lock);
        return isolated;
 }
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+                  list_lru_walk_cb isolate, void *cb_arg,
+                  unsigned long *nr_to_walk)
+{
+        return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+                                   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+                                 list_lru_walk_cb isolate, void *cb_arg,
+                                 unsigned long *nr_to_walk)
+{
+        long isolated = 0;
+        int memcg_idx;
+        isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+                                        nr_to_walk);
+        if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+                for_each_memcg_cache_index(memcg_idx) {
+                        isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+                                                isolate, cb_arg, nr_to_walk);
+                        if (*nr_to_walk <= 0)
+                                break;
+                }
+        }
+        return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+        INIT_LIST_HEAD(&l->list);
+        l->nr_items = 0;
+}
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+                                          int begin, int end)
+{
+        int i;
+        for (i = begin; i < end; i++)
+                kfree(memcg_lrus->lru[i]);
+}
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+                                      int begin, int end)
+{
+        int i;
+        for (i = begin; i < end; i++) {
+                struct list_lru_one *l;
+                l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+                if (!l)
+                        goto fail;
+                init_one_lru(l);
+                memcg_lrus->lru[i] = l;
+        }
+        return 0;
+fail:
+        __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+        return -ENOMEM;
+}
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+        int size = memcg_nr_cache_ids;
+        nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+        if (!nlru->memcg_lrus)
+                return -ENOMEM;
+        if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+                kfree(nlru->memcg_lrus);
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+        __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+        kfree(nlru->memcg_lrus);
+}
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+                                      int old_size, int new_size)
+{
+        struct list_lru_memcg *old, *new;
+        BUG_ON(old_size > new_size);
+        old = nlru->memcg_lrus;
+        new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+        if (!new)
+                return -ENOMEM;
+        if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+                kfree(new);
+                return -ENOMEM;
+        }
+        memcpy(new, old, old_size * sizeof(void *));
+        /*
+         * The lock guarantees that we won't race with a reader
+         * (see list_lru_from_memcg_idx).
+         *
+         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+         * we have to use IRQ-safe primitives here to avoid deadlock.
+         */
+        spin_lock_irq(&nlru->lock);
+        nlru->memcg_lrus = new;
+        spin_unlock_irq(&nlru->lock);
+        kfree(old);
+        return 0;
+}
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+                                              int old_size, int new_size)
+{
+        /* do not bother shrinking the array back to the old size, because we
+         * cannot handle allocation failures here */
+        __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+        int i;
+        for (i = 0; i < nr_node_ids; i++) {
+                if (!memcg_aware)
+                        lru->node[i].memcg_lrus = NULL;
+                else if (memcg_init_list_lru_node(&lru->node[i]))
+                        goto fail;
+        }
+        return 0;
+fail:
+        for (i = i - 1; i >= 0; i--)
+                memcg_destroy_list_lru_node(&lru->node[i]);
+        return -ENOMEM;
+}
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_destroy_list_lru_node(&lru->node[i]);
+}
+static int memcg_update_list_lru(struct list_lru *lru,
+                                 int old_size, int new_size)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return 0;
+        for (i = 0; i < nr_node_ids; i++) {
+                if (memcg_update_list_lru_node(&lru->node[i],
+                                               old_size, new_size))
+                        goto fail;
+        }
+        return 0;
+fail:
+        for (i = i - 1; i >= 0; i--)
+                memcg_cancel_update_list_lru_node(&lru->node[i],
+                                                  old_size, new_size);
+        return -ENOMEM;
+}
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+                                         int old_size, int new_size)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_cancel_update_list_lru_node(&lru->node[i],
+                                                  old_size, new_size);
+}
+int memcg_update_all_list_lrus(int new_size)
+{
+        int ret = 0;
+        struct list_lru *lru;
+        int old_size = memcg_nr_cache_ids;
+        mutex_lock(&list_lrus_mutex);
+        list_for_each_entry(lru, &list_lrus, list) {
+                ret = memcg_update_list_lru(lru, old_size, new_size);
+                if (ret)
+                        goto fail;
+        }
+out:
+        mutex_unlock(&list_lrus_mutex);
+        return ret;
+fail:
+        list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+                memcg_cancel_update_list_lru(lru, old_size, new_size);
+        goto out;
+}
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+                                      int src_idx, int dst_idx)
+{
+        struct list_lru_one *src, *dst;
+        /*
+         * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+         * we have to use IRQ-safe primitives here to avoid deadlock.
+         */
+        spin_lock_irq(&nlru->lock);
+        src = list_lru_from_memcg_idx(nlru, src_idx);
+        dst = list_lru_from_memcg_idx(nlru, dst_idx);
+        list_splice_init(&src->list, &dst->list);
+        dst->nr_items += src->nr_items;
+        src->nr_items = 0;
+        spin_unlock_irq(&nlru->lock);
+}
+static void memcg_drain_list_lru(struct list_lru *lru,
+                                 int src_idx, int dst_idx)
+{
+        int i;
+        if (!list_lru_memcg_aware(lru))
+                return;
+        for (i = 0; i < nr_node_ids; i++)
+                memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+        struct list_lru *lru;
+        mutex_lock(&list_lrus_mutex);
+        list_for_each_entry(lru, &list_lrus, list)
+                memcg_drain_list_lru(lru, src_idx, dst_idx);
+        mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+        return 0;
+}
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+                    struct lock_class_key *key)
 {
        int i;
        size_t size = sizeof(*lru->node) * nr_node_ids;
+        int err = -ENOMEM;
+        memcg_get_cache_ids();
        lru->node = kzalloc(size, GFP_KERNEL);
        if (!lru->node)
-                return -ENOMEM;
+                goto out;
-        nodes_clear(lru->active_nodes);
        for (i = 0; i < nr_node_ids; i++) {
                spin_lock_init(&lru->node[i].lock);
                if (key)
                        lockdep_set_class(&lru->node[i].lock, key);
-                INIT_LIST_HEAD(&lru->node[i].list);
+                init_one_lru(&lru->node[i].lru);
-                lru->node[i].nr_items = 0;
        }
-        return 0;
+        err = memcg_init_list_lru(lru, memcg_aware);
+        if (err) {
+                kfree(lru->node);
+                goto out;
+        }
+        list_lru_register(lru);
+out:
+        memcg_put_cache_ids();
+        return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 void list_lru_destroy(struct list_lru *lru)
 {
+        /* Already destroyed or not yet initialized? */
+        if (!lru->node)
+                return;
+        memcg_get_cache_ids();
+        list_lru_unregister(lru);
+        memcg_destroy_list_lru(lru);
        kfree(lru->node);
+        lru->node = NULL;
+        memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,8 +332,10 @@ struct mem_cgroup {
        struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
+        bool kmem_acct_activated;
+        bool kmem_acct_active;
 #endif
        int last_scanned_node;
@@ -352,9 +354,9 @@ struct mem_cgroup {
 };
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-        return memcg->kmemcg_id >= 0;
+        return memcg->kmem_acct_active;
 }
 #endif
@@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-        if (!memcg_proto_activated(&memcg->tcp_mem))
-                return;
-        static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 * The main reason for not using cgroup id for this:
 *  this works better in sparse environments, where we have a lot of memcgs,
 *  but only a few kmem-limited. Or also, if we have, for instance, 200
 *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 *  200 entry array for that.
 *
- * The current size of the caches array is stored in
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
- * memcg_limited_groups_array_size.  It will double each time we have to
+ * will double each time we have to increase it.
- * increase it.
 */
-static DEFINE_IDA(kmem_limited_groups);
+static DEFINE_IDA(memcg_cache_ida);
-int memcg_limited_groups_array_size;
+int memcg_nr_cache_ids;
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+void memcg_get_cache_ids(void)
+{
+        down_read(&memcg_cache_ids_sem);
+}
+void memcg_put_cache_ids(void)
+{
+        up_read(&memcg_cache_ids_sem);
+}
 /*
 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -569,32 +573,8 @@ int memcg_limited_groups_array_size;
 struct static_key memcg_kmem_enabled_key;
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-        if (memcg_kmem_is_active(memcg)) {
-                static_key_slow_dec(&memcg_kmem_enabled_key);
-                memcg_free_cache_id(memcg->kmemcg_id);
-        }
-        /*
-         * This check can't live in kmem destruction function,
-         * since the charges will outlive the cgroup
-         */
-        WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
 #endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
-        disarm_sock_keys(memcg);
-        disarm_kmem_keys(memcg);
-}
 static struct mem_cgroup_per_zone *
 mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
 {
@@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
        int id, size;
        int err;
-        id = ida_simple_get(&kmem_limited_groups,
+        id = ida_simple_get(&memcg_cache_ida,
                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
        if (id < 0)
                return id;
-        if (id < memcg_limited_groups_array_size)
+        if (id < memcg_nr_cache_ids)
                return id;
        /*
         * There's no space for the new id in memcg_caches arrays,
         * so we have to grow them.
         */
+        down_write(&memcg_cache_ids_sem);
        size = 2 * (id + 1);
        if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void)
                size = MEMCG_CACHES_MAX_SIZE;
        err = memcg_update_all_caches(size);
+        if (!err)
+                err = memcg_update_all_list_lrus(size);
+        if (!err)
+                memcg_nr_cache_ids = size;
+        up_write(&memcg_cache_ids_sem);
        if (err) {
-                ida_simple_remove(&kmem_limited_groups, id);
+                ida_simple_remove(&memcg_cache_ida, id);
                return err;
        }
        return id;
@@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void)
 static void memcg_free_cache_id(int id)
 {
-        ida_simple_remove(&kmem_limited_groups, id);
+        ida_simple_remove(&memcg_cache_ida, id);
-}
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-        memcg_limited_groups_array_size = num;
 }
 struct memcg_kmem_cache_create_work {
@@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
+        int kmemcg_id;
-        VM_BUG_ON(!cachep->memcg_params);
+        VM_BUG_ON(!is_root_cache(cachep));
-        VM_BUG_ON(!cachep->memcg_params->is_root_cache);
        if (current->memcg_kmem_skip_account)
                return cachep;
        memcg = get_mem_cgroup_from_mm(current->mm);
-        if (!memcg_kmem_is_active(memcg))
+        kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+        if (kmemcg_id < 0)
                goto out;
-        memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+        memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
        if (likely(memcg_cachep))
                return memcg_cachep;
@@ -2692,7 +2671,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
        if (!is_root_cache(cachep))
-                css_put(&cachep->memcg_params->memcg->css);
+                css_put(&cachep->memcg_params.memcg->css);
 }
 /*
@@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+        struct mem_cgroup *memcg = NULL;
+        struct kmem_cache *cachep;
+        struct page *page;
+        page = virt_to_head_page(ptr);
+        if (PageSlab(page)) {
+                cachep = page->slab_cache;
+                if (!is_root_cache(cachep))
+                        memcg = cachep->memcg_params.memcg;
+        } else
+                /* page allocated by alloc_kmem_pages */
+                memcg = page->mem_cgroup;
+        return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
        int err = 0;
        int memcg_id;
-        if (memcg_kmem_is_active(memcg))
+        BUG_ON(memcg->kmemcg_id >= 0);
-                return 0;
+        BUG_ON(memcg->kmem_acct_activated);
+        BUG_ON(memcg->kmem_acct_active);
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         * patched.
         */
        memcg->kmemcg_id = memcg_id;
+        memcg->kmem_acct_activated = true;
+        memcg->kmem_acct_active = true;
 out:
        return err;
 }
@@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(memcg, ss);
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+        struct cgroup_subsys_state *css;
+        struct mem_cgroup *parent, *child;
+        int kmemcg_id;
+        if (!memcg->kmem_acct_active)
+                return;
+        /*
+         * Clear the 'active' flag before clearing memcg_caches arrays entries.
+         * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+         * guarantees no cache will be created for this cgroup after we are
+         * done (see memcg_create_kmem_cache()).
+         */
+        memcg->kmem_acct_active = false;
+        memcg_deactivate_kmem_caches(memcg);
+        kmemcg_id = memcg->kmemcg_id;
+        BUG_ON(kmemcg_id < 0);
+        parent = parent_mem_cgroup(memcg);
+        if (!parent)
+                parent = root_mem_cgroup;
+        /*
+         * Change kmemcg_id of this cgroup and all its descendants to the
+         * parent's id, and then move all entries from this cgroup's list_lrus
+         * to ones of the parent. After we have finished, all list_lrus
+         * corresponding to this cgroup are guaranteed to remain empty. The
+         * ordering is imposed by list_lru_node->lock taken by
+         * memcg_drain_all_list_lrus().
+         */
+        css_for_each_descendant_pre(css, &memcg->css) {
+                child = mem_cgroup_from_css(css);
+                BUG_ON(child->kmemcg_id != kmemcg_id);
+                child->kmemcg_id = parent->kmemcg_id;
+                if (!memcg->use_hierarchy)
+                        break;
+        }
+        memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+        memcg_free_cache_id(kmemcg_id);
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
-        memcg_destroy_kmem_caches(memcg);
+        if (memcg->kmem_acct_activated) {
+                memcg_destroy_kmem_caches(memcg);
+                static_key_slow_dec(&memcg_kmem_enabled_key);
+                WARN_ON(page_counter_read(&memcg->kmem));
+        }
        mem_cgroup_sockets_destroy(memcg);
 }
 #else
@@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return 0;
 }
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
        free_percpu(memcg->stat);
-        disarm_static_keys(memcg);
        kfree(memcg);
 }
@@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        spin_unlock(&memcg->event_list_lock);
        vmpressure_cleanup(&memcg->vmpressure);
+        memcg_deactivate_kmem(memcg);
 }
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
         * Only call shrink_node_slabs here (which would also shrink
         * other caches) if access is not potentially fatal.
         */
-        if (access) {
+        if (access)
-                int nr;
+                drop_slab_node(page_to_nid(p));
-                int nid = page_to_nid(p);
-                do {
-                        nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
-                        if (page_count(p) == 1)
-                                break;
-                } while (nr > 10);
-        }
 }
 EXPORT_SYMBOL_GPL(shake_page);
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
                         * setting PG_hwpoison.
                         */
                        if (!is_free_buddy_page(page))
-                                lru_add_drain_all();
-                        if (!is_free_buddy_page(page))
                                drain_all_pages(page_zone(page));
                        SetPageHWPoison(page);
                        if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index bbe6a73a899d..99275325f303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3013,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
        bool migrated = false;
        int flags = 0;
+        /* A PROT_NONE fault should not end up here */
+        BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
        /*
        * The "pte" at this point cannot be used safely without
        * validation through pte_unmap_same(). It's of NUMA type but
        * the pfn may be screwed if the read is non atomic.
        *
-        * ptep_modify_prot_start is not called as this is clearing
+        * We can safely just do a "set_pte_at()", because the old
-        * the _PAGE_NUMA bit and it is not really expected that there
+        * page table entry is not accessible, so there would be no
-        * would be concurrent hardware modifications to the PTE.
+        * concurrent hardware modifications to the PTE.
        */
        ptl = pte_lockptr(mm, pmd);
        spin_lock(ptl);
@@ -3029,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out;
        }
-        pte = pte_mknonnuma(pte);
+        /* Make it present again */
+        pte = pte_modify(pte, vma->vm_page_prot);
+        pte = pte_mkyoung(pte);
        set_pte_at(mm, addr, ptep, pte);
        update_mmu_cache(vma, addr, ptep);
@@ -3038,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte_unmap_unlock(ptep, ptl);
                return 0;
        }
-        BUG_ON(is_zero_pfn(page_to_pfn(page)));
        /*
         * Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3124,7 +3128,7 @@ static int handle_pte_fault(struct mm_struct *mm,
                                        pte, pmd, flags, entry);
        }
-        if (pte_numa(entry))
+        if (pte_protnone(entry))
                return do_numa_page(mm, vma, address, entry, pte, pmd);
        ptl = pte_lockptr(mm, pmd);
@@ -3202,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        if (pmd_trans_splitting(orig_pmd))
                                return 0;
-                        if (pmd_numa(orig_pmd))
+                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
@@ -3458,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
        if (follow_phys(vma, addr, write, &prot, &phys_addr))
                return -EINVAL;
-        maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+        maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
        if (write)
                memcpy_toio(maddr + offset, buf, len);
        else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..c75f4dcec808 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
 {
        int nr_updated;
-        nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+        nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
        if (nr_updated)
                count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
        return PageLocked(page);
 }
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
-        struct page *page = pmd_page(*pmd);
-        wait_on_page_locked(page);
-}
 /*
 * Attempt to migrate a misplaced page to the specified destination
 * node. Caller is expected to have an elevated reference count on
@@ -1853,7 +1847,7 @@ out_fail:
 out_dropref:
        ptl = pmd_lock(mm, pmd);
        if (pmd_same(*pmd, entry)) {
-                entry = pmd_mknonnuma(entry);
+                entry = pmd_modify(entry, vma->vm_page_prot);
                set_pmd_at(mm, mmun_start, pmd, entry);
                update_mmu_cache_pmd(vma, address, &entry);
        }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
 #include "internal.h"
 #ifdef CONFIG_DEBUG_MEMORY_INIT
-int mminit_loglevel;
+int __meminitdata mminit_loglevel;
 #ifndef SECTIONS_SHIFT
 #define SECTIONS_SHIFT  0
 #endif
 /* The zonelists are simply reported, validation is manual. */
-void mminit_verify_zonelist(void)
+void __init mminit_verify_zonelist(void)
 {
        int nid;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33121662f08b..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                oldpte = *pte;
                if (pte_present(oldpte)) {
                        pte_t ptent;
-                        bool updated = false;
-                        if (!prot_numa) {
+                        /*
-                                ptent = ptep_modify_prot_start(mm, addr, pte);
+                         * Avoid trapping faults against the zero or KSM
-                                if (pte_numa(ptent))
+                         * pages. See similar comment in change_huge_pmd.
-                                        ptent = pte_mknonnuma(ptent);
+                         */
-                                ptent = pte_modify(ptent, newprot);
+                        if (prot_numa) {
-                                /*
-                                 * Avoid taking write faults for pages we
-                                 * know to be dirty.
-                                 */
-                                if (dirty_accountable && pte_dirty(ptent) &&
-                                    (pte_soft_dirty(ptent) ||
-                                     !(vma->vm_flags & VM_SOFTDIRTY)))
-                                        ptent = pte_mkwrite(ptent);
-                                ptep_modify_prot_commit(mm, addr, pte, ptent);
-                                updated = true;
-                        } else {
                                struct page *page;
                                page = vm_normal_page(vma, addr, oldpte);
-                                if (page && !PageKsm(page)) {
+                                if (!page || PageKsm(page))
-                                        if (!pte_numa(oldpte)) {
+                                        continue;
-                                                ptep_set_numa(mm, addr, pte);
-                                                updated = true;
+                                /* Avoid TLB flush if possible */
-                                        }
+                                if (pte_protnone(oldpte))
-                                }
+                                        continue;
                        }
-                        if (updated)
-                                pages++;
+                        ptent = ptep_modify_prot_start(mm, addr, pte);
+                        ptent = pte_modify(ptent, newprot);
+                        /* Avoid taking write faults for known dirty pages */
+                        if (dirty_accountable && pte_dirty(ptent) &&
+                                        (pte_soft_dirty(ptent) ||
+                                         !(vma->vm_flags & VM_SOFTDIRTY))) {
+                                ptent = pte_mkwrite(ptent);
+                        }
+                        ptep_modify_prot_commit(mm, addr, pte, ptent);
+                        pages++;
                } else if (IS_ENABLED(CONFIG_MIGRATION)) {
                        swp_entry_t entry = pte_to_swp_entry(oldpte);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d52ab18fe0d..cb4758263f6b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -172,7 +172,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
 *      1G machine -> (16M dma, 784M normal, 224M high)
 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- *      HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
 *
 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 * don't need any ZONE_NORMAL reservation
@@ -3871,18 +3871,29 @@ static int __build_all_zonelists(void *data)
        return 0;
 }
+static noinline void __init
+build_all_zonelists_init(void)
+{
+        __build_all_zonelists(NULL);
+        mminit_verify_zonelist();
+        cpuset_init_current_mems_allowed();
+}
 /*
 * Called with zonelists_mutex held always
 * unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
 */
 void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
 {
        set_zonelist_order();
        if (system_state == SYSTEM_BOOTING) {
-                __build_all_zonelists(NULL);
+                build_all_zonelists_init();
-                mminit_verify_zonelist();
-                cpuset_init_current_mems_allowed();
        } else {
 #ifdef CONFIG_MEMORY_HOTPLUG
                if (zone)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
                     pmd_t *pmdp)
 {
        pmd_t entry = *pmdp;
-        if (pmd_numa(entry))
-                entry = pmd_mknonnuma(entry);
        set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
        flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
        return nr_freed;
 }
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
        int ret = 0;
        int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
        int i;
        struct kmem_cache_node *n;
-        int rc = __kmem_cache_shrink(cachep);
+        int rc = __kmem_cache_shrink(cachep, false);
        if (rc)
                return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
                                int batchcount, int shared, gfp_t gfp)
 {
        int ret;
-        struct kmem_cache *c = NULL;
+        struct kmem_cache *c;
-        int i = 0;
        ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
        if ((ret < 0) || !is_root_cache(cachep))
                return ret;
-        VM_BUG_ON(!mutex_is_locked(&slab_mutex));
+        lockdep_assert_held(&slab_mutex);
-        for_each_memcg_cache_index(i) {
+        for_each_memcg_cache(c, cachep) {
-                c = cache_from_memcg_idx(cachep, i);
+                /* return value determined by the root cache only */
-                if (c)
+                __do_tune_cpucache(c, limit, batchcount, shared, gfp);
-                        /* return value determined by the parent cache only */
-                        __do_tune_cpucache(c, limit, batchcount, shared, gfp);
        }
        return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
                        size_t size, unsigned long flags);
-struct mem_cgroup;
 int slab_unmergeable(struct kmem_cache *s);
 struct kmem_cache *find_mergeable(size_t size, size_t align,
                unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);
 struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
                       size_t count, loff_t *ppos);
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+        list_for_each_entry(iter, &(root)->memcg_params.list, \
+                            memcg_params.list)
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+        list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+                                 memcg_params.list)
 static inline bool is_root_cache(struct kmem_cache *s)
 {
-        return !s->memcg_params || s->memcg_params->is_root_cache;
+        return s->memcg_params.is_root_cache;
 }
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-                                        struct kmem_cache *p)
+                                      struct kmem_cache *p)
 {
-        return (p == s) ||
+        return p == s || p == s->memcg_params.root_cache;
-                (s->memcg_params && (p == s->memcg_params->root_cache));
 }
 /*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
        if (!is_root_cache(s))
-                return s->memcg_params->root_cache->name;
+                s = s->memcg_params.root_cache;
        return s->name;
 }
 /*
 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
+ * That said the caller must assure the memcg's cache won't go away by either
- * created a memcg's cache is destroyed only along with the root cache, it is
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
 */
 static inline struct kmem_cache *
 cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
        struct kmem_cache *cachep;
-        struct memcg_cache_params *params;
+        struct memcg_cache_array *arr;
-        if (!s->memcg_params)
-                return NULL;
        rcu_read_lock();
-        params = rcu_dereference(s->memcg_params);
+        arr = rcu_dereference(s->memcg_params.memcg_caches);
        /*
         * Make sure we will access the up-to-date value. The code updating
         * memcg_caches issues a write barrier to match this (see
-         * memcg_register_cache()).
+         * memcg_create_kmem_cache()).
         */
-        cachep = lockless_dereference(params->memcg_caches[idx]);
+        cachep = lockless_dereference(arr->entries[idx]);
        rcu_read_unlock();
        return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
        if (is_root_cache(s))
                return s;
-        return s->memcg_params->root_cache;
+        return s->memcg_params.root_cache;
 }
 static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
                return 0;
        if (is_root_cache(s))
                return 0;
-        return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
+        return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
 }
 static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
                return;
        if (is_root_cache(s))
                return;
-        memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
+        memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
 }
-#else
+extern void slab_init_memcg_params(struct kmem_cache *);
+#else /* !CONFIG_MEMCG_KMEM */
+#define for_each_memcg_cache(iter, root) \
+        for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+        for ((void)(iter), (void)(tmp), (void)(root); 0; )
 static inline bool is_root_cache(struct kmem_cache *s)
 {
        return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
 static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
 }
-#endif
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..1a1cc89acaa3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+void slab_init_memcg_params(struct kmem_cache *s)
-                struct kmem_cache *s, struct kmem_cache *root_cache)
 {
-        size_t size;
+        s->memcg_params.is_root_cache = true;
+        INIT_LIST_HEAD(&s->memcg_params.list);
+        RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+static int init_memcg_params(struct kmem_cache *s,
+                struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+        struct memcg_cache_array *arr;
-        if (!memcg_kmem_enabled())
+        if (memcg) {
+                s->memcg_params.is_root_cache = false;
+                s->memcg_params.memcg = memcg;
+                s->memcg_params.root_cache = root_cache;
                return 0;
+        }
-        if (!memcg) {
+        slab_init_memcg_params(s);
-                size = offsetof(struct memcg_cache_params, memcg_caches);
-                size += memcg_limited_groups_array_size * sizeof(void *);
-        } else
-                size = sizeof(struct memcg_cache_params);
-        s->memcg_params = kzalloc(size, GFP_KERNEL);
+        if (!memcg_nr_cache_ids)
-        if (!s->memcg_params)
+                return 0;
-                return -ENOMEM;
-        if (memcg) {
+        arr = kzalloc(sizeof(struct memcg_cache_array) +
-                s->memcg_params->memcg = memcg;
+                      memcg_nr_cache_ids * sizeof(void *),
-                s->memcg_params->root_cache = root_cache;
+                      GFP_KERNEL);
-        } else
+        if (!arr)
-                s->memcg_params->is_root_cache = true;
+                return -ENOMEM;
+        RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
        return 0;
 }
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
 {
-        kfree(s->memcg_params);
+        if (is_root_cache(s))
+                kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
 }
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
 {
-        int size;
+        struct memcg_cache_array *old, *new;
-        struct memcg_cache_params *new_params, *cur_params;
-        BUG_ON(!is_root_cache(s));
-        size = offsetof(struct memcg_cache_params, memcg_caches);
+        if (!is_root_cache(s))
-        size += num_memcgs * sizeof(void *);
+                return 0;
-        new_params = kzalloc(size, GFP_KERNEL);
+        new = kzalloc(sizeof(struct memcg_cache_array) +
-        if (!new_params)
+                      new_array_size * sizeof(void *), GFP_KERNEL);
+        if (!new)
                return -ENOMEM;
-        cur_params = s->memcg_params;
+        old = rcu_dereference_protected(s->memcg_params.memcg_caches,
-        memcpy(new_params->memcg_caches, cur_params->memcg_caches,
+                                        lockdep_is_held(&slab_mutex));
-               memcg_limited_groups_array_size * sizeof(void *));
+        if (old)
+                memcpy(new->entries, old->entries,
-        new_params->is_root_cache = true;
+                       memcg_nr_cache_ids * sizeof(void *));
-        rcu_assign_pointer(s->memcg_params, new_params);
-        if (cur_params)
-                kfree_rcu(cur_params, rcu_head);
+        rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+        if (old)
+                kfree_rcu(old, rcu);
        return 0;
 }
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
 {
        struct kmem_cache *s;
        int ret = 0;
-        mutex_lock(&slab_mutex);
+        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list) {
-                if (!is_root_cache(s))
+                ret = update_memcg_params(s, num_memcgs);
-                        continue;
-                ret = memcg_update_cache_params(s, num_memcgs);
                /*
                 * Instead of freeing the memory, we'll just leave the caches
                 * up to this point in an updated state.
                 */
                if (ret)
-                        goto out;
+                        break;
        }
-        memcg_update_array_size(num_memcgs);
-out:
        mutex_unlock(&slab_mutex);
        return ret;
 }
 #else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
+static inline int init_memcg_params(struct kmem_cache *s,
-                struct kmem_cache *s, struct kmem_cache *root_cache)
+                struct mem_cgroup *memcg, struct kmem_cache *root_cache)
 {
        return 0;
 }
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
 {
 }
 #endif /* CONFIG_MEMCG_KMEM */
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
        s->align = align;
        s->ctor = ctor;
-        err = memcg_alloc_cache_params(memcg, s, root_cache);
+        err = init_memcg_params(s, memcg, root_cache);
        if (err)
                goto out_free_cache;
@@ -330,7 +329,7 @@ out:
        return s;
 out_free_cache:
-        memcg_free_cache_params(s);
+        destroy_memcg_params(s);
        kmem_cache_free(kmem_cache, s);
        goto out;
 }
@@ -369,6 +368,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
        get_online_cpus();
        get_online_mems();
+        memcg_get_cache_ids();
        mutex_lock(&slab_mutex);
@@ -407,6 +407,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
 out_unlock:
        mutex_unlock(&slab_mutex);
+        memcg_put_cache_ids();
        put_online_mems();
        put_online_cpus();
@@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
                *need_rcu_barrier = true;
 #ifdef CONFIG_MEMCG_KMEM
-        if (!is_root_cache(s)) {
+        if (!is_root_cache(s))
-                struct kmem_cache *root_cache = s->memcg_params->root_cache;
+                list_del(&s->memcg_params.list);
-                int memcg_id = memcg_cache_id(s->memcg_params->memcg);
-                BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
-                root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
-        }
 #endif
        list_move(&s->list, release);
        return 0;
@@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
                             struct kmem_cache *root_cache)
 {
        static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
-        int memcg_id = memcg_cache_id(memcg);
+        struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+        struct memcg_cache_array *arr;
        struct kmem_cache *s = NULL;
        char *cache_name;
+        int idx;
        get_online_cpus();
        get_online_mems();
@@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
        mutex_lock(&slab_mutex);
        /*
+         * The memory cgroup could have been deactivated while the cache
+         * creation work was pending.
+         */
+        if (!memcg_kmem_is_active(memcg))
+                goto out_unlock;
+        idx = memcg_cache_id(memcg);
+        arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+                                        lockdep_is_held(&slab_mutex));
+        /*
         * Since per-memcg caches are created asynchronously on first
         * allocation (see memcg_kmem_get_cache()), several threads can try to
         * create the same cache, but only one of them may succeed.
         */
-        if (cache_from_memcg_idx(root_cache, memcg_id))
+        if (arr->entries[idx])
                goto out_unlock;
-        cgroup_name(mem_cgroup_css(memcg)->cgroup,
+        cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
-                    memcg_name_buf, sizeof(memcg_name_buf));
        cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
-                               memcg_cache_id(memcg), memcg_name_buf);
+                               css->id, memcg_name_buf);
        if (!cache_name)
                goto out_unlock;
@@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
                goto out_unlock;
        }
+        list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
        /*
         * Since readers won't lock (see cache_from_memcg_idx()), we need a
         * barrier here to ensure nobody will see the kmem_cache partially
         * initialized.
         */
        smp_wmb();
-        root_cache->memcg_params->memcg_caches[memcg_id] = s;
+        arr->entries[idx] = s;
 out_unlock:
        mutex_unlock(&slab_mutex);
@@ -535,6 +545,37 @@ out_unlock:
        put_online_cpus();
 }
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+        int idx;
+        struct memcg_cache_array *arr;
+        struct kmem_cache *s, *c;
+        idx = memcg_cache_id(memcg);
+        get_online_cpus();
+        get_online_mems();
+        mutex_lock(&slab_mutex);
+        list_for_each_entry(s, &slab_caches, list) {
+                if (!is_root_cache(s))
+                        continue;
+                arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+                                                lockdep_is_held(&slab_mutex));
+                c = arr->entries[idx];
+                if (!c)
+                        continue;
+                __kmem_cache_shrink(c, true);
+                arr->entries[idx] = NULL;
+        }
+        mutex_unlock(&slab_mutex);
+        put_online_mems();
+        put_online_cpus();
+}
 void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 {
        LIST_HEAD(release);
@@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
        mutex_lock(&slab_mutex);
        list_for_each_entry_safe(s, s2, &slab_caches, list) {
-                if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+                if (is_root_cache(s) || s->memcg_params.memcg != memcg)
                        continue;
                /*
                 * The cgroup is about to be freed and therefore has no charges
@@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
 void slab_kmem_cache_release(struct kmem_cache *s)
 {
-        memcg_free_cache_params(s);
+        destroy_memcg_params(s);
        kfree(s->name);
        kmem_cache_free(kmem_cache, s);
 }
 void kmem_cache_destroy(struct kmem_cache *s)
 {
-        int i;
+        struct kmem_cache *c, *c2;
        LIST_HEAD(release);
        bool need_rcu_barrier = false;
        bool busy = false;
+        BUG_ON(!is_root_cache(s));
        get_online_cpus();
        get_online_mems();
@@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
        if (s->refcount)
                goto out_unlock;
-        for_each_memcg_cache_index(i) {
+        for_each_memcg_cache_safe(c, c2, s) {
-                struct kmem_cache *c = cache_from_memcg_idx(s, i);
+                if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
-                if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
                        busy = true;
        }
@@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
        get_online_cpus();
        get_online_mems();
-        ret = __kmem_cache_shrink(cachep);
+        ret = __kmem_cache_shrink(cachep, false);
        put_online_mems();
        put_online_cpus();
        return ret;
@@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
        s->name = name;
        s->size = s->object_size = size;
        s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+        slab_init_memcg_params(s);
        err = __kmem_cache_create(s, flags);
        if (err)
@@ -920,16 +964,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
        struct kmem_cache *c;
        struct slabinfo sinfo;
-        int i;
        if (!is_root_cache(s))
                return;
-        for_each_memcg_cache_index(i) {
+        for_each_memcg_cache(c, s) {
-                c = cache_from_memcg_idx(s, i);
-                if (!c)
-                        continue;
                memset(&sinfo, 0, sizeof(sinfo));
                get_slabinfo(c, &sinfo);
@@ -981,7 +1020,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
        if (p == slab_caches.next)
                print_slabinfo_header(m);
-        if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+        if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
                cache_show(s, m);
        return 0;
 }
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
        return 0;
 }
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
        return 0;
 }
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..06cdb1829dc9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        int pages;
        int pobjects;
+        preempt_disable();
        do {
                pages = 0;
                pobjects = 0;
@@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
        } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
                                                                != oldpage);
+        if (unlikely(!s->cpu_partial)) {
+                unsigned long flags;
+                local_irq_save(flags);
+                unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+                local_irq_restore(flags);
+        }
+        preempt_enable();
 #endif
 }
@@ -3358,69 +3367,92 @@ void kfree(const void *x)
 }
 EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
- * the remaining slabs by the number of items in use. The slabs with the
+ * up most to the head of the partial lists. New allocations will then
- * most items in use come first. New allocations will then fill those up
+ * fill those up and thus they can be removed from the partial lists.
- * and thus they can be removed from the partial lists.
 *
 * The slabs with the least items are placed last. This results in them
 * being allocated from last increasing the chance that the last objects
 * are freed in them.
 */
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
        int node;
        int i;
        struct kmem_cache_node *n;
        struct page *page;
        struct page *t;
-        int objects = oo_objects(s->max);
+        struct list_head discard;
-        struct list_head *slabs_by_inuse =
+        struct list_head promote[SHRINK_PROMOTE_MAX];
-                kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
        unsigned long flags;
+        int ret = 0;
-        if (!slabs_by_inuse)
+        if (deactivate) {
-                return -ENOMEM;
+                /*
+                 * Disable empty slabs caching. Used to avoid pinning offline
+                 * memory cgroups by kmem pages that can be freed.
+                 */
+                s->cpu_partial = 0;
+                s->min_partial = 0;
+                /*
+                 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+                 * so we have to make sure the change is visible.
+                 */
+                kick_all_cpus_sync();
+        }
        flush_all(s);
        for_each_kmem_cache_node(s, node, n) {
-                if (!n->nr_partial)
+                INIT_LIST_HEAD(&discard);
-                        continue;
+                for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+                        INIT_LIST_HEAD(promote + i);
-                for (i = 0; i < objects; i++)
-                        INIT_LIST_HEAD(slabs_by_inuse + i);
                spin_lock_irqsave(&n->list_lock, flags);
                /*
-                 * Build lists indexed by the items in use in each slab.
+                 * Build lists of slabs to discard or promote.
                 *
                 * Note that concurrent frees may occur while we hold the
                 * list_lock. page->inuse here is the upper limit.
                 */
                list_for_each_entry_safe(page, t, &n->partial, lru) {
-                        list_move(&page->lru, slabs_by_inuse + page->inuse);
+                        int free = page->objects - page->inuse;
-                        if (!page->inuse)
+                        /* Do not reread page->inuse */
+                        barrier();
+                        /* We do not keep full slabs on the list */
+                        BUG_ON(free <= 0);
+                        if (free == page->objects) {
+                                list_move(&page->lru, &discard);
                                n->nr_partial--;
+                        } else if (free <= SHRINK_PROMOTE_MAX)
+                                list_move(&page->lru, promote + free - 1);
                }
                /*
-                 * Rebuild the partial list with the slabs filled up most
+                 * Promote the slabs filled up most to the head of the
-                 * first and the least used slabs at the end.
+                 * partial list.
                 */
-                for (i = objects - 1; i > 0; i--)
+                for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
-                        list_splice(slabs_by_inuse + i, n->partial.prev);
+                        list_splice(promote + i, &n->partial);
                spin_unlock_irqrestore(&n->list_lock, flags);
                /* Release empty slabs */
-                list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+                list_for_each_entry_safe(page, t, &discard, lru)
                        discard_slab(s, page);
+                if (slabs_node(s, node))
+                        ret = 1;
        }
-        kfree(slabs_by_inuse);
+        return ret;
-        return 0;
 }
 static int slab_mem_going_offline_callback(void *arg)
@@ -3429,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg)
        mutex_lock(&slab_mutex);
        list_for_each_entry(s, &slab_caches, list)
-                __kmem_cache_shrink(s);
+                __kmem_cache_shrink(s, false);
        mutex_unlock(&slab_mutex);
        return 0;
@@ -3577,6 +3609,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
                        p->slab_cache = s;
 #endif
        }
+        slab_init_memcg_params(s);
        list_add(&s->list, &slab_caches);
        return s;
 }
@@ -3635,13 +3668,10 @@ struct kmem_cache *
 __kmem_cache_alias(const char *name, size_t size, size_t align,
                   unsigned long flags, void (*ctor)(void *))
 {
-        struct kmem_cache *s;
+        struct kmem_cache *s, *c;
        s = find_mergeable(size, align, flags, name, ctor);
        if (s) {
-                int i;
-                struct kmem_cache *c;
                s->refcount++;
                /*
@@ -3651,10 +3681,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
                s->object_size = max(s->object_size, (int)size);
                s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
-                for_each_memcg_cache_index(i) {
+                for_each_memcg_cache(c, s) {
-                        c = cache_from_memcg_idx(s, i);
-                        if (!c)
-                                continue;
                        c->object_size = s->object_size;
                        c->inuse = max_t(int, c->inuse,
                                         ALIGN(size, sizeof(void *)));
@@ -4691,12 +4718,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
                        const char *buf, size_t length)
 {
-        if (buf[0] == '1') {
+        if (buf[0] == '1')
-                int rc = kmem_cache_shrink(s);
+                kmem_cache_shrink(s);
+        else
-                if (rc)
-                        return rc;
-        } else
                return -EINVAL;
        return length;
 }
@@ -4920,7 +4944,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
        err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
        if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-                int i;
+                struct kmem_cache *c;
                mutex_lock(&slab_mutex);
                if (s->max_attr_size < len)
@@ -4943,11 +4967,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
                 * directly either failed or succeeded, in which case we loop
                 * through the descendants with best-effort propagation.
                 */
-                for_each_memcg_cache_index(i) {
+                for_each_memcg_cache(c, s)
-                        struct kmem_cache *c = cache_from_memcg_idx(s, i);
+                        attribute->store(c, buf, len);
-                        if (c)
-                                attribute->store(c, buf, len);
-                }
                mutex_unlock(&slab_mutex);
        }
 #endif
@@ -4964,7 +4985,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
        if (is_root_cache(s))
                return;
-        root_cache = s->memcg_params->root_cache;
+        root_cache = s->memcg_params.root_cache;
        /*
         * This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5065,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
 {
 #ifdef CONFIG_MEMCG_KMEM
        if (!is_root_cache(s))
-                return s->memcg_params->root_cache->memcg_kset;
+                return s->memcg_params.root_cache->memcg_kset;
 #endif
        return slab_kset;
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 224dd298fdcd..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
 #define SHRINK_BATCH 128
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
-                                  struct shrinker *shrinker,
+                                    struct shrinker *shrinker,
-                                  unsigned long nr_scanned,
+                                    unsigned long nr_scanned,
-                                  unsigned long nr_eligible)
+                                    unsigned long nr_eligible)
 {
        unsigned long freed = 0;
        unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 }
 /**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
 * @gfp_mask: allocation context
 * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
 * @nr_scanned: pressure numerator
 * @nr_eligible: pressure denominator
 *
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 * unaware shrinkers will receive a node id of 0 instead.
 *
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
 * the available objects should be scanned.  Page reclaim for example
 * passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
 *
 * Returns the number of reclaimed slab objects.
 */
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
-                                unsigned long nr_scanned,
+                                 struct mem_cgroup *memcg,
-                                unsigned long nr_eligible)
+                                 unsigned long nr_scanned,
+                                 unsigned long nr_eligible)
 {
        struct shrinker *shrinker;
        unsigned long freed = 0;
+        if (memcg && !memcg_kmem_is_active(memcg))
+                return 0;
        if (nr_scanned == 0)
                nr_scanned = SWAP_CLUSTER_MAX;
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
+                        .memcg = memcg,
                };
+                if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+                        continue;
                if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
                        sc.nid = 0;
-                freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+                freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
        }
        up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
        return freed;
 }
+void drop_slab_node(int nid)
+{
+        unsigned long freed;
+        do {
+                struct mem_cgroup *memcg = NULL;
+                freed = 0;
+                do {
+                        freed += shrink_slab(GFP_KERNEL, nid, memcg,
+                                             1000, 1000);
+                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+        } while (freed > 10);
+}
+void drop_slab(void)
+{
+        int nid;
+        for_each_online_node(nid)
+                drop_slab_node(nid);
+}
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
 static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                        bool is_classzone)
 {
+        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_reclaimed, nr_scanned;
        bool reclaimable = false;
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                memcg = mem_cgroup_iter(root, NULL, &reclaim);
                do {
                        unsigned long lru_pages;
+                        unsigned long scanned;
                        struct lruvec *lruvec;
                        int swappiness;
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                        lruvec = mem_cgroup_zone_lruvec(zone, memcg);
                        swappiness = mem_cgroup_swappiness(memcg);
+                        scanned = sc->nr_scanned;
                        shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
                        zone_lru_pages += lru_pages;
+                        if (memcg && is_classzone)
+                                shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+                                            memcg, sc->nr_scanned - scanned,
+                                            lru_pages);
                        /*
                         * Direct reclaim and kswapd have to scan all memory
                         * cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
                 * Shrink the slab caches in the same proportion that
                 * the eligible LRU pages were scanned.
                 */
-                if (global_reclaim(sc) && is_classzone) {
+                if (global_reclaim(sc) && is_classzone)
-                        struct reclaim_state *reclaim_state;
+                        shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+                                    sc->nr_scanned - nr_scanned,
-                        shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
+                                    zone_lru_pages);
-                                          sc->nr_scanned - nr_scanned,
-                                          zone_lru_pages);
+                if (reclaim_state) {
+                        sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-                        reclaim_state = current->reclaim_state;
+                        reclaim_state->reclaimed_slab = 0;
-                        if (reclaim_state) {
-                                sc->nr_reclaimed +=
-                                        reclaim_state->reclaimed_slab;
-                                reclaim_state->reclaimed_slab = 0;
-                        }
                }
                vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-        shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+        shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
        local_irq_enable();
        pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
 }
 static enum lru_status shadow_lru_isolate(struct list_head *item,
+                                          struct list_lru_one *lru,
                                          spinlock_t *lru_lock,
                                          void *arg)
 {
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
                goto out;
        }
-        list_del_init(item);
+        list_lru_isolate(lru, item);
        spin_unlock(lru_lock);
        /*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
        /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
        local_irq_disable();
-        ret =  list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+        ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
-                                  shadow_lru_isolate, NULL, &sc->nr_to_scan);
+                                    shadow_lru_isolate, NULL);
        local_irq_enable();
        return ret;
 }
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
        .evict =        zbud_zpool_evict
 };
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+                        struct zpool_ops *zpool_ops)
 {
        return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
 }
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
 /**
 * zpool_create_pool() - Create a new zpool
 * @type        The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name        The name of the zpool (e.g. zram0, zswap)
 * @gfp         The GFP flags to use when allocating the pool.
 * @ops         The optional ops callback.
 *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
 *
 * Returns: New zpool on success, NULL on failure.
 */
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+                struct zpool_ops *ops)
 {
        struct zpool_driver *driver;
        struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
        zpool->type = driver->type;
        zpool->driver = driver;
-        zpool->pool = driver->create(gfp, ops);
+        zpool->pool = driver->create(name, gfp, ops);
        zpool->ops = ops;
        if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
 #include <linux/hardirq.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/debugfs.h>
 #include <linux/zsmalloc.h>
 #include <linux/zpool.h>
@@ -168,6 +169,22 @@ enum fullness_group {
        ZS_FULL
 };
+enum zs_stat_type {
+        OBJ_ALLOCATED,
+        OBJ_USED,
+        NR_ZS_STAT_TYPE,
+};
+#ifdef CONFIG_ZSMALLOC_STAT
+static struct dentry *zs_stat_root;
+struct zs_size_stat {
+        unsigned long objs[NR_ZS_STAT_TYPE];
+};
+#endif
 /*
 * number of size_classes
 */
@@ -200,6 +217,10 @@ struct size_class {
        /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
        int pages_per_zspage;
+#ifdef CONFIG_ZSMALLOC_STAT
+        struct zs_size_stat stats;
+#endif
        spinlock_t lock;
        struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
 };
 struct zs_pool {
+        char *name;
        struct size_class **size_class;
        gfp_t flags;    /* allocation flags used when growing pool */
        atomic_long_t pages_allocated;
+#ifdef CONFIG_ZSMALLOC_STAT
+        struct dentry *stat_dentry;
+#endif
 };
 /*
@@ -246,9 +273,9 @@ struct mapping_area {
 #ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
 {
-        return zs_create_pool(gfp);
+        return zs_create_pool(name, gfp);
 }
 static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
        return true;
 }
+#ifdef CONFIG_ZSMALLOC_STAT
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] += cnt;
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+        class->stats.objs[type] -= cnt;
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return class->stats.objs[type];
+}
+static int __init zs_stat_init(void)
+{
+        if (!debugfs_initialized())
+                return -ENODEV;
+        zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+        if (!zs_stat_root)
+                return -ENOMEM;
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+        debugfs_remove_recursive(zs_stat_root);
+}
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+        int i;
+        struct zs_pool *pool = s->private;
+        struct size_class *class;
+        int objs_per_zspage;
+        unsigned long obj_allocated, obj_used, pages_used;
+        unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+        seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+                                "obj_allocated", "obj_used", "pages_used");
+        for (i = 0; i < zs_size_classes; i++) {
+                class = pool->size_class[i];
+                if (class->index != i)
+                        continue;
+                spin_lock(&class->lock);
+                obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+                obj_used = zs_stat_get(class, OBJ_USED);
+                spin_unlock(&class->lock);
+                objs_per_zspage = get_maxobj_per_zspage(class->size,
+                                class->pages_per_zspage);
+                pages_used = obj_allocated / objs_per_zspage *
+                                class->pages_per_zspage;
+                seq_printf(s, " %5u %5u    %10lu %10lu %10lu\n", i,
+                        class->size, obj_allocated, obj_used, pages_used);
+                total_objs += obj_allocated;
+                total_used_objs += obj_used;
+                total_pages += pages_used;
+        }
+        seq_puts(s, "\n");
+        seq_printf(s, " %5s %5s    %10lu %10lu %10lu\n", "Total", "",
+                        total_objs, total_used_objs, total_pages);
+        return 0;
+}
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, zs_stats_size_show, inode->i_private);
+}
+static const struct file_operations zs_stat_size_ops = {
+        .open           = zs_stats_size_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        struct dentry *entry;
+        if (!zs_stat_root)
+                return -ENODEV;
+        entry = debugfs_create_dir(name, zs_stat_root);
+        if (!entry) {
+                pr_warn("debugfs dir <%s> creation failed\n", name);
+                return -ENOMEM;
+        }
+        pool->stat_dentry = entry;
+        entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+                        pool->stat_dentry, pool, &zs_stat_size_ops);
+        if (!entry) {
+                pr_warn("%s: debugfs file entry <%s> creation failed\n",
+                                name, "obj_in_classes");
+                return -ENOMEM;
+        }
+        return 0;
+}
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+        debugfs_remove_recursive(pool->stat_dentry);
+}
+#else /* CONFIG_ZSMALLOC_STAT */
+static inline void zs_stat_inc(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline void zs_stat_dec(struct size_class *class,
+                                enum zs_stat_type type, unsigned long cnt)
+{
+}
+static inline unsigned long zs_stat_get(struct size_class *class,
+                                enum zs_stat_type type)
+{
+        return 0;
+}
+static int __init zs_stat_init(void)
+{
+        return 0;
+}
+static void __exit zs_stat_exit(void)
+{
+}
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+        return 0;
+}
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+#endif
 unsigned long zs_get_total_pages(struct zs_pool *pool)
 {
        return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
                set_zspage_mapping(first_page, class->index, ZS_EMPTY);
                atomic_long_add(class->pages_per_zspage,
                                        &pool->pages_allocated);
                spin_lock(&class->lock);
+                zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                                class->size, class->pages_per_zspage));
        }
        obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
        kunmap_atomic(vaddr);
        first_page->inuse++;
+        zs_stat_inc(class, OBJ_USED, 1);
        /* Now move the zspage to another fullness group, if required */
        fix_fullness_group(pool, first_page);
        spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
        first_page->inuse--;
        fullness = fix_fullness_group(pool, first_page);
+        zs_stat_dec(class, OBJ_USED, 1);
+        if (fullness == ZS_EMPTY)
+                zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+                                class->size, class->pages_per_zspage));
        spin_unlock(&class->lock);
        if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
 * On success, a pointer to the newly created pool is returned,
 * otherwise NULL.
 */
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
 {
        int i;
        struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
        if (!pool)
                return NULL;
+        pool->name = kstrdup(name, GFP_KERNEL);
+        if (!pool->name) {
+                kfree(pool);
+                return NULL;
+        }
        pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
                        GFP_KERNEL);
        if (!pool->size_class) {
+                kfree(pool->name);
                kfree(pool);
                return NULL;
        }
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
        pool->flags = flags;
+        if (zs_pool_stat_create(name, pool))
+                goto err;
        return pool;
 err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
 {
        int i;
+        zs_pool_stat_destroy(pool);
        for (i = 0; i < zs_size_classes; i++) {
                int fg;
                struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
        }
        kfree(pool->size_class);
+        kfree(pool->name);
        kfree(pool);
 }
 EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
 {
        int ret = zs_register_cpu_notifier();
-        if (ret) {
+        if (ret)
-                zs_unregister_cpu_notifier();
+                goto notifier_fail;
-                return ret;
-        }
        init_zs_size_classes();
 #ifdef CONFIG_ZPOOL
        zpool_register_driver(&zs_zpool_driver);
 #endif
+        ret = zs_stat_init();
+        if (ret) {
+                pr_err("zs stat initialization failed\n");
+                goto stat_fail;
+        }
        return 0;
+stat_fail:
+#ifdef CONFIG_ZPOOL
+        zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+        zs_unregister_cpu_notifier();
+        return ret;
 }
 static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
        zpool_unregister_driver(&zs_zpool_driver);
 #endif
        zs_unregister_cpu_notifier();
+        zs_stat_exit();
 }
 module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
        pr_info("loading zswap\n");
-        zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+        zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+                                        &zswap_zpool_ops);
        if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
                pr_info("%s zpool not available\n", zswap_zpool_type);
                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
-                zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+                zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
                                        &zswap_zpool_ops);
        }
        if (!zswap_pool) {
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-12 21:54:28 -0500
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-02-12 21:54:28 -0500
commit	818099574b04c5301eacbbcd441022b353a65466 (patch)
tree	77b3645b375105cb0389df2b4ea5ffa90329f7f8 /mm
parent	802ea9d8645d33d24b7b4cd4537c14f3e698bde0 (diff)
parent	6016daed58ee482a2f7684e93342e89139cf4419 (diff)