aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 21:54:28 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-12 21:54:28 -0500
commit818099574b04c5301eacbbcd441022b353a65466 (patch)
tree77b3645b375105cb0389df2b4ea5ffa90329f7f8 /mm
parent802ea9d8645d33d24b7b4cd4537c14f3e698bde0 (diff)
parent6016daed58ee482a2f7684e93342e89139cf4419 (diff)
Merge branch 'akpm' (patches from Andrew)
Merge third set of updates from Andrew Morton: - the rest of MM [ This includes getting rid of the numa hinting bits, in favor of just generic protnone logic. Yay. - Linus ] - core kernel - procfs - some of lib/ (lots of lib/ material this time) * emailed patches from Andrew Morton <akpm@linux-foundation.org>: (104 commits) lib/lcm.c: replace include lib/percpu_ida.c: remove redundant includes lib/strncpy_from_user.c: replace module.h include lib/stmp_device.c: replace module.h include lib/sort.c: move include inside #if 0 lib/show_mem.c: remove redundant include lib/radix-tree.c: change to simpler include lib/plist.c: remove redundant include lib/nlattr.c: remove redundant include lib/kobject_uevent.c: remove redundant include lib/llist.c: remove redundant include lib/md5.c: simplify include lib/list_sort.c: rearrange includes lib/genalloc.c: remove redundant include lib/idr.c: remove redundant include lib/halfmd4.c: simplify includes lib/dynamic_queue_limits.c: simplify includes lib/sort.c: use simpler includes lib/interval_tree.c: simplify includes hexdump: make it return number of bytes placed in buffer ...
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/compaction.c23
-rw-r--r--mm/gup.c10
-rw-r--r--mm/huge_memory.c50
-rw-r--r--mm/internal.h6
-rw-r--r--mm/list_lru.c467
-rw-r--r--mm/memcontrol.c188
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c20
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mprotect.c48
-rw-r--r--mm/page_alloc.c19
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h67
-rw-r--r--mm/slab_common.c197
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c117
-rw-r--r--mm/vmscan.c85
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c239
-rw-r--r--mm/zswap.c5
26 files changed, 1217 insertions, 400 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12869c8..de5239c152f9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
602 You can check speed with zsmalloc benchmark: 602 You can check speed with zsmalloc benchmark:
603 https://github.com/spartacus06/zsmapbench 603 https://github.com/spartacus06/zsmapbench
604 604
605config ZSMALLOC_STAT
606 bool "Export zsmalloc statistics"
607 depends on ZSMALLOC
608 select DEBUG_FS
609 help
610 This option enables code in the zsmalloc to collect various
611 statistics about whats happening in zsmalloc and exports that
612 information to userspace via debugfs.
613 If unsure, say N.
614
605config GENERIC_EARLY_IOREMAP 615config GENERIC_EARLY_IOREMAP
606 bool 616 bool
607 617
diff --git a/mm/compaction.c b/mm/compaction.c
index b68736c8a1ce..d50d6de6f1b6 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -490,6 +490,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
490 490
491 /* If a page was split, advance to the end of it */ 491 /* If a page was split, advance to the end of it */
492 if (isolated) { 492 if (isolated) {
493 cc->nr_freepages += isolated;
494 if (!strict &&
495 cc->nr_migratepages <= cc->nr_freepages) {
496 blockpfn += isolated;
497 break;
498 }
499
493 blockpfn += isolated - 1; 500 blockpfn += isolated - 1;
494 cursor += isolated - 1; 501 cursor += isolated - 1;
495 continue; 502 continue;
@@ -899,7 +906,6 @@ static void isolate_freepages(struct compact_control *cc)
899 unsigned long isolate_start_pfn; /* exact pfn we start at */ 906 unsigned long isolate_start_pfn; /* exact pfn we start at */
900 unsigned long block_end_pfn; /* end of current pageblock */ 907 unsigned long block_end_pfn; /* end of current pageblock */
901 unsigned long low_pfn; /* lowest pfn scanner is able to scan */ 908 unsigned long low_pfn; /* lowest pfn scanner is able to scan */
902 int nr_freepages = cc->nr_freepages;
903 struct list_head *freelist = &cc->freepages; 909 struct list_head *freelist = &cc->freepages;
904 910
905 /* 911 /*
@@ -924,11 +930,11 @@ static void isolate_freepages(struct compact_control *cc)
924 * pages on cc->migratepages. We stop searching if the migrate 930 * pages on cc->migratepages. We stop searching if the migrate
925 * and free page scanners meet or enough free pages are isolated. 931 * and free page scanners meet or enough free pages are isolated.
926 */ 932 */
927 for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; 933 for (; block_start_pfn >= low_pfn &&
934 cc->nr_migratepages > cc->nr_freepages;
928 block_end_pfn = block_start_pfn, 935 block_end_pfn = block_start_pfn,
929 block_start_pfn -= pageblock_nr_pages, 936 block_start_pfn -= pageblock_nr_pages,
930 isolate_start_pfn = block_start_pfn) { 937 isolate_start_pfn = block_start_pfn) {
931 unsigned long isolated;
932 938
933 /* 939 /*
934 * This can iterate a massively long zone without finding any 940 * This can iterate a massively long zone without finding any
@@ -953,9 +959,8 @@ static void isolate_freepages(struct compact_control *cc)
953 continue; 959 continue;
954 960
955 /* Found a block suitable for isolating free pages from. */ 961 /* Found a block suitable for isolating free pages from. */
956 isolated = isolate_freepages_block(cc, &isolate_start_pfn, 962 isolate_freepages_block(cc, &isolate_start_pfn,
957 block_end_pfn, freelist, false); 963 block_end_pfn, freelist, false);
958 nr_freepages += isolated;
959 964
960 /* 965 /*
961 * Remember where the free scanner should restart next time, 966 * Remember where the free scanner should restart next time,
@@ -987,8 +992,6 @@ static void isolate_freepages(struct compact_control *cc)
987 */ 992 */
988 if (block_start_pfn < low_pfn) 993 if (block_start_pfn < low_pfn)
989 cc->free_pfn = cc->migrate_pfn; 994 cc->free_pfn = cc->migrate_pfn;
990
991 cc->nr_freepages = nr_freepages;
992} 995}
993 996
994/* 997/*
@@ -1100,8 +1103,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
1100 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn, 1103 low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
1101 isolate_mode); 1104 isolate_mode);
1102 1105
1103 if (!low_pfn || cc->contended) 1106 if (!low_pfn || cc->contended) {
1107 acct_isolated(zone, cc);
1104 return ISOLATE_ABORT; 1108 return ISOLATE_ABORT;
1109 }
1105 1110
1106 /* 1111 /*
1107 * Either we isolated something and proceed with migration. Or 1112 * Either we isolated something and proceed with migration. Or
@@ -1173,7 +1178,7 @@ static int __compact_finished(struct zone *zone, struct compact_control *cc,
1173 return COMPACT_PARTIAL; 1178 return COMPACT_PARTIAL;
1174 1179
1175 /* Job done if allocation would set block type */ 1180 /* Job done if allocation would set block type */
1176 if (cc->order >= pageblock_order && area->nr_free) 1181 if (order >= pageblock_order && area->nr_free)
1177 return COMPACT_PARTIAL; 1182 return COMPACT_PARTIAL;
1178 } 1183 }
1179 1184
diff --git a/mm/gup.c b/mm/gup.c
index c2da1163986a..51bf0b06ca7b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -64,7 +64,7 @@ retry:
64 migration_entry_wait(mm, pmd, address); 64 migration_entry_wait(mm, pmd, address);
65 goto retry; 65 goto retry;
66 } 66 }
67 if ((flags & FOLL_NUMA) && pte_numa(pte)) 67 if ((flags & FOLL_NUMA) && pte_protnone(pte))
68 goto no_page; 68 goto no_page;
69 if ((flags & FOLL_WRITE) && !pte_write(pte)) { 69 if ((flags & FOLL_WRITE) && !pte_write(pte)) {
70 pte_unmap_unlock(ptep, ptl); 70 pte_unmap_unlock(ptep, ptl);
@@ -184,7 +184,7 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
184 return page; 184 return page;
185 return no_page_table(vma, flags); 185 return no_page_table(vma, flags);
186 } 186 }
187 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 187 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
188 return no_page_table(vma, flags); 188 return no_page_table(vma, flags);
189 if (pmd_trans_huge(*pmd)) { 189 if (pmd_trans_huge(*pmd)) {
190 if (flags & FOLL_SPLIT) { 190 if (flags & FOLL_SPLIT) {
@@ -906,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
906 906
907 /* 907 /*
908 * Similar to the PMD case below, NUMA hinting must take slow 908 * Similar to the PMD case below, NUMA hinting must take slow
909 * path 909 * path using the pte_protnone check.
910 */ 910 */
911 if (!pte_present(pte) || pte_special(pte) || 911 if (!pte_present(pte) || pte_special(pte) ||
912 pte_numa(pte) || (write && !pte_write(pte))) 912 pte_protnone(pte) || (write && !pte_write(pte)))
913 goto pte_unmap; 913 goto pte_unmap;
914 914
915 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 915 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -1104,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
1104 * slowpath for accounting purposes and so that they 1104 * slowpath for accounting purposes and so that they
1105 * can be serialised against THP migration. 1105 * can be serialised against THP migration.
1106 */ 1106 */
1107 if (pmd_numa(pmd)) 1107 if (pmd_protnone(pmd))
1108 return 0; 1108 return 0;
1109 1109
1110 if (!gup_huge_pmd(pmd, pmdp, addr, next, write, 1110 if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index cb7be110cad3..fc00c8cb5a82 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1211,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
1211 return ERR_PTR(-EFAULT); 1211 return ERR_PTR(-EFAULT);
1212 1212
1213 /* Full NUMA hinting faults to serialise migration in fault paths */ 1213 /* Full NUMA hinting faults to serialise migration in fault paths */
1214 if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) 1214 if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
1215 goto out; 1215 goto out;
1216 1216
1217 page = pmd_page(*pmd); 1217 page = pmd_page(*pmd);
@@ -1262,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1262 bool migrated = false; 1262 bool migrated = false;
1263 int flags = 0; 1263 int flags = 0;
1264 1264
1265 /* A PROT_NONE fault should not end up here */
1266 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
1267
1265 ptl = pmd_lock(mm, pmdp); 1268 ptl = pmd_lock(mm, pmdp);
1266 if (unlikely(!pmd_same(pmd, *pmdp))) 1269 if (unlikely(!pmd_same(pmd, *pmdp)))
1267 goto out_unlock; 1270 goto out_unlock;
@@ -1272,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1272 * check_same as the page may no longer be mapped. 1275 * check_same as the page may no longer be mapped.
1273 */ 1276 */
1274 if (unlikely(pmd_trans_migrating(*pmdp))) { 1277 if (unlikely(pmd_trans_migrating(*pmdp))) {
1278 page = pmd_page(*pmdp);
1275 spin_unlock(ptl); 1279 spin_unlock(ptl);
1276 wait_migrate_huge_page(vma->anon_vma, pmdp); 1280 wait_on_page_locked(page);
1277 goto out; 1281 goto out;
1278 } 1282 }
1279 1283
@@ -1341,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1341 1345
1342 /* 1346 /*
1343 * Migrate the THP to the requested node, returns with page unlocked 1347 * Migrate the THP to the requested node, returns with page unlocked
1344 * and pmd_numa cleared. 1348 * and access rights restored.
1345 */ 1349 */
1346 spin_unlock(ptl); 1350 spin_unlock(ptl);
1347 migrated = migrate_misplaced_transhuge_page(mm, vma, 1351 migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1354,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
1354 goto out; 1358 goto out;
1355clear_pmdnuma: 1359clear_pmdnuma:
1356 BUG_ON(!PageLocked(page)); 1360 BUG_ON(!PageLocked(page));
1357 pmd = pmd_mknonnuma(pmd); 1361 pmd = pmd_modify(pmd, vma->vm_page_prot);
1358 set_pmd_at(mm, haddr, pmdp, pmd); 1362 set_pmd_at(mm, haddr, pmdp, pmd);
1359 VM_BUG_ON(pmd_numa(*pmdp));
1360 update_mmu_cache_pmd(vma, addr, pmdp); 1363 update_mmu_cache_pmd(vma, addr, pmdp);
1361 unlock_page(page); 1364 unlock_page(page);
1362out_unlock: 1365out_unlock:
@@ -1479,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
1479 1482
1480 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { 1483 if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
1481 pmd_t entry; 1484 pmd_t entry;
1482 ret = 1; 1485
1483 if (!prot_numa) { 1486 /*
1487 * Avoid trapping faults against the zero page. The read-only
1488 * data is likely to be read-cached on the local CPU and
1489 * local/remote hits to the zero page are not interesting.
1490 */
1491 if (prot_numa && is_huge_zero_pmd(*pmd)) {
1492 spin_unlock(ptl);
1493 return 0;
1494 }
1495
1496 if (!prot_numa || !pmd_protnone(*pmd)) {
1497 ret = 1;
1484 entry = pmdp_get_and_clear_notify(mm, addr, pmd); 1498 entry = pmdp_get_and_clear_notify(mm, addr, pmd);
1485 if (pmd_numa(entry))
1486 entry = pmd_mknonnuma(entry);
1487 entry = pmd_modify(entry, newprot); 1499 entry = pmd_modify(entry, newprot);
1488 ret = HPAGE_PMD_NR; 1500 ret = HPAGE_PMD_NR;
1489 set_pmd_at(mm, addr, pmd, entry); 1501 set_pmd_at(mm, addr, pmd, entry);
1490 BUG_ON(pmd_write(entry)); 1502 BUG_ON(pmd_write(entry));
1491 } else {
1492 struct page *page = pmd_page(*pmd);
1493
1494 /*
1495 * Do not trap faults against the zero page. The
1496 * read-only data is likely to be read-cached on the
1497 * local CPU cache and it is less useful to know about
1498 * local vs remote hits on the zero page.
1499 */
1500 if (!is_huge_zero_page(page) &&
1501 !pmd_numa(*pmd)) {
1502 pmdp_set_numa(mm, addr, pmd);
1503 ret = HPAGE_PMD_NR;
1504 }
1505 } 1503 }
1506 spin_unlock(ptl); 1504 spin_unlock(ptl);
1507 } 1505 }
@@ -1766,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
1766 pte_t *pte, entry; 1764 pte_t *pte, entry;
1767 BUG_ON(PageCompound(page+i)); 1765 BUG_ON(PageCompound(page+i));
1768 /* 1766 /*
1769 * Note that pmd_numa is not transferred deliberately 1767 * Note that NUMA hinting access restrictions are not
1770 * to avoid any possibility that pte_numa leaks to 1768 * transferred to avoid any possibility of altering
1771 * a PROT_NONE VMA by accident. 1769 * permissions across VMAs.
1772 */ 1770 */
1773 entry = mk_pte(page + i, vma->vm_page_prot); 1771 entry = mk_pte(page + i, vma->vm_page_prot);
1774 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 1772 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
diff --git a/mm/internal.h b/mm/internal.h
index c4d6c9b43491..a96da5b0029d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -351,8 +351,10 @@ extern int mminit_loglevel;
351#define mminit_dprintk(level, prefix, fmt, arg...) \ 351#define mminit_dprintk(level, prefix, fmt, arg...) \
352do { \ 352do { \
353 if (level < mminit_loglevel) { \ 353 if (level < mminit_loglevel) { \
354 printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \ 354 if (level <= MMINIT_WARNING) \
355 printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \ 355 printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
356 else \
357 printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
356 } \ 358 } \
357} while (0) 359} while (0)
358 360
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db194173..909eca2c820e 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/list_lru.h> 10#include <linux/list_lru.h>
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/mutex.h>
13#include <linux/memcontrol.h>
14
15#ifdef CONFIG_MEMCG_KMEM
16static LIST_HEAD(list_lrus);
17static DEFINE_MUTEX(list_lrus_mutex);
18
19static void list_lru_register(struct list_lru *lru)
20{
21 mutex_lock(&list_lrus_mutex);
22 list_add(&lru->list, &list_lrus);
23 mutex_unlock(&list_lrus_mutex);
24}
25
26static void list_lru_unregister(struct list_lru *lru)
27{
28 mutex_lock(&list_lrus_mutex);
29 list_del(&lru->list);
30 mutex_unlock(&list_lrus_mutex);
31}
32#else
33static void list_lru_register(struct list_lru *lru)
34{
35}
36
37static void list_lru_unregister(struct list_lru *lru)
38{
39}
40#endif /* CONFIG_MEMCG_KMEM */
41
42#ifdef CONFIG_MEMCG_KMEM
43static inline bool list_lru_memcg_aware(struct list_lru *lru)
44{
45 return !!lru->node[0].memcg_lrus;
46}
47
48static inline struct list_lru_one *
49list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
50{
51 /*
52 * The lock protects the array of per cgroup lists from relocation
53 * (see memcg_update_list_lru_node).
54 */
55 lockdep_assert_held(&nlru->lock);
56 if (nlru->memcg_lrus && idx >= 0)
57 return nlru->memcg_lrus->lru[idx];
58
59 return &nlru->lru;
60}
61
62static inline struct list_lru_one *
63list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
64{
65 struct mem_cgroup *memcg;
66
67 if (!nlru->memcg_lrus)
68 return &nlru->lru;
69
70 memcg = mem_cgroup_from_kmem(ptr);
71 if (!memcg)
72 return &nlru->lru;
73
74 return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
75}
76#else
77static inline bool list_lru_memcg_aware(struct list_lru *lru)
78{
79 return false;
80}
81
82static inline struct list_lru_one *
83list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
84{
85 return &nlru->lru;
86}
87
88static inline struct list_lru_one *
89list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
90{
91 return &nlru->lru;
92}
93#endif /* CONFIG_MEMCG_KMEM */
12 94
13bool list_lru_add(struct list_lru *lru, struct list_head *item) 95bool list_lru_add(struct list_lru *lru, struct list_head *item)
14{ 96{
15 int nid = page_to_nid(virt_to_page(item)); 97 int nid = page_to_nid(virt_to_page(item));
16 struct list_lru_node *nlru = &lru->node[nid]; 98 struct list_lru_node *nlru = &lru->node[nid];
99 struct list_lru_one *l;
17 100
18 spin_lock(&nlru->lock); 101 spin_lock(&nlru->lock);
19 WARN_ON_ONCE(nlru->nr_items < 0); 102 l = list_lru_from_kmem(nlru, item);
20 if (list_empty(item)) { 103 if (list_empty(item)) {
21 list_add_tail(item, &nlru->list); 104 list_add_tail(item, &l->list);
22 if (nlru->nr_items++ == 0) 105 l->nr_items++;
23 node_set(nid, lru->active_nodes);
24 spin_unlock(&nlru->lock); 106 spin_unlock(&nlru->lock);
25 return true; 107 return true;
26 } 108 }
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
33{ 115{
34 int nid = page_to_nid(virt_to_page(item)); 116 int nid = page_to_nid(virt_to_page(item));
35 struct list_lru_node *nlru = &lru->node[nid]; 117 struct list_lru_node *nlru = &lru->node[nid];
118 struct list_lru_one *l;
36 119
37 spin_lock(&nlru->lock); 120 spin_lock(&nlru->lock);
121 l = list_lru_from_kmem(nlru, item);
38 if (!list_empty(item)) { 122 if (!list_empty(item)) {
39 list_del_init(item); 123 list_del_init(item);
40 if (--nlru->nr_items == 0) 124 l->nr_items--;
41 node_clear(nid, lru->active_nodes);
42 WARN_ON_ONCE(nlru->nr_items < 0);
43 spin_unlock(&nlru->lock); 125 spin_unlock(&nlru->lock);
44 return true; 126 return true;
45 } 127 }
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
48} 130}
49EXPORT_SYMBOL_GPL(list_lru_del); 131EXPORT_SYMBOL_GPL(list_lru_del);
50 132
51unsigned long 133void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
52list_lru_count_node(struct list_lru *lru, int nid) 134{
135 list_del_init(item);
136 list->nr_items--;
137}
138EXPORT_SYMBOL_GPL(list_lru_isolate);
139
140void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
141 struct list_head *head)
142{
143 list_move(item, head);
144 list->nr_items--;
145}
146EXPORT_SYMBOL_GPL(list_lru_isolate_move);
147
148static unsigned long __list_lru_count_one(struct list_lru *lru,
149 int nid, int memcg_idx)
53{ 150{
54 unsigned long count = 0;
55 struct list_lru_node *nlru = &lru->node[nid]; 151 struct list_lru_node *nlru = &lru->node[nid];
152 struct list_lru_one *l;
153 unsigned long count;
56 154
57 spin_lock(&nlru->lock); 155 spin_lock(&nlru->lock);
58 WARN_ON_ONCE(nlru->nr_items < 0); 156 l = list_lru_from_memcg_idx(nlru, memcg_idx);
59 count += nlru->nr_items; 157 count = l->nr_items;
60 spin_unlock(&nlru->lock); 158 spin_unlock(&nlru->lock);
61 159
62 return count; 160 return count;
63} 161}
162
163unsigned long list_lru_count_one(struct list_lru *lru,
164 int nid, struct mem_cgroup *memcg)
165{
166 return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
167}
168EXPORT_SYMBOL_GPL(list_lru_count_one);
169
170unsigned long list_lru_count_node(struct list_lru *lru, int nid)
171{
172 long count = 0;
173 int memcg_idx;
174
175 count += __list_lru_count_one(lru, nid, -1);
176 if (list_lru_memcg_aware(lru)) {
177 for_each_memcg_cache_index(memcg_idx)
178 count += __list_lru_count_one(lru, nid, memcg_idx);
179 }
180 return count;
181}
64EXPORT_SYMBOL_GPL(list_lru_count_node); 182EXPORT_SYMBOL_GPL(list_lru_count_node);
65 183
66unsigned long 184static unsigned long
67list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, 185__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
68 void *cb_arg, unsigned long *nr_to_walk) 186 list_lru_walk_cb isolate, void *cb_arg,
187 unsigned long *nr_to_walk)
69{ 188{
70 189
71 struct list_lru_node *nlru = &lru->node[nid]; 190 struct list_lru_node *nlru = &lru->node[nid];
191 struct list_lru_one *l;
72 struct list_head *item, *n; 192 struct list_head *item, *n;
73 unsigned long isolated = 0; 193 unsigned long isolated = 0;
74 194
75 spin_lock(&nlru->lock); 195 spin_lock(&nlru->lock);
196 l = list_lru_from_memcg_idx(nlru, memcg_idx);
76restart: 197restart:
77 list_for_each_safe(item, n, &nlru->list) { 198 list_for_each_safe(item, n, &l->list) {
78 enum lru_status ret; 199 enum lru_status ret;
79 200
80 /* 201 /*
@@ -85,14 +206,11 @@ restart:
85 break; 206 break;
86 --*nr_to_walk; 207 --*nr_to_walk;
87 208
88 ret = isolate(item, &nlru->lock, cb_arg); 209 ret = isolate(item, l, &nlru->lock, cb_arg);
89 switch (ret) { 210 switch (ret) {
90 case LRU_REMOVED_RETRY: 211 case LRU_REMOVED_RETRY:
91 assert_spin_locked(&nlru->lock); 212 assert_spin_locked(&nlru->lock);
92 case LRU_REMOVED: 213 case LRU_REMOVED:
93 if (--nlru->nr_items == 0)
94 node_clear(nid, lru->active_nodes);
95 WARN_ON_ONCE(nlru->nr_items < 0);
96 isolated++; 214 isolated++;
97 /* 215 /*
98 * If the lru lock has been dropped, our list 216 * If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
103 goto restart; 221 goto restart;
104 break; 222 break;
105 case LRU_ROTATE: 223 case LRU_ROTATE:
106 list_move_tail(item, &nlru->list); 224 list_move_tail(item, &l->list);
107 break; 225 break;
108 case LRU_SKIP: 226 case LRU_SKIP:
109 break; 227 break;
@@ -122,31 +240,322 @@ restart:
122 spin_unlock(&nlru->lock); 240 spin_unlock(&nlru->lock);
123 return isolated; 241 return isolated;
124} 242}
243
244unsigned long
245list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
246 list_lru_walk_cb isolate, void *cb_arg,
247 unsigned long *nr_to_walk)
248{
249 return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
250 isolate, cb_arg, nr_to_walk);
251}
252EXPORT_SYMBOL_GPL(list_lru_walk_one);
253
254unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
255 list_lru_walk_cb isolate, void *cb_arg,
256 unsigned long *nr_to_walk)
257{
258 long isolated = 0;
259 int memcg_idx;
260
261 isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
262 nr_to_walk);
263 if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
264 for_each_memcg_cache_index(memcg_idx) {
265 isolated += __list_lru_walk_one(lru, nid, memcg_idx,
266 isolate, cb_arg, nr_to_walk);
267 if (*nr_to_walk <= 0)
268 break;
269 }
270 }
271 return isolated;
272}
125EXPORT_SYMBOL_GPL(list_lru_walk_node); 273EXPORT_SYMBOL_GPL(list_lru_walk_node);
126 274
127int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key) 275static void init_one_lru(struct list_lru_one *l)
276{
277 INIT_LIST_HEAD(&l->list);
278 l->nr_items = 0;
279}
280
281#ifdef CONFIG_MEMCG_KMEM
282static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
283 int begin, int end)
284{
285 int i;
286
287 for (i = begin; i < end; i++)
288 kfree(memcg_lrus->lru[i]);
289}
290
291static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
292 int begin, int end)
293{
294 int i;
295
296 for (i = begin; i < end; i++) {
297 struct list_lru_one *l;
298
299 l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
300 if (!l)
301 goto fail;
302
303 init_one_lru(l);
304 memcg_lrus->lru[i] = l;
305 }
306 return 0;
307fail:
308 __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
309 return -ENOMEM;
310}
311
312static int memcg_init_list_lru_node(struct list_lru_node *nlru)
313{
314 int size = memcg_nr_cache_ids;
315
316 nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
317 if (!nlru->memcg_lrus)
318 return -ENOMEM;
319
320 if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
321 kfree(nlru->memcg_lrus);
322 return -ENOMEM;
323 }
324
325 return 0;
326}
327
328static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
329{
330 __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
331 kfree(nlru->memcg_lrus);
332}
333
334static int memcg_update_list_lru_node(struct list_lru_node *nlru,
335 int old_size, int new_size)
336{
337 struct list_lru_memcg *old, *new;
338
339 BUG_ON(old_size > new_size);
340
341 old = nlru->memcg_lrus;
342 new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
343 if (!new)
344 return -ENOMEM;
345
346 if (__memcg_init_list_lru_node(new, old_size, new_size)) {
347 kfree(new);
348 return -ENOMEM;
349 }
350
351 memcpy(new, old, old_size * sizeof(void *));
352
353 /*
354 * The lock guarantees that we won't race with a reader
355 * (see list_lru_from_memcg_idx).
356 *
357 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
358 * we have to use IRQ-safe primitives here to avoid deadlock.
359 */
360 spin_lock_irq(&nlru->lock);
361 nlru->memcg_lrus = new;
362 spin_unlock_irq(&nlru->lock);
363
364 kfree(old);
365 return 0;
366}
367
368static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
369 int old_size, int new_size)
370{
371 /* do not bother shrinking the array back to the old size, because we
372 * cannot handle allocation failures here */
373 __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
374}
375
376static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
377{
378 int i;
379
380 for (i = 0; i < nr_node_ids; i++) {
381 if (!memcg_aware)
382 lru->node[i].memcg_lrus = NULL;
383 else if (memcg_init_list_lru_node(&lru->node[i]))
384 goto fail;
385 }
386 return 0;
387fail:
388 for (i = i - 1; i >= 0; i--)
389 memcg_destroy_list_lru_node(&lru->node[i]);
390 return -ENOMEM;
391}
392
393static void memcg_destroy_list_lru(struct list_lru *lru)
394{
395 int i;
396
397 if (!list_lru_memcg_aware(lru))
398 return;
399
400 for (i = 0; i < nr_node_ids; i++)
401 memcg_destroy_list_lru_node(&lru->node[i]);
402}
403
404static int memcg_update_list_lru(struct list_lru *lru,
405 int old_size, int new_size)
406{
407 int i;
408
409 if (!list_lru_memcg_aware(lru))
410 return 0;
411
412 for (i = 0; i < nr_node_ids; i++) {
413 if (memcg_update_list_lru_node(&lru->node[i],
414 old_size, new_size))
415 goto fail;
416 }
417 return 0;
418fail:
419 for (i = i - 1; i >= 0; i--)
420 memcg_cancel_update_list_lru_node(&lru->node[i],
421 old_size, new_size);
422 return -ENOMEM;
423}
424
425static void memcg_cancel_update_list_lru(struct list_lru *lru,
426 int old_size, int new_size)
427{
428 int i;
429
430 if (!list_lru_memcg_aware(lru))
431 return;
432
433 for (i = 0; i < nr_node_ids; i++)
434 memcg_cancel_update_list_lru_node(&lru->node[i],
435 old_size, new_size);
436}
437
438int memcg_update_all_list_lrus(int new_size)
439{
440 int ret = 0;
441 struct list_lru *lru;
442 int old_size = memcg_nr_cache_ids;
443
444 mutex_lock(&list_lrus_mutex);
445 list_for_each_entry(lru, &list_lrus, list) {
446 ret = memcg_update_list_lru(lru, old_size, new_size);
447 if (ret)
448 goto fail;
449 }
450out:
451 mutex_unlock(&list_lrus_mutex);
452 return ret;
453fail:
454 list_for_each_entry_continue_reverse(lru, &list_lrus, list)
455 memcg_cancel_update_list_lru(lru, old_size, new_size);
456 goto out;
457}
458
459static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
460 int src_idx, int dst_idx)
461{
462 struct list_lru_one *src, *dst;
463
464 /*
465 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
466 * we have to use IRQ-safe primitives here to avoid deadlock.
467 */
468 spin_lock_irq(&nlru->lock);
469
470 src = list_lru_from_memcg_idx(nlru, src_idx);
471 dst = list_lru_from_memcg_idx(nlru, dst_idx);
472
473 list_splice_init(&src->list, &dst->list);
474 dst->nr_items += src->nr_items;
475 src->nr_items = 0;
476
477 spin_unlock_irq(&nlru->lock);
478}
479
480static void memcg_drain_list_lru(struct list_lru *lru,
481 int src_idx, int dst_idx)
482{
483 int i;
484
485 if (!list_lru_memcg_aware(lru))
486 return;
487
488 for (i = 0; i < nr_node_ids; i++)
489 memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
490}
491
492void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
493{
494 struct list_lru *lru;
495
496 mutex_lock(&list_lrus_mutex);
497 list_for_each_entry(lru, &list_lrus, list)
498 memcg_drain_list_lru(lru, src_idx, dst_idx);
499 mutex_unlock(&list_lrus_mutex);
500}
501#else
502static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
503{
504 return 0;
505}
506
507static void memcg_destroy_list_lru(struct list_lru *lru)
508{
509}
510#endif /* CONFIG_MEMCG_KMEM */
511
512int __list_lru_init(struct list_lru *lru, bool memcg_aware,
513 struct lock_class_key *key)
128{ 514{
129 int i; 515 int i;
130 size_t size = sizeof(*lru->node) * nr_node_ids; 516 size_t size = sizeof(*lru->node) * nr_node_ids;
517 int err = -ENOMEM;
518
519 memcg_get_cache_ids();
131 520
132 lru->node = kzalloc(size, GFP_KERNEL); 521 lru->node = kzalloc(size, GFP_KERNEL);
133 if (!lru->node) 522 if (!lru->node)
134 return -ENOMEM; 523 goto out;
135 524
136 nodes_clear(lru->active_nodes);
137 for (i = 0; i < nr_node_ids; i++) { 525 for (i = 0; i < nr_node_ids; i++) {
138 spin_lock_init(&lru->node[i].lock); 526 spin_lock_init(&lru->node[i].lock);
139 if (key) 527 if (key)
140 lockdep_set_class(&lru->node[i].lock, key); 528 lockdep_set_class(&lru->node[i].lock, key);
141 INIT_LIST_HEAD(&lru->node[i].list); 529 init_one_lru(&lru->node[i].lru);
142 lru->node[i].nr_items = 0;
143 } 530 }
144 return 0; 531
532 err = memcg_init_list_lru(lru, memcg_aware);
533 if (err) {
534 kfree(lru->node);
535 goto out;
536 }
537
538 list_lru_register(lru);
539out:
540 memcg_put_cache_ids();
541 return err;
145} 542}
146EXPORT_SYMBOL_GPL(list_lru_init_key); 543EXPORT_SYMBOL_GPL(__list_lru_init);
147 544
148void list_lru_destroy(struct list_lru *lru) 545void list_lru_destroy(struct list_lru *lru)
149{ 546{
547 /* Already destroyed or not yet initialized? */
548 if (!lru->node)
549 return;
550
551 memcg_get_cache_ids();
552
553 list_lru_unregister(lru);
554
555 memcg_destroy_list_lru(lru);
150 kfree(lru->node); 556 kfree(lru->node);
557 lru->node = NULL;
558
559 memcg_put_cache_ids();
151} 560}
152EXPORT_SYMBOL_GPL(list_lru_destroy); 561EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 095c1f96fbec..d18d3a6e7337 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -332,8 +332,10 @@ struct mem_cgroup {
332 struct cg_proto tcp_mem; 332 struct cg_proto tcp_mem;
333#endif 333#endif
334#if defined(CONFIG_MEMCG_KMEM) 334#if defined(CONFIG_MEMCG_KMEM)
335 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 335 /* Index in the kmem_cache->memcg_params.memcg_caches array */
336 int kmemcg_id; 336 int kmemcg_id;
337 bool kmem_acct_activated;
338 bool kmem_acct_active;
337#endif 339#endif
338 340
339 int last_scanned_node; 341 int last_scanned_node;
@@ -352,9 +354,9 @@ struct mem_cgroup {
352}; 354};
353 355
354#ifdef CONFIG_MEMCG_KMEM 356#ifdef CONFIG_MEMCG_KMEM
355static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 357bool memcg_kmem_is_active(struct mem_cgroup *memcg)
356{ 358{
357 return memcg->kmemcg_id >= 0; 359 return memcg->kmem_acct_active;
358} 360}
359#endif 361#endif
360 362
@@ -517,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
517} 519}
518EXPORT_SYMBOL(tcp_proto_cgroup); 520EXPORT_SYMBOL(tcp_proto_cgroup);
519 521
520static void disarm_sock_keys(struct mem_cgroup *memcg)
521{
522 if (!memcg_proto_activated(&memcg->tcp_mem))
523 return;
524 static_key_slow_dec(&memcg_socket_limit_enabled);
525}
526#else
527static void disarm_sock_keys(struct mem_cgroup *memcg)
528{
529}
530#endif 522#endif
531 523
532#ifdef CONFIG_MEMCG_KMEM 524#ifdef CONFIG_MEMCG_KMEM
533/* 525/*
534 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 526 * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
535 * The main reason for not using cgroup id for this: 527 * The main reason for not using cgroup id for this:
536 * this works better in sparse environments, where we have a lot of memcgs, 528 * this works better in sparse environments, where we have a lot of memcgs,
537 * but only a few kmem-limited. Or also, if we have, for instance, 200 529 * but only a few kmem-limited. Or also, if we have, for instance, 200
538 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 530 * memcgs, and none but the 200th is kmem-limited, we'd have to have a
539 * 200 entry array for that. 531 * 200 entry array for that.
540 * 532 *
541 * The current size of the caches array is stored in 533 * The current size of the caches array is stored in memcg_nr_cache_ids. It
542 * memcg_limited_groups_array_size. It will double each time we have to 534 * will double each time we have to increase it.
543 * increase it.
544 */ 535 */
545static DEFINE_IDA(kmem_limited_groups); 536static DEFINE_IDA(memcg_cache_ida);
546int memcg_limited_groups_array_size; 537int memcg_nr_cache_ids;
538
539/* Protects memcg_nr_cache_ids */
540static DECLARE_RWSEM(memcg_cache_ids_sem);
541
542void memcg_get_cache_ids(void)
543{
544 down_read(&memcg_cache_ids_sem);
545}
546
547void memcg_put_cache_ids(void)
548{
549 up_read(&memcg_cache_ids_sem);
550}
547 551
548/* 552/*
549 * MIN_SIZE is different than 1, because we would like to avoid going through 553 * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -569,32 +573,8 @@ int memcg_limited_groups_array_size;
569struct static_key memcg_kmem_enabled_key; 573struct static_key memcg_kmem_enabled_key;
570EXPORT_SYMBOL(memcg_kmem_enabled_key); 574EXPORT_SYMBOL(memcg_kmem_enabled_key);
571 575
572static void memcg_free_cache_id(int id);
573
574static void disarm_kmem_keys(struct mem_cgroup *memcg)
575{
576 if (memcg_kmem_is_active(memcg)) {
577 static_key_slow_dec(&memcg_kmem_enabled_key);
578 memcg_free_cache_id(memcg->kmemcg_id);
579 }
580 /*
581 * This check can't live in kmem destruction function,
582 * since the charges will outlive the cgroup
583 */
584 WARN_ON(page_counter_read(&memcg->kmem));
585}
586#else
587static void disarm_kmem_keys(struct mem_cgroup *memcg)
588{
589}
590#endif /* CONFIG_MEMCG_KMEM */ 576#endif /* CONFIG_MEMCG_KMEM */
591 577
592static void disarm_static_keys(struct mem_cgroup *memcg)
593{
594 disarm_sock_keys(memcg);
595 disarm_kmem_keys(memcg);
596}
597
598static struct mem_cgroup_per_zone * 578static struct mem_cgroup_per_zone *
599mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone) 579mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
600{ 580{
@@ -2538,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
2538 int id, size; 2518 int id, size;
2539 int err; 2519 int err;
2540 2520
2541 id = ida_simple_get(&kmem_limited_groups, 2521 id = ida_simple_get(&memcg_cache_ida,
2542 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 2522 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2543 if (id < 0) 2523 if (id < 0)
2544 return id; 2524 return id;
2545 2525
2546 if (id < memcg_limited_groups_array_size) 2526 if (id < memcg_nr_cache_ids)
2547 return id; 2527 return id;
2548 2528
2549 /* 2529 /*
2550 * There's no space for the new id in memcg_caches arrays, 2530 * There's no space for the new id in memcg_caches arrays,
2551 * so we have to grow them. 2531 * so we have to grow them.
2552 */ 2532 */
2533 down_write(&memcg_cache_ids_sem);
2553 2534
2554 size = 2 * (id + 1); 2535 size = 2 * (id + 1);
2555 if (size < MEMCG_CACHES_MIN_SIZE) 2536 if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2558,8 +2539,15 @@ static int memcg_alloc_cache_id(void)
2558 size = MEMCG_CACHES_MAX_SIZE; 2539 size = MEMCG_CACHES_MAX_SIZE;
2559 2540
2560 err = memcg_update_all_caches(size); 2541 err = memcg_update_all_caches(size);
2542 if (!err)
2543 err = memcg_update_all_list_lrus(size);
2544 if (!err)
2545 memcg_nr_cache_ids = size;
2546
2547 up_write(&memcg_cache_ids_sem);
2548
2561 if (err) { 2549 if (err) {
2562 ida_simple_remove(&kmem_limited_groups, id); 2550 ida_simple_remove(&memcg_cache_ida, id);
2563 return err; 2551 return err;
2564 } 2552 }
2565 return id; 2553 return id;
@@ -2567,17 +2555,7 @@ static int memcg_alloc_cache_id(void)
2567 2555
2568static void memcg_free_cache_id(int id) 2556static void memcg_free_cache_id(int id)
2569{ 2557{
2570 ida_simple_remove(&kmem_limited_groups, id); 2558 ida_simple_remove(&memcg_cache_ida, id);
2571}
2572
2573/*
2574 * We should update the current array size iff all caches updates succeed. This
2575 * can only be done from the slab side. The slab mutex needs to be held when
2576 * calling this.
2577 */
2578void memcg_update_array_size(int num)
2579{
2580 memcg_limited_groups_array_size = num;
2581} 2559}
2582 2560
2583struct memcg_kmem_cache_create_work { 2561struct memcg_kmem_cache_create_work {
@@ -2656,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
2656{ 2634{
2657 struct mem_cgroup *memcg; 2635 struct mem_cgroup *memcg;
2658 struct kmem_cache *memcg_cachep; 2636 struct kmem_cache *memcg_cachep;
2637 int kmemcg_id;
2659 2638
2660 VM_BUG_ON(!cachep->memcg_params); 2639 VM_BUG_ON(!is_root_cache(cachep));
2661 VM_BUG_ON(!cachep->memcg_params->is_root_cache);
2662 2640
2663 if (current->memcg_kmem_skip_account) 2641 if (current->memcg_kmem_skip_account)
2664 return cachep; 2642 return cachep;
2665 2643
2666 memcg = get_mem_cgroup_from_mm(current->mm); 2644 memcg = get_mem_cgroup_from_mm(current->mm);
2667 if (!memcg_kmem_is_active(memcg)) 2645 kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
2646 if (kmemcg_id < 0)
2668 goto out; 2647 goto out;
2669 2648
2670 memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg)); 2649 memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2671 if (likely(memcg_cachep)) 2650 if (likely(memcg_cachep))
2672 return memcg_cachep; 2651 return memcg_cachep;
2673 2652
@@ -2692,7 +2671,7 @@ out:
2692void __memcg_kmem_put_cache(struct kmem_cache *cachep) 2671void __memcg_kmem_put_cache(struct kmem_cache *cachep)
2693{ 2672{
2694 if (!is_root_cache(cachep)) 2673 if (!is_root_cache(cachep))
2695 css_put(&cachep->memcg_params->memcg->css); 2674 css_put(&cachep->memcg_params.memcg->css);
2696} 2675}
2697 2676
2698/* 2677/*
@@ -2757,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2757 memcg_uncharge_kmem(memcg, 1 << order); 2736 memcg_uncharge_kmem(memcg, 1 << order);
2758 page->mem_cgroup = NULL; 2737 page->mem_cgroup = NULL;
2759} 2738}
2739
2740struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
2741{
2742 struct mem_cgroup *memcg = NULL;
2743 struct kmem_cache *cachep;
2744 struct page *page;
2745
2746 page = virt_to_head_page(ptr);
2747 if (PageSlab(page)) {
2748 cachep = page->slab_cache;
2749 if (!is_root_cache(cachep))
2750 memcg = cachep->memcg_params.memcg;
2751 } else
2752 /* page allocated by alloc_kmem_pages */
2753 memcg = page->mem_cgroup;
2754
2755 return memcg;
2756}
2760#endif /* CONFIG_MEMCG_KMEM */ 2757#endif /* CONFIG_MEMCG_KMEM */
2761 2758
2762#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2759#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3291,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3291 int err = 0; 3288 int err = 0;
3292 int memcg_id; 3289 int memcg_id;
3293 3290
3294 if (memcg_kmem_is_active(memcg)) 3291 BUG_ON(memcg->kmemcg_id >= 0);
3295 return 0; 3292 BUG_ON(memcg->kmem_acct_activated);
3293 BUG_ON(memcg->kmem_acct_active);
3296 3294
3297 /* 3295 /*
3298 * For simplicity, we won't allow this to be disabled. It also can't 3296 * For simplicity, we won't allow this to be disabled. It also can't
@@ -3335,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
3335 * patched. 3333 * patched.
3336 */ 3334 */
3337 memcg->kmemcg_id = memcg_id; 3335 memcg->kmemcg_id = memcg_id;
3336 memcg->kmem_acct_activated = true;
3337 memcg->kmem_acct_active = true;
3338out: 3338out:
3339 return err; 3339 return err;
3340} 3340}
@@ -4014,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4014 return mem_cgroup_sockets_init(memcg, ss); 4014 return mem_cgroup_sockets_init(memcg, ss);
4015} 4015}
4016 4016
4017static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4018{
4019 struct cgroup_subsys_state *css;
4020 struct mem_cgroup *parent, *child;
4021 int kmemcg_id;
4022
4023 if (!memcg->kmem_acct_active)
4024 return;
4025
4026 /*
4027 * Clear the 'active' flag before clearing memcg_caches arrays entries.
4028 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
4029 * guarantees no cache will be created for this cgroup after we are
4030 * done (see memcg_create_kmem_cache()).
4031 */
4032 memcg->kmem_acct_active = false;
4033
4034 memcg_deactivate_kmem_caches(memcg);
4035
4036 kmemcg_id = memcg->kmemcg_id;
4037 BUG_ON(kmemcg_id < 0);
4038
4039 parent = parent_mem_cgroup(memcg);
4040 if (!parent)
4041 parent = root_mem_cgroup;
4042
4043 /*
4044 * Change kmemcg_id of this cgroup and all its descendants to the
4045 * parent's id, and then move all entries from this cgroup's list_lrus
4046 * to ones of the parent. After we have finished, all list_lrus
4047 * corresponding to this cgroup are guaranteed to remain empty. The
4048 * ordering is imposed by list_lru_node->lock taken by
4049 * memcg_drain_all_list_lrus().
4050 */
4051 css_for_each_descendant_pre(css, &memcg->css) {
4052 child = mem_cgroup_from_css(css);
4053 BUG_ON(child->kmemcg_id != kmemcg_id);
4054 child->kmemcg_id = parent->kmemcg_id;
4055 if (!memcg->use_hierarchy)
4056 break;
4057 }
4058 memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
4059
4060 memcg_free_cache_id(kmemcg_id);
4061}
4062
4017static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4063static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4018{ 4064{
4019 memcg_destroy_kmem_caches(memcg); 4065 if (memcg->kmem_acct_activated) {
4066 memcg_destroy_kmem_caches(memcg);
4067 static_key_slow_dec(&memcg_kmem_enabled_key);
4068 WARN_ON(page_counter_read(&memcg->kmem));
4069 }
4020 mem_cgroup_sockets_destroy(memcg); 4070 mem_cgroup_sockets_destroy(memcg);
4021} 4071}
4022#else 4072#else
@@ -4025,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4025 return 0; 4075 return 0;
4026} 4076}
4027 4077
4078static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
4079{
4080}
4081
4028static void memcg_destroy_kmem(struct mem_cgroup *memcg) 4082static void memcg_destroy_kmem(struct mem_cgroup *memcg)
4029{ 4083{
4030} 4084}
@@ -4443,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4443 free_mem_cgroup_per_zone_info(memcg, node); 4497 free_mem_cgroup_per_zone_info(memcg, node);
4444 4498
4445 free_percpu(memcg->stat); 4499 free_percpu(memcg->stat);
4446
4447 disarm_static_keys(memcg);
4448 kfree(memcg); 4500 kfree(memcg);
4449} 4501}
4450 4502
@@ -4581,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4581 spin_unlock(&memcg->event_list_lock); 4633 spin_unlock(&memcg->event_list_lock);
4582 4634
4583 vmpressure_cleanup(&memcg->vmpressure); 4635 vmpressure_cleanup(&memcg->vmpressure);
4636
4637 memcg_deactivate_kmem(memcg);
4584} 4638}
4585 4639
4586static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 4640static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803bf3443..d487f8dc6d39 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
242 * Only call shrink_node_slabs here (which would also shrink 242 * Only call shrink_node_slabs here (which would also shrink
243 * other caches) if access is not potentially fatal. 243 * other caches) if access is not potentially fatal.
244 */ 244 */
245 if (access) { 245 if (access)
246 int nr; 246 drop_slab_node(page_to_nid(p));
247 int nid = page_to_nid(p);
248 do {
249 nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
250 if (page_count(p) == 1)
251 break;
252 } while (nr > 10);
253 }
254} 247}
255EXPORT_SYMBOL_GPL(shake_page); 248EXPORT_SYMBOL_GPL(shake_page);
256 249
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
1654 * setting PG_hwpoison. 1647 * setting PG_hwpoison.
1655 */ 1648 */
1656 if (!is_free_buddy_page(page)) 1649 if (!is_free_buddy_page(page))
1657 lru_add_drain_all();
1658 if (!is_free_buddy_page(page))
1659 drain_all_pages(page_zone(page)); 1650 drain_all_pages(page_zone(page));
1660 SetPageHWPoison(page); 1651 SetPageHWPoison(page);
1661 if (!is_free_buddy_page(page)) 1652 if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index bbe6a73a899d..99275325f303 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3013,14 +3013,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3013 bool migrated = false; 3013 bool migrated = false;
3014 int flags = 0; 3014 int flags = 0;
3015 3015
3016 /* A PROT_NONE fault should not end up here */
3017 BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
3018
3016 /* 3019 /*
3017 * The "pte" at this point cannot be used safely without 3020 * The "pte" at this point cannot be used safely without
3018 * validation through pte_unmap_same(). It's of NUMA type but 3021 * validation through pte_unmap_same(). It's of NUMA type but
3019 * the pfn may be screwed if the read is non atomic. 3022 * the pfn may be screwed if the read is non atomic.
3020 * 3023 *
3021 * ptep_modify_prot_start is not called as this is clearing 3024 * We can safely just do a "set_pte_at()", because the old
3022 * the _PAGE_NUMA bit and it is not really expected that there 3025 * page table entry is not accessible, so there would be no
3023 * would be concurrent hardware modifications to the PTE. 3026 * concurrent hardware modifications to the PTE.
3024 */ 3027 */
3025 ptl = pte_lockptr(mm, pmd); 3028 ptl = pte_lockptr(mm, pmd);
3026 spin_lock(ptl); 3029 spin_lock(ptl);
@@ -3029,7 +3032,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3029 goto out; 3032 goto out;
3030 } 3033 }
3031 3034
3032 pte = pte_mknonnuma(pte); 3035 /* Make it present again */
3036 pte = pte_modify(pte, vma->vm_page_prot);
3037 pte = pte_mkyoung(pte);
3033 set_pte_at(mm, addr, ptep, pte); 3038 set_pte_at(mm, addr, ptep, pte);
3034 update_mmu_cache(vma, addr, ptep); 3039 update_mmu_cache(vma, addr, ptep);
3035 3040
@@ -3038,7 +3043,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3038 pte_unmap_unlock(ptep, ptl); 3043 pte_unmap_unlock(ptep, ptl);
3039 return 0; 3044 return 0;
3040 } 3045 }
3041 BUG_ON(is_zero_pfn(page_to_pfn(page)));
3042 3046
3043 /* 3047 /*
3044 * Avoid grouping on DSO/COW pages in specific and RO pages 3048 * Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3124,7 +3128,7 @@ static int handle_pte_fault(struct mm_struct *mm,
3124 pte, pmd, flags, entry); 3128 pte, pmd, flags, entry);
3125 } 3129 }
3126 3130
3127 if (pte_numa(entry)) 3131 if (pte_protnone(entry))
3128 return do_numa_page(mm, vma, address, entry, pte, pmd); 3132 return do_numa_page(mm, vma, address, entry, pte, pmd);
3129 3133
3130 ptl = pte_lockptr(mm, pmd); 3134 ptl = pte_lockptr(mm, pmd);
@@ -3202,7 +3206,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3202 if (pmd_trans_splitting(orig_pmd)) 3206 if (pmd_trans_splitting(orig_pmd))
3203 return 0; 3207 return 0;
3204 3208
3205 if (pmd_numa(orig_pmd)) 3209 if (pmd_protnone(orig_pmd))
3206 return do_huge_pmd_numa_page(mm, vma, address, 3210 return do_huge_pmd_numa_page(mm, vma, address,
3207 orig_pmd, pmd); 3211 orig_pmd, pmd);
3208 3212
@@ -3458,7 +3462,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
3458 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3462 if (follow_phys(vma, addr, write, &prot, &phys_addr))
3459 return -EINVAL; 3463 return -EINVAL;
3460 3464
3461 maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot); 3465 maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
3462 if (write) 3466 if (write)
3463 memcpy_toio(maddr + offset, buf, len); 3467 memcpy_toio(maddr + offset, buf, len);
3464 else 3468 else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f1bd23803576..c75f4dcec808 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -569,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
569{ 569{
570 int nr_updated; 570 int nr_updated;
571 571
572 nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1); 572 nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
573 if (nr_updated) 573 if (nr_updated)
574 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); 574 count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
575 575
diff --git a/mm/migrate.c b/mm/migrate.c
index f98067e5d353..85e042686031 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1654,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
1654 return PageLocked(page); 1654 return PageLocked(page);
1655} 1655}
1656 1656
1657void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
1658{
1659 struct page *page = pmd_page(*pmd);
1660 wait_on_page_locked(page);
1661}
1662
1663/* 1657/*
1664 * Attempt to migrate a misplaced page to the specified destination 1658 * Attempt to migrate a misplaced page to the specified destination
1665 * node. Caller is expected to have an elevated reference count on 1659 * node. Caller is expected to have an elevated reference count on
@@ -1853,7 +1847,7 @@ out_fail:
1853out_dropref: 1847out_dropref:
1854 ptl = pmd_lock(mm, pmd); 1848 ptl = pmd_lock(mm, pmd);
1855 if (pmd_same(*pmd, entry)) { 1849 if (pmd_same(*pmd, entry)) {
1856 entry = pmd_mknonnuma(entry); 1850 entry = pmd_modify(entry, vma->vm_page_prot);
1857 set_pmd_at(mm, mmun_start, pmd, entry); 1851 set_pmd_at(mm, mmun_start, pmd, entry);
1858 update_mmu_cache_pmd(vma, address, &entry); 1852 update_mmu_cache_pmd(vma, address, &entry);
1859 } 1853 }
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf9936b..5f420f7fafa1 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
14#include "internal.h" 14#include "internal.h"
15 15
16#ifdef CONFIG_DEBUG_MEMORY_INIT 16#ifdef CONFIG_DEBUG_MEMORY_INIT
17int mminit_loglevel; 17int __meminitdata mminit_loglevel;
18 18
19#ifndef SECTIONS_SHIFT 19#ifndef SECTIONS_SHIFT
20#define SECTIONS_SHIFT 0 20#define SECTIONS_SHIFT 0
21#endif 21#endif
22 22
23/* The zonelists are simply reported, validation is manual. */ 23/* The zonelists are simply reported, validation is manual. */
24void mminit_verify_zonelist(void) 24void __init mminit_verify_zonelist(void)
25{ 25{
26 int nid; 26 int nid;
27 27
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 33121662f08b..44727811bf4c 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,36 +75,34 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
75 oldpte = *pte; 75 oldpte = *pte;
76 if (pte_present(oldpte)) { 76 if (pte_present(oldpte)) {
77 pte_t ptent; 77 pte_t ptent;
78 bool updated = false;
79 78
80 if (!prot_numa) { 79 /*
81 ptent = ptep_modify_prot_start(mm, addr, pte); 80 * Avoid trapping faults against the zero or KSM
82 if (pte_numa(ptent)) 81 * pages. See similar comment in change_huge_pmd.
83 ptent = pte_mknonnuma(ptent); 82 */
84 ptent = pte_modify(ptent, newprot); 83 if (prot_numa) {
85 /*
86 * Avoid taking write faults for pages we
87 * know to be dirty.
88 */
89 if (dirty_accountable && pte_dirty(ptent) &&
90 (pte_soft_dirty(ptent) ||
91 !(vma->vm_flags & VM_SOFTDIRTY)))
92 ptent = pte_mkwrite(ptent);
93 ptep_modify_prot_commit(mm, addr, pte, ptent);
94 updated = true;
95 } else {
96 struct page *page; 84 struct page *page;
97 85
98 page = vm_normal_page(vma, addr, oldpte); 86 page = vm_normal_page(vma, addr, oldpte);
99 if (page && !PageKsm(page)) { 87 if (!page || PageKsm(page))
100 if (!pte_numa(oldpte)) { 88 continue;
101 ptep_set_numa(mm, addr, pte); 89
102 updated = true; 90 /* Avoid TLB flush if possible */
103 } 91 if (pte_protnone(oldpte))
104 } 92 continue;
105 } 93 }
106 if (updated) 94
107 pages++; 95 ptent = ptep_modify_prot_start(mm, addr, pte);
96 ptent = pte_modify(ptent, newprot);
97
98 /* Avoid taking write faults for known dirty pages */
99 if (dirty_accountable && pte_dirty(ptent) &&
100 (pte_soft_dirty(ptent) ||
101 !(vma->vm_flags & VM_SOFTDIRTY))) {
102 ptent = pte_mkwrite(ptent);
103 }
104 ptep_modify_prot_commit(mm, addr, pte, ptent);
105 pages++;
108 } else if (IS_ENABLED(CONFIG_MIGRATION)) { 106 } else if (IS_ENABLED(CONFIG_MIGRATION)) {
109 swp_entry_t entry = pte_to_swp_entry(oldpte); 107 swp_entry_t entry = pte_to_swp_entry(oldpte);
110 108
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8d52ab18fe0d..cb4758263f6b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -172,7 +172,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
172 * 1G machine -> (16M dma, 784M normal, 224M high) 172 * 1G machine -> (16M dma, 784M normal, 224M high)
173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA 173 * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL 174 * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
175 * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA 175 * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
176 * 176 *
177 * TBD: should special case ZONE_DMA32 machines here - in those we normally 177 * TBD: should special case ZONE_DMA32 machines here - in those we normally
178 * don't need any ZONE_NORMAL reservation 178 * don't need any ZONE_NORMAL reservation
@@ -3871,18 +3871,29 @@ static int __build_all_zonelists(void *data)
3871 return 0; 3871 return 0;
3872} 3872}
3873 3873
3874static noinline void __init
3875build_all_zonelists_init(void)
3876{
3877 __build_all_zonelists(NULL);
3878 mminit_verify_zonelist();
3879 cpuset_init_current_mems_allowed();
3880}
3881
3874/* 3882/*
3875 * Called with zonelists_mutex held always 3883 * Called with zonelists_mutex held always
3876 * unless system_state == SYSTEM_BOOTING. 3884 * unless system_state == SYSTEM_BOOTING.
3885 *
3886 * __ref due to (1) call of __meminit annotated setup_zone_pageset
3887 * [we're only called with non-NULL zone through __meminit paths] and
3888 * (2) call of __init annotated helper build_all_zonelists_init
3889 * [protected by SYSTEM_BOOTING].
3877 */ 3890 */
3878void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) 3891void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
3879{ 3892{
3880 set_zonelist_order(); 3893 set_zonelist_order();
3881 3894
3882 if (system_state == SYSTEM_BOOTING) { 3895 if (system_state == SYSTEM_BOOTING) {
3883 __build_all_zonelists(NULL); 3896 build_all_zonelists_init();
3884 mminit_verify_zonelist();
3885 cpuset_init_current_mems_allowed();
3886 } else { 3897 } else {
3887#ifdef CONFIG_MEMORY_HOTPLUG 3898#ifdef CONFIG_MEMORY_HOTPLUG
3888 if (zone) 3899 if (zone)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e028ecb..c25f94b33811 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
193 pmd_t *pmdp) 193 pmd_t *pmdp)
194{ 194{
195 pmd_t entry = *pmdp; 195 pmd_t entry = *pmdp;
196 if (pmd_numa(entry))
197 entry = pmd_mknonnuma(entry);
198 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); 196 set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
199 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); 197 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
200} 198}
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb6f671..c4b89eaf4c96 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
2382 return nr_freed; 2382 return nr_freed;
2383} 2383}
2384 2384
2385int __kmem_cache_shrink(struct kmem_cache *cachep) 2385int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
2386{ 2386{
2387 int ret = 0; 2387 int ret = 0;
2388 int node; 2388 int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
2404{ 2404{
2405 int i; 2405 int i;
2406 struct kmem_cache_node *n; 2406 struct kmem_cache_node *n;
2407 int rc = __kmem_cache_shrink(cachep); 2407 int rc = __kmem_cache_shrink(cachep, false);
2408 2408
2409 if (rc) 2409 if (rc)
2410 return rc; 2410 return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3708 int batchcount, int shared, gfp_t gfp) 3708 int batchcount, int shared, gfp_t gfp)
3709{ 3709{
3710 int ret; 3710 int ret;
3711 struct kmem_cache *c = NULL; 3711 struct kmem_cache *c;
3712 int i = 0;
3713 3712
3714 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp); 3713 ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
3715 3714
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
3719 if ((ret < 0) || !is_root_cache(cachep)) 3718 if ((ret < 0) || !is_root_cache(cachep))
3720 return ret; 3719 return ret;
3721 3720
3722 VM_BUG_ON(!mutex_is_locked(&slab_mutex)); 3721 lockdep_assert_held(&slab_mutex);
3723 for_each_memcg_cache_index(i) { 3722 for_each_memcg_cache(c, cachep) {
3724 c = cache_from_memcg_idx(cachep, i); 3723 /* return value determined by the root cache only */
3725 if (c) 3724 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3726 /* return value determined by the parent cache only */
3727 __do_tune_cpucache(c, limit, batchcount, shared, gfp);
3728 } 3725 }
3729 3726
3730 return ret; 3727 return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 90430d6f665e..4c3ac12dd644 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
86extern void create_boot_cache(struct kmem_cache *, const char *name, 86extern void create_boot_cache(struct kmem_cache *, const char *name,
87 size_t size, unsigned long flags); 87 size_t size, unsigned long flags);
88 88
89struct mem_cgroup;
90
91int slab_unmergeable(struct kmem_cache *s); 89int slab_unmergeable(struct kmem_cache *s);
92struct kmem_cache *find_mergeable(size_t size, size_t align, 90struct kmem_cache *find_mergeable(size_t size, size_t align,
93 unsigned long flags, const char *name, void (*ctor)(void *)); 91 unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
140#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) 138#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
141 139
142int __kmem_cache_shutdown(struct kmem_cache *); 140int __kmem_cache_shutdown(struct kmem_cache *);
143int __kmem_cache_shrink(struct kmem_cache *); 141int __kmem_cache_shrink(struct kmem_cache *, bool);
144void slab_kmem_cache_release(struct kmem_cache *); 142void slab_kmem_cache_release(struct kmem_cache *);
145 143
146struct seq_file; 144struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
165 size_t count, loff_t *ppos); 163 size_t count, loff_t *ppos);
166 164
167#ifdef CONFIG_MEMCG_KMEM 165#ifdef CONFIG_MEMCG_KMEM
166/*
167 * Iterate over all memcg caches of the given root cache. The caller must hold
168 * slab_mutex.
169 */
170#define for_each_memcg_cache(iter, root) \
171 list_for_each_entry(iter, &(root)->memcg_params.list, \
172 memcg_params.list)
173
174#define for_each_memcg_cache_safe(iter, tmp, root) \
175 list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
176 memcg_params.list)
177
168static inline bool is_root_cache(struct kmem_cache *s) 178static inline bool is_root_cache(struct kmem_cache *s)
169{ 179{
170 return !s->memcg_params || s->memcg_params->is_root_cache; 180 return s->memcg_params.is_root_cache;
171} 181}
172 182
173static inline bool slab_equal_or_root(struct kmem_cache *s, 183static inline bool slab_equal_or_root(struct kmem_cache *s,
174 struct kmem_cache *p) 184 struct kmem_cache *p)
175{ 185{
176 return (p == s) || 186 return p == s || p == s->memcg_params.root_cache;
177 (s->memcg_params && (p == s->memcg_params->root_cache));
178} 187}
179 188
180/* 189/*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
185static inline const char *cache_name(struct kmem_cache *s) 194static inline const char *cache_name(struct kmem_cache *s)
186{ 195{
187 if (!is_root_cache(s)) 196 if (!is_root_cache(s))
188 return s->memcg_params->root_cache->name; 197 s = s->memcg_params.root_cache;
189 return s->name; 198 return s->name;
190} 199}
191 200
192/* 201/*
193 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches. 202 * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
194 * That said the caller must assure the memcg's cache won't go away. Since once 203 * That said the caller must assure the memcg's cache won't go away by either
195 * created a memcg's cache is destroyed only along with the root cache, it is 204 * taking a css reference to the owner cgroup, or holding the slab_mutex.
196 * true if we are going to allocate from the cache or hold a reference to the
197 * root cache by other means. Otherwise, we should hold either the slab_mutex
198 * or the memcg's slab_caches_mutex while calling this function and accessing
199 * the returned value.
200 */ 205 */
201static inline struct kmem_cache * 206static inline struct kmem_cache *
202cache_from_memcg_idx(struct kmem_cache *s, int idx) 207cache_from_memcg_idx(struct kmem_cache *s, int idx)
203{ 208{
204 struct kmem_cache *cachep; 209 struct kmem_cache *cachep;
205 struct memcg_cache_params *params; 210 struct memcg_cache_array *arr;
206
207 if (!s->memcg_params)
208 return NULL;
209 211
210 rcu_read_lock(); 212 rcu_read_lock();
211 params = rcu_dereference(s->memcg_params); 213 arr = rcu_dereference(s->memcg_params.memcg_caches);
212 214
213 /* 215 /*
214 * Make sure we will access the up-to-date value. The code updating 216 * Make sure we will access the up-to-date value. The code updating
215 * memcg_caches issues a write barrier to match this (see 217 * memcg_caches issues a write barrier to match this (see
216 * memcg_register_cache()). 218 * memcg_create_kmem_cache()).
217 */ 219 */
218 cachep = lockless_dereference(params->memcg_caches[idx]); 220 cachep = lockless_dereference(arr->entries[idx]);
219 rcu_read_unlock(); 221 rcu_read_unlock();
220 222
221 return cachep; 223 return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
225{ 227{
226 if (is_root_cache(s)) 228 if (is_root_cache(s))
227 return s; 229 return s;
228 return s->memcg_params->root_cache; 230 return s->memcg_params.root_cache;
229} 231}
230 232
231static __always_inline int memcg_charge_slab(struct kmem_cache *s, 233static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
235 return 0; 237 return 0;
236 if (is_root_cache(s)) 238 if (is_root_cache(s))
237 return 0; 239 return 0;
238 return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order); 240 return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
239} 241}
240 242
241static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 243static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
244 return; 246 return;
245 if (is_root_cache(s)) 247 if (is_root_cache(s))
246 return; 248 return;
247 memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order); 249 memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
248} 250}
249#else 251
252extern void slab_init_memcg_params(struct kmem_cache *);
253
254#else /* !CONFIG_MEMCG_KMEM */
255
256#define for_each_memcg_cache(iter, root) \
257 for ((void)(iter), (void)(root); 0; )
258#define for_each_memcg_cache_safe(iter, tmp, root) \
259 for ((void)(iter), (void)(tmp), (void)(root); 0; )
260
250static inline bool is_root_cache(struct kmem_cache *s) 261static inline bool is_root_cache(struct kmem_cache *s)
251{ 262{
252 return true; 263 return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
282static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) 293static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
283{ 294{
284} 295}
285#endif 296
297static inline void slab_init_memcg_params(struct kmem_cache *s)
298{
299}
300#endif /* CONFIG_MEMCG_KMEM */
286 301
287static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) 302static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
288{ 303{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 6e1e4cf65836..1a1cc89acaa3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
106#endif 106#endif
107 107
108#ifdef CONFIG_MEMCG_KMEM 108#ifdef CONFIG_MEMCG_KMEM
109static int memcg_alloc_cache_params(struct mem_cgroup *memcg, 109void slab_init_memcg_params(struct kmem_cache *s)
110 struct kmem_cache *s, struct kmem_cache *root_cache)
111{ 110{
112 size_t size; 111 s->memcg_params.is_root_cache = true;
112 INIT_LIST_HEAD(&s->memcg_params.list);
113 RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
114}
115
116static int init_memcg_params(struct kmem_cache *s,
117 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
118{
119 struct memcg_cache_array *arr;
113 120
114 if (!memcg_kmem_enabled()) 121 if (memcg) {
122 s->memcg_params.is_root_cache = false;
123 s->memcg_params.memcg = memcg;
124 s->memcg_params.root_cache = root_cache;
115 return 0; 125 return 0;
126 }
116 127
117 if (!memcg) { 128 slab_init_memcg_params(s);
118 size = offsetof(struct memcg_cache_params, memcg_caches);
119 size += memcg_limited_groups_array_size * sizeof(void *);
120 } else
121 size = sizeof(struct memcg_cache_params);
122 129
123 s->memcg_params = kzalloc(size, GFP_KERNEL); 130 if (!memcg_nr_cache_ids)
124 if (!s->memcg_params) 131 return 0;
125 return -ENOMEM;
126 132
127 if (memcg) { 133 arr = kzalloc(sizeof(struct memcg_cache_array) +
128 s->memcg_params->memcg = memcg; 134 memcg_nr_cache_ids * sizeof(void *),
129 s->memcg_params->root_cache = root_cache; 135 GFP_KERNEL);
130 } else 136 if (!arr)
131 s->memcg_params->is_root_cache = true; 137 return -ENOMEM;
132 138
139 RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
133 return 0; 140 return 0;
134} 141}
135 142
136static void memcg_free_cache_params(struct kmem_cache *s) 143static void destroy_memcg_params(struct kmem_cache *s)
137{ 144{
138 kfree(s->memcg_params); 145 if (is_root_cache(s))
146 kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
139} 147}
140 148
141static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs) 149static int update_memcg_params(struct kmem_cache *s, int new_array_size)
142{ 150{
143 int size; 151 struct memcg_cache_array *old, *new;
144 struct memcg_cache_params *new_params, *cur_params;
145
146 BUG_ON(!is_root_cache(s));
147 152
148 size = offsetof(struct memcg_cache_params, memcg_caches); 153 if (!is_root_cache(s))
149 size += num_memcgs * sizeof(void *); 154 return 0;
150 155
151 new_params = kzalloc(size, GFP_KERNEL); 156 new = kzalloc(sizeof(struct memcg_cache_array) +
152 if (!new_params) 157 new_array_size * sizeof(void *), GFP_KERNEL);
158 if (!new)
153 return -ENOMEM; 159 return -ENOMEM;
154 160
155 cur_params = s->memcg_params; 161 old = rcu_dereference_protected(s->memcg_params.memcg_caches,
156 memcpy(new_params->memcg_caches, cur_params->memcg_caches, 162 lockdep_is_held(&slab_mutex));
157 memcg_limited_groups_array_size * sizeof(void *)); 163 if (old)
158 164 memcpy(new->entries, old->entries,
159 new_params->is_root_cache = true; 165 memcg_nr_cache_ids * sizeof(void *));
160
161 rcu_assign_pointer(s->memcg_params, new_params);
162 if (cur_params)
163 kfree_rcu(cur_params, rcu_head);
164 166
167 rcu_assign_pointer(s->memcg_params.memcg_caches, new);
168 if (old)
169 kfree_rcu(old, rcu);
165 return 0; 170 return 0;
166} 171}
167 172
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
169{ 174{
170 struct kmem_cache *s; 175 struct kmem_cache *s;
171 int ret = 0; 176 int ret = 0;
172 mutex_lock(&slab_mutex);
173 177
178 mutex_lock(&slab_mutex);
174 list_for_each_entry(s, &slab_caches, list) { 179 list_for_each_entry(s, &slab_caches, list) {
175 if (!is_root_cache(s)) 180 ret = update_memcg_params(s, num_memcgs);
176 continue;
177
178 ret = memcg_update_cache_params(s, num_memcgs);
179 /* 181 /*
180 * Instead of freeing the memory, we'll just leave the caches 182 * Instead of freeing the memory, we'll just leave the caches
181 * up to this point in an updated state. 183 * up to this point in an updated state.
182 */ 184 */
183 if (ret) 185 if (ret)
184 goto out; 186 break;
185 } 187 }
186
187 memcg_update_array_size(num_memcgs);
188out:
189 mutex_unlock(&slab_mutex); 188 mutex_unlock(&slab_mutex);
190 return ret; 189 return ret;
191} 190}
192#else 191#else
193static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg, 192static inline int init_memcg_params(struct kmem_cache *s,
194 struct kmem_cache *s, struct kmem_cache *root_cache) 193 struct mem_cgroup *memcg, struct kmem_cache *root_cache)
195{ 194{
196 return 0; 195 return 0;
197} 196}
198 197
199static inline void memcg_free_cache_params(struct kmem_cache *s) 198static inline void destroy_memcg_params(struct kmem_cache *s)
200{ 199{
201} 200}
202#endif /* CONFIG_MEMCG_KMEM */ 201#endif /* CONFIG_MEMCG_KMEM */
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
314 s->align = align; 313 s->align = align;
315 s->ctor = ctor; 314 s->ctor = ctor;
316 315
317 err = memcg_alloc_cache_params(memcg, s, root_cache); 316 err = init_memcg_params(s, memcg, root_cache);
318 if (err) 317 if (err)
319 goto out_free_cache; 318 goto out_free_cache;
320 319
@@ -330,7 +329,7 @@ out:
330 return s; 329 return s;
331 330
332out_free_cache: 331out_free_cache:
333 memcg_free_cache_params(s); 332 destroy_memcg_params(s);
334 kmem_cache_free(kmem_cache, s); 333 kmem_cache_free(kmem_cache, s);
335 goto out; 334 goto out;
336} 335}
@@ -369,6 +368,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
369 368
370 get_online_cpus(); 369 get_online_cpus();
371 get_online_mems(); 370 get_online_mems();
371 memcg_get_cache_ids();
372 372
373 mutex_lock(&slab_mutex); 373 mutex_lock(&slab_mutex);
374 374
@@ -407,6 +407,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
407out_unlock: 407out_unlock:
408 mutex_unlock(&slab_mutex); 408 mutex_unlock(&slab_mutex);
409 409
410 memcg_put_cache_ids();
410 put_online_mems(); 411 put_online_mems();
411 put_online_cpus(); 412 put_online_cpus();
412 413
@@ -439,13 +440,8 @@ static int do_kmem_cache_shutdown(struct kmem_cache *s,
439 *need_rcu_barrier = true; 440 *need_rcu_barrier = true;
440 441
441#ifdef CONFIG_MEMCG_KMEM 442#ifdef CONFIG_MEMCG_KMEM
442 if (!is_root_cache(s)) { 443 if (!is_root_cache(s))
443 struct kmem_cache *root_cache = s->memcg_params->root_cache; 444 list_del(&s->memcg_params.list);
444 int memcg_id = memcg_cache_id(s->memcg_params->memcg);
445
446 BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
447 root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
448 }
449#endif 445#endif
450 list_move(&s->list, release); 446 list_move(&s->list, release);
451 return 0; 447 return 0;
@@ -482,9 +478,11 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
482 struct kmem_cache *root_cache) 478 struct kmem_cache *root_cache)
483{ 479{
484 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */ 480 static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
485 int memcg_id = memcg_cache_id(memcg); 481 struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
482 struct memcg_cache_array *arr;
486 struct kmem_cache *s = NULL; 483 struct kmem_cache *s = NULL;
487 char *cache_name; 484 char *cache_name;
485 int idx;
488 486
489 get_online_cpus(); 487 get_online_cpus();
490 get_online_mems(); 488 get_online_mems();
@@ -492,17 +490,27 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
492 mutex_lock(&slab_mutex); 490 mutex_lock(&slab_mutex);
493 491
494 /* 492 /*
493 * The memory cgroup could have been deactivated while the cache
494 * creation work was pending.
495 */
496 if (!memcg_kmem_is_active(memcg))
497 goto out_unlock;
498
499 idx = memcg_cache_id(memcg);
500 arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
501 lockdep_is_held(&slab_mutex));
502
503 /*
495 * Since per-memcg caches are created asynchronously on first 504 * Since per-memcg caches are created asynchronously on first
496 * allocation (see memcg_kmem_get_cache()), several threads can try to 505 * allocation (see memcg_kmem_get_cache()), several threads can try to
497 * create the same cache, but only one of them may succeed. 506 * create the same cache, but only one of them may succeed.
498 */ 507 */
499 if (cache_from_memcg_idx(root_cache, memcg_id)) 508 if (arr->entries[idx])
500 goto out_unlock; 509 goto out_unlock;
501 510
502 cgroup_name(mem_cgroup_css(memcg)->cgroup, 511 cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
503 memcg_name_buf, sizeof(memcg_name_buf));
504 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, 512 cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
505 memcg_cache_id(memcg), memcg_name_buf); 513 css->id, memcg_name_buf);
506 if (!cache_name) 514 if (!cache_name)
507 goto out_unlock; 515 goto out_unlock;
508 516
@@ -520,13 +528,15 @@ void memcg_create_kmem_cache(struct mem_cgroup *memcg,
520 goto out_unlock; 528 goto out_unlock;
521 } 529 }
522 530
531 list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
532
523 /* 533 /*
524 * Since readers won't lock (see cache_from_memcg_idx()), we need a 534 * Since readers won't lock (see cache_from_memcg_idx()), we need a
525 * barrier here to ensure nobody will see the kmem_cache partially 535 * barrier here to ensure nobody will see the kmem_cache partially
526 * initialized. 536 * initialized.
527 */ 537 */
528 smp_wmb(); 538 smp_wmb();
529 root_cache->memcg_params->memcg_caches[memcg_id] = s; 539 arr->entries[idx] = s;
530 540
531out_unlock: 541out_unlock:
532 mutex_unlock(&slab_mutex); 542 mutex_unlock(&slab_mutex);
@@ -535,6 +545,37 @@ out_unlock:
535 put_online_cpus(); 545 put_online_cpus();
536} 546}
537 547
548void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
549{
550 int idx;
551 struct memcg_cache_array *arr;
552 struct kmem_cache *s, *c;
553
554 idx = memcg_cache_id(memcg);
555
556 get_online_cpus();
557 get_online_mems();
558
559 mutex_lock(&slab_mutex);
560 list_for_each_entry(s, &slab_caches, list) {
561 if (!is_root_cache(s))
562 continue;
563
564 arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
565 lockdep_is_held(&slab_mutex));
566 c = arr->entries[idx];
567 if (!c)
568 continue;
569
570 __kmem_cache_shrink(c, true);
571 arr->entries[idx] = NULL;
572 }
573 mutex_unlock(&slab_mutex);
574
575 put_online_mems();
576 put_online_cpus();
577}
578
538void memcg_destroy_kmem_caches(struct mem_cgroup *memcg) 579void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
539{ 580{
540 LIST_HEAD(release); 581 LIST_HEAD(release);
@@ -546,7 +587,7 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
546 587
547 mutex_lock(&slab_mutex); 588 mutex_lock(&slab_mutex);
548 list_for_each_entry_safe(s, s2, &slab_caches, list) { 589 list_for_each_entry_safe(s, s2, &slab_caches, list) {
549 if (is_root_cache(s) || s->memcg_params->memcg != memcg) 590 if (is_root_cache(s) || s->memcg_params.memcg != memcg)
550 continue; 591 continue;
551 /* 592 /*
552 * The cgroup is about to be freed and therefore has no charges 593 * The cgroup is about to be freed and therefore has no charges
@@ -565,18 +606,20 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
565 606
566void slab_kmem_cache_release(struct kmem_cache *s) 607void slab_kmem_cache_release(struct kmem_cache *s)
567{ 608{
568 memcg_free_cache_params(s); 609 destroy_memcg_params(s);
569 kfree(s->name); 610 kfree(s->name);
570 kmem_cache_free(kmem_cache, s); 611 kmem_cache_free(kmem_cache, s);
571} 612}
572 613
573void kmem_cache_destroy(struct kmem_cache *s) 614void kmem_cache_destroy(struct kmem_cache *s)
574{ 615{
575 int i; 616 struct kmem_cache *c, *c2;
576 LIST_HEAD(release); 617 LIST_HEAD(release);
577 bool need_rcu_barrier = false; 618 bool need_rcu_barrier = false;
578 bool busy = false; 619 bool busy = false;
579 620
621 BUG_ON(!is_root_cache(s));
622
580 get_online_cpus(); 623 get_online_cpus();
581 get_online_mems(); 624 get_online_mems();
582 625
@@ -586,10 +629,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
586 if (s->refcount) 629 if (s->refcount)
587 goto out_unlock; 630 goto out_unlock;
588 631
589 for_each_memcg_cache_index(i) { 632 for_each_memcg_cache_safe(c, c2, s) {
590 struct kmem_cache *c = cache_from_memcg_idx(s, i); 633 if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
591
592 if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
593 busy = true; 634 busy = true;
594 } 635 }
595 636
@@ -619,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
619 660
620 get_online_cpus(); 661 get_online_cpus();
621 get_online_mems(); 662 get_online_mems();
622 ret = __kmem_cache_shrink(cachep); 663 ret = __kmem_cache_shrink(cachep, false);
623 put_online_mems(); 664 put_online_mems();
624 put_online_cpus(); 665 put_online_cpus();
625 return ret; 666 return ret;
@@ -641,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
641 s->name = name; 682 s->name = name;
642 s->size = s->object_size = size; 683 s->size = s->object_size = size;
643 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size); 684 s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
685
686 slab_init_memcg_params(s);
687
644 err = __kmem_cache_create(s, flags); 688 err = __kmem_cache_create(s, flags);
645 689
646 if (err) 690 if (err)
@@ -920,16 +964,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
920{ 964{
921 struct kmem_cache *c; 965 struct kmem_cache *c;
922 struct slabinfo sinfo; 966 struct slabinfo sinfo;
923 int i;
924 967
925 if (!is_root_cache(s)) 968 if (!is_root_cache(s))
926 return; 969 return;
927 970
928 for_each_memcg_cache_index(i) { 971 for_each_memcg_cache(c, s) {
929 c = cache_from_memcg_idx(s, i);
930 if (!c)
931 continue;
932
933 memset(&sinfo, 0, sizeof(sinfo)); 972 memset(&sinfo, 0, sizeof(sinfo));
934 get_slabinfo(c, &sinfo); 973 get_slabinfo(c, &sinfo);
935 974
@@ -981,7 +1020,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
981 1020
982 if (p == slab_caches.next) 1021 if (p == slab_caches.next)
983 print_slabinfo_header(m); 1022 print_slabinfo_header(m);
984 if (!is_root_cache(s) && s->memcg_params->memcg == memcg) 1023 if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
985 cache_show(s, m); 1024 cache_show(s, m);
986 return 0; 1025 return 0;
987} 1026}
diff --git a/mm/slob.c b/mm/slob.c
index 96a86206a26b..94a7fede6d48 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
618 return 0; 618 return 0;
619} 619}
620 620
621int __kmem_cache_shrink(struct kmem_cache *d) 621int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
622{ 622{
623 return 0; 623 return 0;
624} 624}
diff --git a/mm/slub.c b/mm/slub.c
index 8b8508adf9c2..06cdb1829dc9 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2007,6 +2007,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2007 int pages; 2007 int pages;
2008 int pobjects; 2008 int pobjects;
2009 2009
2010 preempt_disable();
2010 do { 2011 do {
2011 pages = 0; 2012 pages = 0;
2012 pobjects = 0; 2013 pobjects = 0;
@@ -2040,6 +2041,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
2040 2041
2041 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) 2042 } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
2042 != oldpage); 2043 != oldpage);
2044 if (unlikely(!s->cpu_partial)) {
2045 unsigned long flags;
2046
2047 local_irq_save(flags);
2048 unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
2049 local_irq_restore(flags);
2050 }
2051 preempt_enable();
2043#endif 2052#endif
2044} 2053}
2045 2054
@@ -3358,69 +3367,92 @@ void kfree(const void *x)
3358} 3367}
3359EXPORT_SYMBOL(kfree); 3368EXPORT_SYMBOL(kfree);
3360 3369
3370#define SHRINK_PROMOTE_MAX 32
3371
3361/* 3372/*
3362 * kmem_cache_shrink removes empty slabs from the partial lists and sorts 3373 * kmem_cache_shrink discards empty slabs and promotes the slabs filled
3363 * the remaining slabs by the number of items in use. The slabs with the 3374 * up most to the head of the partial lists. New allocations will then
3364 * most items in use come first. New allocations will then fill those up 3375 * fill those up and thus they can be removed from the partial lists.
3365 * and thus they can be removed from the partial lists.
3366 * 3376 *
3367 * The slabs with the least items are placed last. This results in them 3377 * The slabs with the least items are placed last. This results in them
3368 * being allocated from last increasing the chance that the last objects 3378 * being allocated from last increasing the chance that the last objects
3369 * are freed in them. 3379 * are freed in them.
3370 */ 3380 */
3371int __kmem_cache_shrink(struct kmem_cache *s) 3381int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
3372{ 3382{
3373 int node; 3383 int node;
3374 int i; 3384 int i;
3375 struct kmem_cache_node *n; 3385 struct kmem_cache_node *n;
3376 struct page *page; 3386 struct page *page;
3377 struct page *t; 3387 struct page *t;
3378 int objects = oo_objects(s->max); 3388 struct list_head discard;
3379 struct list_head *slabs_by_inuse = 3389 struct list_head promote[SHRINK_PROMOTE_MAX];
3380 kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
3381 unsigned long flags; 3390 unsigned long flags;
3391 int ret = 0;
3382 3392
3383 if (!slabs_by_inuse) 3393 if (deactivate) {
3384 return -ENOMEM; 3394 /*
3395 * Disable empty slabs caching. Used to avoid pinning offline
3396 * memory cgroups by kmem pages that can be freed.
3397 */
3398 s->cpu_partial = 0;
3399 s->min_partial = 0;
3400
3401 /*
3402 * s->cpu_partial is checked locklessly (see put_cpu_partial),
3403 * so we have to make sure the change is visible.
3404 */
3405 kick_all_cpus_sync();
3406 }
3385 3407
3386 flush_all(s); 3408 flush_all(s);
3387 for_each_kmem_cache_node(s, node, n) { 3409 for_each_kmem_cache_node(s, node, n) {
3388 if (!n->nr_partial) 3410 INIT_LIST_HEAD(&discard);
3389 continue; 3411 for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
3390 3412 INIT_LIST_HEAD(promote + i);
3391 for (i = 0; i < objects; i++)
3392 INIT_LIST_HEAD(slabs_by_inuse + i);
3393 3413
3394 spin_lock_irqsave(&n->list_lock, flags); 3414 spin_lock_irqsave(&n->list_lock, flags);
3395 3415
3396 /* 3416 /*
3397 * Build lists indexed by the items in use in each slab. 3417 * Build lists of slabs to discard or promote.
3398 * 3418 *
3399 * Note that concurrent frees may occur while we hold the 3419 * Note that concurrent frees may occur while we hold the
3400 * list_lock. page->inuse here is the upper limit. 3420 * list_lock. page->inuse here is the upper limit.
3401 */ 3421 */
3402 list_for_each_entry_safe(page, t, &n->partial, lru) { 3422 list_for_each_entry_safe(page, t, &n->partial, lru) {
3403 list_move(&page->lru, slabs_by_inuse + page->inuse); 3423 int free = page->objects - page->inuse;
3404 if (!page->inuse) 3424
3425 /* Do not reread page->inuse */
3426 barrier();
3427
3428 /* We do not keep full slabs on the list */
3429 BUG_ON(free <= 0);
3430
3431 if (free == page->objects) {
3432 list_move(&page->lru, &discard);
3405 n->nr_partial--; 3433 n->nr_partial--;
3434 } else if (free <= SHRINK_PROMOTE_MAX)
3435 list_move(&page->lru, promote + free - 1);
3406 } 3436 }
3407 3437
3408 /* 3438 /*
3409 * Rebuild the partial list with the slabs filled up most 3439 * Promote the slabs filled up most to the head of the
3410 * first and the least used slabs at the end. 3440 * partial list.
3411 */ 3441 */
3412 for (i = objects - 1; i > 0; i--) 3442 for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
3413 list_splice(slabs_by_inuse + i, n->partial.prev); 3443 list_splice(promote + i, &n->partial);
3414 3444
3415 spin_unlock_irqrestore(&n->list_lock, flags); 3445 spin_unlock_irqrestore(&n->list_lock, flags);
3416 3446
3417 /* Release empty slabs */ 3447 /* Release empty slabs */
3418 list_for_each_entry_safe(page, t, slabs_by_inuse, lru) 3448 list_for_each_entry_safe(page, t, &discard, lru)
3419 discard_slab(s, page); 3449 discard_slab(s, page);
3450
3451 if (slabs_node(s, node))
3452 ret = 1;
3420 } 3453 }
3421 3454
3422 kfree(slabs_by_inuse); 3455 return ret;
3423 return 0;
3424} 3456}
3425 3457
3426static int slab_mem_going_offline_callback(void *arg) 3458static int slab_mem_going_offline_callback(void *arg)
@@ -3429,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg)
3429 3461
3430 mutex_lock(&slab_mutex); 3462 mutex_lock(&slab_mutex);
3431 list_for_each_entry(s, &slab_caches, list) 3463 list_for_each_entry(s, &slab_caches, list)
3432 __kmem_cache_shrink(s); 3464 __kmem_cache_shrink(s, false);
3433 mutex_unlock(&slab_mutex); 3465 mutex_unlock(&slab_mutex);
3434 3466
3435 return 0; 3467 return 0;
@@ -3577,6 +3609,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
3577 p->slab_cache = s; 3609 p->slab_cache = s;
3578#endif 3610#endif
3579 } 3611 }
3612 slab_init_memcg_params(s);
3580 list_add(&s->list, &slab_caches); 3613 list_add(&s->list, &slab_caches);
3581 return s; 3614 return s;
3582} 3615}
@@ -3635,13 +3668,10 @@ struct kmem_cache *
3635__kmem_cache_alias(const char *name, size_t size, size_t align, 3668__kmem_cache_alias(const char *name, size_t size, size_t align,
3636 unsigned long flags, void (*ctor)(void *)) 3669 unsigned long flags, void (*ctor)(void *))
3637{ 3670{
3638 struct kmem_cache *s; 3671 struct kmem_cache *s, *c;
3639 3672
3640 s = find_mergeable(size, align, flags, name, ctor); 3673 s = find_mergeable(size, align, flags, name, ctor);
3641 if (s) { 3674 if (s) {
3642 int i;
3643 struct kmem_cache *c;
3644
3645 s->refcount++; 3675 s->refcount++;
3646 3676
3647 /* 3677 /*
@@ -3651,10 +3681,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
3651 s->object_size = max(s->object_size, (int)size); 3681 s->object_size = max(s->object_size, (int)size);
3652 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); 3682 s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
3653 3683
3654 for_each_memcg_cache_index(i) { 3684 for_each_memcg_cache(c, s) {
3655 c = cache_from_memcg_idx(s, i);
3656 if (!c)
3657 continue;
3658 c->object_size = s->object_size; 3685 c->object_size = s->object_size;
3659 c->inuse = max_t(int, c->inuse, 3686 c->inuse = max_t(int, c->inuse,
3660 ALIGN(size, sizeof(void *))); 3687 ALIGN(size, sizeof(void *)));
@@ -4691,12 +4718,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
4691static ssize_t shrink_store(struct kmem_cache *s, 4718static ssize_t shrink_store(struct kmem_cache *s,
4692 const char *buf, size_t length) 4719 const char *buf, size_t length)
4693{ 4720{
4694 if (buf[0] == '1') { 4721 if (buf[0] == '1')
4695 int rc = kmem_cache_shrink(s); 4722 kmem_cache_shrink(s);
4696 4723 else
4697 if (rc)
4698 return rc;
4699 } else
4700 return -EINVAL; 4724 return -EINVAL;
4701 return length; 4725 return length;
4702} 4726}
@@ -4920,7 +4944,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4920 err = attribute->store(s, buf, len); 4944 err = attribute->store(s, buf, len);
4921#ifdef CONFIG_MEMCG_KMEM 4945#ifdef CONFIG_MEMCG_KMEM
4922 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { 4946 if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
4923 int i; 4947 struct kmem_cache *c;
4924 4948
4925 mutex_lock(&slab_mutex); 4949 mutex_lock(&slab_mutex);
4926 if (s->max_attr_size < len) 4950 if (s->max_attr_size < len)
@@ -4943,11 +4967,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
4943 * directly either failed or succeeded, in which case we loop 4967 * directly either failed or succeeded, in which case we loop
4944 * through the descendants with best-effort propagation. 4968 * through the descendants with best-effort propagation.
4945 */ 4969 */
4946 for_each_memcg_cache_index(i) { 4970 for_each_memcg_cache(c, s)
4947 struct kmem_cache *c = cache_from_memcg_idx(s, i); 4971 attribute->store(c, buf, len);
4948 if (c)
4949 attribute->store(c, buf, len);
4950 }
4951 mutex_unlock(&slab_mutex); 4972 mutex_unlock(&slab_mutex);
4952 } 4973 }
4953#endif 4974#endif
@@ -4964,7 +4985,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
4964 if (is_root_cache(s)) 4985 if (is_root_cache(s))
4965 return; 4986 return;
4966 4987
4967 root_cache = s->memcg_params->root_cache; 4988 root_cache = s->memcg_params.root_cache;
4968 4989
4969 /* 4990 /*
4970 * This mean this cache had no attribute written. Therefore, no point 4991 * This mean this cache had no attribute written. Therefore, no point
@@ -5044,7 +5065,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
5044{ 5065{
5045#ifdef CONFIG_MEMCG_KMEM 5066#ifdef CONFIG_MEMCG_KMEM
5046 if (!is_root_cache(s)) 5067 if (!is_root_cache(s))
5047 return s->memcg_params->root_cache->memcg_kset; 5068 return s->memcg_params.root_cache->memcg_kset;
5048#endif 5069#endif
5049 return slab_kset; 5070 return slab_kset;
5050} 5071}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 224dd298fdcd..5e8eadd71bac 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -232,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
232 232
233#define SHRINK_BATCH 128 233#define SHRINK_BATCH 128
234 234
235static unsigned long shrink_slabs(struct shrink_control *shrinkctl, 235static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
236 struct shrinker *shrinker, 236 struct shrinker *shrinker,
237 unsigned long nr_scanned, 237 unsigned long nr_scanned,
238 unsigned long nr_eligible) 238 unsigned long nr_eligible)
239{ 239{
240 unsigned long freed = 0; 240 unsigned long freed = 0;
241 unsigned long long delta; 241 unsigned long long delta;
@@ -344,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
344} 344}
345 345
346/** 346/**
347 * shrink_node_slabs - shrink slab caches of a given node 347 * shrink_slab - shrink slab caches
348 * @gfp_mask: allocation context 348 * @gfp_mask: allocation context
349 * @nid: node whose slab caches to target 349 * @nid: node whose slab caches to target
350 * @memcg: memory cgroup whose slab caches to target
350 * @nr_scanned: pressure numerator 351 * @nr_scanned: pressure numerator
351 * @nr_eligible: pressure denominator 352 * @nr_eligible: pressure denominator
352 * 353 *
@@ -355,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
355 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, 356 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
356 * unaware shrinkers will receive a node id of 0 instead. 357 * unaware shrinkers will receive a node id of 0 instead.
357 * 358 *
359 * @memcg specifies the memory cgroup to target. If it is not NULL,
360 * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
361 * objects from the memory cgroup specified. Otherwise all shrinkers
362 * are called, and memcg aware shrinkers are supposed to scan the
363 * global list then.
364 *
358 * @nr_scanned and @nr_eligible form a ratio that indicate how much of 365 * @nr_scanned and @nr_eligible form a ratio that indicate how much of
359 * the available objects should be scanned. Page reclaim for example 366 * the available objects should be scanned. Page reclaim for example
360 * passes the number of pages scanned and the number of pages on the 367 * passes the number of pages scanned and the number of pages on the
@@ -365,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
365 * 372 *
366 * Returns the number of reclaimed slab objects. 373 * Returns the number of reclaimed slab objects.
367 */ 374 */
368unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid, 375static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
369 unsigned long nr_scanned, 376 struct mem_cgroup *memcg,
370 unsigned long nr_eligible) 377 unsigned long nr_scanned,
378 unsigned long nr_eligible)
371{ 379{
372 struct shrinker *shrinker; 380 struct shrinker *shrinker;
373 unsigned long freed = 0; 381 unsigned long freed = 0;
374 382
383 if (memcg && !memcg_kmem_is_active(memcg))
384 return 0;
385
375 if (nr_scanned == 0) 386 if (nr_scanned == 0)
376 nr_scanned = SWAP_CLUSTER_MAX; 387 nr_scanned = SWAP_CLUSTER_MAX;
377 388
@@ -390,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
390 struct shrink_control sc = { 401 struct shrink_control sc = {
391 .gfp_mask = gfp_mask, 402 .gfp_mask = gfp_mask,
392 .nid = nid, 403 .nid = nid,
404 .memcg = memcg,
393 }; 405 };
394 406
407 if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
408 continue;
409
395 if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) 410 if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
396 sc.nid = 0; 411 sc.nid = 0;
397 412
398 freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible); 413 freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
399 } 414 }
400 415
401 up_read(&shrinker_rwsem); 416 up_read(&shrinker_rwsem);
@@ -404,6 +419,29 @@ out:
404 return freed; 419 return freed;
405} 420}
406 421
422void drop_slab_node(int nid)
423{
424 unsigned long freed;
425
426 do {
427 struct mem_cgroup *memcg = NULL;
428
429 freed = 0;
430 do {
431 freed += shrink_slab(GFP_KERNEL, nid, memcg,
432 1000, 1000);
433 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
434 } while (freed > 10);
435}
436
437void drop_slab(void)
438{
439 int nid;
440
441 for_each_online_node(nid)
442 drop_slab_node(nid);
443}
444
407static inline int is_page_cache_freeable(struct page *page) 445static inline int is_page_cache_freeable(struct page *page)
408{ 446{
409 /* 447 /*
@@ -2276,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
2276static bool shrink_zone(struct zone *zone, struct scan_control *sc, 2314static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2277 bool is_classzone) 2315 bool is_classzone)
2278{ 2316{
2317 struct reclaim_state *reclaim_state = current->reclaim_state;
2279 unsigned long nr_reclaimed, nr_scanned; 2318 unsigned long nr_reclaimed, nr_scanned;
2280 bool reclaimable = false; 2319 bool reclaimable = false;
2281 2320
@@ -2294,6 +2333,7 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2294 memcg = mem_cgroup_iter(root, NULL, &reclaim); 2333 memcg = mem_cgroup_iter(root, NULL, &reclaim);
2295 do { 2334 do {
2296 unsigned long lru_pages; 2335 unsigned long lru_pages;
2336 unsigned long scanned;
2297 struct lruvec *lruvec; 2337 struct lruvec *lruvec;
2298 int swappiness; 2338 int swappiness;
2299 2339
@@ -2305,10 +2345,16 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2305 2345
2306 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 2346 lruvec = mem_cgroup_zone_lruvec(zone, memcg);
2307 swappiness = mem_cgroup_swappiness(memcg); 2347 swappiness = mem_cgroup_swappiness(memcg);
2348 scanned = sc->nr_scanned;
2308 2349
2309 shrink_lruvec(lruvec, swappiness, sc, &lru_pages); 2350 shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
2310 zone_lru_pages += lru_pages; 2351 zone_lru_pages += lru_pages;
2311 2352
2353 if (memcg && is_classzone)
2354 shrink_slab(sc->gfp_mask, zone_to_nid(zone),
2355 memcg, sc->nr_scanned - scanned,
2356 lru_pages);
2357
2312 /* 2358 /*
2313 * Direct reclaim and kswapd have to scan all memory 2359 * Direct reclaim and kswapd have to scan all memory
2314 * cgroups to fulfill the overall scan target for the 2360 * cgroups to fulfill the overall scan target for the
@@ -2330,19 +2376,14 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
2330 * Shrink the slab caches in the same proportion that 2376 * Shrink the slab caches in the same proportion that
2331 * the eligible LRU pages were scanned. 2377 * the eligible LRU pages were scanned.
2332 */ 2378 */
2333 if (global_reclaim(sc) && is_classzone) { 2379 if (global_reclaim(sc) && is_classzone)
2334 struct reclaim_state *reclaim_state; 2380 shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
2335 2381 sc->nr_scanned - nr_scanned,
2336 shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone), 2382 zone_lru_pages);
2337 sc->nr_scanned - nr_scanned, 2383
2338 zone_lru_pages); 2384 if (reclaim_state) {
2339 2385 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2340 reclaim_state = current->reclaim_state; 2386 reclaim_state->reclaimed_slab = 0;
2341 if (reclaim_state) {
2342 sc->nr_reclaimed +=
2343 reclaim_state->reclaimed_slab;
2344 reclaim_state->reclaimed_slab = 0;
2345 }
2346 } 2387 }
2347 2388
2348 vmpressure(sc->gfp_mask, sc->target_mem_cgroup, 2389 vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa7da27..aa017133744b 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
275 275
276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 276 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
277 local_irq_disable(); 277 local_irq_disable();
278 shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); 278 shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
279 local_irq_enable(); 279 local_irq_enable();
280 280
281 pages = node_present_pages(sc->nid); 281 pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
302} 302}
303 303
304static enum lru_status shadow_lru_isolate(struct list_head *item, 304static enum lru_status shadow_lru_isolate(struct list_head *item,
305 struct list_lru_one *lru,
305 spinlock_t *lru_lock, 306 spinlock_t *lru_lock,
306 void *arg) 307 void *arg)
307{ 308{
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
332 goto out; 333 goto out;
333 } 334 }
334 335
335 list_del_init(item); 336 list_lru_isolate(lru, item);
336 spin_unlock(lru_lock); 337 spin_unlock(lru_lock);
337 338
338 /* 339 /*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
376 377
377 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ 378 /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
378 local_irq_disable(); 379 local_irq_disable();
379 ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, 380 ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
380 shadow_lru_isolate, NULL, &sc->nr_to_scan); 381 shadow_lru_isolate, NULL);
381 local_irq_enable(); 382 local_irq_enable();
382 return ret; 383 return ret;
383} 384}
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387bea702e..2ee4e4520493 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
130 .evict = zbud_zpool_evict 130 .evict = zbud_zpool_evict
131}; 131};
132 132
133static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 133static void *zbud_zpool_create(char *name, gfp_t gfp,
134 struct zpool_ops *zpool_ops)
134{ 135{
135 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL); 136 return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
136} 137}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0d183a..bacdab6e47de 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
129/** 129/**
130 * zpool_create_pool() - Create a new zpool 130 * zpool_create_pool() - Create a new zpool
131 * @type The type of the zpool to create (e.g. zbud, zsmalloc) 131 * @type The type of the zpool to create (e.g. zbud, zsmalloc)
132 * @name The name of the zpool (e.g. zram0, zswap)
132 * @gfp The GFP flags to use when allocating the pool. 133 * @gfp The GFP flags to use when allocating the pool.
133 * @ops The optional ops callback. 134 * @ops The optional ops callback.
134 * 135 *
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
140 * 141 *
141 * Returns: New zpool on success, NULL on failure. 142 * Returns: New zpool on success, NULL on failure.
142 */ 143 */
143struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops) 144struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
145 struct zpool_ops *ops)
144{ 146{
145 struct zpool_driver *driver; 147 struct zpool_driver *driver;
146 struct zpool *zpool; 148 struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
168 170
169 zpool->type = driver->type; 171 zpool->type = driver->type;
170 zpool->driver = driver; 172 zpool->driver = driver;
171 zpool->pool = driver->create(gfp, ops); 173 zpool->pool = driver->create(name, gfp, ops);
172 zpool->ops = ops; 174 zpool->ops = ops;
173 175
174 if (!zpool->pool) { 176 if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b72403927aa4..0dec1fa5f656 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
91#include <linux/hardirq.h> 91#include <linux/hardirq.h>
92#include <linux/spinlock.h> 92#include <linux/spinlock.h>
93#include <linux/types.h> 93#include <linux/types.h>
94#include <linux/debugfs.h>
94#include <linux/zsmalloc.h> 95#include <linux/zsmalloc.h>
95#include <linux/zpool.h> 96#include <linux/zpool.h>
96 97
@@ -168,6 +169,22 @@ enum fullness_group {
168 ZS_FULL 169 ZS_FULL
169}; 170};
170 171
172enum zs_stat_type {
173 OBJ_ALLOCATED,
174 OBJ_USED,
175 NR_ZS_STAT_TYPE,
176};
177
178#ifdef CONFIG_ZSMALLOC_STAT
179
180static struct dentry *zs_stat_root;
181
182struct zs_size_stat {
183 unsigned long objs[NR_ZS_STAT_TYPE];
184};
185
186#endif
187
171/* 188/*
172 * number of size_classes 189 * number of size_classes
173 */ 190 */
@@ -200,6 +217,10 @@ struct size_class {
200 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */ 217 /* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
201 int pages_per_zspage; 218 int pages_per_zspage;
202 219
220#ifdef CONFIG_ZSMALLOC_STAT
221 struct zs_size_stat stats;
222#endif
223
203 spinlock_t lock; 224 spinlock_t lock;
204 225
205 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS]; 226 struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
217}; 238};
218 239
219struct zs_pool { 240struct zs_pool {
241 char *name;
242
220 struct size_class **size_class; 243 struct size_class **size_class;
221 244
222 gfp_t flags; /* allocation flags used when growing pool */ 245 gfp_t flags; /* allocation flags used when growing pool */
223 atomic_long_t pages_allocated; 246 atomic_long_t pages_allocated;
247
248#ifdef CONFIG_ZSMALLOC_STAT
249 struct dentry *stat_dentry;
250#endif
224}; 251};
225 252
226/* 253/*
@@ -246,9 +273,9 @@ struct mapping_area {
246 273
247#ifdef CONFIG_ZPOOL 274#ifdef CONFIG_ZPOOL
248 275
249static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops) 276static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
250{ 277{
251 return zs_create_pool(gfp); 278 return zs_create_pool(name, gfp);
252} 279}
253 280
254static void zs_zpool_destroy(void *pool) 281static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
942 return true; 969 return true;
943} 970}
944 971
972#ifdef CONFIG_ZSMALLOC_STAT
973
974static inline void zs_stat_inc(struct size_class *class,
975 enum zs_stat_type type, unsigned long cnt)
976{
977 class->stats.objs[type] += cnt;
978}
979
980static inline void zs_stat_dec(struct size_class *class,
981 enum zs_stat_type type, unsigned long cnt)
982{
983 class->stats.objs[type] -= cnt;
984}
985
986static inline unsigned long zs_stat_get(struct size_class *class,
987 enum zs_stat_type type)
988{
989 return class->stats.objs[type];
990}
991
992static int __init zs_stat_init(void)
993{
994 if (!debugfs_initialized())
995 return -ENODEV;
996
997 zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
998 if (!zs_stat_root)
999 return -ENOMEM;
1000
1001 return 0;
1002}
1003
1004static void __exit zs_stat_exit(void)
1005{
1006 debugfs_remove_recursive(zs_stat_root);
1007}
1008
1009static int zs_stats_size_show(struct seq_file *s, void *v)
1010{
1011 int i;
1012 struct zs_pool *pool = s->private;
1013 struct size_class *class;
1014 int objs_per_zspage;
1015 unsigned long obj_allocated, obj_used, pages_used;
1016 unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
1017
1018 seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
1019 "obj_allocated", "obj_used", "pages_used");
1020
1021 for (i = 0; i < zs_size_classes; i++) {
1022 class = pool->size_class[i];
1023
1024 if (class->index != i)
1025 continue;
1026
1027 spin_lock(&class->lock);
1028 obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
1029 obj_used = zs_stat_get(class, OBJ_USED);
1030 spin_unlock(&class->lock);
1031
1032 objs_per_zspage = get_maxobj_per_zspage(class->size,
1033 class->pages_per_zspage);
1034 pages_used = obj_allocated / objs_per_zspage *
1035 class->pages_per_zspage;
1036
1037 seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
1038 class->size, obj_allocated, obj_used, pages_used);
1039
1040 total_objs += obj_allocated;
1041 total_used_objs += obj_used;
1042 total_pages += pages_used;
1043 }
1044
1045 seq_puts(s, "\n");
1046 seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
1047 total_objs, total_used_objs, total_pages);
1048
1049 return 0;
1050}
1051
1052static int zs_stats_size_open(struct inode *inode, struct file *file)
1053{
1054 return single_open(file, zs_stats_size_show, inode->i_private);
1055}
1056
1057static const struct file_operations zs_stat_size_ops = {
1058 .open = zs_stats_size_open,
1059 .read = seq_read,
1060 .llseek = seq_lseek,
1061 .release = single_release,
1062};
1063
1064static int zs_pool_stat_create(char *name, struct zs_pool *pool)
1065{
1066 struct dentry *entry;
1067
1068 if (!zs_stat_root)
1069 return -ENODEV;
1070
1071 entry = debugfs_create_dir(name, zs_stat_root);
1072 if (!entry) {
1073 pr_warn("debugfs dir <%s> creation failed\n", name);
1074 return -ENOMEM;
1075 }
1076 pool->stat_dentry = entry;
1077
1078 entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
1079 pool->stat_dentry, pool, &zs_stat_size_ops);
1080 if (!entry) {
1081 pr_warn("%s: debugfs file entry <%s> creation failed\n",
1082 name, "obj_in_classes");
1083 return -ENOMEM;
1084 }
1085
1086 return 0;
1087}
1088
1089static void zs_pool_stat_destroy(struct zs_pool *pool)
1090{
1091 debugfs_remove_recursive(pool->stat_dentry);
1092}
1093
1094#else /* CONFIG_ZSMALLOC_STAT */
1095
1096static inline void zs_stat_inc(struct size_class *class,
1097 enum zs_stat_type type, unsigned long cnt)
1098{
1099}
1100
1101static inline void zs_stat_dec(struct size_class *class,
1102 enum zs_stat_type type, unsigned long cnt)
1103{
1104}
1105
1106static inline unsigned long zs_stat_get(struct size_class *class,
1107 enum zs_stat_type type)
1108{
1109 return 0;
1110}
1111
1112static int __init zs_stat_init(void)
1113{
1114 return 0;
1115}
1116
1117static void __exit zs_stat_exit(void)
1118{
1119}
1120
1121static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
1122{
1123 return 0;
1124}
1125
1126static inline void zs_pool_stat_destroy(struct zs_pool *pool)
1127{
1128}
1129
1130#endif
1131
945unsigned long zs_get_total_pages(struct zs_pool *pool) 1132unsigned long zs_get_total_pages(struct zs_pool *pool)
946{ 1133{
947 return atomic_long_read(&pool->pages_allocated); 1134 return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1074 set_zspage_mapping(first_page, class->index, ZS_EMPTY); 1261 set_zspage_mapping(first_page, class->index, ZS_EMPTY);
1075 atomic_long_add(class->pages_per_zspage, 1262 atomic_long_add(class->pages_per_zspage,
1076 &pool->pages_allocated); 1263 &pool->pages_allocated);
1264
1077 spin_lock(&class->lock); 1265 spin_lock(&class->lock);
1266 zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1267 class->size, class->pages_per_zspage));
1078 } 1268 }
1079 1269
1080 obj = (unsigned long)first_page->freelist; 1270 obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
1088 kunmap_atomic(vaddr); 1278 kunmap_atomic(vaddr);
1089 1279
1090 first_page->inuse++; 1280 first_page->inuse++;
1281 zs_stat_inc(class, OBJ_USED, 1);
1091 /* Now move the zspage to another fullness group, if required */ 1282 /* Now move the zspage to another fullness group, if required */
1092 fix_fullness_group(pool, first_page); 1283 fix_fullness_group(pool, first_page);
1093 spin_unlock(&class->lock); 1284 spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
1128 1319
1129 first_page->inuse--; 1320 first_page->inuse--;
1130 fullness = fix_fullness_group(pool, first_page); 1321 fullness = fix_fullness_group(pool, first_page);
1322
1323 zs_stat_dec(class, OBJ_USED, 1);
1324 if (fullness == ZS_EMPTY)
1325 zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
1326 class->size, class->pages_per_zspage));
1327
1131 spin_unlock(&class->lock); 1328 spin_unlock(&class->lock);
1132 1329
1133 if (fullness == ZS_EMPTY) { 1330 if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
1148 * On success, a pointer to the newly created pool is returned, 1345 * On success, a pointer to the newly created pool is returned,
1149 * otherwise NULL. 1346 * otherwise NULL.
1150 */ 1347 */
1151struct zs_pool *zs_create_pool(gfp_t flags) 1348struct zs_pool *zs_create_pool(char *name, gfp_t flags)
1152{ 1349{
1153 int i; 1350 int i;
1154 struct zs_pool *pool; 1351 struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
1158 if (!pool) 1355 if (!pool)
1159 return NULL; 1356 return NULL;
1160 1357
1358 pool->name = kstrdup(name, GFP_KERNEL);
1359 if (!pool->name) {
1360 kfree(pool);
1361 return NULL;
1362 }
1363
1161 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *), 1364 pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
1162 GFP_KERNEL); 1365 GFP_KERNEL);
1163 if (!pool->size_class) { 1366 if (!pool->size_class) {
1367 kfree(pool->name);
1164 kfree(pool); 1368 kfree(pool);
1165 return NULL; 1369 return NULL;
1166 } 1370 }
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
1210 1414
1211 pool->flags = flags; 1415 pool->flags = flags;
1212 1416
1417 if (zs_pool_stat_create(name, pool))
1418 goto err;
1419
1213 return pool; 1420 return pool;
1214 1421
1215err: 1422err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
1222{ 1429{
1223 int i; 1430 int i;
1224 1431
1432 zs_pool_stat_destroy(pool);
1433
1225 for (i = 0; i < zs_size_classes; i++) { 1434 for (i = 0; i < zs_size_classes; i++) {
1226 int fg; 1435 int fg;
1227 struct size_class *class = pool->size_class[i]; 1436 struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
1242 } 1451 }
1243 1452
1244 kfree(pool->size_class); 1453 kfree(pool->size_class);
1454 kfree(pool->name);
1245 kfree(pool); 1455 kfree(pool);
1246} 1456}
1247EXPORT_SYMBOL_GPL(zs_destroy_pool); 1457EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
1250{ 1460{
1251 int ret = zs_register_cpu_notifier(); 1461 int ret = zs_register_cpu_notifier();
1252 1462
1253 if (ret) { 1463 if (ret)
1254 zs_unregister_cpu_notifier(); 1464 goto notifier_fail;
1255 return ret;
1256 }
1257 1465
1258 init_zs_size_classes(); 1466 init_zs_size_classes();
1259 1467
1260#ifdef CONFIG_ZPOOL 1468#ifdef CONFIG_ZPOOL
1261 zpool_register_driver(&zs_zpool_driver); 1469 zpool_register_driver(&zs_zpool_driver);
1262#endif 1470#endif
1471
1472 ret = zs_stat_init();
1473 if (ret) {
1474 pr_err("zs stat initialization failed\n");
1475 goto stat_fail;
1476 }
1263 return 0; 1477 return 0;
1478
1479stat_fail:
1480#ifdef CONFIG_ZPOOL
1481 zpool_unregister_driver(&zs_zpool_driver);
1482#endif
1483notifier_fail:
1484 zs_unregister_cpu_notifier();
1485
1486 return ret;
1264} 1487}
1265 1488
1266static void __exit zs_exit(void) 1489static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
1269 zpool_unregister_driver(&zs_zpool_driver); 1492 zpool_unregister_driver(&zs_zpool_driver);
1270#endif 1493#endif
1271 zs_unregister_cpu_notifier(); 1494 zs_unregister_cpu_notifier();
1495
1496 zs_stat_exit();
1272} 1497}
1273 1498
1274module_init(zs_init); 1499module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9bc51e4..4249e82ff934 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
906 906
907 pr_info("loading zswap\n"); 907 pr_info("loading zswap\n");
908 908
909 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops); 909 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
910 &zswap_zpool_ops);
910 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) { 911 if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
911 pr_info("%s zpool not available\n", zswap_zpool_type); 912 pr_info("%s zpool not available\n", zswap_zpool_type);
912 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT; 913 zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
913 zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, 914 zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
914 &zswap_zpool_ops); 915 &zswap_zpool_ops);
915 } 916 }
916 if (!zswap_pool) { 917 if (!zswap_pool) {