aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJohannes Weiner <hannes@cmpxchg.org>2014-12-10 18:44:52 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-10 20:41:09 -0500
commit1306a85aed3ec3db98945aafb7dfbe5648a1203c (patch)
tree63643e556c64118d963020758faf915325ba613c
parent22811c6bc3c764d8935383ad0ddd7a96b45d75dc (diff)
mm: embed the memcg pointer directly into struct page
Memory cgroups used to have 5 per-page pointers. To allow users to disable that amount of overhead during runtime, those pointers were allocated in a separate array, with a translation layer between them and struct page. There is now only one page pointer remaining: the memcg pointer, that indicates which cgroup the page is associated with when charged. The complexity of runtime allocation and the runtime translation overhead is no longer justified to save that *potential* 0.19% of memory. With CONFIG_SLUB, page->mem_cgroup actually sits in the doubleword padding after the page->private member and doesn't even increase struct page, and then this patch actually saves space. Remaining users that care can still compile their kernels without CONFIG_MEMCG. text data bss dec hex filename 8828345 1725264 983040 11536649 b00909 vmlinux.old 8827425 1725264 966656 11519345 afc571 vmlinux.new [mhocko@suse.cz: update Documentation/cgroups/memory.txt] Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Michal Hocko <mhocko@suse.cz> Acked-by: Vladimir Davydov <vdavydov@parallels.com> Acked-by: David S. Miller <davem@davemloft.net> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: "Kirill A. Shutemov" <kirill@shutemov.name> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vladimir Davydov <vdavydov@parallels.com> Cc: Tejun Heo <tj@kernel.org> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com> Acked-by: Konstantin Khlebnikov <koct9i@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cgroups/memory.txt5
-rw-r--r--include/linux/memcontrol.h6
-rw-r--r--include/linux/mm_types.h5
-rw-r--r--include/linux/mmzone.h12
-rw-r--r--include/linux/page_cgroup.h53
-rw-r--r--init/main.c7
-rw-r--r--mm/memcontrol.c124
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/page_cgroup.c319
9 files changed, 46 insertions, 487 deletions
diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 67613ff0270c..46b2b5080317 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -1,5 +1,10 @@
1Memory Resource Controller 1Memory Resource Controller
2 2
3NOTE: This document is hopelessly outdated and it asks for a complete
4 rewrite. It still contains a useful information so we are keeping it
5 here but make sure to check the current code if you need a deeper
6 understanding.
7
3NOTE: The Memory Resource Controller has generically been referred to as the 8NOTE: The Memory Resource Controller has generically been referred to as the
4 memory controller in this document. Do not confuse memory controller 9 memory controller in this document. Do not confuse memory controller
5 used here with the memory controller that is used in hardware. 10 used here with the memory controller that is used in hardware.
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index de018766be45..c4d080875164 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -25,7 +25,6 @@
25#include <linux/jump_label.h> 25#include <linux/jump_label.h>
26 26
27struct mem_cgroup; 27struct mem_cgroup;
28struct page_cgroup;
29struct page; 28struct page;
30struct mm_struct; 29struct mm_struct;
31struct kmem_cache; 30struct kmem_cache;
@@ -466,8 +465,6 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
466 * memcg_kmem_uncharge_pages: uncharge pages from memcg 465 * memcg_kmem_uncharge_pages: uncharge pages from memcg
467 * @page: pointer to struct page being freed 466 * @page: pointer to struct page being freed
468 * @order: allocation order. 467 * @order: allocation order.
469 *
470 * there is no need to specify memcg here, since it is embedded in page_cgroup
471 */ 468 */
472static inline void 469static inline void
473memcg_kmem_uncharge_pages(struct page *page, int order) 470memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -484,8 +481,7 @@ memcg_kmem_uncharge_pages(struct page *page, int order)
484 * 481 *
485 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or 482 * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
486 * failure of the allocation. if @page is NULL, this function will revert the 483 * failure of the allocation. if @page is NULL, this function will revert the
487 * charges. Otherwise, it will commit the memcg given by @memcg to the 484 * charges. Otherwise, it will commit @page to @memcg.
488 * corresponding page_cgroup.
489 */ 485 */
490static inline void 486static inline void
491memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) 487memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 004e9d17b47e..bf9f57529dcf 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -22,6 +22,7 @@
22#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1)) 22#define AT_VECTOR_SIZE (2*(AT_VECTOR_SIZE_ARCH + AT_VECTOR_SIZE_BASE + 1))
23 23
24struct address_space; 24struct address_space;
25struct mem_cgroup;
25 26
26#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) 27#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
27#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ 28#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \
@@ -167,6 +168,10 @@ struct page {
167 struct page *first_page; /* Compound tail pages */ 168 struct page *first_page; /* Compound tail pages */
168 }; 169 };
169 170
171#ifdef CONFIG_MEMCG
172 struct mem_cgroup *mem_cgroup;
173#endif
174
170 /* 175 /*
171 * On machines where all RAM is mapped into kernel address space, 176 * On machines where all RAM is mapped into kernel address space,
172 * we can simply calculate the virtual address. On machines with 177 * we can simply calculate the virtual address. On machines with
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ffe66e381c04..3879d7664dfc 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -722,9 +722,6 @@ typedef struct pglist_data {
722 int nr_zones; 722 int nr_zones;
723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ 723#ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */
724 struct page *node_mem_map; 724 struct page *node_mem_map;
725#ifdef CONFIG_MEMCG
726 struct page_cgroup *node_page_cgroup;
727#endif
728#endif 725#endif
729#ifndef CONFIG_NO_BOOTMEM 726#ifndef CONFIG_NO_BOOTMEM
730 struct bootmem_data *bdata; 727 struct bootmem_data *bdata;
@@ -1078,7 +1075,6 @@ static inline unsigned long early_pfn_to_nid(unsigned long pfn)
1078#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) 1075#define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK)
1079 1076
1080struct page; 1077struct page;
1081struct page_cgroup;
1082struct mem_section { 1078struct mem_section {
1083 /* 1079 /*
1084 * This is, logically, a pointer to an array of struct 1080 * This is, logically, a pointer to an array of struct
@@ -1096,14 +1092,6 @@ struct mem_section {
1096 1092
1097 /* See declaration of similar field in struct zone */ 1093 /* See declaration of similar field in struct zone */
1098 unsigned long *pageblock_flags; 1094 unsigned long *pageblock_flags;
1099#ifdef CONFIG_MEMCG
1100 /*
1101 * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use
1102 * section. (see memcontrol.h/page_cgroup.h about this.)
1103 */
1104 struct page_cgroup *page_cgroup;
1105 unsigned long pad;
1106#endif
1107 /* 1095 /*
1108 * WARNING: mem_section must be a power-of-2 in size for the 1096 * WARNING: mem_section must be a power-of-2 in size for the
1109 * calculation and use of SECTION_ROOT_MASK to make sense. 1097 * calculation and use of SECTION_ROOT_MASK to make sense.
diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h
index 1289be6b436c..65be35785c86 100644
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -1,59 +1,6 @@
1#ifndef __LINUX_PAGE_CGROUP_H 1#ifndef __LINUX_PAGE_CGROUP_H
2#define __LINUX_PAGE_CGROUP_H 2#define __LINUX_PAGE_CGROUP_H
3 3
4struct pglist_data;
5
6#ifdef CONFIG_MEMCG
7struct mem_cgroup;
8
9/*
10 * Page Cgroup can be considered as an extended mem_map.
11 * A page_cgroup page is associated with every page descriptor. The
12 * page_cgroup helps us identify information about the cgroup
13 * All page cgroups are allocated at boot or memory hotplug event,
14 * then the page cgroup for pfn always exists.
15 */
16struct page_cgroup {
17 struct mem_cgroup *mem_cgroup;
18};
19
20extern void pgdat_page_cgroup_init(struct pglist_data *pgdat);
21
22#ifdef CONFIG_SPARSEMEM
23static inline void page_cgroup_init_flatmem(void)
24{
25}
26extern void page_cgroup_init(void);
27#else
28extern void page_cgroup_init_flatmem(void);
29static inline void page_cgroup_init(void)
30{
31}
32#endif
33
34struct page_cgroup *lookup_page_cgroup(struct page *page);
35
36#else /* !CONFIG_MEMCG */
37struct page_cgroup;
38
39static inline void pgdat_page_cgroup_init(struct pglist_data *pgdat)
40{
41}
42
43static inline struct page_cgroup *lookup_page_cgroup(struct page *page)
44{
45 return NULL;
46}
47
48static inline void page_cgroup_init(void)
49{
50}
51
52static inline void page_cgroup_init_flatmem(void)
53{
54}
55#endif /* CONFIG_MEMCG */
56
57#include <linux/swap.h> 4#include <linux/swap.h>
58 5
59#ifdef CONFIG_MEMCG_SWAP 6#ifdef CONFIG_MEMCG_SWAP
diff --git a/init/main.c b/init/main.c
index 321d0ceb26d3..d2e4ead4891f 100644
--- a/init/main.c
+++ b/init/main.c
@@ -51,7 +51,6 @@
51#include <linux/mempolicy.h> 51#include <linux/mempolicy.h>
52#include <linux/key.h> 52#include <linux/key.h>
53#include <linux/buffer_head.h> 53#include <linux/buffer_head.h>
54#include <linux/page_cgroup.h>
55#include <linux/debug_locks.h> 54#include <linux/debug_locks.h>
56#include <linux/debugobjects.h> 55#include <linux/debugobjects.h>
57#include <linux/lockdep.h> 56#include <linux/lockdep.h>
@@ -485,11 +484,6 @@ void __init __weak thread_info_cache_init(void)
485 */ 484 */
486static void __init mm_init(void) 485static void __init mm_init(void)
487{ 486{
488 /*
489 * page_cgroup requires contiguous pages,
490 * bigger than MAX_ORDER unless SPARSEMEM.
491 */
492 page_cgroup_init_flatmem();
493 mem_init(); 487 mem_init();
494 kmem_cache_init(); 488 kmem_cache_init();
495 percpu_init_late(); 489 percpu_init_late();
@@ -627,7 +621,6 @@ asmlinkage __visible void __init start_kernel(void)
627 initrd_start = 0; 621 initrd_start = 0;
628 } 622 }
629#endif 623#endif
630 page_cgroup_init();
631 debug_objects_mem_init(); 624 debug_objects_mem_init();
632 kmemleak_init(); 625 kmemleak_init();
633 setup_per_cpu_pageset(); 626 setup_per_cpu_pageset();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 78cb3b05a9fa..b864067791dc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1274,7 +1274,6 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1274{ 1274{
1275 struct mem_cgroup_per_zone *mz; 1275 struct mem_cgroup_per_zone *mz;
1276 struct mem_cgroup *memcg; 1276 struct mem_cgroup *memcg;
1277 struct page_cgroup *pc;
1278 struct lruvec *lruvec; 1277 struct lruvec *lruvec;
1279 1278
1280 if (mem_cgroup_disabled()) { 1279 if (mem_cgroup_disabled()) {
@@ -1282,8 +1281,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1282 goto out; 1281 goto out;
1283 } 1282 }
1284 1283
1285 pc = lookup_page_cgroup(page); 1284 memcg = page->mem_cgroup;
1286 memcg = pc->mem_cgroup;
1287 /* 1285 /*
1288 * Swapcache readahead pages are added to the LRU - and 1286 * Swapcache readahead pages are added to the LRU - and
1289 * possibly migrated - before they are charged. 1287 * possibly migrated - before they are charged.
@@ -2020,16 +2018,13 @@ struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
2020 unsigned long *flags) 2018 unsigned long *flags)
2021{ 2019{
2022 struct mem_cgroup *memcg; 2020 struct mem_cgroup *memcg;
2023 struct page_cgroup *pc;
2024 2021
2025 rcu_read_lock(); 2022 rcu_read_lock();
2026 2023
2027 if (mem_cgroup_disabled()) 2024 if (mem_cgroup_disabled())
2028 return NULL; 2025 return NULL;
2029
2030 pc = lookup_page_cgroup(page);
2031again: 2026again:
2032 memcg = pc->mem_cgroup; 2027 memcg = page->mem_cgroup;
2033 if (unlikely(!memcg)) 2028 if (unlikely(!memcg))
2034 return NULL; 2029 return NULL;
2035 2030
@@ -2038,7 +2033,7 @@ again:
2038 return memcg; 2033 return memcg;
2039 2034
2040 spin_lock_irqsave(&memcg->move_lock, *flags); 2035 spin_lock_irqsave(&memcg->move_lock, *flags);
2041 if (memcg != pc->mem_cgroup) { 2036 if (memcg != page->mem_cgroup) {
2042 spin_unlock_irqrestore(&memcg->move_lock, *flags); 2037 spin_unlock_irqrestore(&memcg->move_lock, *flags);
2043 goto again; 2038 goto again;
2044 } 2039 }
@@ -2405,15 +2400,12 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
2405struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2400struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
2406{ 2401{
2407 struct mem_cgroup *memcg; 2402 struct mem_cgroup *memcg;
2408 struct page_cgroup *pc;
2409 unsigned short id; 2403 unsigned short id;
2410 swp_entry_t ent; 2404 swp_entry_t ent;
2411 2405
2412 VM_BUG_ON_PAGE(!PageLocked(page), page); 2406 VM_BUG_ON_PAGE(!PageLocked(page), page);
2413 2407
2414 pc = lookup_page_cgroup(page); 2408 memcg = page->mem_cgroup;
2415 memcg = pc->mem_cgroup;
2416
2417 if (memcg) { 2409 if (memcg) {
2418 if (!css_tryget_online(&memcg->css)) 2410 if (!css_tryget_online(&memcg->css))
2419 memcg = NULL; 2411 memcg = NULL;
@@ -2463,10 +2455,9 @@ static void unlock_page_lru(struct page *page, int isolated)
2463static void commit_charge(struct page *page, struct mem_cgroup *memcg, 2455static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2464 bool lrucare) 2456 bool lrucare)
2465{ 2457{
2466 struct page_cgroup *pc = lookup_page_cgroup(page);
2467 int isolated; 2458 int isolated;
2468 2459
2469 VM_BUG_ON_PAGE(pc->mem_cgroup, page); 2460 VM_BUG_ON_PAGE(page->mem_cgroup, page);
2470 2461
2471 /* 2462 /*
2472 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2463 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
@@ -2477,7 +2468,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2477 2468
2478 /* 2469 /*
2479 * Nobody should be changing or seriously looking at 2470 * Nobody should be changing or seriously looking at
2480 * pc->mem_cgroup at this point: 2471 * page->mem_cgroup at this point:
2481 * 2472 *
2482 * - the page is uncharged 2473 * - the page is uncharged
2483 * 2474 *
@@ -2489,7 +2480,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2489 * - a page cache insertion, a swapin fault, or a migration 2480 * - a page cache insertion, a swapin fault, or a migration
2490 * have the page locked 2481 * have the page locked
2491 */ 2482 */
2492 pc->mem_cgroup = memcg; 2483 page->mem_cgroup = memcg;
2493 2484
2494 if (lrucare) 2485 if (lrucare)
2495 unlock_page_lru(page, isolated); 2486 unlock_page_lru(page, isolated);
@@ -2972,8 +2963,6 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
2972void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 2963void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
2973 int order) 2964 int order)
2974{ 2965{
2975 struct page_cgroup *pc;
2976
2977 VM_BUG_ON(mem_cgroup_is_root(memcg)); 2966 VM_BUG_ON(mem_cgroup_is_root(memcg));
2978 2967
2979 /* The page allocation failed. Revert */ 2968 /* The page allocation failed. Revert */
@@ -2981,14 +2970,12 @@ void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
2981 memcg_uncharge_kmem(memcg, 1 << order); 2970 memcg_uncharge_kmem(memcg, 1 << order);
2982 return; 2971 return;
2983 } 2972 }
2984 pc = lookup_page_cgroup(page); 2973 page->mem_cgroup = memcg;
2985 pc->mem_cgroup = memcg;
2986} 2974}
2987 2975
2988void __memcg_kmem_uncharge_pages(struct page *page, int order) 2976void __memcg_kmem_uncharge_pages(struct page *page, int order)
2989{ 2977{
2990 struct page_cgroup *pc = lookup_page_cgroup(page); 2978 struct mem_cgroup *memcg = page->mem_cgroup;
2991 struct mem_cgroup *memcg = pc->mem_cgroup;
2992 2979
2993 if (!memcg) 2980 if (!memcg)
2994 return; 2981 return;
@@ -2996,7 +2983,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
2996 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); 2983 VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2997 2984
2998 memcg_uncharge_kmem(memcg, 1 << order); 2985 memcg_uncharge_kmem(memcg, 1 << order);
2999 pc->mem_cgroup = NULL; 2986 page->mem_cgroup = NULL;
3000} 2987}
3001#else 2988#else
3002static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) 2989static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
@@ -3014,16 +3001,15 @@ static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
3014 */ 3001 */
3015void mem_cgroup_split_huge_fixup(struct page *head) 3002void mem_cgroup_split_huge_fixup(struct page *head)
3016{ 3003{
3017 struct page_cgroup *pc = lookup_page_cgroup(head);
3018 int i; 3004 int i;
3019 3005
3020 if (mem_cgroup_disabled()) 3006 if (mem_cgroup_disabled())
3021 return; 3007 return;
3022 3008
3023 for (i = 1; i < HPAGE_PMD_NR; i++) 3009 for (i = 1; i < HPAGE_PMD_NR; i++)
3024 pc[i].mem_cgroup = pc[0].mem_cgroup; 3010 head[i].mem_cgroup = head->mem_cgroup;
3025 3011
3026 __this_cpu_sub(pc[0].mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3012 __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
3027 HPAGE_PMD_NR); 3013 HPAGE_PMD_NR);
3028} 3014}
3029#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3015#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3032,7 +3018,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3032 * mem_cgroup_move_account - move account of the page 3018 * mem_cgroup_move_account - move account of the page
3033 * @page: the page 3019 * @page: the page
3034 * @nr_pages: number of regular pages (>1 for huge pages) 3020 * @nr_pages: number of regular pages (>1 for huge pages)
3035 * @pc: page_cgroup of the page.
3036 * @from: mem_cgroup which the page is moved from. 3021 * @from: mem_cgroup which the page is moved from.
3037 * @to: mem_cgroup which the page is moved to. @from != @to. 3022 * @to: mem_cgroup which the page is moved to. @from != @to.
3038 * 3023 *
@@ -3045,7 +3030,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
3045 */ 3030 */
3046static int mem_cgroup_move_account(struct page *page, 3031static int mem_cgroup_move_account(struct page *page,
3047 unsigned int nr_pages, 3032 unsigned int nr_pages,
3048 struct page_cgroup *pc,
3049 struct mem_cgroup *from, 3033 struct mem_cgroup *from,
3050 struct mem_cgroup *to) 3034 struct mem_cgroup *to)
3051{ 3035{
@@ -3065,7 +3049,7 @@ static int mem_cgroup_move_account(struct page *page,
3065 goto out; 3049 goto out;
3066 3050
3067 /* 3051 /*
3068 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup 3052 * Prevent mem_cgroup_migrate() from looking at page->mem_cgroup
3069 * of its source page while we change it: page migration takes 3053 * of its source page while we change it: page migration takes
3070 * both pages off the LRU, but page cache replacement doesn't. 3054 * both pages off the LRU, but page cache replacement doesn't.
3071 */ 3055 */
@@ -3073,7 +3057,7 @@ static int mem_cgroup_move_account(struct page *page,
3073 goto out; 3057 goto out;
3074 3058
3075 ret = -EINVAL; 3059 ret = -EINVAL;
3076 if (pc->mem_cgroup != from) 3060 if (page->mem_cgroup != from)
3077 goto out_unlock; 3061 goto out_unlock;
3078 3062
3079 spin_lock_irqsave(&from->move_lock, flags); 3063 spin_lock_irqsave(&from->move_lock, flags);
@@ -3093,13 +3077,13 @@ static int mem_cgroup_move_account(struct page *page,
3093 } 3077 }
3094 3078
3095 /* 3079 /*
3096 * It is safe to change pc->mem_cgroup here because the page 3080 * It is safe to change page->mem_cgroup here because the page
3097 * is referenced, charged, and isolated - we can't race with 3081 * is referenced, charged, and isolated - we can't race with
3098 * uncharging, charging, migration, or LRU putback. 3082 * uncharging, charging, migration, or LRU putback.
3099 */ 3083 */
3100 3084
3101 /* caller should have done css_get */ 3085 /* caller should have done css_get */
3102 pc->mem_cgroup = to; 3086 page->mem_cgroup = to;
3103 spin_unlock_irqrestore(&from->move_lock, flags); 3087 spin_unlock_irqrestore(&from->move_lock, flags);
3104 3088
3105 ret = 0; 3089 ret = 0;
@@ -3174,36 +3158,17 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3174#endif 3158#endif
3175 3159
3176#ifdef CONFIG_DEBUG_VM 3160#ifdef CONFIG_DEBUG_VM
3177static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
3178{
3179 struct page_cgroup *pc;
3180
3181 pc = lookup_page_cgroup(page);
3182 /*
3183 * Can be NULL while feeding pages into the page allocator for
3184 * the first time, i.e. during boot or memory hotplug;
3185 * or when mem_cgroup_disabled().
3186 */
3187 if (likely(pc) && pc->mem_cgroup)
3188 return pc;
3189 return NULL;
3190}
3191
3192bool mem_cgroup_bad_page_check(struct page *page) 3161bool mem_cgroup_bad_page_check(struct page *page)
3193{ 3162{
3194 if (mem_cgroup_disabled()) 3163 if (mem_cgroup_disabled())
3195 return false; 3164 return false;
3196 3165
3197 return lookup_page_cgroup_used(page) != NULL; 3166 return page->mem_cgroup != NULL;
3198} 3167}
3199 3168
3200void mem_cgroup_print_bad_page(struct page *page) 3169void mem_cgroup_print_bad_page(struct page *page)
3201{ 3170{
3202 struct page_cgroup *pc; 3171 pr_alert("page->mem_cgroup:%p\n", page->mem_cgroup);
3203
3204 pc = lookup_page_cgroup_used(page);
3205 if (pc)
3206 pr_alert("pc:%p pc->mem_cgroup:%p\n", pc, pc->mem_cgroup);
3207} 3172}
3208#endif 3173#endif
3209 3174
@@ -5123,7 +5088,6 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5123 unsigned long addr, pte_t ptent, union mc_target *target) 5088 unsigned long addr, pte_t ptent, union mc_target *target)
5124{ 5089{
5125 struct page *page = NULL; 5090 struct page *page = NULL;
5126 struct page_cgroup *pc;
5127 enum mc_target_type ret = MC_TARGET_NONE; 5091 enum mc_target_type ret = MC_TARGET_NONE;
5128 swp_entry_t ent = { .val = 0 }; 5092 swp_entry_t ent = { .val = 0 };
5129 5093
@@ -5137,13 +5101,12 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5137 if (!page && !ent.val) 5101 if (!page && !ent.val)
5138 return ret; 5102 return ret;
5139 if (page) { 5103 if (page) {
5140 pc = lookup_page_cgroup(page);
5141 /* 5104 /*
5142 * Do only loose check w/o serialization. 5105 * Do only loose check w/o serialization.
5143 * mem_cgroup_move_account() checks the pc is valid or 5106 * mem_cgroup_move_account() checks the page is valid or
5144 * not under LRU exclusion. 5107 * not under LRU exclusion.
5145 */ 5108 */
5146 if (pc->mem_cgroup == mc.from) { 5109 if (page->mem_cgroup == mc.from) {
5147 ret = MC_TARGET_PAGE; 5110 ret = MC_TARGET_PAGE;
5148 if (target) 5111 if (target)
5149 target->page = page; 5112 target->page = page;
@@ -5171,15 +5134,13 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5171 unsigned long addr, pmd_t pmd, union mc_target *target) 5134 unsigned long addr, pmd_t pmd, union mc_target *target)
5172{ 5135{
5173 struct page *page = NULL; 5136 struct page *page = NULL;
5174 struct page_cgroup *pc;
5175 enum mc_target_type ret = MC_TARGET_NONE; 5137 enum mc_target_type ret = MC_TARGET_NONE;
5176 5138
5177 page = pmd_page(pmd); 5139 page = pmd_page(pmd);
5178 VM_BUG_ON_PAGE(!page || !PageHead(page), page); 5140 VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5179 if (!move_anon()) 5141 if (!move_anon())
5180 return ret; 5142 return ret;
5181 pc = lookup_page_cgroup(page); 5143 if (page->mem_cgroup == mc.from) {
5182 if (pc->mem_cgroup == mc.from) {
5183 ret = MC_TARGET_PAGE; 5144 ret = MC_TARGET_PAGE;
5184 if (target) { 5145 if (target) {
5185 get_page(page); 5146 get_page(page);
@@ -5378,7 +5339,6 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5378 enum mc_target_type target_type; 5339 enum mc_target_type target_type;
5379 union mc_target target; 5340 union mc_target target;
5380 struct page *page; 5341 struct page *page;
5381 struct page_cgroup *pc;
5382 5342
5383 /* 5343 /*
5384 * We don't take compound_lock() here but no race with splitting thp 5344 * We don't take compound_lock() here but no race with splitting thp
@@ -5399,9 +5359,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5399 if (target_type == MC_TARGET_PAGE) { 5359 if (target_type == MC_TARGET_PAGE) {
5400 page = target.page; 5360 page = target.page;
5401 if (!isolate_lru_page(page)) { 5361 if (!isolate_lru_page(page)) {
5402 pc = lookup_page_cgroup(page);
5403 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5362 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5404 pc, mc.from, mc.to)) { 5363 mc.from, mc.to)) {
5405 mc.precharge -= HPAGE_PMD_NR; 5364 mc.precharge -= HPAGE_PMD_NR;
5406 mc.moved_charge += HPAGE_PMD_NR; 5365 mc.moved_charge += HPAGE_PMD_NR;
5407 } 5366 }
@@ -5429,9 +5388,7 @@ retry:
5429 page = target.page; 5388 page = target.page;
5430 if (isolate_lru_page(page)) 5389 if (isolate_lru_page(page))
5431 goto put; 5390 goto put;
5432 pc = lookup_page_cgroup(page); 5391 if (!mem_cgroup_move_account(page, 1, mc.from, mc.to)) {
5433 if (!mem_cgroup_move_account(page, 1, pc,
5434 mc.from, mc.to)) {
5435 mc.precharge--; 5392 mc.precharge--;
5436 /* we uncharge from mc.from later. */ 5393 /* we uncharge from mc.from later. */
5437 mc.moved_charge++; 5394 mc.moved_charge++;
@@ -5619,7 +5576,6 @@ static void __init enable_swap_cgroup(void)
5619void mem_cgroup_swapout(struct page *page, swp_entry_t entry) 5576void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5620{ 5577{
5621 struct mem_cgroup *memcg; 5578 struct mem_cgroup *memcg;
5622 struct page_cgroup *pc;
5623 unsigned short oldid; 5579 unsigned short oldid;
5624 5580
5625 VM_BUG_ON_PAGE(PageLRU(page), page); 5581 VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5628,8 +5584,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5628 if (!do_swap_account) 5584 if (!do_swap_account)
5629 return; 5585 return;
5630 5586
5631 pc = lookup_page_cgroup(page); 5587 memcg = page->mem_cgroup;
5632 memcg = pc->mem_cgroup;
5633 5588
5634 /* Readahead page, never charged */ 5589 /* Readahead page, never charged */
5635 if (!memcg) 5590 if (!memcg)
@@ -5639,7 +5594,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5639 VM_BUG_ON_PAGE(oldid, page); 5594 VM_BUG_ON_PAGE(oldid, page);
5640 mem_cgroup_swap_statistics(memcg, true); 5595 mem_cgroup_swap_statistics(memcg, true);
5641 5596
5642 pc->mem_cgroup = NULL; 5597 page->mem_cgroup = NULL;
5643 5598
5644 if (!mem_cgroup_is_root(memcg)) 5599 if (!mem_cgroup_is_root(memcg))
5645 page_counter_uncharge(&memcg->memory, 1); 5600 page_counter_uncharge(&memcg->memory, 1);
@@ -5706,7 +5661,6 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5706 goto out; 5661 goto out;
5707 5662
5708 if (PageSwapCache(page)) { 5663 if (PageSwapCache(page)) {
5709 struct page_cgroup *pc = lookup_page_cgroup(page);
5710 /* 5664 /*
5711 * Every swap fault against a single page tries to charge the 5665 * Every swap fault against a single page tries to charge the
5712 * page, bail as early as possible. shmem_unuse() encounters 5666 * page, bail as early as possible. shmem_unuse() encounters
@@ -5714,7 +5668,7 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5714 * the page lock, which serializes swap cache removal, which 5668 * the page lock, which serializes swap cache removal, which
5715 * in turn serializes uncharging. 5669 * in turn serializes uncharging.
5716 */ 5670 */
5717 if (pc->mem_cgroup) 5671 if (page->mem_cgroup)
5718 goto out; 5672 goto out;
5719 } 5673 }
5720 5674
@@ -5867,7 +5821,6 @@ static void uncharge_list(struct list_head *page_list)
5867 next = page_list->next; 5821 next = page_list->next;
5868 do { 5822 do {
5869 unsigned int nr_pages = 1; 5823 unsigned int nr_pages = 1;
5870 struct page_cgroup *pc;
5871 5824
5872 page = list_entry(next, struct page, lru); 5825 page = list_entry(next, struct page, lru);
5873 next = page->lru.next; 5826 next = page->lru.next;
@@ -5875,23 +5828,22 @@ static void uncharge_list(struct list_head *page_list)
5875 VM_BUG_ON_PAGE(PageLRU(page), page); 5828 VM_BUG_ON_PAGE(PageLRU(page), page);
5876 VM_BUG_ON_PAGE(page_count(page), page); 5829 VM_BUG_ON_PAGE(page_count(page), page);
5877 5830
5878 pc = lookup_page_cgroup(page); 5831 if (!page->mem_cgroup)
5879 if (!pc->mem_cgroup)
5880 continue; 5832 continue;
5881 5833
5882 /* 5834 /*
5883 * Nobody should be changing or seriously looking at 5835 * Nobody should be changing or seriously looking at
5884 * pc->mem_cgroup at this point, we have fully 5836 * page->mem_cgroup at this point, we have fully
5885 * exclusive access to the page. 5837 * exclusive access to the page.
5886 */ 5838 */
5887 5839
5888 if (memcg != pc->mem_cgroup) { 5840 if (memcg != page->mem_cgroup) {
5889 if (memcg) { 5841 if (memcg) {
5890 uncharge_batch(memcg, pgpgout, nr_anon, nr_file, 5842 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5891 nr_huge, page); 5843 nr_huge, page);
5892 pgpgout = nr_anon = nr_file = nr_huge = 0; 5844 pgpgout = nr_anon = nr_file = nr_huge = 0;
5893 } 5845 }
5894 memcg = pc->mem_cgroup; 5846 memcg = page->mem_cgroup;
5895 } 5847 }
5896 5848
5897 if (PageTransHuge(page)) { 5849 if (PageTransHuge(page)) {
@@ -5905,7 +5857,7 @@ static void uncharge_list(struct list_head *page_list)
5905 else 5857 else
5906 nr_file += nr_pages; 5858 nr_file += nr_pages;
5907 5859
5908 pc->mem_cgroup = NULL; 5860 page->mem_cgroup = NULL;
5909 5861
5910 pgpgout++; 5862 pgpgout++;
5911 } while (next != page_list); 5863 } while (next != page_list);
@@ -5924,14 +5876,11 @@ static void uncharge_list(struct list_head *page_list)
5924 */ 5876 */
5925void mem_cgroup_uncharge(struct page *page) 5877void mem_cgroup_uncharge(struct page *page)
5926{ 5878{
5927 struct page_cgroup *pc;
5928
5929 if (mem_cgroup_disabled()) 5879 if (mem_cgroup_disabled())
5930 return; 5880 return;
5931 5881
5932 /* Don't touch page->lru of any random page, pre-check: */ 5882 /* Don't touch page->lru of any random page, pre-check: */
5933 pc = lookup_page_cgroup(page); 5883 if (!page->mem_cgroup)
5934 if (!pc->mem_cgroup)
5935 return; 5884 return;
5936 5885
5937 INIT_LIST_HEAD(&page->lru); 5886 INIT_LIST_HEAD(&page->lru);
@@ -5968,7 +5917,6 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5968 bool lrucare) 5917 bool lrucare)
5969{ 5918{
5970 struct mem_cgroup *memcg; 5919 struct mem_cgroup *memcg;
5971 struct page_cgroup *pc;
5972 int isolated; 5920 int isolated;
5973 5921
5974 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); 5922 VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
@@ -5983,8 +5931,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5983 return; 5931 return;
5984 5932
5985 /* Page cache replacement: new page already charged? */ 5933 /* Page cache replacement: new page already charged? */
5986 pc = lookup_page_cgroup(newpage); 5934 if (newpage->mem_cgroup)
5987 if (pc->mem_cgroup)
5988 return; 5935 return;
5989 5936
5990 /* 5937 /*
@@ -5993,15 +5940,14 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
5993 * uncharged page when the PFN walker finds a page that 5940 * uncharged page when the PFN walker finds a page that
5994 * reclaim just put back on the LRU but has not released yet. 5941 * reclaim just put back on the LRU but has not released yet.
5995 */ 5942 */
5996 pc = lookup_page_cgroup(oldpage); 5943 memcg = oldpage->mem_cgroup;
5997 memcg = pc->mem_cgroup;
5998 if (!memcg) 5944 if (!memcg)
5999 return; 5945 return;
6000 5946
6001 if (lrucare) 5947 if (lrucare)
6002 lock_page_lru(oldpage, &isolated); 5948 lock_page_lru(oldpage, &isolated);
6003 5949
6004 pc->mem_cgroup = NULL; 5950 oldpage->mem_cgroup = NULL;
6005 5951
6006 if (lrucare) 5952 if (lrucare)
6007 unlock_page_lru(oldpage, isolated); 5953 unlock_page_lru(oldpage, isolated);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 97b6966816e5..22cfdeffbf69 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -48,7 +48,6 @@
48#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
49#include <linux/fault-inject.h> 49#include <linux/fault-inject.h>
50#include <linux/page-isolation.h> 50#include <linux/page-isolation.h>
51#include <linux/page_cgroup.h>
52#include <linux/debugobjects.h> 51#include <linux/debugobjects.h>
53#include <linux/kmemleak.h> 52#include <linux/kmemleak.h>
54#include <linux/compaction.h> 53#include <linux/compaction.h>
@@ -4853,7 +4852,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
4853#endif 4852#endif
4854 init_waitqueue_head(&pgdat->kswapd_wait); 4853 init_waitqueue_head(&pgdat->kswapd_wait);
4855 init_waitqueue_head(&pgdat->pfmemalloc_wait); 4854 init_waitqueue_head(&pgdat->pfmemalloc_wait);
4856 pgdat_page_cgroup_init(pgdat);
4857 4855
4858 for (j = 0; j < MAX_NR_ZONES; j++) { 4856 for (j = 0; j < MAX_NR_ZONES; j++) {
4859 struct zone *zone = pgdat->node_zones + j; 4857 struct zone *zone = pgdat->node_zones + j;
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 5331c2bd85a2..f0f31c1d4d0c 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -1,326 +1,7 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h> 2#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h> 3#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11#include <linux/swapops.h> 4#include <linux/swapops.h>
12#include <linux/kmemleak.h>
13
14static unsigned long total_usage;
15
16#if !defined(CONFIG_SPARSEMEM)
17
18
19void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
20{
21 pgdat->node_page_cgroup = NULL;
22}
23
24struct page_cgroup *lookup_page_cgroup(struct page *page)
25{
26 unsigned long pfn = page_to_pfn(page);
27 unsigned long offset;
28 struct page_cgroup *base;
29
30 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
31#ifdef CONFIG_DEBUG_VM
32 /*
33 * The sanity checks the page allocator does upon freeing a
34 * page can reach here before the page_cgroup arrays are
35 * allocated when feeding a range of pages to the allocator
36 * for the first time during bootup or memory hotplug.
37 */
38 if (unlikely(!base))
39 return NULL;
40#endif
41 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
42 return base + offset;
43}
44
45static int __init alloc_node_page_cgroup(int nid)
46{
47 struct page_cgroup *base;
48 unsigned long table_size;
49 unsigned long nr_pages;
50
51 nr_pages = NODE_DATA(nid)->node_spanned_pages;
52 if (!nr_pages)
53 return 0;
54
55 table_size = sizeof(struct page_cgroup) * nr_pages;
56
57 base = memblock_virt_alloc_try_nid_nopanic(
58 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
59 BOOTMEM_ALLOC_ACCESSIBLE, nid);
60 if (!base)
61 return -ENOMEM;
62 NODE_DATA(nid)->node_page_cgroup = base;
63 total_usage += table_size;
64 return 0;
65}
66
67void __init page_cgroup_init_flatmem(void)
68{
69
70 int nid, fail;
71
72 if (mem_cgroup_disabled())
73 return;
74
75 for_each_online_node(nid) {
76 fail = alloc_node_page_cgroup(nid);
77 if (fail)
78 goto fail;
79 }
80 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
81 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you"
82 " don't want memory cgroups\n");
83 return;
84fail:
85 printk(KERN_CRIT "allocation of page_cgroup failed.\n");
86 printk(KERN_CRIT "please try 'cgroup_disable=memory' boot option\n");
87 panic("Out of memory");
88}
89
90#else /* CONFIG_FLAT_NODE_MEM_MAP */
91
92struct page_cgroup *lookup_page_cgroup(struct page *page)
93{
94 unsigned long pfn = page_to_pfn(page);
95 struct mem_section *section = __pfn_to_section(pfn);
96#ifdef CONFIG_DEBUG_VM
97 /*
98 * The sanity checks the page allocator does upon freeing a
99 * page can reach here before the page_cgroup arrays are
100 * allocated when feeding a range of pages to the allocator
101 * for the first time during bootup or memory hotplug.
102 */
103 if (!section->page_cgroup)
104 return NULL;
105#endif
106 return section->page_cgroup + pfn;
107}
108
109static void *__meminit alloc_page_cgroup(size_t size, int nid)
110{
111 gfp_t flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
112 void *addr = NULL;
113
114 addr = alloc_pages_exact_nid(nid, size, flags);
115 if (addr) {
116 kmemleak_alloc(addr, size, 1, flags);
117 return addr;
118 }
119
120 if (node_state(nid, N_HIGH_MEMORY))
121 addr = vzalloc_node(size, nid);
122 else
123 addr = vzalloc(size);
124
125 return addr;
126}
127
128static int __meminit init_section_page_cgroup(unsigned long pfn, int nid)
129{
130 struct mem_section *section;
131 struct page_cgroup *base;
132 unsigned long table_size;
133
134 section = __pfn_to_section(pfn);
135
136 if (section->page_cgroup)
137 return 0;
138
139 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
140 base = alloc_page_cgroup(table_size, nid);
141
142 /*
143 * The value stored in section->page_cgroup is (base - pfn)
144 * and it does not point to the memory block allocated above,
145 * causing kmemleak false positives.
146 */
147 kmemleak_not_leak(base);
148
149 if (!base) {
150 printk(KERN_ERR "page cgroup allocation failure\n");
151 return -ENOMEM;
152 }
153
154 /*
155 * The passed "pfn" may not be aligned to SECTION. For the calculation
156 * we need to apply a mask.
157 */
158 pfn &= PAGE_SECTION_MASK;
159 section->page_cgroup = base - pfn;
160 total_usage += table_size;
161 return 0;
162}
163#ifdef CONFIG_MEMORY_HOTPLUG
164static void free_page_cgroup(void *addr)
165{
166 if (is_vmalloc_addr(addr)) {
167 vfree(addr);
168 } else {
169 struct page *page = virt_to_page(addr);
170 size_t table_size =
171 sizeof(struct page_cgroup) * PAGES_PER_SECTION;
172
173 BUG_ON(PageReserved(page));
174 kmemleak_free(addr);
175 free_pages_exact(addr, table_size);
176 }
177}
178
179static void __free_page_cgroup(unsigned long pfn)
180{
181 struct mem_section *ms;
182 struct page_cgroup *base;
183
184 ms = __pfn_to_section(pfn);
185 if (!ms || !ms->page_cgroup)
186 return;
187 base = ms->page_cgroup + pfn;
188 free_page_cgroup(base);
189 ms->page_cgroup = NULL;
190}
191
192static int __meminit online_page_cgroup(unsigned long start_pfn,
193 unsigned long nr_pages,
194 int nid)
195{
196 unsigned long start, end, pfn;
197 int fail = 0;
198
199 start = SECTION_ALIGN_DOWN(start_pfn);
200 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
201
202 if (nid == -1) {
203 /*
204 * In this case, "nid" already exists and contains valid memory.
205 * "start_pfn" passed to us is a pfn which is an arg for
206 * online__pages(), and start_pfn should exist.
207 */
208 nid = pfn_to_nid(start_pfn);
209 VM_BUG_ON(!node_state(nid, N_ONLINE));
210 }
211
212 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
213 if (!pfn_present(pfn))
214 continue;
215 fail = init_section_page_cgroup(pfn, nid);
216 }
217 if (!fail)
218 return 0;
219
220 /* rollback */
221 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
222 __free_page_cgroup(pfn);
223
224 return -ENOMEM;
225}
226
227static int __meminit offline_page_cgroup(unsigned long start_pfn,
228 unsigned long nr_pages, int nid)
229{
230 unsigned long start, end, pfn;
231
232 start = SECTION_ALIGN_DOWN(start_pfn);
233 end = SECTION_ALIGN_UP(start_pfn + nr_pages);
234
235 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
236 __free_page_cgroup(pfn);
237 return 0;
238
239}
240
241static int __meminit page_cgroup_callback(struct notifier_block *self,
242 unsigned long action, void *arg)
243{
244 struct memory_notify *mn = arg;
245 int ret = 0;
246 switch (action) {
247 case MEM_GOING_ONLINE:
248 ret = online_page_cgroup(mn->start_pfn,
249 mn->nr_pages, mn->status_change_nid);
250 break;
251 case MEM_OFFLINE:
252 offline_page_cgroup(mn->start_pfn,
253 mn->nr_pages, mn->status_change_nid);
254 break;
255 case MEM_CANCEL_ONLINE:
256 offline_page_cgroup(mn->start_pfn,
257 mn->nr_pages, mn->status_change_nid);
258 break;
259 case MEM_GOING_OFFLINE:
260 break;
261 case MEM_ONLINE:
262 case MEM_CANCEL_OFFLINE:
263 break;
264 }
265
266 return notifier_from_errno(ret);
267}
268
269#endif
270
271void __init page_cgroup_init(void)
272{
273 unsigned long pfn;
274 int nid;
275
276 if (mem_cgroup_disabled())
277 return;
278
279 for_each_node_state(nid, N_MEMORY) {
280 unsigned long start_pfn, end_pfn;
281
282 start_pfn = node_start_pfn(nid);
283 end_pfn = node_end_pfn(nid);
284 /*
285 * start_pfn and end_pfn may not be aligned to SECTION and the
286 * page->flags of out of node pages are not initialized. So we
287 * scan [start_pfn, the biggest section's pfn < end_pfn) here.
288 */
289 for (pfn = start_pfn;
290 pfn < end_pfn;
291 pfn = ALIGN(pfn + 1, PAGES_PER_SECTION)) {
292
293 if (!pfn_valid(pfn))
294 continue;
295 /*
296 * Nodes's pfns can be overlapping.
297 * We know some arch can have a nodes layout such as
298 * -------------pfn-------------->
299 * N0 | N1 | N2 | N0 | N1 | N2|....
300 */
301 if (pfn_to_nid(pfn) != nid)
302 continue;
303 if (init_section_page_cgroup(pfn, nid))
304 goto oom;
305 }
306 }
307 hotplug_memory_notifier(page_cgroup_callback, 0);
308 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
309 printk(KERN_INFO "please try 'cgroup_disable=memory' option if you "
310 "don't want memory cgroups\n");
311 return;
312oom:
313 printk(KERN_CRIT "try 'cgroup_disable=memory' boot option\n");
314 panic("Out of memory");
315}
316
317void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
318{
319 return;
320}
321
322#endif
323
324 5
325#ifdef CONFIG_MEMCG_SWAP 6#ifdef CONFIG_MEMCG_SWAP
326 7