aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c761
1 files changed, 328 insertions, 433 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7685d4a0b3ce..f72b5e52451a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,7 @@
59 59
60struct cgroup_subsys mem_cgroup_subsys __read_mostly; 60struct cgroup_subsys mem_cgroup_subsys __read_mostly;
61#define MEM_CGROUP_RECLAIM_RETRIES 5 61#define MEM_CGROUP_RECLAIM_RETRIES 5
62struct mem_cgroup *root_mem_cgroup __read_mostly; 62static struct mem_cgroup *root_mem_cgroup __read_mostly;
63 63
64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 64#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 65/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -73,7 +73,7 @@ static int really_do_swap_account __initdata = 0;
73#endif 73#endif
74 74
75#else 75#else
76#define do_swap_account (0) 76#define do_swap_account 0
77#endif 77#endif
78 78
79 79
@@ -88,18 +88,31 @@ enum mem_cgroup_stat_index {
88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ 88 MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */
89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ 89 MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */
90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ 90 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
91 MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */
92 MEM_CGROUP_STAT_NSTATS, 91 MEM_CGROUP_STAT_NSTATS,
93}; 92};
94 93
94static const char * const mem_cgroup_stat_names[] = {
95 "cache",
96 "rss",
97 "mapped_file",
98 "swap",
99};
100
95enum mem_cgroup_events_index { 101enum mem_cgroup_events_index {
96 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 102 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
97 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 103 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
98 MEM_CGROUP_EVENTS_COUNT, /* # of pages paged in/out */
99 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 104 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
100 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 105 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
101 MEM_CGROUP_EVENTS_NSTATS, 106 MEM_CGROUP_EVENTS_NSTATS,
102}; 107};
108
109static const char * const mem_cgroup_events_names[] = {
110 "pgpgin",
111 "pgpgout",
112 "pgfault",
113 "pgmajfault",
114};
115
103/* 116/*
104 * Per memcg event counter is incremented at every pagein/pageout. With THP, 117 * Per memcg event counter is incremented at every pagein/pageout. With THP,
105 * it will be incremated by the number of pages. This counter is used for 118 * it will be incremated by the number of pages. This counter is used for
@@ -112,13 +125,14 @@ enum mem_cgroup_events_target {
112 MEM_CGROUP_TARGET_NUMAINFO, 125 MEM_CGROUP_TARGET_NUMAINFO,
113 MEM_CGROUP_NTARGETS, 126 MEM_CGROUP_NTARGETS,
114}; 127};
115#define THRESHOLDS_EVENTS_TARGET (128) 128#define THRESHOLDS_EVENTS_TARGET 128
116#define SOFTLIMIT_EVENTS_TARGET (1024) 129#define SOFTLIMIT_EVENTS_TARGET 1024
117#define NUMAINFO_EVENTS_TARGET (1024) 130#define NUMAINFO_EVENTS_TARGET 1024
118 131
119struct mem_cgroup_stat_cpu { 132struct mem_cgroup_stat_cpu {
120 long count[MEM_CGROUP_STAT_NSTATS]; 133 long count[MEM_CGROUP_STAT_NSTATS];
121 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 134 unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
135 unsigned long nr_page_events;
122 unsigned long targets[MEM_CGROUP_NTARGETS]; 136 unsigned long targets[MEM_CGROUP_NTARGETS];
123}; 137};
124 138
@@ -138,7 +152,6 @@ struct mem_cgroup_per_zone {
138 152
139 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 153 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1];
140 154
141 struct zone_reclaim_stat reclaim_stat;
142 struct rb_node tree_node; /* RB tree node */ 155 struct rb_node tree_node; /* RB tree node */
143 unsigned long long usage_in_excess;/* Set to the value by which */ 156 unsigned long long usage_in_excess;/* Set to the value by which */
144 /* the soft limit is exceeded*/ 157 /* the soft limit is exceeded*/
@@ -182,7 +195,7 @@ struct mem_cgroup_threshold {
182 195
183/* For threshold */ 196/* For threshold */
184struct mem_cgroup_threshold_ary { 197struct mem_cgroup_threshold_ary {
185 /* An array index points to threshold just below usage. */ 198 /* An array index points to threshold just below or equal to usage. */
186 int current_threshold; 199 int current_threshold;
187 /* Size of entries[] */ 200 /* Size of entries[] */
188 unsigned int size; 201 unsigned int size;
@@ -245,8 +258,8 @@ struct mem_cgroup {
245 */ 258 */
246 struct rcu_head rcu_freeing; 259 struct rcu_head rcu_freeing;
247 /* 260 /*
248 * But when using vfree(), that cannot be done at 261 * We also need some space for a worker in deferred freeing.
249 * interrupt time, so we must then queue the work. 262 * By the time we call it, rcu_freeing is no longer in use.
250 */ 263 */
251 struct work_struct work_freeing; 264 struct work_struct work_freeing;
252 }; 265 };
@@ -305,7 +318,7 @@ struct mem_cgroup {
305 /* 318 /*
306 * percpu counter. 319 * percpu counter.
307 */ 320 */
308 struct mem_cgroup_stat_cpu *stat; 321 struct mem_cgroup_stat_cpu __percpu *stat;
309 /* 322 /*
310 * used when a cpu is offlined or other synchronizations 323 * used when a cpu is offlined or other synchronizations
311 * See mem_cgroup_read_stat(). 324 * See mem_cgroup_read_stat().
@@ -360,8 +373,8 @@ static bool move_file(void)
360 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 373 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
361 * limit reclaim to prevent infinite loops, if they ever occur. 374 * limit reclaim to prevent infinite loops, if they ever occur.
362 */ 375 */
363#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) 376#define MEM_CGROUP_MAX_RECLAIM_LOOPS 100
364#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) 377#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
365 378
366enum charge_type { 379enum charge_type {
367 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 380 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
@@ -377,8 +390,8 @@ enum charge_type {
377#define _MEM (0) 390#define _MEM (0)
378#define _MEMSWAP (1) 391#define _MEMSWAP (1)
379#define _OOM_TYPE (2) 392#define _OOM_TYPE (2)
380#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) 393#define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
381#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 394#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
382#define MEMFILE_ATTR(val) ((val) & 0xffff) 395#define MEMFILE_ATTR(val) ((val) & 0xffff)
383/* Used for OOM nofiier */ 396/* Used for OOM nofiier */
384#define OOM_CONTROL (0) 397#define OOM_CONTROL (0)
@@ -404,6 +417,7 @@ void sock_update_memcg(struct sock *sk)
404{ 417{
405 if (mem_cgroup_sockets_enabled) { 418 if (mem_cgroup_sockets_enabled) {
406 struct mem_cgroup *memcg; 419 struct mem_cgroup *memcg;
420 struct cg_proto *cg_proto;
407 421
408 BUG_ON(!sk->sk_prot->proto_cgroup); 422 BUG_ON(!sk->sk_prot->proto_cgroup);
409 423
@@ -423,9 +437,10 @@ void sock_update_memcg(struct sock *sk)
423 437
424 rcu_read_lock(); 438 rcu_read_lock();
425 memcg = mem_cgroup_from_task(current); 439 memcg = mem_cgroup_from_task(current);
426 if (!mem_cgroup_is_root(memcg)) { 440 cg_proto = sk->sk_prot->proto_cgroup(memcg);
441 if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
427 mem_cgroup_get(memcg); 442 mem_cgroup_get(memcg);
428 sk->sk_cgrp = sk->sk_prot->proto_cgroup(memcg); 443 sk->sk_cgrp = cg_proto;
429 } 444 }
430 rcu_read_unlock(); 445 rcu_read_unlock();
431 } 446 }
@@ -454,6 +469,19 @@ EXPORT_SYMBOL(tcp_proto_cgroup);
454#endif /* CONFIG_INET */ 469#endif /* CONFIG_INET */
455#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ 470#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */
456 471
472#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM)
473static void disarm_sock_keys(struct mem_cgroup *memcg)
474{
475 if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
476 return;
477 static_key_slow_dec(&memcg_socket_limit_enabled);
478}
479#else
480static void disarm_sock_keys(struct mem_cgroup *memcg)
481{
482}
483#endif
484
457static void drain_all_stock_async(struct mem_cgroup *memcg); 485static void drain_all_stock_async(struct mem_cgroup *memcg);
458 486
459static struct mem_cgroup_per_zone * 487static struct mem_cgroup_per_zone *
@@ -718,12 +746,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
718 nr_pages = -nr_pages; /* for event */ 746 nr_pages = -nr_pages; /* for event */
719 } 747 }
720 748
721 __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); 749 __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
722 750
723 preempt_enable(); 751 preempt_enable();
724} 752}
725 753
726unsigned long 754unsigned long
755mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
756{
757 struct mem_cgroup_per_zone *mz;
758
759 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
760 return mz->lru_size[lru];
761}
762
763static unsigned long
727mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 764mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid,
728 unsigned int lru_mask) 765 unsigned int lru_mask)
729{ 766{
@@ -770,7 +807,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
770{ 807{
771 unsigned long val, next; 808 unsigned long val, next;
772 809
773 val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); 810 val = __this_cpu_read(memcg->stat->nr_page_events);
774 next = __this_cpu_read(memcg->stat->targets[target]); 811 next = __this_cpu_read(memcg->stat->targets[target]);
775 /* from time_after() in jiffies.h */ 812 /* from time_after() in jiffies.h */
776 if ((long)next - (long)val < 0) { 813 if ((long)next - (long)val < 0) {
@@ -1013,7 +1050,7 @@ EXPORT_SYMBOL(mem_cgroup_count_vm_event);
1013/** 1050/**
1014 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1051 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg
1015 * @zone: zone of the wanted lruvec 1052 * @zone: zone of the wanted lruvec
1016 * @mem: memcg of the wanted lruvec 1053 * @memcg: memcg of the wanted lruvec
1017 * 1054 *
1018 * Returns the lru list vector holding pages for the given @zone and 1055 * Returns the lru list vector holding pages for the given @zone and
1019 * @mem. This can be the global zone lruvec, if the memory controller 1056 * @mem. This can be the global zone lruvec, if the memory controller
@@ -1046,19 +1083,11 @@ struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone,
1046 */ 1083 */
1047 1084
1048/** 1085/**
1049 * mem_cgroup_lru_add_list - account for adding an lru page and return lruvec 1086 * mem_cgroup_page_lruvec - return lruvec for adding an lru page
1050 * @zone: zone of the page
1051 * @page: the page 1087 * @page: the page
1052 * @lru: current lru 1088 * @zone: zone of the page
1053 *
1054 * This function accounts for @page being added to @lru, and returns
1055 * the lruvec for the given @zone and the memcg @page is charged to.
1056 *
1057 * The callsite is then responsible for physically linking the page to
1058 * the returned lruvec->lists[@lru].
1059 */ 1089 */
1060struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page, 1090struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone)
1061 enum lru_list lru)
1062{ 1091{
1063 struct mem_cgroup_per_zone *mz; 1092 struct mem_cgroup_per_zone *mz;
1064 struct mem_cgroup *memcg; 1093 struct mem_cgroup *memcg;
@@ -1071,7 +1100,7 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1071 memcg = pc->mem_cgroup; 1100 memcg = pc->mem_cgroup;
1072 1101
1073 /* 1102 /*
1074 * Surreptitiously switch any uncharged page to root: 1103 * Surreptitiously switch any uncharged offlist page to root:
1075 * an uncharged page off lru does nothing to secure 1104 * an uncharged page off lru does nothing to secure
1076 * its former mem_cgroup from sudden removal. 1105 * its former mem_cgroup from sudden removal.
1077 * 1106 *
@@ -1079,85 +1108,60 @@ struct lruvec *mem_cgroup_lru_add_list(struct zone *zone, struct page *page,
1079 * under page_cgroup lock: between them, they make all uses 1108 * under page_cgroup lock: between them, they make all uses
1080 * of pc->mem_cgroup safe. 1109 * of pc->mem_cgroup safe.
1081 */ 1110 */
1082 if (!PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1111 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup)
1083 pc->mem_cgroup = memcg = root_mem_cgroup; 1112 pc->mem_cgroup = memcg = root_mem_cgroup;
1084 1113
1085 mz = page_cgroup_zoneinfo(memcg, page); 1114 mz = page_cgroup_zoneinfo(memcg, page);
1086 /* compound_order() is stabilized through lru_lock */
1087 mz->lru_size[lru] += 1 << compound_order(page);
1088 return &mz->lruvec; 1115 return &mz->lruvec;
1089} 1116}
1090 1117
1091/** 1118/**
1092 * mem_cgroup_lru_del_list - account for removing an lru page 1119 * mem_cgroup_update_lru_size - account for adding or removing an lru page
1093 * @page: the page 1120 * @lruvec: mem_cgroup per zone lru vector
1094 * @lru: target lru 1121 * @lru: index of lru list the page is sitting on
1095 * 1122 * @nr_pages: positive when adding or negative when removing
1096 * This function accounts for @page being removed from @lru.
1097 * 1123 *
1098 * The callsite is then responsible for physically unlinking 1124 * This function must be called when a page is added to or removed from an
1099 * @page->lru. 1125 * lru list.
1100 */ 1126 */
1101void mem_cgroup_lru_del_list(struct page *page, enum lru_list lru) 1127void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1128 int nr_pages)
1102{ 1129{
1103 struct mem_cgroup_per_zone *mz; 1130 struct mem_cgroup_per_zone *mz;
1104 struct mem_cgroup *memcg; 1131 unsigned long *lru_size;
1105 struct page_cgroup *pc;
1106 1132
1107 if (mem_cgroup_disabled()) 1133 if (mem_cgroup_disabled())
1108 return; 1134 return;
1109 1135
1110 pc = lookup_page_cgroup(page); 1136 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
1111 memcg = pc->mem_cgroup; 1137 lru_size = mz->lru_size + lru;
1112 VM_BUG_ON(!memcg); 1138 *lru_size += nr_pages;
1113 mz = page_cgroup_zoneinfo(memcg, page); 1139 VM_BUG_ON((long)(*lru_size) < 0);
1114 /* huge page split is done under lru_lock. so, we have no races. */
1115 VM_BUG_ON(mz->lru_size[lru] < (1 << compound_order(page)));
1116 mz->lru_size[lru] -= 1 << compound_order(page);
1117}
1118
1119void mem_cgroup_lru_del(struct page *page)
1120{
1121 mem_cgroup_lru_del_list(page, page_lru(page));
1122}
1123
1124/**
1125 * mem_cgroup_lru_move_lists - account for moving a page between lrus
1126 * @zone: zone of the page
1127 * @page: the page
1128 * @from: current lru
1129 * @to: target lru
1130 *
1131 * This function accounts for @page being moved between the lrus @from
1132 * and @to, and returns the lruvec for the given @zone and the memcg
1133 * @page is charged to.
1134 *
1135 * The callsite is then responsible for physically relinking
1136 * @page->lru to the returned lruvec->lists[@to].
1137 */
1138struct lruvec *mem_cgroup_lru_move_lists(struct zone *zone,
1139 struct page *page,
1140 enum lru_list from,
1141 enum lru_list to)
1142{
1143 /* XXX: Optimize this, especially for @from == @to */
1144 mem_cgroup_lru_del_list(page, from);
1145 return mem_cgroup_lru_add_list(zone, page, to);
1146} 1140}
1147 1141
1148/* 1142/*
1149 * Checks whether given mem is same or in the root_mem_cgroup's 1143 * Checks whether given mem is same or in the root_mem_cgroup's
1150 * hierarchy subtree 1144 * hierarchy subtree
1151 */ 1145 */
1146bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1147 struct mem_cgroup *memcg)
1148{
1149 if (root_memcg == memcg)
1150 return true;
1151 if (!root_memcg->use_hierarchy || !memcg)
1152 return false;
1153 return css_is_ancestor(&memcg->css, &root_memcg->css);
1154}
1155
1152static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1156static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
1153 struct mem_cgroup *memcg) 1157 struct mem_cgroup *memcg)
1154{ 1158{
1155 if (root_memcg != memcg) { 1159 bool ret;
1156 return (root_memcg->use_hierarchy &&
1157 css_is_ancestor(&memcg->css, &root_memcg->css));
1158 }
1159 1160
1160 return true; 1161 rcu_read_lock();
1162 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg);
1163 rcu_read_unlock();
1164 return ret;
1161} 1165}
1162 1166
1163int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) 1167int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
@@ -1195,19 +1199,15 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
1195 return ret; 1199 return ret;
1196} 1200}
1197 1201
1198int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) 1202int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
1199{ 1203{
1200 unsigned long inactive_ratio; 1204 unsigned long inactive_ratio;
1201 int nid = zone_to_nid(zone);
1202 int zid = zone_idx(zone);
1203 unsigned long inactive; 1205 unsigned long inactive;
1204 unsigned long active; 1206 unsigned long active;
1205 unsigned long gb; 1207 unsigned long gb;
1206 1208
1207 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1209 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON);
1208 BIT(LRU_INACTIVE_ANON)); 1210 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON);
1209 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1210 BIT(LRU_ACTIVE_ANON));
1211 1211
1212 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1212 gb = (inactive + active) >> (30 - PAGE_SHIFT);
1213 if (gb) 1213 if (gb)
@@ -1218,55 +1218,23 @@ int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone)
1218 return inactive * inactive_ratio < active; 1218 return inactive * inactive_ratio < active;
1219} 1219}
1220 1220
1221int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) 1221int mem_cgroup_inactive_file_is_low(struct lruvec *lruvec)
1222{ 1222{
1223 unsigned long active; 1223 unsigned long active;
1224 unsigned long inactive; 1224 unsigned long inactive;
1225 int zid = zone_idx(zone);
1226 int nid = zone_to_nid(zone);
1227 1225
1228 inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, 1226 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_FILE);
1229 BIT(LRU_INACTIVE_FILE)); 1227 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_FILE);
1230 active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid,
1231 BIT(LRU_ACTIVE_FILE));
1232 1228
1233 return (active > inactive); 1229 return (active > inactive);
1234} 1230}
1235 1231
1236struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
1237 struct zone *zone)
1238{
1239 int nid = zone_to_nid(zone);
1240 int zid = zone_idx(zone);
1241 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
1242
1243 return &mz->reclaim_stat;
1244}
1245
1246struct zone_reclaim_stat *
1247mem_cgroup_get_reclaim_stat_from_page(struct page *page)
1248{
1249 struct page_cgroup *pc;
1250 struct mem_cgroup_per_zone *mz;
1251
1252 if (mem_cgroup_disabled())
1253 return NULL;
1254
1255 pc = lookup_page_cgroup(page);
1256 if (!PageCgroupUsed(pc))
1257 return NULL;
1258 /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */
1259 smp_rmb();
1260 mz = page_cgroup_zoneinfo(pc->mem_cgroup, page);
1261 return &mz->reclaim_stat;
1262}
1263
1264#define mem_cgroup_from_res_counter(counter, member) \ 1232#define mem_cgroup_from_res_counter(counter, member) \
1265 container_of(counter, struct mem_cgroup, member) 1233 container_of(counter, struct mem_cgroup, member)
1266 1234
1267/** 1235/**
1268 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1236 * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1269 * @mem: the memory cgroup 1237 * @memcg: the memory cgroup
1270 * 1238 *
1271 * Returns the maximum amount of memory @mem can be charged with, in 1239 * Returns the maximum amount of memory @mem can be charged with, in
1272 * pages. 1240 * pages.
@@ -1540,7 +1508,7 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
1540 1508
1541/** 1509/**
1542 * test_mem_cgroup_node_reclaimable 1510 * test_mem_cgroup_node_reclaimable
1543 * @mem: the target memcg 1511 * @memcg: the target memcg
1544 * @nid: the node ID to be checked. 1512 * @nid: the node ID to be checked.
1545 * @noswap : specify true here if the user wants flle only information. 1513 * @noswap : specify true here if the user wants flle only information.
1546 * 1514 *
@@ -1634,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1634 * unused nodes. But scan_nodes is lazily updated and may not cotain 1602 * unused nodes. But scan_nodes is lazily updated and may not cotain
1635 * enough new information. We need to do double check. 1603 * enough new information. We need to do double check.
1636 */ 1604 */
1637bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1605static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1638{ 1606{
1639 int nid; 1607 int nid;
1640 1608
@@ -1669,7 +1637,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1669 return 0; 1637 return 0;
1670} 1638}
1671 1639
1672bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1640static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap)
1673{ 1641{
1674 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 1642 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap);
1675} 1643}
@@ -1843,7 +1811,8 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
1843/* 1811/*
1844 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 1812 * try to call OOM killer. returns false if we should exit memory-reclaim loop.
1845 */ 1813 */
1846bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 1814static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
1815 int order)
1847{ 1816{
1848 struct oom_wait_info owait; 1817 struct oom_wait_info owait;
1849 bool locked, need_to_kill; 1818 bool locked, need_to_kill;
@@ -1992,7 +1961,7 @@ struct memcg_stock_pcp {
1992 unsigned int nr_pages; 1961 unsigned int nr_pages;
1993 struct work_struct work; 1962 struct work_struct work;
1994 unsigned long flags; 1963 unsigned long flags;
1995#define FLUSHING_CACHED_CHARGE (0) 1964#define FLUSHING_CACHED_CHARGE 0
1996}; 1965};
1997static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 1966static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1998static DEFINE_MUTEX(percpu_charge_mutex); 1967static DEFINE_MUTEX(percpu_charge_mutex);
@@ -2139,7 +2108,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
2139 int i; 2108 int i;
2140 2109
2141 spin_lock(&memcg->pcp_counter_lock); 2110 spin_lock(&memcg->pcp_counter_lock);
2142 for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { 2111 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
2143 long x = per_cpu(memcg->stat->count[i], cpu); 2112 long x = per_cpu(memcg->stat->count[i], cpu);
2144 2113
2145 per_cpu(memcg->stat->count[i], cpu) = 0; 2114 per_cpu(memcg->stat->count[i], cpu) = 0;
@@ -2427,6 +2396,24 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
2427} 2396}
2428 2397
2429/* 2398/*
2399 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup.
2400 * This is useful when moving usage to parent cgroup.
2401 */
2402static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
2403 unsigned int nr_pages)
2404{
2405 unsigned long bytes = nr_pages * PAGE_SIZE;
2406
2407 if (mem_cgroup_is_root(memcg))
2408 return;
2409
2410 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes);
2411 if (do_swap_account)
2412 res_counter_uncharge_until(&memcg->memsw,
2413 memcg->memsw.parent, bytes);
2414}
2415
2416/*
2430 * A helper function to get mem_cgroup from ID. must be called under 2417 * A helper function to get mem_cgroup from ID. must be called under
2431 * rcu_read_lock(). The caller must check css_is_removed() or some if 2418 * rcu_read_lock(). The caller must check css_is_removed() or some if
2432 * it's concern. (dropping refcnt from swap can be called against removed 2419 * it's concern. (dropping refcnt from swap can be called against removed
@@ -2481,6 +2468,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2481{ 2468{
2482 struct page_cgroup *pc = lookup_page_cgroup(page); 2469 struct page_cgroup *pc = lookup_page_cgroup(page);
2483 struct zone *uninitialized_var(zone); 2470 struct zone *uninitialized_var(zone);
2471 struct lruvec *lruvec;
2484 bool was_on_lru = false; 2472 bool was_on_lru = false;
2485 bool anon; 2473 bool anon;
2486 2474
@@ -2503,8 +2491,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2503 zone = page_zone(page); 2491 zone = page_zone(page);
2504 spin_lock_irq(&zone->lru_lock); 2492 spin_lock_irq(&zone->lru_lock);
2505 if (PageLRU(page)) { 2493 if (PageLRU(page)) {
2494 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2506 ClearPageLRU(page); 2495 ClearPageLRU(page);
2507 del_page_from_lru_list(zone, page, page_lru(page)); 2496 del_page_from_lru_list(page, lruvec, page_lru(page));
2508 was_on_lru = true; 2497 was_on_lru = true;
2509 } 2498 }
2510 } 2499 }
@@ -2522,9 +2511,10 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2522 2511
2523 if (lrucare) { 2512 if (lrucare) {
2524 if (was_on_lru) { 2513 if (was_on_lru) {
2514 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
2525 VM_BUG_ON(PageLRU(page)); 2515 VM_BUG_ON(PageLRU(page));
2526 SetPageLRU(page); 2516 SetPageLRU(page);
2527 add_page_to_lru_list(zone, page, page_lru(page)); 2517 add_page_to_lru_list(page, lruvec, page_lru(page));
2528 } 2518 }
2529 spin_unlock_irq(&zone->lru_lock); 2519 spin_unlock_irq(&zone->lru_lock);
2530 } 2520 }
@@ -2547,7 +2537,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
2547 2537
2548#ifdef CONFIG_TRANSPARENT_HUGEPAGE 2538#ifdef CONFIG_TRANSPARENT_HUGEPAGE
2549 2539
2550#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MIGRATION)) 2540#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
2551/* 2541/*
2552 * Because tail pages are not marked as "used", set it. We're under 2542 * Because tail pages are not marked as "used", set it. We're under
2553 * zone->lru_lock, 'splitting on pmd' and compound_lock. 2543 * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -2578,23 +2568,19 @@ void mem_cgroup_split_huge_fixup(struct page *head)
2578 * @pc: page_cgroup of the page. 2568 * @pc: page_cgroup of the page.
2579 * @from: mem_cgroup which the page is moved from. 2569 * @from: mem_cgroup which the page is moved from.
2580 * @to: mem_cgroup which the page is moved to. @from != @to. 2570 * @to: mem_cgroup which the page is moved to. @from != @to.
2581 * @uncharge: whether we should call uncharge and css_put against @from.
2582 * 2571 *
2583 * The caller must confirm following. 2572 * The caller must confirm following.
2584 * - page is not on LRU (isolate_page() is useful.) 2573 * - page is not on LRU (isolate_page() is useful.)
2585 * - compound_lock is held when nr_pages > 1 2574 * - compound_lock is held when nr_pages > 1
2586 * 2575 *
2587 * This function doesn't do "charge" nor css_get to new cgroup. It should be 2576 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
2588 * done by a caller(__mem_cgroup_try_charge would be useful). If @uncharge is 2577 * from old cgroup.
2589 * true, this function does "uncharge" from old cgroup, but it doesn't if
2590 * @uncharge is false, so a caller should do "uncharge".
2591 */ 2578 */
2592static int mem_cgroup_move_account(struct page *page, 2579static int mem_cgroup_move_account(struct page *page,
2593 unsigned int nr_pages, 2580 unsigned int nr_pages,
2594 struct page_cgroup *pc, 2581 struct page_cgroup *pc,
2595 struct mem_cgroup *from, 2582 struct mem_cgroup *from,
2596 struct mem_cgroup *to, 2583 struct mem_cgroup *to)
2597 bool uncharge)
2598{ 2584{
2599 unsigned long flags; 2585 unsigned long flags;
2600 int ret; 2586 int ret;
@@ -2628,9 +2614,6 @@ static int mem_cgroup_move_account(struct page *page,
2628 preempt_enable(); 2614 preempt_enable();
2629 } 2615 }
2630 mem_cgroup_charge_statistics(from, anon, -nr_pages); 2616 mem_cgroup_charge_statistics(from, anon, -nr_pages);
2631 if (uncharge)
2632 /* This is not "cancel", but cancel_charge does all we need. */
2633 __mem_cgroup_cancel_charge(from, nr_pages);
2634 2617
2635 /* caller should have done css_get */ 2618 /* caller should have done css_get */
2636 pc->mem_cgroup = to; 2619 pc->mem_cgroup = to;
@@ -2664,15 +2647,13 @@ static int mem_cgroup_move_parent(struct page *page,
2664 struct mem_cgroup *child, 2647 struct mem_cgroup *child,
2665 gfp_t gfp_mask) 2648 gfp_t gfp_mask)
2666{ 2649{
2667 struct cgroup *cg = child->css.cgroup;
2668 struct cgroup *pcg = cg->parent;
2669 struct mem_cgroup *parent; 2650 struct mem_cgroup *parent;
2670 unsigned int nr_pages; 2651 unsigned int nr_pages;
2671 unsigned long uninitialized_var(flags); 2652 unsigned long uninitialized_var(flags);
2672 int ret; 2653 int ret;
2673 2654
2674 /* Is ROOT ? */ 2655 /* Is ROOT ? */
2675 if (!pcg) 2656 if (mem_cgroup_is_root(child))
2676 return -EINVAL; 2657 return -EINVAL;
2677 2658
2678 ret = -EBUSY; 2659 ret = -EBUSY;
@@ -2683,21 +2664,23 @@ static int mem_cgroup_move_parent(struct page *page,
2683 2664
2684 nr_pages = hpage_nr_pages(page); 2665 nr_pages = hpage_nr_pages(page);
2685 2666
2686 parent = mem_cgroup_from_cont(pcg); 2667 parent = parent_mem_cgroup(child);
2687 ret = __mem_cgroup_try_charge(NULL, gfp_mask, nr_pages, &parent, false); 2668 /*
2688 if (ret) 2669 * If no parent, move charges to root cgroup.
2689 goto put_back; 2670 */
2671 if (!parent)
2672 parent = root_mem_cgroup;
2690 2673
2691 if (nr_pages > 1) 2674 if (nr_pages > 1)
2692 flags = compound_lock_irqsave(page); 2675 flags = compound_lock_irqsave(page);
2693 2676
2694 ret = mem_cgroup_move_account(page, nr_pages, pc, child, parent, true); 2677 ret = mem_cgroup_move_account(page, nr_pages,
2695 if (ret) 2678 pc, child, parent);
2696 __mem_cgroup_cancel_charge(parent, nr_pages); 2679 if (!ret)
2680 __mem_cgroup_cancel_local_charge(child, nr_pages);
2697 2681
2698 if (nr_pages > 1) 2682 if (nr_pages > 1)
2699 compound_unlock_irqrestore(page, flags); 2683 compound_unlock_irqrestore(page, flags);
2700put_back:
2701 putback_lru_page(page); 2684 putback_lru_page(page);
2702put: 2685put:
2703 put_page(page); 2686 put_page(page);
@@ -2845,24 +2828,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
2845 */ 2828 */
2846 if (do_swap_account && PageSwapCache(page)) { 2829 if (do_swap_account && PageSwapCache(page)) {
2847 swp_entry_t ent = {.val = page_private(page)}; 2830 swp_entry_t ent = {.val = page_private(page)};
2848 struct mem_cgroup *swap_memcg; 2831 mem_cgroup_uncharge_swap(ent);
2849 unsigned short id;
2850
2851 id = swap_cgroup_record(ent, 0);
2852 rcu_read_lock();
2853 swap_memcg = mem_cgroup_lookup(id);
2854 if (swap_memcg) {
2855 /*
2856 * This recorded memcg can be obsolete one. So, avoid
2857 * calling css_tryget
2858 */
2859 if (!mem_cgroup_is_root(swap_memcg))
2860 res_counter_uncharge(&swap_memcg->memsw,
2861 PAGE_SIZE);
2862 mem_cgroup_swap_statistics(swap_memcg, false);
2863 mem_cgroup_put(swap_memcg);
2864 }
2865 rcu_read_unlock();
2866 } 2832 }
2867 /* 2833 /*
2868 * At swapin, we may charge account against cgroup which has no tasks. 2834 * At swapin, we may charge account against cgroup which has no tasks.
@@ -3155,7 +3121,6 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3155 * @entry: swap entry to be moved 3121 * @entry: swap entry to be moved
3156 * @from: mem_cgroup which the entry is moved from 3122 * @from: mem_cgroup which the entry is moved from
3157 * @to: mem_cgroup which the entry is moved to 3123 * @to: mem_cgroup which the entry is moved to
3158 * @need_fixup: whether we should fixup res_counters and refcounts.
3159 * 3124 *
3160 * It succeeds only when the swap_cgroup's record for this entry is the same 3125 * It succeeds only when the swap_cgroup's record for this entry is the same
3161 * as the mem_cgroup's id of @from. 3126 * as the mem_cgroup's id of @from.
@@ -3166,7 +3131,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
3166 * both res and memsw, and called css_get(). 3131 * both res and memsw, and called css_get().
3167 */ 3132 */
3168static int mem_cgroup_move_swap_account(swp_entry_t entry, 3133static int mem_cgroup_move_swap_account(swp_entry_t entry,
3169 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3134 struct mem_cgroup *from, struct mem_cgroup *to)
3170{ 3135{
3171 unsigned short old_id, new_id; 3136 unsigned short old_id, new_id;
3172 3137
@@ -3185,24 +3150,13 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
3185 * swap-in, the refcount of @to might be decreased to 0. 3150 * swap-in, the refcount of @to might be decreased to 0.
3186 */ 3151 */
3187 mem_cgroup_get(to); 3152 mem_cgroup_get(to);
3188 if (need_fixup) {
3189 if (!mem_cgroup_is_root(from))
3190 res_counter_uncharge(&from->memsw, PAGE_SIZE);
3191 mem_cgroup_put(from);
3192 /*
3193 * we charged both to->res and to->memsw, so we should
3194 * uncharge to->res.
3195 */
3196 if (!mem_cgroup_is_root(to))
3197 res_counter_uncharge(&to->res, PAGE_SIZE);
3198 }
3199 return 0; 3153 return 0;
3200 } 3154 }
3201 return -EINVAL; 3155 return -EINVAL;
3202} 3156}
3203#else 3157#else
3204static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 3158static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3205 struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) 3159 struct mem_cgroup *from, struct mem_cgroup *to)
3206{ 3160{
3207 return -EINVAL; 3161 return -EINVAL;
3208} 3162}
@@ -3363,7 +3317,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg,
3363void mem_cgroup_replace_page_cache(struct page *oldpage, 3317void mem_cgroup_replace_page_cache(struct page *oldpage,
3364 struct page *newpage) 3318 struct page *newpage)
3365{ 3319{
3366 struct mem_cgroup *memcg; 3320 struct mem_cgroup *memcg = NULL;
3367 struct page_cgroup *pc; 3321 struct page_cgroup *pc;
3368 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 3322 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
3369 3323
@@ -3373,11 +3327,20 @@ void mem_cgroup_replace_page_cache(struct page *oldpage,
3373 pc = lookup_page_cgroup(oldpage); 3327 pc = lookup_page_cgroup(oldpage);
3374 /* fix accounting on old pages */ 3328 /* fix accounting on old pages */
3375 lock_page_cgroup(pc); 3329 lock_page_cgroup(pc);
3376 memcg = pc->mem_cgroup; 3330 if (PageCgroupUsed(pc)) {
3377 mem_cgroup_charge_statistics(memcg, false, -1); 3331 memcg = pc->mem_cgroup;
3378 ClearPageCgroupUsed(pc); 3332 mem_cgroup_charge_statistics(memcg, false, -1);
3333 ClearPageCgroupUsed(pc);
3334 }
3379 unlock_page_cgroup(pc); 3335 unlock_page_cgroup(pc);
3380 3336
3337 /*
3338 * When called from shmem_replace_page(), in some cases the
3339 * oldpage has already been charged, and in some cases not.
3340 */
3341 if (!memcg)
3342 return;
3343
3381 if (PageSwapBacked(oldpage)) 3344 if (PageSwapBacked(oldpage))
3382 type = MEM_CGROUP_CHARGE_TYPE_SHMEM; 3345 type = MEM_CGROUP_CHARGE_TYPE_SHMEM;
3383 3346
@@ -3793,7 +3756,7 @@ try_to_free:
3793 goto move_account; 3756 goto move_account;
3794} 3757}
3795 3758
3796int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) 3759static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
3797{ 3760{
3798 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true); 3761 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
3799} 3762}
@@ -3873,14 +3836,21 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3873 return val << PAGE_SHIFT; 3836 return val << PAGE_SHIFT;
3874} 3837}
3875 3838
3876static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 3839static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
3840 struct file *file, char __user *buf,
3841 size_t nbytes, loff_t *ppos)
3877{ 3842{
3878 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 3843 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3844 char str[64];
3879 u64 val; 3845 u64 val;
3880 int type, name; 3846 int type, name, len;
3881 3847
3882 type = MEMFILE_TYPE(cft->private); 3848 type = MEMFILE_TYPE(cft->private);
3883 name = MEMFILE_ATTR(cft->private); 3849 name = MEMFILE_ATTR(cft->private);
3850
3851 if (!do_swap_account && type == _MEMSWAP)
3852 return -EOPNOTSUPP;
3853
3884 switch (type) { 3854 switch (type) {
3885 case _MEM: 3855 case _MEM:
3886 if (name == RES_USAGE) 3856 if (name == RES_USAGE)
@@ -3897,7 +3867,9 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
3897 default: 3867 default:
3898 BUG(); 3868 BUG();
3899 } 3869 }
3900 return val; 3870
3871 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val);
3872 return simple_read_from_buffer(buf, nbytes, ppos, str, len);
3901} 3873}
3902/* 3874/*
3903 * The user of this function is... 3875 * The user of this function is...
@@ -3913,6 +3885,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
3913 3885
3914 type = MEMFILE_TYPE(cft->private); 3886 type = MEMFILE_TYPE(cft->private);
3915 name = MEMFILE_ATTR(cft->private); 3887 name = MEMFILE_ATTR(cft->private);
3888
3889 if (!do_swap_account && type == _MEMSWAP)
3890 return -EOPNOTSUPP;
3891
3916 switch (name) { 3892 switch (name) {
3917 case RES_LIMIT: 3893 case RES_LIMIT:
3918 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 3894 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
@@ -3978,12 +3954,15 @@ out:
3978 3954
3979static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 3955static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
3980{ 3956{
3981 struct mem_cgroup *memcg; 3957 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
3982 int type, name; 3958 int type, name;
3983 3959
3984 memcg = mem_cgroup_from_cont(cont);
3985 type = MEMFILE_TYPE(event); 3960 type = MEMFILE_TYPE(event);
3986 name = MEMFILE_ATTR(event); 3961 name = MEMFILE_ATTR(event);
3962
3963 if (!do_swap_account && type == _MEMSWAP)
3964 return -EOPNOTSUPP;
3965
3987 switch (name) { 3966 switch (name) {
3988 case RES_MAX_USAGE: 3967 case RES_MAX_USAGE:
3989 if (type == _MEM) 3968 if (type == _MEM)
@@ -4035,103 +4014,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
4035} 4014}
4036#endif 4015#endif
4037 4016
4038
4039/* For read statistics */
4040enum {
4041 MCS_CACHE,
4042 MCS_RSS,
4043 MCS_FILE_MAPPED,
4044 MCS_PGPGIN,
4045 MCS_PGPGOUT,
4046 MCS_SWAP,
4047 MCS_PGFAULT,
4048 MCS_PGMAJFAULT,
4049 MCS_INACTIVE_ANON,
4050 MCS_ACTIVE_ANON,
4051 MCS_INACTIVE_FILE,
4052 MCS_ACTIVE_FILE,
4053 MCS_UNEVICTABLE,
4054 NR_MCS_STAT,
4055};
4056
4057struct mcs_total_stat {
4058 s64 stat[NR_MCS_STAT];
4059};
4060
4061struct {
4062 char *local_name;
4063 char *total_name;
4064} memcg_stat_strings[NR_MCS_STAT] = {
4065 {"cache", "total_cache"},
4066 {"rss", "total_rss"},
4067 {"mapped_file", "total_mapped_file"},
4068 {"pgpgin", "total_pgpgin"},
4069 {"pgpgout", "total_pgpgout"},
4070 {"swap", "total_swap"},
4071 {"pgfault", "total_pgfault"},
4072 {"pgmajfault", "total_pgmajfault"},
4073 {"inactive_anon", "total_inactive_anon"},
4074 {"active_anon", "total_active_anon"},
4075 {"inactive_file", "total_inactive_file"},
4076 {"active_file", "total_active_file"},
4077 {"unevictable", "total_unevictable"}
4078};
4079
4080
4081static void
4082mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4083{
4084 s64 val;
4085
4086 /* per cpu stat */
4087 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
4088 s->stat[MCS_CACHE] += val * PAGE_SIZE;
4089 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
4090 s->stat[MCS_RSS] += val * PAGE_SIZE;
4091 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
4092 s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
4093 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN);
4094 s->stat[MCS_PGPGIN] += val;
4095 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT);
4096 s->stat[MCS_PGPGOUT] += val;
4097 if (do_swap_account) {
4098 val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT);
4099 s->stat[MCS_SWAP] += val * PAGE_SIZE;
4100 }
4101 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT);
4102 s->stat[MCS_PGFAULT] += val;
4103 val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT);
4104 s->stat[MCS_PGMAJFAULT] += val;
4105
4106 /* per zone stat */
4107 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON));
4108 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
4109 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON));
4110 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
4111 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE));
4112 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
4113 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE));
4114 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
4115 val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
4116 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
4117}
4118
4119static void
4120mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s)
4121{
4122 struct mem_cgroup *iter;
4123
4124 for_each_mem_cgroup_tree(iter, memcg)
4125 mem_cgroup_get_local_stat(iter, s);
4126}
4127
4128#ifdef CONFIG_NUMA 4017#ifdef CONFIG_NUMA
4129static int mem_control_numa_stat_show(struct seq_file *m, void *arg) 4018static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft,
4019 struct seq_file *m)
4130{ 4020{
4131 int nid; 4021 int nid;
4132 unsigned long total_nr, file_nr, anon_nr, unevictable_nr; 4022 unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
4133 unsigned long node_nr; 4023 unsigned long node_nr;
4134 struct cgroup *cont = m->private;
4135 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4024 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4136 4025
4137 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); 4026 total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
@@ -4172,64 +4061,100 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
4172} 4061}
4173#endif /* CONFIG_NUMA */ 4062#endif /* CONFIG_NUMA */
4174 4063
4064static const char * const mem_cgroup_lru_names[] = {
4065 "inactive_anon",
4066 "active_anon",
4067 "inactive_file",
4068 "active_file",
4069 "unevictable",
4070};
4071
4072static inline void mem_cgroup_lru_names_not_uptodate(void)
4073{
4074 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
4075}
4076
4175static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 4077static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
4176 struct cgroup_map_cb *cb) 4078 struct seq_file *m)
4177{ 4079{
4178 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4080 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
4179 struct mcs_total_stat mystat; 4081 struct mem_cgroup *mi;
4180 int i; 4082 unsigned int i;
4181
4182 memset(&mystat, 0, sizeof(mystat));
4183 mem_cgroup_get_local_stat(memcg, &mystat);
4184 4083
4185 4084 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4186 for (i = 0; i < NR_MCS_STAT; i++) { 4085 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4187 if (i == MCS_SWAP && !do_swap_account)
4188 continue; 4086 continue;
4189 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 4087 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
4088 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
4190 } 4089 }
4191 4090
4091 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
4092 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
4093 mem_cgroup_read_events(memcg, i));
4094
4095 for (i = 0; i < NR_LRU_LISTS; i++)
4096 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
4097 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
4098
4192 /* Hierarchical information */ 4099 /* Hierarchical information */
4193 { 4100 {
4194 unsigned long long limit, memsw_limit; 4101 unsigned long long limit, memsw_limit;
4195 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 4102 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit);
4196 cb->fill(cb, "hierarchical_memory_limit", limit); 4103 seq_printf(m, "hierarchical_memory_limit %llu\n", limit);
4197 if (do_swap_account) 4104 if (do_swap_account)
4198 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 4105 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4106 memsw_limit);
4199 } 4107 }
4200 4108
4201 memset(&mystat, 0, sizeof(mystat)); 4109 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
4202 mem_cgroup_get_total_stat(memcg, &mystat); 4110 long long val = 0;
4203 for (i = 0; i < NR_MCS_STAT; i++) { 4111
4204 if (i == MCS_SWAP && !do_swap_account) 4112 if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account)
4205 continue; 4113 continue;
4206 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 4114 for_each_mem_cgroup_tree(mi, memcg)
4115 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
4116 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
4117 }
4118
4119 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
4120 unsigned long long val = 0;
4121
4122 for_each_mem_cgroup_tree(mi, memcg)
4123 val += mem_cgroup_read_events(mi, i);
4124 seq_printf(m, "total_%s %llu\n",
4125 mem_cgroup_events_names[i], val);
4126 }
4127
4128 for (i = 0; i < NR_LRU_LISTS; i++) {
4129 unsigned long long val = 0;
4130
4131 for_each_mem_cgroup_tree(mi, memcg)
4132 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
4133 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
4207 } 4134 }
4208 4135
4209#ifdef CONFIG_DEBUG_VM 4136#ifdef CONFIG_DEBUG_VM
4210 { 4137 {
4211 int nid, zid; 4138 int nid, zid;
4212 struct mem_cgroup_per_zone *mz; 4139 struct mem_cgroup_per_zone *mz;
4140 struct zone_reclaim_stat *rstat;
4213 unsigned long recent_rotated[2] = {0, 0}; 4141 unsigned long recent_rotated[2] = {0, 0};
4214 unsigned long recent_scanned[2] = {0, 0}; 4142 unsigned long recent_scanned[2] = {0, 0};
4215 4143
4216 for_each_online_node(nid) 4144 for_each_online_node(nid)
4217 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4145 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
4218 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 4146 mz = mem_cgroup_zoneinfo(memcg, nid, zid);
4147 rstat = &mz->lruvec.reclaim_stat;
4219 4148
4220 recent_rotated[0] += 4149 recent_rotated[0] += rstat->recent_rotated[0];
4221 mz->reclaim_stat.recent_rotated[0]; 4150 recent_rotated[1] += rstat->recent_rotated[1];
4222 recent_rotated[1] += 4151 recent_scanned[0] += rstat->recent_scanned[0];
4223 mz->reclaim_stat.recent_rotated[1]; 4152 recent_scanned[1] += rstat->recent_scanned[1];
4224 recent_scanned[0] +=
4225 mz->reclaim_stat.recent_scanned[0];
4226 recent_scanned[1] +=
4227 mz->reclaim_stat.recent_scanned[1];
4228 } 4153 }
4229 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]); 4154 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
4230 cb->fill(cb, "recent_rotated_file", recent_rotated[1]); 4155 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
4231 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]); 4156 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
4232 cb->fill(cb, "recent_scanned_file", recent_scanned[1]); 4157 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
4233 } 4158 }
4234#endif 4159#endif
4235 4160
@@ -4291,7 +4216,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4291 usage = mem_cgroup_usage(memcg, swap); 4216 usage = mem_cgroup_usage(memcg, swap);
4292 4217
4293 /* 4218 /*
4294 * current_threshold points to threshold just below usage. 4219 * current_threshold points to threshold just below or equal to usage.
4295 * If it's not true, a threshold was crossed after last 4220 * If it's not true, a threshold was crossed after last
4296 * call of __mem_cgroup_threshold(). 4221 * call of __mem_cgroup_threshold().
4297 */ 4222 */
@@ -4417,14 +4342,15 @@ static int mem_cgroup_usage_register_event(struct cgroup *cgrp,
4417 /* Find current threshold */ 4342 /* Find current threshold */
4418 new->current_threshold = -1; 4343 new->current_threshold = -1;
4419 for (i = 0; i < size; i++) { 4344 for (i = 0; i < size; i++) {
4420 if (new->entries[i].threshold < usage) { 4345 if (new->entries[i].threshold <= usage) {
4421 /* 4346 /*
4422 * new->current_threshold will not be used until 4347 * new->current_threshold will not be used until
4423 * rcu_assign_pointer(), so it's safe to increment 4348 * rcu_assign_pointer(), so it's safe to increment
4424 * it here. 4349 * it here.
4425 */ 4350 */
4426 ++new->current_threshold; 4351 ++new->current_threshold;
4427 } 4352 } else
4353 break;
4428 } 4354 }
4429 4355
4430 /* Free old spare buffer and save old primary buffer as spare */ 4356 /* Free old spare buffer and save old primary buffer as spare */
@@ -4493,7 +4419,7 @@ static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp,
4493 continue; 4419 continue;
4494 4420
4495 new->entries[j] = thresholds->primary->entries[i]; 4421 new->entries[j] = thresholds->primary->entries[i];
4496 if (new->entries[j].threshold < usage) { 4422 if (new->entries[j].threshold <= usage) {
4497 /* 4423 /*
4498 * new->current_threshold will not be used 4424 * new->current_threshold will not be used
4499 * until rcu_assign_pointer(), so it's safe to increment 4425 * until rcu_assign_pointer(), so it's safe to increment
@@ -4607,46 +4533,23 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
4607 return 0; 4533 return 0;
4608} 4534}
4609 4535
4610#ifdef CONFIG_NUMA
4611static const struct file_operations mem_control_numa_stat_file_operations = {
4612 .read = seq_read,
4613 .llseek = seq_lseek,
4614 .release = single_release,
4615};
4616
4617static int mem_control_numa_stat_open(struct inode *unused, struct file *file)
4618{
4619 struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
4620
4621 file->f_op = &mem_control_numa_stat_file_operations;
4622 return single_open(file, mem_control_numa_stat_show, cont);
4623}
4624#endif /* CONFIG_NUMA */
4625
4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM 4536#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
4627static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4537static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4628{ 4538{
4629 /* 4539 return mem_cgroup_sockets_init(memcg, ss);
4630 * Part of this would be better living in a separate allocation
4631 * function, leaving us with just the cgroup tree population work.
4632 * We, however, depend on state such as network's proto_list that
4633 * is only initialized after cgroup creation. I found the less
4634 * cumbersome way to deal with it to defer it all to populate time
4635 */
4636 return mem_cgroup_sockets_init(cont, ss);
4637}; 4540};
4638 4541
4639static void kmem_cgroup_destroy(struct cgroup *cont) 4542static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4640{ 4543{
4641 mem_cgroup_sockets_destroy(cont); 4544 mem_cgroup_sockets_destroy(memcg);
4642} 4545}
4643#else 4546#else
4644static int register_kmem_files(struct cgroup *cont, struct cgroup_subsys *ss) 4547static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
4645{ 4548{
4646 return 0; 4549 return 0;
4647} 4550}
4648 4551
4649static void kmem_cgroup_destroy(struct cgroup *cont) 4552static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
4650{ 4553{
4651} 4554}
4652#endif 4555#endif
@@ -4655,7 +4558,7 @@ static struct cftype mem_cgroup_files[] = {
4655 { 4558 {
4656 .name = "usage_in_bytes", 4559 .name = "usage_in_bytes",
4657 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 4560 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4658 .read_u64 = mem_cgroup_read, 4561 .read = mem_cgroup_read,
4659 .register_event = mem_cgroup_usage_register_event, 4562 .register_event = mem_cgroup_usage_register_event,
4660 .unregister_event = mem_cgroup_usage_unregister_event, 4563 .unregister_event = mem_cgroup_usage_unregister_event,
4661 }, 4564 },
@@ -4663,29 +4566,29 @@ static struct cftype mem_cgroup_files[] = {
4663 .name = "max_usage_in_bytes", 4566 .name = "max_usage_in_bytes",
4664 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 4567 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4665 .trigger = mem_cgroup_reset, 4568 .trigger = mem_cgroup_reset,
4666 .read_u64 = mem_cgroup_read, 4569 .read = mem_cgroup_read,
4667 }, 4570 },
4668 { 4571 {
4669 .name = "limit_in_bytes", 4572 .name = "limit_in_bytes",
4670 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 4573 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4671 .write_string = mem_cgroup_write, 4574 .write_string = mem_cgroup_write,
4672 .read_u64 = mem_cgroup_read, 4575 .read = mem_cgroup_read,
4673 }, 4576 },
4674 { 4577 {
4675 .name = "soft_limit_in_bytes", 4578 .name = "soft_limit_in_bytes",
4676 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 4579 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4677 .write_string = mem_cgroup_write, 4580 .write_string = mem_cgroup_write,
4678 .read_u64 = mem_cgroup_read, 4581 .read = mem_cgroup_read,
4679 }, 4582 },
4680 { 4583 {
4681 .name = "failcnt", 4584 .name = "failcnt",
4682 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 4585 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4683 .trigger = mem_cgroup_reset, 4586 .trigger = mem_cgroup_reset,
4684 .read_u64 = mem_cgroup_read, 4587 .read = mem_cgroup_read,
4685 }, 4588 },
4686 { 4589 {
4687 .name = "stat", 4590 .name = "stat",
4688 .read_map = mem_control_stat_show, 4591 .read_seq_string = mem_control_stat_show,
4689 }, 4592 },
4690 { 4593 {
4691 .name = "force_empty", 4594 .name = "force_empty",
@@ -4717,18 +4620,14 @@ static struct cftype mem_cgroup_files[] = {
4717#ifdef CONFIG_NUMA 4620#ifdef CONFIG_NUMA
4718 { 4621 {
4719 .name = "numa_stat", 4622 .name = "numa_stat",
4720 .open = mem_control_numa_stat_open, 4623 .read_seq_string = mem_control_numa_stat_show,
4721 .mode = S_IRUGO,
4722 }, 4624 },
4723#endif 4625#endif
4724};
4725
4726#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 4626#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
4727static struct cftype memsw_cgroup_files[] = {
4728 { 4627 {
4729 .name = "memsw.usage_in_bytes", 4628 .name = "memsw.usage_in_bytes",
4730 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 4629 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
4731 .read_u64 = mem_cgroup_read, 4630 .read = mem_cgroup_read,
4732 .register_event = mem_cgroup_usage_register_event, 4631 .register_event = mem_cgroup_usage_register_event,
4733 .unregister_event = mem_cgroup_usage_unregister_event, 4632 .unregister_event = mem_cgroup_usage_unregister_event,
4734 }, 4633 },
@@ -4736,41 +4635,28 @@ static struct cftype memsw_cgroup_files[] = {
4736 .name = "memsw.max_usage_in_bytes", 4635 .name = "memsw.max_usage_in_bytes",
4737 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 4636 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
4738 .trigger = mem_cgroup_reset, 4637 .trigger = mem_cgroup_reset,
4739 .read_u64 = mem_cgroup_read, 4638 .read = mem_cgroup_read,
4740 }, 4639 },
4741 { 4640 {
4742 .name = "memsw.limit_in_bytes", 4641 .name = "memsw.limit_in_bytes",
4743 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 4642 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
4744 .write_string = mem_cgroup_write, 4643 .write_string = mem_cgroup_write,
4745 .read_u64 = mem_cgroup_read, 4644 .read = mem_cgroup_read,
4746 }, 4645 },
4747 { 4646 {
4748 .name = "memsw.failcnt", 4647 .name = "memsw.failcnt",
4749 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 4648 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
4750 .trigger = mem_cgroup_reset, 4649 .trigger = mem_cgroup_reset,
4751 .read_u64 = mem_cgroup_read, 4650 .read = mem_cgroup_read,
4752 }, 4651 },
4753};
4754
4755static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4756{
4757 if (!do_swap_account)
4758 return 0;
4759 return cgroup_add_files(cont, ss, memsw_cgroup_files,
4760 ARRAY_SIZE(memsw_cgroup_files));
4761};
4762#else
4763static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
4764{
4765 return 0;
4766}
4767#endif 4652#endif
4653 { }, /* terminate */
4654};
4768 4655
4769static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 4656static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4770{ 4657{
4771 struct mem_cgroup_per_node *pn; 4658 struct mem_cgroup_per_node *pn;
4772 struct mem_cgroup_per_zone *mz; 4659 struct mem_cgroup_per_zone *mz;
4773 enum lru_list lru;
4774 int zone, tmp = node; 4660 int zone, tmp = node;
4775 /* 4661 /*
4776 * This routine is called against possible nodes. 4662 * This routine is called against possible nodes.
@@ -4788,8 +4674,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
4788 4674
4789 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 4675 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
4790 mz = &pn->zoneinfo[zone]; 4676 mz = &pn->zoneinfo[zone];
4791 for_each_lru(lru) 4677 lruvec_init(&mz->lruvec, &NODE_DATA(node)->node_zones[zone]);
4792 INIT_LIST_HEAD(&mz->lruvec.lists[lru]);
4793 mz->usage_in_excess = 0; 4678 mz->usage_in_excess = 0;
4794 mz->on_tree = false; 4679 mz->on_tree = false;
4795 mz->memcg = memcg; 4680 mz->memcg = memcg;
@@ -4832,23 +4717,40 @@ out_free:
4832} 4717}
4833 4718
4834/* 4719/*
4835 * Helpers for freeing a vzalloc()ed mem_cgroup by RCU, 4720 * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
4836 * but in process context. The work_freeing structure is overlaid 4721 * but in process context. The work_freeing structure is overlaid
4837 * on the rcu_freeing structure, which itself is overlaid on memsw. 4722 * on the rcu_freeing structure, which itself is overlaid on memsw.
4838 */ 4723 */
4839static void vfree_work(struct work_struct *work) 4724static void free_work(struct work_struct *work)
4840{ 4725{
4841 struct mem_cgroup *memcg; 4726 struct mem_cgroup *memcg;
4727 int size = sizeof(struct mem_cgroup);
4842 4728
4843 memcg = container_of(work, struct mem_cgroup, work_freeing); 4729 memcg = container_of(work, struct mem_cgroup, work_freeing);
4844 vfree(memcg); 4730 /*
4731 * We need to make sure that (at least for now), the jump label
4732 * destruction code runs outside of the cgroup lock. This is because
4733 * get_online_cpus(), which is called from the static_branch update,
4734 * can't be called inside the cgroup_lock. cpusets are the ones
4735 * enforcing this dependency, so if they ever change, we might as well.
4736 *
4737 * schedule_work() will guarantee this happens. Be careful if you need
4738 * to move this code around, and make sure it is outside
4739 * the cgroup_lock.
4740 */
4741 disarm_sock_keys(memcg);
4742 if (size < PAGE_SIZE)
4743 kfree(memcg);
4744 else
4745 vfree(memcg);
4845} 4746}
4846static void vfree_rcu(struct rcu_head *rcu_head) 4747
4748static void free_rcu(struct rcu_head *rcu_head)
4847{ 4749{
4848 struct mem_cgroup *memcg; 4750 struct mem_cgroup *memcg;
4849 4751
4850 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); 4752 memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
4851 INIT_WORK(&memcg->work_freeing, vfree_work); 4753 INIT_WORK(&memcg->work_freeing, free_work);
4852 schedule_work(&memcg->work_freeing); 4754 schedule_work(&memcg->work_freeing);
4853} 4755}
4854 4756
@@ -4874,10 +4776,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
4874 free_mem_cgroup_per_zone_info(memcg, node); 4776 free_mem_cgroup_per_zone_info(memcg, node);
4875 4777
4876 free_percpu(memcg->stat); 4778 free_percpu(memcg->stat);
4877 if (sizeof(struct mem_cgroup) < PAGE_SIZE) 4779 call_rcu(&memcg->rcu_freeing, free_rcu);
4878 kfree_rcu(memcg, rcu_freeing);
4879 else
4880 call_rcu(&memcg->rcu_freeing, vfree_rcu);
4881} 4780}
4882 4781
4883static void mem_cgroup_get(struct mem_cgroup *memcg) 4782static void mem_cgroup_get(struct mem_cgroup *memcg)
@@ -5016,6 +4915,17 @@ mem_cgroup_create(struct cgroup *cont)
5016 memcg->move_charge_at_immigrate = 0; 4915 memcg->move_charge_at_immigrate = 0;
5017 mutex_init(&memcg->thresholds_lock); 4916 mutex_init(&memcg->thresholds_lock);
5018 spin_lock_init(&memcg->move_lock); 4917 spin_lock_init(&memcg->move_lock);
4918
4919 error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
4920 if (error) {
4921 /*
4922 * We call put now because our (and parent's) refcnts
4923 * are already in place. mem_cgroup_put() will internally
4924 * call __mem_cgroup_free, so return directly
4925 */
4926 mem_cgroup_put(memcg);
4927 return ERR_PTR(error);
4928 }
5019 return &memcg->css; 4929 return &memcg->css;
5020free_out: 4930free_out:
5021 __mem_cgroup_free(memcg); 4931 __mem_cgroup_free(memcg);
@@ -5033,28 +4943,11 @@ static void mem_cgroup_destroy(struct cgroup *cont)
5033{ 4943{
5034 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 4944 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
5035 4945
5036 kmem_cgroup_destroy(cont); 4946 kmem_cgroup_destroy(memcg);
5037 4947
5038 mem_cgroup_put(memcg); 4948 mem_cgroup_put(memcg);
5039} 4949}
5040 4950
5041static int mem_cgroup_populate(struct cgroup_subsys *ss,
5042 struct cgroup *cont)
5043{
5044 int ret;
5045
5046 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
5047 ARRAY_SIZE(mem_cgroup_files));
5048
5049 if (!ret)
5050 ret = register_memsw_files(cont, ss);
5051
5052 if (!ret)
5053 ret = register_kmem_files(cont, ss);
5054
5055 return ret;
5056}
5057
5058#ifdef CONFIG_MMU 4951#ifdef CONFIG_MMU
5059/* Handlers for move charge at task migration. */ 4952/* Handlers for move charge at task migration. */
5060#define PRECHARGE_COUNT_AT_ONCE 256 4953#define PRECHARGE_COUNT_AT_ONCE 256
@@ -5147,7 +5040,7 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5147 return NULL; 5040 return NULL;
5148 if (PageAnon(page)) { 5041 if (PageAnon(page)) {
5149 /* we don't move shared anon */ 5042 /* we don't move shared anon */
5150 if (!move_anon() || page_mapcount(page) > 2) 5043 if (!move_anon())
5151 return NULL; 5044 return NULL;
5152 } else if (!move_file()) 5045 } else if (!move_file())
5153 /* we ignore mapcount for file pages */ 5046 /* we ignore mapcount for file pages */
@@ -5158,32 +5051,37 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5158 return page; 5051 return page;
5159} 5052}
5160 5053
5054#ifdef CONFIG_SWAP
5161static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, 5055static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5162 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5056 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5163{ 5057{
5164 int usage_count;
5165 struct page *page = NULL; 5058 struct page *page = NULL;
5166 swp_entry_t ent = pte_to_swp_entry(ptent); 5059 swp_entry_t ent = pte_to_swp_entry(ptent);
5167 5060
5168 if (!move_anon() || non_swap_entry(ent)) 5061 if (!move_anon() || non_swap_entry(ent))
5169 return NULL; 5062 return NULL;
5170 usage_count = mem_cgroup_count_swap_user(ent, &page); 5063 /*
5171 if (usage_count > 1) { /* we don't move shared anon */ 5064 * Because lookup_swap_cache() updates some statistics counter,
5172 if (page) 5065 * we call find_get_page() with swapper_space directly.
5173 put_page(page); 5066 */
5174 return NULL; 5067 page = find_get_page(&swapper_space, ent.val);
5175 }
5176 if (do_swap_account) 5068 if (do_swap_account)
5177 entry->val = ent.val; 5069 entry->val = ent.val;
5178 5070
5179 return page; 5071 return page;
5180} 5072}
5073#else
5074static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5075 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5076{
5077 return NULL;
5078}
5079#endif
5181 5080
5182static struct page *mc_handle_file_pte(struct vm_area_struct *vma, 5081static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5183 unsigned long addr, pte_t ptent, swp_entry_t *entry) 5082 unsigned long addr, pte_t ptent, swp_entry_t *entry)
5184{ 5083{
5185 struct page *page = NULL; 5084 struct page *page = NULL;
5186 struct inode *inode;
5187 struct address_space *mapping; 5085 struct address_space *mapping;
5188 pgoff_t pgoff; 5086 pgoff_t pgoff;
5189 5087
@@ -5192,7 +5090,6 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5192 if (!move_file()) 5090 if (!move_file())
5193 return NULL; 5091 return NULL;
5194 5092
5195 inode = vma->vm_file->f_path.dentry->d_inode;
5196 mapping = vma->vm_file->f_mapping; 5093 mapping = vma->vm_file->f_mapping;
5197 if (pte_none(ptent)) 5094 if (pte_none(ptent))
5198 pgoff = linear_page_index(vma, addr); 5095 pgoff = linear_page_index(vma, addr);
@@ -5491,8 +5388,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5491 if (!isolate_lru_page(page)) { 5388 if (!isolate_lru_page(page)) {
5492 pc = lookup_page_cgroup(page); 5389 pc = lookup_page_cgroup(page);
5493 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR, 5390 if (!mem_cgroup_move_account(page, HPAGE_PMD_NR,
5494 pc, mc.from, mc.to, 5391 pc, mc.from, mc.to)) {
5495 false)) {
5496 mc.precharge -= HPAGE_PMD_NR; 5392 mc.precharge -= HPAGE_PMD_NR;
5497 mc.moved_charge += HPAGE_PMD_NR; 5393 mc.moved_charge += HPAGE_PMD_NR;
5498 } 5394 }
@@ -5522,7 +5418,7 @@ retry:
5522 goto put; 5418 goto put;
5523 pc = lookup_page_cgroup(page); 5419 pc = lookup_page_cgroup(page);
5524 if (!mem_cgroup_move_account(page, 1, pc, 5420 if (!mem_cgroup_move_account(page, 1, pc,
5525 mc.from, mc.to, false)) { 5421 mc.from, mc.to)) {
5526 mc.precharge--; 5422 mc.precharge--;
5527 /* we uncharge from mc.from later. */ 5423 /* we uncharge from mc.from later. */
5528 mc.moved_charge++; 5424 mc.moved_charge++;
@@ -5533,8 +5429,7 @@ put: /* get_mctgt_type() gets the page */
5533 break; 5429 break;
5534 case MC_TARGET_SWAP: 5430 case MC_TARGET_SWAP:
5535 ent = target.ent; 5431 ent = target.ent;
5536 if (!mem_cgroup_move_swap_account(ent, 5432 if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5537 mc.from, mc.to, false)) {
5538 mc.precharge--; 5433 mc.precharge--;
5539 /* we fixup refcnts and charges later. */ 5434 /* we fixup refcnts and charges later. */
5540 mc.moved_swap++; 5435 mc.moved_swap++;
@@ -5610,7 +5505,6 @@ static void mem_cgroup_move_task(struct cgroup *cont,
5610 if (mm) { 5505 if (mm) {
5611 if (mc.to) 5506 if (mc.to)
5612 mem_cgroup_move_charge(mm); 5507 mem_cgroup_move_charge(mm);
5613 put_swap_token(mm);
5614 mmput(mm); 5508 mmput(mm);
5615 } 5509 }
5616 if (mc.to) 5510 if (mc.to)
@@ -5638,12 +5532,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
5638 .create = mem_cgroup_create, 5532 .create = mem_cgroup_create,
5639 .pre_destroy = mem_cgroup_pre_destroy, 5533 .pre_destroy = mem_cgroup_pre_destroy,
5640 .destroy = mem_cgroup_destroy, 5534 .destroy = mem_cgroup_destroy,
5641 .populate = mem_cgroup_populate,
5642 .can_attach = mem_cgroup_can_attach, 5535 .can_attach = mem_cgroup_can_attach,
5643 .cancel_attach = mem_cgroup_cancel_attach, 5536 .cancel_attach = mem_cgroup_cancel_attach,
5644 .attach = mem_cgroup_move_task, 5537 .attach = mem_cgroup_move_task,
5538 .base_cftypes = mem_cgroup_files,
5645 .early_init = 0, 5539 .early_init = 0,
5646 .use_id = 1, 5540 .use_id = 1,
5541 .__DEPRECATED_clear_css_refs = true,
5647}; 5542};
5648 5543
5649#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 5544#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP