aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBalbir Singh <balbir@linux.vnet.ibm.com>2009-09-23 18:56:37 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-24 10:20:59 -0400
commitf64c3f54940d6929a2b6dcffaab942bd62be2e66 (patch)
tree7b3587700b08639970580be6c87f36df80ca8c74
parent296c81d89f4f14269f7346f81442910158c0a83a (diff)
memory controller: soft limit organize cgroups
Organize cgroups over soft limit in a RB-Tree Introduce an RB-Tree for storing memory cgroups that are over their soft limit. The overall goal is to 1. Add a memory cgroup to the RB-Tree when the soft limit is exceeded. We are careful about updates, updates take place only after a particular time interval has passed 2. We remove the node from the RB-Tree when the usage goes below the soft limit The next set of patches will exploit the RB-Tree to get the group that is over its soft limit by the largest amount and reclaim from it, when we face memory contention. [hugh.dickins@tiscali.co.uk: CONFIG_CGROUP_MEM_RES_CTLR=y CONFIG_PREEMPT=y fails to boot] Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Jiri Slaby <jirislaby@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/res_counter.h6
-rw-r--r--kernel/res_counter.c18
-rw-r--r--mm/memcontrol.c300
3 files changed, 277 insertions, 47 deletions
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index fcb9884df618..731af71cddc9 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -114,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
114int __must_check res_counter_charge_locked(struct res_counter *counter, 114int __must_check res_counter_charge_locked(struct res_counter *counter,
115 unsigned long val); 115 unsigned long val);
116int __must_check res_counter_charge(struct res_counter *counter, 116int __must_check res_counter_charge(struct res_counter *counter,
117 unsigned long val, struct res_counter **limit_fail_at); 117 unsigned long val, struct res_counter **limit_fail_at,
118 struct res_counter **soft_limit_at);
118 119
119/* 120/*
120 * uncharge - tell that some portion of the resource is released 121 * uncharge - tell that some portion of the resource is released
@@ -127,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter,
127 */ 128 */
128 129
129void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); 130void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
130void res_counter_uncharge(struct res_counter *counter, unsigned long val); 131void res_counter_uncharge(struct res_counter *counter, unsigned long val,
132 bool *was_soft_limit_excess);
131 133
132static inline bool res_counter_limit_check_locked(struct res_counter *cnt) 134static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
133{ 135{
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..88faec23e833 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -37,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
37} 37}
38 38
39int res_counter_charge(struct res_counter *counter, unsigned long val, 39int res_counter_charge(struct res_counter *counter, unsigned long val,
40 struct res_counter **limit_fail_at) 40 struct res_counter **limit_fail_at,
41 struct res_counter **soft_limit_fail_at)
41{ 42{
42 int ret; 43 int ret;
43 unsigned long flags; 44 unsigned long flags;
44 struct res_counter *c, *u; 45 struct res_counter *c, *u;
45 46
46 *limit_fail_at = NULL; 47 *limit_fail_at = NULL;
48 if (soft_limit_fail_at)
49 *soft_limit_fail_at = NULL;
47 local_irq_save(flags); 50 local_irq_save(flags);
48 for (c = counter; c != NULL; c = c->parent) { 51 for (c = counter; c != NULL; c = c->parent) {
49 spin_lock(&c->lock); 52 spin_lock(&c->lock);
50 ret = res_counter_charge_locked(c, val); 53 ret = res_counter_charge_locked(c, val);
54 /*
55 * With soft limits, we return the highest ancestor
56 * that exceeds its soft limit
57 */
58 if (soft_limit_fail_at &&
59 !res_counter_soft_limit_check_locked(c))
60 *soft_limit_fail_at = c;
51 spin_unlock(&c->lock); 61 spin_unlock(&c->lock);
52 if (ret < 0) { 62 if (ret < 0) {
53 *limit_fail_at = c; 63 *limit_fail_at = c;
@@ -75,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
75 counter->usage -= val; 85 counter->usage -= val;
76} 86}
77 87
78void res_counter_uncharge(struct res_counter *counter, unsigned long val) 88void res_counter_uncharge(struct res_counter *counter, unsigned long val,
89 bool *was_soft_limit_excess)
79{ 90{
80 unsigned long flags; 91 unsigned long flags;
81 struct res_counter *c; 92 struct res_counter *c;
@@ -83,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val)
83 local_irq_save(flags); 94 local_irq_save(flags);
84 for (c = counter; c != NULL; c = c->parent) { 95 for (c = counter; c != NULL; c = c->parent) {
85 spin_lock(&c->lock); 96 spin_lock(&c->lock);
97 if (was_soft_limit_excess)
98 *was_soft_limit_excess =
99 !res_counter_soft_limit_check_locked(c);
86 res_counter_uncharge_locked(c, val); 100 res_counter_uncharge_locked(c, val);
87 spin_unlock(&c->lock); 101 spin_unlock(&c->lock);
88 } 102 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4ad3e6be045d..0ed325943cd1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -54,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
54#endif 55#endif
55 56
56static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
57 59
58/* 60/*
59 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -67,6 +69,7 @@ enum mem_cgroup_stat_index {
67 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
68 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
69 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
70 73
71 MEM_CGROUP_STAT_NSTATS, 74 MEM_CGROUP_STAT_NSTATS,
72}; 75};
@@ -79,6 +82,20 @@ struct mem_cgroup_stat {
79 struct mem_cgroup_stat_cpu cpustat[0]; 82 struct mem_cgroup_stat_cpu cpustat[0];
80}; 83};
81 84
85static inline void
86__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
87 enum mem_cgroup_stat_index idx)
88{
89 stat->count[idx] = 0;
90}
91
92static inline s64
93__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
94 enum mem_cgroup_stat_index idx)
95{
96 return stat->count[idx];
97}
98
82/* 99/*
83 * For accounting under irq disable, no need for increment preempt count. 100 * For accounting under irq disable, no need for increment preempt count.
84 */ 101 */
@@ -118,6 +135,10 @@ struct mem_cgroup_per_zone {
118 unsigned long count[NR_LRU_LISTS]; 135 unsigned long count[NR_LRU_LISTS];
119 136
120 struct zone_reclaim_stat reclaim_stat; 137 struct zone_reclaim_stat reclaim_stat;
138 struct rb_node tree_node; /* RB tree node */
139 unsigned long long usage_in_excess;/* Set to the value by which */
140 /* the soft limit is exceeded*/
141 bool on_tree;
121}; 142};
122/* Macro for accessing counter */ 143/* Macro for accessing counter */
123#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 144#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -131,6 +152,26 @@ struct mem_cgroup_lru_info {
131}; 152};
132 153
133/* 154/*
155 * Cgroups above their limits are maintained in a RB-Tree, independent of
156 * their hierarchy representation
157 */
158
159struct mem_cgroup_tree_per_zone {
160 struct rb_root rb_root;
161 spinlock_t lock;
162};
163
164struct mem_cgroup_tree_per_node {
165 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
166};
167
168struct mem_cgroup_tree {
169 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
170};
171
172static struct mem_cgroup_tree soft_limit_tree __read_mostly;
173
174/*
134 * The memory controller data structure. The memory controller controls both 175 * The memory controller data structure. The memory controller controls both
135 * page cache and RSS per cgroup. We would eventually like to provide 176 * page cache and RSS per cgroup. We would eventually like to provide
136 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 177 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -215,6 +256,150 @@ static void mem_cgroup_get(struct mem_cgroup *mem);
215static void mem_cgroup_put(struct mem_cgroup *mem); 256static void mem_cgroup_put(struct mem_cgroup *mem);
216static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 257static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
217 258
259static struct mem_cgroup_per_zone *
260mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
261{
262 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
263}
264
265static struct mem_cgroup_per_zone *
266page_cgroup_zoneinfo(struct page_cgroup *pc)
267{
268 struct mem_cgroup *mem = pc->mem_cgroup;
269 int nid = page_cgroup_nid(pc);
270 int zid = page_cgroup_zid(pc);
271
272 if (!mem)
273 return NULL;
274
275 return mem_cgroup_zoneinfo(mem, nid, zid);
276}
277
278static struct mem_cgroup_tree_per_zone *
279soft_limit_tree_node_zone(int nid, int zid)
280{
281 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
282}
283
284static struct mem_cgroup_tree_per_zone *
285soft_limit_tree_from_page(struct page *page)
286{
287 int nid = page_to_nid(page);
288 int zid = page_zonenum(page);
289
290 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
291}
292
293static void
294mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
295 struct mem_cgroup_per_zone *mz,
296 struct mem_cgroup_tree_per_zone *mctz)
297{
298 struct rb_node **p = &mctz->rb_root.rb_node;
299 struct rb_node *parent = NULL;
300 struct mem_cgroup_per_zone *mz_node;
301
302 if (mz->on_tree)
303 return;
304
305 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
306 spin_lock(&mctz->lock);
307 while (*p) {
308 parent = *p;
309 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
310 tree_node);
311 if (mz->usage_in_excess < mz_node->usage_in_excess)
312 p = &(*p)->rb_left;
313 /*
314 * We can't avoid mem cgroups that are over their soft
315 * limit by the same amount
316 */
317 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
318 p = &(*p)->rb_right;
319 }
320 rb_link_node(&mz->tree_node, parent, p);
321 rb_insert_color(&mz->tree_node, &mctz->rb_root);
322 mz->on_tree = true;
323 spin_unlock(&mctz->lock);
324}
325
326static void
327mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
328 struct mem_cgroup_per_zone *mz,
329 struct mem_cgroup_tree_per_zone *mctz)
330{
331 spin_lock(&mctz->lock);
332 rb_erase(&mz->tree_node, &mctz->rb_root);
333 mz->on_tree = false;
334 spin_unlock(&mctz->lock);
335}
336
337static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
338{
339 bool ret = false;
340 int cpu;
341 s64 val;
342 struct mem_cgroup_stat_cpu *cpustat;
343
344 cpu = get_cpu();
345 cpustat = &mem->stat.cpustat[cpu];
346 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
347 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
348 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
349 ret = true;
350 }
351 put_cpu();
352 return ret;
353}
354
355static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
356{
357 unsigned long long prev_usage_in_excess, new_usage_in_excess;
358 bool updated_tree = false;
359 struct mem_cgroup_per_zone *mz;
360 struct mem_cgroup_tree_per_zone *mctz;
361
362 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
363 mctz = soft_limit_tree_from_page(page);
364
365 /*
366 * We do updates in lazy mode, mem's are removed
367 * lazily from the per-zone, per-node rb tree
368 */
369 prev_usage_in_excess = mz->usage_in_excess;
370
371 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
372 if (prev_usage_in_excess) {
373 mem_cgroup_remove_exceeded(mem, mz, mctz);
374 updated_tree = true;
375 }
376 if (!new_usage_in_excess)
377 goto done;
378 mem_cgroup_insert_exceeded(mem, mz, mctz);
379
380done:
381 if (updated_tree) {
382 spin_lock(&mctz->lock);
383 mz->usage_in_excess = new_usage_in_excess;
384 spin_unlock(&mctz->lock);
385 }
386}
387
388static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
389{
390 int node, zone;
391 struct mem_cgroup_per_zone *mz;
392 struct mem_cgroup_tree_per_zone *mctz;
393
394 for_each_node_state(node, N_POSSIBLE) {
395 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
396 mz = mem_cgroup_zoneinfo(mem, node, zone);
397 mctz = soft_limit_tree_node_zone(node, zone);
398 mem_cgroup_remove_exceeded(mem, mz, mctz);
399 }
400 }
401}
402
218static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 403static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
219 struct page_cgroup *pc, 404 struct page_cgroup *pc,
220 bool charge) 405 bool charge)
@@ -236,28 +421,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
236 else 421 else
237 __mem_cgroup_stat_add_safe(cpustat, 422 __mem_cgroup_stat_add_safe(cpustat,
238 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 423 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
424 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
239 put_cpu(); 425 put_cpu();
240} 426}
241 427
242static struct mem_cgroup_per_zone *
243mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
244{
245 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
246}
247
248static struct mem_cgroup_per_zone *
249page_cgroup_zoneinfo(struct page_cgroup *pc)
250{
251 struct mem_cgroup *mem = pc->mem_cgroup;
252 int nid = page_cgroup_nid(pc);
253 int zid = page_cgroup_zid(pc);
254
255 if (!mem)
256 return NULL;
257
258 return mem_cgroup_zoneinfo(mem, nid, zid);
259}
260
261static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 428static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
262 enum lru_list idx) 429 enum lru_list idx)
263{ 430{
@@ -972,11 +1139,11 @@ done:
972 */ 1139 */
973static int __mem_cgroup_try_charge(struct mm_struct *mm, 1140static int __mem_cgroup_try_charge(struct mm_struct *mm,
974 gfp_t gfp_mask, struct mem_cgroup **memcg, 1141 gfp_t gfp_mask, struct mem_cgroup **memcg,
975 bool oom) 1142 bool oom, struct page *page)
976{ 1143{
977 struct mem_cgroup *mem, *mem_over_limit; 1144 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
978 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1145 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
979 struct res_counter *fail_res; 1146 struct res_counter *fail_res, *soft_fail_res = NULL;
980 1147
981 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1148 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
982 /* Don't account this! */ 1149 /* Don't account this! */
@@ -1006,16 +1173,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1006 int ret; 1173 int ret;
1007 bool noswap = false; 1174 bool noswap = false;
1008 1175
1009 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1176 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1177 &soft_fail_res);
1010 if (likely(!ret)) { 1178 if (likely(!ret)) {
1011 if (!do_swap_account) 1179 if (!do_swap_account)
1012 break; 1180 break;
1013 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1181 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1014 &fail_res); 1182 &fail_res, NULL);
1015 if (likely(!ret)) 1183 if (likely(!ret))
1016 break; 1184 break;
1017 /* mem+swap counter fails */ 1185 /* mem+swap counter fails */
1018 res_counter_uncharge(&mem->res, PAGE_SIZE); 1186 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1019 noswap = true; 1187 noswap = true;
1020 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1188 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1021 memsw); 1189 memsw);
@@ -1053,13 +1221,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1053 goto nomem; 1221 goto nomem;
1054 } 1222 }
1055 } 1223 }
1224 /*
1225 * Insert just the ancestor, we should trickle down to the correct
1226 * cgroup for reclaim, since the other nodes will be below their
1227 * soft limit
1228 */
1229 if (soft_fail_res) {
1230 mem_over_soft_limit =
1231 mem_cgroup_from_res_counter(soft_fail_res, res);
1232 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1233 mem_cgroup_update_tree(mem_over_soft_limit, page);
1234 }
1056 return 0; 1235 return 0;
1057nomem: 1236nomem:
1058 css_put(&mem->css); 1237 css_put(&mem->css);
1059 return -ENOMEM; 1238 return -ENOMEM;
1060} 1239}
1061 1240
1062
1063/* 1241/*
1064 * A helper function to get mem_cgroup from ID. must be called under 1242 * A helper function to get mem_cgroup from ID. must be called under
1065 * rcu_read_lock(). The caller must check css_is_removed() or some if 1243 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1126,9 +1304,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1126 lock_page_cgroup(pc); 1304 lock_page_cgroup(pc);
1127 if (unlikely(PageCgroupUsed(pc))) { 1305 if (unlikely(PageCgroupUsed(pc))) {
1128 unlock_page_cgroup(pc); 1306 unlock_page_cgroup(pc);
1129 res_counter_uncharge(&mem->res, PAGE_SIZE); 1307 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1130 if (do_swap_account) 1308 if (do_swap_account)
1131 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1309 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1132 css_put(&mem->css); 1310 css_put(&mem->css);
1133 return; 1311 return;
1134 } 1312 }
@@ -1205,7 +1383,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1205 if (pc->mem_cgroup != from) 1383 if (pc->mem_cgroup != from)
1206 goto out; 1384 goto out;
1207 1385
1208 res_counter_uncharge(&from->res, PAGE_SIZE); 1386 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1209 mem_cgroup_charge_statistics(from, pc, false); 1387 mem_cgroup_charge_statistics(from, pc, false);
1210 1388
1211 page = pc->page; 1389 page = pc->page;
@@ -1225,7 +1403,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1225 } 1403 }
1226 1404
1227 if (do_swap_account) 1405 if (do_swap_account)
1228 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1406 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1229 css_put(&from->css); 1407 css_put(&from->css);
1230 1408
1231 css_get(&to->css); 1409 css_get(&to->css);
@@ -1265,7 +1443,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1265 parent = mem_cgroup_from_cont(pcg); 1443 parent = mem_cgroup_from_cont(pcg);
1266 1444
1267 1445
1268 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1446 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1269 if (ret || !parent) 1447 if (ret || !parent)
1270 return ret; 1448 return ret;
1271 1449
@@ -1295,9 +1473,9 @@ uncharge:
1295 /* drop extra refcnt by try_charge() */ 1473 /* drop extra refcnt by try_charge() */
1296 css_put(&parent->css); 1474 css_put(&parent->css);
1297 /* uncharge if move fails */ 1475 /* uncharge if move fails */
1298 res_counter_uncharge(&parent->res, PAGE_SIZE); 1476 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1299 if (do_swap_account) 1477 if (do_swap_account)
1300 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1478 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1301 return ret; 1479 return ret;
1302} 1480}
1303 1481
@@ -1322,7 +1500,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1322 prefetchw(pc); 1500 prefetchw(pc);
1323 1501
1324 mem = memcg; 1502 mem = memcg;
1325 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1503 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1326 if (ret || !mem) 1504 if (ret || !mem)
1327 return ret; 1505 return ret;
1328 1506
@@ -1441,14 +1619,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1441 if (!mem) 1619 if (!mem)
1442 goto charge_cur_mm; 1620 goto charge_cur_mm;
1443 *ptr = mem; 1621 *ptr = mem;
1444 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1622 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1445 /* drop extra refcnt from tryget */ 1623 /* drop extra refcnt from tryget */
1446 css_put(&mem->css); 1624 css_put(&mem->css);
1447 return ret; 1625 return ret;
1448charge_cur_mm: 1626charge_cur_mm:
1449 if (unlikely(!mm)) 1627 if (unlikely(!mm))
1450 mm = &init_mm; 1628 mm = &init_mm;
1451 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1629 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1452} 1630}
1453 1631
1454static void 1632static void
@@ -1486,7 +1664,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1486 * This recorded memcg can be obsolete one. So, avoid 1664 * This recorded memcg can be obsolete one. So, avoid
1487 * calling css_tryget 1665 * calling css_tryget
1488 */ 1666 */
1489 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1667 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1490 mem_cgroup_put(memcg); 1668 mem_cgroup_put(memcg);
1491 } 1669 }
1492 rcu_read_unlock(); 1670 rcu_read_unlock();
@@ -1511,9 +1689,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1511 return; 1689 return;
1512 if (!mem) 1690 if (!mem)
1513 return; 1691 return;
1514 res_counter_uncharge(&mem->res, PAGE_SIZE); 1692 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1515 if (do_swap_account) 1693 if (do_swap_account)
1516 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1694 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1517 css_put(&mem->css); 1695 css_put(&mem->css);
1518} 1696}
1519 1697
@@ -1527,6 +1705,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1527 struct page_cgroup *pc; 1705 struct page_cgroup *pc;
1528 struct mem_cgroup *mem = NULL; 1706 struct mem_cgroup *mem = NULL;
1529 struct mem_cgroup_per_zone *mz; 1707 struct mem_cgroup_per_zone *mz;
1708 bool soft_limit_excess = false;
1530 1709
1531 if (mem_cgroup_disabled()) 1710 if (mem_cgroup_disabled())
1532 return NULL; 1711 return NULL;
@@ -1565,9 +1744,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1565 break; 1744 break;
1566 } 1745 }
1567 1746
1568 res_counter_uncharge(&mem->res, PAGE_SIZE); 1747 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1569 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1748 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1570 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1749 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1571 mem_cgroup_charge_statistics(mem, pc, false); 1750 mem_cgroup_charge_statistics(mem, pc, false);
1572 1751
1573 ClearPageCgroupUsed(pc); 1752 ClearPageCgroupUsed(pc);
@@ -1581,6 +1760,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1581 mz = page_cgroup_zoneinfo(pc); 1760 mz = page_cgroup_zoneinfo(pc);
1582 unlock_page_cgroup(pc); 1761 unlock_page_cgroup(pc);
1583 1762
1763 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
1764 mem_cgroup_update_tree(mem, page);
1584 /* at swapout, this memcg will be accessed to record to swap */ 1765 /* at swapout, this memcg will be accessed to record to swap */
1585 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1766 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1586 css_put(&mem->css); 1767 css_put(&mem->css);
@@ -1656,7 +1837,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1656 * We uncharge this because swap is freed. 1837 * We uncharge this because swap is freed.
1657 * This memcg can be obsolete one. We avoid calling css_tryget 1838 * This memcg can be obsolete one. We avoid calling css_tryget
1658 */ 1839 */
1659 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1840 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1660 mem_cgroup_put(memcg); 1841 mem_cgroup_put(memcg);
1661 } 1842 }
1662 rcu_read_unlock(); 1843 rcu_read_unlock();
@@ -1685,7 +1866,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1685 unlock_page_cgroup(pc); 1866 unlock_page_cgroup(pc);
1686 1867
1687 if (mem) { 1868 if (mem) {
1688 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 1869 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
1870 page);
1689 css_put(&mem->css); 1871 css_put(&mem->css);
1690 } 1872 }
1691 *ptr = mem; 1873 *ptr = mem;
@@ -2194,6 +2376,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2194 res_counter_reset_failcnt(&mem->memsw); 2376 res_counter_reset_failcnt(&mem->memsw);
2195 break; 2377 break;
2196 } 2378 }
2379
2197 return 0; 2380 return 0;
2198} 2381}
2199 2382
@@ -2489,6 +2672,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2489 mz = &pn->zoneinfo[zone]; 2672 mz = &pn->zoneinfo[zone];
2490 for_each_lru(l) 2673 for_each_lru(l)
2491 INIT_LIST_HEAD(&mz->lists[l]); 2674 INIT_LIST_HEAD(&mz->lists[l]);
2675 mz->usage_in_excess = 0;
2492 } 2676 }
2493 return 0; 2677 return 0;
2494} 2678}
@@ -2534,6 +2718,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2534{ 2718{
2535 int node; 2719 int node;
2536 2720
2721 mem_cgroup_remove_from_trees(mem);
2537 free_css_id(&mem_cgroup_subsys, &mem->css); 2722 free_css_id(&mem_cgroup_subsys, &mem->css);
2538 2723
2539 for_each_node_state(node, N_POSSIBLE) 2724 for_each_node_state(node, N_POSSIBLE)
@@ -2582,6 +2767,31 @@ static void __init enable_swap_cgroup(void)
2582} 2767}
2583#endif 2768#endif
2584 2769
2770static int mem_cgroup_soft_limit_tree_init(void)
2771{
2772 struct mem_cgroup_tree_per_node *rtpn;
2773 struct mem_cgroup_tree_per_zone *rtpz;
2774 int tmp, node, zone;
2775
2776 for_each_node_state(node, N_POSSIBLE) {
2777 tmp = node;
2778 if (!node_state(node, N_NORMAL_MEMORY))
2779 tmp = -1;
2780 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
2781 if (!rtpn)
2782 return 1;
2783
2784 soft_limit_tree.rb_tree_per_node[node] = rtpn;
2785
2786 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
2787 rtpz = &rtpn->rb_tree_per_zone[zone];
2788 rtpz->rb_root = RB_ROOT;
2789 spin_lock_init(&rtpz->lock);
2790 }
2791 }
2792 return 0;
2793}
2794
2585static struct cgroup_subsys_state * __ref 2795static struct cgroup_subsys_state * __ref
2586mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2796mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2587{ 2797{
@@ -2596,11 +2806,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2596 for_each_node_state(node, N_POSSIBLE) 2806 for_each_node_state(node, N_POSSIBLE)
2597 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2807 if (alloc_mem_cgroup_per_zone_info(mem, node))
2598 goto free_out; 2808 goto free_out;
2809
2599 /* root ? */ 2810 /* root ? */
2600 if (cont->parent == NULL) { 2811 if (cont->parent == NULL) {
2601 enable_swap_cgroup(); 2812 enable_swap_cgroup();
2602 parent = NULL; 2813 parent = NULL;
2603 root_mem_cgroup = mem; 2814 root_mem_cgroup = mem;
2815 if (mem_cgroup_soft_limit_tree_init())
2816 goto free_out;
2817
2604 } else { 2818 } else {
2605 parent = mem_cgroup_from_cont(cont->parent); 2819 parent = mem_cgroup_from_cont(cont->parent);
2606 mem->use_hierarchy = parent->use_hierarchy; 2820 mem->use_hierarchy = parent->use_hierarchy;