aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c737
1 files changed, 653 insertions, 84 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9b10d8753784..e2b98a6875c0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz)
317{
318 struct rb_node **p = &mctz->rb_root.rb_node;
319 struct rb_node *parent = NULL;
320 struct mem_cgroup_per_zone *mz_node;
321
322 if (mz->on_tree)
323 return;
324
325 mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res);
326 while (*p) {
327 parent = *p;
328 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
329 tree_node);
330 if (mz->usage_in_excess < mz_node->usage_in_excess)
331 p = &(*p)->rb_left;
332 /*
333 * We can't avoid mem cgroups that are over their soft
334 * limit by the same amount
335 */
336 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
337 p = &(*p)->rb_right;
338 }
339 rb_link_node(&mz->tree_node, parent, p);
340 rb_insert_color(&mz->tree_node, &mctz->rb_root);
341 mz->on_tree = true;
342}
343
344static void
345__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
346 struct mem_cgroup_per_zone *mz,
347 struct mem_cgroup_tree_per_zone *mctz)
348{
349 if (!mz->on_tree)
350 return;
351 rb_erase(&mz->tree_node, &mctz->rb_root);
352 mz->on_tree = false;
353}
354
355static void
356mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
357 struct mem_cgroup_per_zone *mz,
358 struct mem_cgroup_tree_per_zone *mctz)
359{
360 spin_lock(&mctz->lock);
361 __mem_cgroup_insert_exceeded(mem, mz, mctz);
362 spin_unlock(&mctz->lock);
363}
364
365static void
366mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
367 struct mem_cgroup_per_zone *mz,
368 struct mem_cgroup_tree_per_zone *mctz)
369{
370 spin_lock(&mctz->lock);
371 __mem_cgroup_remove_exceeded(mem, mz, mctz);
372 spin_unlock(&mctz->lock);
373}
374
375static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
376{
377 bool ret = false;
378 int cpu;
379 s64 val;
380 struct mem_cgroup_stat_cpu *cpustat;
381
382 cpu = get_cpu();
383 cpustat = &mem->stat.cpustat[cpu];
384 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
385 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
386 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
387 ret = true;
388 }
389 put_cpu();
390 return ret;
391}
392
393static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
394{
395 unsigned long long prev_usage_in_excess, new_usage_in_excess;
396 bool updated_tree = false;
397 struct mem_cgroup_per_zone *mz;
398 struct mem_cgroup_tree_per_zone *mctz;
399
400 mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
401 mctz = soft_limit_tree_from_page(page);
402
403 /*
404 * We do updates in lazy mode, mem's are removed
405 * lazily from the per-zone, per-node rb tree
406 */
407 prev_usage_in_excess = mz->usage_in_excess;
408
409 new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
410 if (prev_usage_in_excess) {
411 mem_cgroup_remove_exceeded(mem, mz, mctz);
412 updated_tree = true;
413 }
414 if (!new_usage_in_excess)
415 goto done;
416 mem_cgroup_insert_exceeded(mem, mz, mctz);
417
418done:
419 if (updated_tree) {
420 spin_lock(&mctz->lock);
421 mz->usage_in_excess = new_usage_in_excess;
422 spin_unlock(&mctz->lock);
423 }
424}
425
426static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
427{
428 int node, zone;
429 struct mem_cgroup_per_zone *mz;
430 struct mem_cgroup_tree_per_zone *mctz;
431
432 for_each_node_state(node, N_POSSIBLE) {
433 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
434 mz = mem_cgroup_zoneinfo(mem, node, zone);
435 mctz = soft_limit_tree_node_zone(node, zone);
436 mem_cgroup_remove_exceeded(mem, mz, mctz);
437 }
438 }
439}
440
441static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
442{
443 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
444}
445
446static struct mem_cgroup_per_zone *
447__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
448{
449 struct rb_node *rightmost = NULL;
450 struct mem_cgroup_per_zone *mz = NULL;
451
452retry:
453 rightmost = rb_last(&mctz->rb_root);
454 if (!rightmost)
455 goto done; /* Nothing to reclaim from */
456
457 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
458 /*
459 * Remove the node now but someone else can add it back,
460 * we will to add it back at the end of reclaim to its correct
461 * position in the tree.
462 */
463 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
464 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
465 !css_tryget(&mz->mem->css))
466 goto retry;
467done:
468 return mz;
469}
470
471static struct mem_cgroup_per_zone *
472mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
473{
474 struct mem_cgroup_per_zone *mz;
475
476 spin_lock(&mctz->lock);
477 mz = __mem_cgroup_largest_soft_limit_node(mctz);
478 spin_unlock(&mctz->lock);
479 return mz;
480}
481
482static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
483 bool charge)
484{
485 int val = (charge) ? 1 : -1;
486 struct mem_cgroup_stat *stat = &mem->stat;
487 struct mem_cgroup_stat_cpu *cpustat;
488 int cpu = get_cpu();
489
490 cpustat = &stat->cpustat[cpu];
491 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
492 put_cpu();
493}
494
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 495static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 496 struct page_cgroup *pc,
224 bool charge) 497 bool charge)
225{ 498{
226 int val = (charge)? 1 : -1; 499 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 500 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 501 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 502 int cpu = get_cpu();
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 513 else
241 __mem_cgroup_stat_add_safe(cpustat, 514 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 515 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
516 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 517 put_cpu();
244} 518}
245 519
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 520static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 521 enum lru_list idx)
267{ 522{
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 609 return ret;
355} 610}
356 611
612static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
613{
614 return (mem == root_mem_cgroup);
615}
616
357/* 617/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 618 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 619 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 631void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 632{
373 struct page_cgroup *pc; 633 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 634 struct mem_cgroup_per_zone *mz;
376 635
377 if (mem_cgroup_disabled()) 636 if (mem_cgroup_disabled())
378 return; 637 return;
379 pc = lookup_page_cgroup(page); 638 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 639 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 640 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 641 return;
642 VM_BUG_ON(!pc->mem_cgroup);
383 /* 643 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 644 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 645 * removed from global LRU.
386 */ 646 */
387 mz = page_cgroup_zoneinfo(pc); 647 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 648 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
649 if (mem_cgroup_is_root(pc->mem_cgroup))
650 return;
651 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 652 list_del_init(&pc->lru);
391 return; 653 return;
392} 654}
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 672 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 673 */
412 smp_rmb(); 674 smp_rmb();
413 /* unused page is not rotated. */ 675 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 676 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 677 return;
416 mz = page_cgroup_zoneinfo(pc); 678 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 679 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 687 if (mem_cgroup_disabled())
426 return; 688 return;
427 pc = lookup_page_cgroup(page); 689 pc = lookup_page_cgroup(page);
690 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 691 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 692 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 693 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 698
436 mz = page_cgroup_zoneinfo(pc); 699 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 700 MEM_CGROUP_ZSTAT(mz, lru) += 1;
701 SetPageCgroupAcctLRU(pc);
702 if (mem_cgroup_is_root(pc->mem_cgroup))
703 return;
438 list_add(&pc->lru, &mz->lists[lru]); 704 list_add(&pc->lru, &mz->lists[lru]);
439} 705}
440 706
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 735
470 spin_lock_irqsave(&zone->lru_lock, flags); 736 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 737 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 738 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 739 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 740 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 741}
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1121 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1122 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1123static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1124 struct zone *zone,
1125 gfp_t gfp_mask,
1126 unsigned long reclaim_options)
859{ 1127{
860 struct mem_cgroup *victim; 1128 struct mem_cgroup *victim;
861 int ret, total = 0; 1129 int ret, total = 0;
862 int loop = 0; 1130 int loop = 0;
1131 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1132 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1133 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1134 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1135
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1136 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1137 if (root_mem->memsw_is_minimum)
866 noswap = true; 1138 noswap = true;
867 1139
868 while (loop < 2) { 1140 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1141 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1142 if (victim == root_mem) {
871 loop++; 1143 loop++;
1144 if (loop >= 2) {
1145 /*
1146 * If we have not been able to reclaim
1147 * anything, it might because there are
1148 * no reclaimable pages under this hierarchy
1149 */
1150 if (!check_soft || !total) {
1151 css_put(&victim->css);
1152 break;
1153 }
1154 /*
1155 * We want to do more targetted reclaim.
1156 * excess >> 2 is not to excessive so as to
1157 * reclaim too much, nor too less that we keep
1158 * coming back to reclaim from this cgroup
1159 */
1160 if (total >= (excess >> 2) ||
1161 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1162 css_put(&victim->css);
1163 break;
1164 }
1165 }
1166 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1167 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1168 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1169 css_put(&victim->css);
875 continue; 1170 continue;
876 } 1171 }
877 /* we use swappiness of local cgroup */ 1172 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1173 if (check_soft)
879 get_swappiness(victim)); 1174 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1175 noswap, get_swappiness(victim), zone,
1176 zone->zone_pgdat->node_id);
1177 else
1178 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1179 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1180 css_put(&victim->css);
881 /* 1181 /*
882 * At shrinking usage, we can't check we should stop here or 1182 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1186 if (shrink)
887 return ret; 1187 return ret;
888 total += ret; 1188 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1189 if (check_soft) {
1190 if (res_counter_check_under_soft_limit(&root_mem->res))
1191 return total;
1192 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1193 return 1 + total;
891 } 1194 }
892 return total; 1195 return total;
@@ -965,11 +1268,11 @@ done:
965 */ 1268 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1269static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1270 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1271 bool oom, struct page *page)
969{ 1272{
970 struct mem_cgroup *mem, *mem_over_limit; 1273 struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1274 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
972 struct res_counter *fail_res; 1275 struct res_counter *fail_res, *soft_fail_res = NULL;
973 1276
974 if (unlikely(test_thread_flag(TIF_MEMDIE))) { 1277 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
975 /* Don't account this! */ 1278 /* Don't account this! */
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1299 VM_BUG_ON(css_is_removed(&mem->css));
997 1300
998 while (1) { 1301 while (1) {
999 int ret; 1302 int ret = 0;
1000 bool noswap = false; 1303 unsigned long flags = 0;
1001 1304
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1305 if (mem_cgroup_is_root(mem))
1306 goto done;
1307 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
1308 &soft_fail_res);
1003 if (likely(!ret)) { 1309 if (likely(!ret)) {
1004 if (!do_swap_account) 1310 if (!do_swap_account)
1005 break; 1311 break;
1006 ret = res_counter_charge(&mem->memsw, PAGE_SIZE, 1312 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
1007 &fail_res); 1313 &fail_res, NULL);
1008 if (likely(!ret)) 1314 if (likely(!ret))
1009 break; 1315 break;
1010 /* mem+swap counter fails */ 1316 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1317 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1012 noswap = true; 1318 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1319 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1320 memsw);
1015 } else 1321 } else
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1326 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1327 goto nomem;
1022 1328
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1329 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1330 gfp_mask, flags);
1025 if (ret) 1331 if (ret)
1026 continue; 1332 continue;
1027 1333
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1352 goto nomem;
1047 } 1353 }
1048 } 1354 }
1355 /*
1356 * Insert just the ancestor, we should trickle down to the correct
1357 * cgroup for reclaim, since the other nodes will be below their
1358 * soft limit
1359 */
1360 if (soft_fail_res) {
1361 mem_over_soft_limit =
1362 mem_cgroup_from_res_counter(soft_fail_res, res);
1363 if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
1364 mem_cgroup_update_tree(mem_over_soft_limit, page);
1365 }
1366done:
1049 return 0; 1367 return 0;
1050nomem: 1368nomem:
1051 css_put(&mem->css); 1369 css_put(&mem->css);
1052 return -ENOMEM; 1370 return -ENOMEM;
1053} 1371}
1054 1372
1055
1056/* 1373/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1374 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1375 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1436 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1437 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1438 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1439 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1440 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1441 if (do_swap_account)
1442 res_counter_uncharge(&mem->memsw, PAGE_SIZE,
1443 NULL);
1444 }
1125 css_put(&mem->css); 1445 css_put(&mem->css);
1126 return; 1446 return;
1127 } 1447 }
1448
1128 pc->mem_cgroup = mem; 1449 pc->mem_cgroup = mem;
1450 /*
1451 * We access a page_cgroup asynchronously without lock_page_cgroup().
1452 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1453 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1454 * before USED bit, we need memory barrier here.
1455 * See mem_cgroup_add_lru_list(), etc.
1456 */
1129 smp_wmb(); 1457 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1458 switch (ctype) {
1459 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1460 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1461 SetPageCgroupCache(pc);
1462 SetPageCgroupUsed(pc);
1463 break;
1464 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1465 ClearPageCgroupCache(pc);
1466 SetPageCgroupUsed(pc);
1467 break;
1468 default:
1469 break;
1470 }
1131 1471
1132 mem_cgroup_charge_statistics(mem, pc, true); 1472 mem_cgroup_charge_statistics(mem, pc, true);
1133 1473
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1518 if (pc->mem_cgroup != from)
1179 goto out; 1519 goto out;
1180 1520
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1521 if (!mem_cgroup_is_root(from))
1522 res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
1182 mem_cgroup_charge_statistics(from, pc, false); 1523 mem_cgroup_charge_statistics(from, pc, false);
1183 1524
1184 page = pc->page; 1525 page = pc->page;
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1538 1);
1198 } 1539 }
1199 1540
1200 if (do_swap_account) 1541 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1542 res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
1202 css_put(&from->css); 1543 css_put(&from->css);
1203 1544
1204 css_get(&to->css); 1545 css_get(&to->css);
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1579 parent = mem_cgroup_from_cont(pcg);
1239 1580
1240 1581
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1582 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1583 if (ret || !parent)
1243 return ret; 1584 return ret;
1244 1585
@@ -1268,9 +1609,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1609 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1610 css_put(&parent->css);
1270 /* uncharge if move fails */ 1611 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1612 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1613 res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1614 if (do_swap_account)
1615 res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
1616 }
1274 return ret; 1617 return ret;
1275} 1618}
1276 1619
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1638 prefetchw(pc);
1296 1639
1297 mem = memcg; 1640 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1641 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1642 if (ret || !mem)
1300 return ret; 1643 return ret;
1301 1644
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1757 if (!mem)
1415 goto charge_cur_mm; 1758 goto charge_cur_mm;
1416 *ptr = mem; 1759 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1760 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1761 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1762 css_put(&mem->css);
1420 return ret; 1763 return ret;
1421charge_cur_mm: 1764charge_cur_mm:
1422 if (unlikely(!mm)) 1765 if (unlikely(!mm))
1423 mm = &init_mm; 1766 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1767 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1768}
1426 1769
1427static void 1770static void
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1802 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1803 * calling css_tryget
1461 */ 1804 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1805 if (!mem_cgroup_is_root(memcg))
1806 res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
1807 NULL);
1808 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1809 mem_cgroup_put(memcg);
1464 } 1810 }
1465 rcu_read_unlock(); 1811 rcu_read_unlock();
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1830 return;
1485 if (!mem) 1831 if (!mem)
1486 return; 1832 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1833 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1834 res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1835 if (do_swap_account)
1836 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1837 }
1490 css_put(&mem->css); 1838 css_put(&mem->css);
1491} 1839}
1492 1840
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1500 struct page_cgroup *pc; 1848 struct page_cgroup *pc;
1501 struct mem_cgroup *mem = NULL; 1849 struct mem_cgroup *mem = NULL;
1502 struct mem_cgroup_per_zone *mz; 1850 struct mem_cgroup_per_zone *mz;
1851 bool soft_limit_excess = false;
1503 1852
1504 if (mem_cgroup_disabled()) 1853 if (mem_cgroup_disabled())
1505 return NULL; 1854 return NULL;
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1887 break;
1539 } 1888 }
1540 1889
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1890 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1891 res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1892 if (do_swap_account &&
1893 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1894 res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
1895 }
1896 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1897 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1898 mem_cgroup_charge_statistics(mem, pc, false);
1545 1899
1546 ClearPageCgroupUsed(pc); 1900 ClearPageCgroupUsed(pc);
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1908 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1909 unlock_page_cgroup(pc);
1556 1910
1911 if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
1912 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1913 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1914 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1915 css_put(&mem->css);
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1985 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1986 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1987 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1988 if (!mem_cgroup_is_root(memcg))
1989 res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
1990 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1991 mem_cgroup_put(memcg);
1634 } 1992 }
1635 rcu_read_unlock(); 1993 rcu_read_unlock();
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2016 unlock_page_cgroup(pc);
1659 2017
1660 if (mem) { 2018 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2019 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2020 page);
1662 css_put(&mem->css); 2021 css_put(&mem->css);
1663 } 2022 }
1664 *ptr = mem; 2023 *ptr = mem;
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2157 if (!ret)
1799 break; 2158 break;
1800 2159
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2160 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2161 GFP_KERNEL,
2162 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2163 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2164 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2165 if (curusage >= oldusage)
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2211 if (!ret)
1852 break; 2212 break;
1853 2213
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2214 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2215 MEM_CGROUP_RECLAIM_NOSWAP |
2216 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2217 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2218 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2219 if (curusage >= oldusage)
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2224 return ret;
1863} 2225}
1864 2226
2227unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2228 gfp_t gfp_mask, int nid,
2229 int zid)
2230{
2231 unsigned long nr_reclaimed = 0;
2232 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2233 unsigned long reclaimed;
2234 int loop = 0;
2235 struct mem_cgroup_tree_per_zone *mctz;
2236
2237 if (order > 0)
2238 return 0;
2239
2240 mctz = soft_limit_tree_node_zone(nid, zid);
2241 /*
2242 * This loop can run a while, specially if mem_cgroup's continuously
2243 * keep exceeding their soft limit and putting the system under
2244 * pressure
2245 */
2246 do {
2247 if (next_mz)
2248 mz = next_mz;
2249 else
2250 mz = mem_cgroup_largest_soft_limit_node(mctz);
2251 if (!mz)
2252 break;
2253
2254 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2255 gfp_mask,
2256 MEM_CGROUP_RECLAIM_SOFT);
2257 nr_reclaimed += reclaimed;
2258 spin_lock(&mctz->lock);
2259
2260 /*
2261 * If we failed to reclaim anything from this memory cgroup
2262 * it is time to move on to the next cgroup
2263 */
2264 next_mz = NULL;
2265 if (!reclaimed) {
2266 do {
2267 /*
2268 * Loop until we find yet another one.
2269 *
2270 * By the time we get the soft_limit lock
2271 * again, someone might have aded the
2272 * group back on the RB tree. Iterate to
2273 * make sure we get a different mem.
2274 * mem_cgroup_largest_soft_limit_node returns
2275 * NULL if no other cgroup is present on
2276 * the tree
2277 */
2278 next_mz =
2279 __mem_cgroup_largest_soft_limit_node(mctz);
2280 if (next_mz == mz) {
2281 css_put(&next_mz->mem->css);
2282 next_mz = NULL;
2283 } else /* next_mz == NULL or other memcg */
2284 break;
2285 } while (1);
2286 }
2287 mz->usage_in_excess =
2288 res_counter_soft_limit_excess(&mz->mem->res);
2289 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2290 /*
2291 * One school of thought says that we should not add
2292 * back the node to the tree if reclaim returns 0.
2293 * But our reclaim could return 0, simply because due
2294 * to priority we are exposing a smaller subset of
2295 * memory to reclaim from. Consider this as a longer
2296 * term TODO.
2297 */
2298 if (mz->usage_in_excess)
2299 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz);
2300 spin_unlock(&mctz->lock);
2301 css_put(&mz->mem->css);
2302 loop++;
2303 /*
2304 * Could not reclaim anything and there are no more
2305 * mem cgroups to try or we seem to be looping without
2306 * reclaiming anything.
2307 */
2308 if (!nr_reclaimed &&
2309 (next_mz == NULL ||
2310 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2311 break;
2312 } while (!nr_reclaimed);
2313 if (next_mz)
2314 css_put(&next_mz->mem->css);
2315 return nr_reclaimed;
2316}
2317
1865/* 2318/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2319 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2320 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2499 return retval;
2047} 2500}
2048 2501
2502struct mem_cgroup_idx_data {
2503 s64 val;
2504 enum mem_cgroup_stat_index idx;
2505};
2506
2507static int
2508mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2509{
2510 struct mem_cgroup_idx_data *d = data;
2511 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2512 return 0;
2513}
2514
2515static void
2516mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2517 enum mem_cgroup_stat_index idx, s64 *val)
2518{
2519 struct mem_cgroup_idx_data d;
2520 d.idx = idx;
2521 d.val = 0;
2522 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2523 *val = d.val;
2524}
2525
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2526static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2527{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2528 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2529 u64 idx_val, val;
2053 int type, name; 2530 int type, name;
2054 2531
2055 type = MEMFILE_TYPE(cft->private); 2532 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2533 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2534 switch (type) {
2058 case _MEM: 2535 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2536 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2537 mem_cgroup_get_recursive_idx_stat(mem,
2538 MEM_CGROUP_STAT_CACHE, &idx_val);
2539 val = idx_val;
2540 mem_cgroup_get_recursive_idx_stat(mem,
2541 MEM_CGROUP_STAT_RSS, &idx_val);
2542 val += idx_val;
2543 val <<= PAGE_SHIFT;
2544 } else
2545 val = res_counter_read_u64(&mem->res, name);
2060 break; 2546 break;
2061 case _MEMSWAP: 2547 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2548 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2549 mem_cgroup_get_recursive_idx_stat(mem,
2550 MEM_CGROUP_STAT_CACHE, &idx_val);
2551 val = idx_val;
2552 mem_cgroup_get_recursive_idx_stat(mem,
2553 MEM_CGROUP_STAT_RSS, &idx_val);
2554 val += idx_val;
2555 mem_cgroup_get_recursive_idx_stat(mem,
2556 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2557 val <<= PAGE_SHIFT;
2558 } else
2559 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2560 break;
2064 default: 2561 default:
2065 BUG(); 2562 BUG();
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2580 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2581 switch (name) {
2085 case RES_LIMIT: 2582 case RES_LIMIT:
2583 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2584 ret = -EINVAL;
2585 break;
2586 }
2086 /* This function does all necessary parse...reuse it */ 2587 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2588 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2589 if (ret)
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2593 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2594 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2595 break;
2596 case RES_SOFT_LIMIT:
2597 ret = res_counter_memparse_write_strategy(buffer, &val);
2598 if (ret)
2599 break;
2600 /*
2601 * For memsw, soft limits are hard to implement in terms
2602 * of semantics, for now, we support soft limits for
2603 * control without swap
2604 */
2605 if (type == _MEM)
2606 ret = res_counter_set_soft_limit(&memcg->res, val);
2607 else
2608 ret = -EINVAL;
2609 break;
2095 default: 2610 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2611 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2612 break;
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2664 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2665 break;
2151 } 2666 }
2667
2152 return 0; 2668 return 0;
2153} 2669}
2154 2670
@@ -2160,6 +2676,7 @@ enum {
2160 MCS_MAPPED_FILE, 2676 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2677 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2678 MCS_PGPGOUT,
2679 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2680 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2681 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2682 MCS_INACTIVE_FILE,
@@ -2181,6 +2698,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2698 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2699 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2700 {"pgpgout", "total_pgpgout"},
2701 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2702 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2703 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2704 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2723 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2724 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2725 s->stat[MCS_PGPGOUT] += val;
2726 if (do_swap_account) {
2727 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2728 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2729 }
2208 2730
2209 /* per zone stat */ 2731 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2732 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2758 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2759 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2760
2239 for (i = 0; i < NR_MCS_STAT; i++) 2761 for (i = 0; i < NR_MCS_STAT; i++) {
2762 if (i == MCS_SWAP && !do_swap_account)
2763 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2764 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2765 }
2241 2766
2242 /* Hierarchical information */ 2767 /* Hierarchical information */
2243 { 2768 {
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2775
2251 memset(&mystat, 0, sizeof(mystat)); 2776 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2777 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2778 for (i = 0; i < NR_MCS_STAT; i++) {
2779 if (i == MCS_SWAP && !do_swap_account)
2780 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2781 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2782 }
2256 2783
2257#ifdef CONFIG_DEBUG_VM 2784#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2785 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2872 .read_u64 = mem_cgroup_read,
2346 }, 2873 },
2347 { 2874 {
2875 .name = "soft_limit_in_bytes",
2876 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2877 .write_string = mem_cgroup_write,
2878 .read_u64 = mem_cgroup_read,
2879 },
2880 {
2348 .name = "failcnt", 2881 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2882 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2883 .trigger = mem_cgroup_reset,
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2971 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2972 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2973 INIT_LIST_HEAD(&mz->lists[l]);
2974 mz->usage_in_excess = 0;
2975 mz->on_tree = false;
2976 mz->mem = mem;
2441 } 2977 }
2442 return 0; 2978 return 0;
2443} 2979}
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3019{
2484 int node; 3020 int node;
2485 3021
3022 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3023 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3024
2488 for_each_node_state(node, N_POSSIBLE) 3025 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void)
2531} 3068}
2532#endif 3069#endif
2533 3070
3071static int mem_cgroup_soft_limit_tree_init(void)
3072{
3073 struct mem_cgroup_tree_per_node *rtpn;
3074 struct mem_cgroup_tree_per_zone *rtpz;
3075 int tmp, node, zone;
3076
3077 for_each_node_state(node, N_POSSIBLE) {
3078 tmp = node;
3079 if (!node_state(node, N_NORMAL_MEMORY))
3080 tmp = -1;
3081 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3082 if (!rtpn)
3083 return 1;
3084
3085 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3086
3087 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3088 rtpz = &rtpn->rb_tree_per_zone[zone];
3089 rtpz->rb_root = RB_ROOT;
3090 spin_lock_init(&rtpz->lock);
3091 }
3092 }
3093 return 0;
3094}
3095
2534static struct cgroup_subsys_state * __ref 3096static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3097mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3098{
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3107 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3108 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3109 goto free_out;
3110
2548 /* root ? */ 3111 /* root ? */
2549 if (cont->parent == NULL) { 3112 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3113 enable_swap_cgroup();
2551 parent = NULL; 3114 parent = NULL;
3115 root_mem_cgroup = mem;
3116 if (mem_cgroup_soft_limit_tree_init())
3117 goto free_out;
3118
2552 } else { 3119 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3120 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3121 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3144 return &mem->css;
2578free_out: 3145free_out:
2579 __mem_cgroup_free(mem); 3146 __mem_cgroup_free(mem);
3147 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3148 return ERR_PTR(error);
2581} 3149}
2582 3150
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3180static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3181 struct cgroup *cont,
2614 struct cgroup *old_cont, 3182 struct cgroup *old_cont,
2615 struct task_struct *p) 3183 struct task_struct *p,
3184 bool threadgroup)
2616{ 3185{
2617 mutex_lock(&memcg_tasklist); 3186 mutex_lock(&memcg_tasklist);
2618 /* 3187 /*