aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c714
1 files changed, 635 insertions, 79 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fd4529d86de5..f99f5991d6bb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -29,6 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h> 30#include <linux/limits.h>
31#include <linux/mutex.h> 31#include <linux/mutex.h>
32#include <linux/rbtree.h>
32#include <linux/slab.h> 33#include <linux/slab.h>
33#include <linux/swap.h> 34#include <linux/swap.h>
34#include <linux/spinlock.h> 35#include <linux/spinlock.h>
@@ -43,6 +44,7 @@
43 44
44struct cgroup_subsys mem_cgroup_subsys __read_mostly; 45struct cgroup_subsys mem_cgroup_subsys __read_mostly;
45#define MEM_CGROUP_RECLAIM_RETRIES 5 46#define MEM_CGROUP_RECLAIM_RETRIES 5
47struct mem_cgroup *root_mem_cgroup __read_mostly;
46 48
47#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 49#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
48/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 50/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
53#endif 55#endif
54 56
55static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ 57static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
58#define SOFTLIMIT_EVENTS_THRESH (1000)
56 59
57/* 60/*
58 * Statistics for memory cgroup. 61 * Statistics for memory cgroup.
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index {
66 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ 69 MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */
67 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ 70 MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */
68 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ 71 MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */
72 MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */
73 MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
69 74
70 MEM_CGROUP_STAT_NSTATS, 75 MEM_CGROUP_STAT_NSTATS,
71}; 76};
@@ -78,6 +83,20 @@ struct mem_cgroup_stat {
78 struct mem_cgroup_stat_cpu cpustat[0]; 83 struct mem_cgroup_stat_cpu cpustat[0];
79}; 84};
80 85
86static inline void
87__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat,
88 enum mem_cgroup_stat_index idx)
89{
90 stat->count[idx] = 0;
91}
92
93static inline s64
94__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
95 enum mem_cgroup_stat_index idx)
96{
97 return stat->count[idx];
98}
99
81/* 100/*
82 * For accounting under irq disable, no need for increment preempt count. 101 * For accounting under irq disable, no need for increment preempt count.
83 */ 102 */
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone {
117 unsigned long count[NR_LRU_LISTS]; 136 unsigned long count[NR_LRU_LISTS];
118 137
119 struct zone_reclaim_stat reclaim_stat; 138 struct zone_reclaim_stat reclaim_stat;
139 struct rb_node tree_node; /* RB tree node */
140 unsigned long long usage_in_excess;/* Set to the value by which */
141 /* the soft limit is exceeded*/
142 bool on_tree;
143 struct mem_cgroup *mem; /* Back pointer, we cannot */
144 /* use container_of */
120}; 145};
121/* Macro for accessing counter */ 146/* Macro for accessing counter */
122#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 147#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info {
130}; 155};
131 156
132/* 157/*
158 * Cgroups above their limits are maintained in a RB-Tree, independent of
159 * their hierarchy representation
160 */
161
162struct mem_cgroup_tree_per_zone {
163 struct rb_root rb_root;
164 spinlock_t lock;
165};
166
167struct mem_cgroup_tree_per_node {
168 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES];
169};
170
171struct mem_cgroup_tree {
172 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
173};
174
175static struct mem_cgroup_tree soft_limit_tree __read_mostly;
176
177/*
133 * The memory controller data structure. The memory controller controls both 178 * The memory controller data structure. The memory controller controls both
134 * page cache and RSS per cgroup. We would eventually like to provide 179 * page cache and RSS per cgroup. We would eventually like to provide
135 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 180 * statistics based on the statistics developed by Rik Van Riel for clock-pro,
@@ -186,6 +231,13 @@ struct mem_cgroup {
186 struct mem_cgroup_stat stat; 231 struct mem_cgroup_stat stat;
187}; 232};
188 233
234/*
235 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
236 * limit reclaim to prevent infinite loops, if they ever occur.
237 */
238#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
239#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2)
240
189enum charge_type { 241enum charge_type {
190 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 242 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
191 MEM_CGROUP_CHARGE_TYPE_MAPPED, 243 MEM_CGROUP_CHARGE_TYPE_MAPPED,
@@ -200,13 +252,8 @@ enum charge_type {
200#define PCGF_CACHE (1UL << PCG_CACHE) 252#define PCGF_CACHE (1UL << PCG_CACHE)
201#define PCGF_USED (1UL << PCG_USED) 253#define PCGF_USED (1UL << PCG_USED)
202#define PCGF_LOCK (1UL << PCG_LOCK) 254#define PCGF_LOCK (1UL << PCG_LOCK)
203static const unsigned long 255/* Not used, but added here for completeness */
204pcg_default_flags[NR_CHARGE_TYPE] = { 256#define PCGF_ACCT (1UL << PCG_ACCT)
205 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
206 PCGF_USED | PCGF_LOCK, /* Anon */
207 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
208 0, /* FORCE */
209};
210 257
211/* for encoding cft->private value on file */ 258/* for encoding cft->private value on file */
212#define _MEM (0) 259#define _MEM (0)
@@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = {
215#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) 262#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
216#define MEMFILE_ATTR(val) ((val) & 0xffff) 263#define MEMFILE_ATTR(val) ((val) & 0xffff)
217 264
265/*
266 * Reclaim flags for mem_cgroup_hierarchical_reclaim
267 */
268#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
269#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
270#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
271#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
272#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
273#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
274
218static void mem_cgroup_get(struct mem_cgroup *mem); 275static void mem_cgroup_get(struct mem_cgroup *mem);
219static void mem_cgroup_put(struct mem_cgroup *mem); 276static void mem_cgroup_put(struct mem_cgroup *mem);
220static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); 277static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
221 278
279static struct mem_cgroup_per_zone *
280mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
281{
282 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
283}
284
285static struct mem_cgroup_per_zone *
286page_cgroup_zoneinfo(struct page_cgroup *pc)
287{
288 struct mem_cgroup *mem = pc->mem_cgroup;
289 int nid = page_cgroup_nid(pc);
290 int zid = page_cgroup_zid(pc);
291
292 if (!mem)
293 return NULL;
294
295 return mem_cgroup_zoneinfo(mem, nid, zid);
296}
297
298static struct mem_cgroup_tree_per_zone *
299soft_limit_tree_node_zone(int nid, int zid)
300{
301 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
302}
303
304static struct mem_cgroup_tree_per_zone *
305soft_limit_tree_from_page(struct page *page)
306{
307 int nid = page_to_nid(page);
308 int zid = page_zonenum(page);
309
310 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid];
311}
312
313static void
314__mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
315 struct mem_cgroup_per_zone *mz,
316 struct mem_cgroup_tree_per_zone *mctz,
317 unsigned long long new_usage_in_excess)
318{
319 struct rb_node **p = &mctz->rb_root.rb_node;
320 struct rb_node *parent = NULL;
321 struct mem_cgroup_per_zone *mz_node;
322
323 if (mz->on_tree)
324 return;
325
326 mz->usage_in_excess = new_usage_in_excess;
327 if (!mz->usage_in_excess)
328 return;
329 while (*p) {
330 parent = *p;
331 mz_node = rb_entry(parent, struct mem_cgroup_per_zone,
332 tree_node);
333 if (mz->usage_in_excess < mz_node->usage_in_excess)
334 p = &(*p)->rb_left;
335 /*
336 * We can't avoid mem cgroups that are over their soft
337 * limit by the same amount
338 */
339 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
340 p = &(*p)->rb_right;
341 }
342 rb_link_node(&mz->tree_node, parent, p);
343 rb_insert_color(&mz->tree_node, &mctz->rb_root);
344 mz->on_tree = true;
345}
346
347static void
348__mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
349 struct mem_cgroup_per_zone *mz,
350 struct mem_cgroup_tree_per_zone *mctz)
351{
352 if (!mz->on_tree)
353 return;
354 rb_erase(&mz->tree_node, &mctz->rb_root);
355 mz->on_tree = false;
356}
357
358static void
359mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
360 struct mem_cgroup_per_zone *mz,
361 struct mem_cgroup_tree_per_zone *mctz)
362{
363 spin_lock(&mctz->lock);
364 __mem_cgroup_remove_exceeded(mem, mz, mctz);
365 spin_unlock(&mctz->lock);
366}
367
368static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
369{
370 bool ret = false;
371 int cpu;
372 s64 val;
373 struct mem_cgroup_stat_cpu *cpustat;
374
375 cpu = get_cpu();
376 cpustat = &mem->stat.cpustat[cpu];
377 val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS);
378 if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) {
379 __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS);
380 ret = true;
381 }
382 put_cpu();
383 return ret;
384}
385
386static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
387{
388 unsigned long long excess;
389 struct mem_cgroup_per_zone *mz;
390 struct mem_cgroup_tree_per_zone *mctz;
391 int nid = page_to_nid(page);
392 int zid = page_zonenum(page);
393 mctz = soft_limit_tree_from_page(page);
394
395 /*
396 * Necessary to update all ancestors when hierarchy is used.
397 * because their event counter is not touched.
398 */
399 for (; mem; mem = parent_mem_cgroup(mem)) {
400 mz = mem_cgroup_zoneinfo(mem, nid, zid);
401 excess = res_counter_soft_limit_excess(&mem->res);
402 /*
403 * We have to update the tree if mz is on RB-tree or
404 * mem is over its softlimit.
405 */
406 if (excess || mz->on_tree) {
407 spin_lock(&mctz->lock);
408 /* if on-tree, remove it */
409 if (mz->on_tree)
410 __mem_cgroup_remove_exceeded(mem, mz, mctz);
411 /*
412 * Insert again. mz->usage_in_excess will be updated.
413 * If excess is 0, no tree ops.
414 */
415 __mem_cgroup_insert_exceeded(mem, mz, mctz, excess);
416 spin_unlock(&mctz->lock);
417 }
418 }
419}
420
421static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem)
422{
423 int node, zone;
424 struct mem_cgroup_per_zone *mz;
425 struct mem_cgroup_tree_per_zone *mctz;
426
427 for_each_node_state(node, N_POSSIBLE) {
428 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
429 mz = mem_cgroup_zoneinfo(mem, node, zone);
430 mctz = soft_limit_tree_node_zone(node, zone);
431 mem_cgroup_remove_exceeded(mem, mz, mctz);
432 }
433 }
434}
435
436static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem)
437{
438 return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT;
439}
440
441static struct mem_cgroup_per_zone *
442__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
443{
444 struct rb_node *rightmost = NULL;
445 struct mem_cgroup_per_zone *mz;
446
447retry:
448 mz = NULL;
449 rightmost = rb_last(&mctz->rb_root);
450 if (!rightmost)
451 goto done; /* Nothing to reclaim from */
452
453 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node);
454 /*
455 * Remove the node now but someone else can add it back,
456 * we will to add it back at the end of reclaim to its correct
457 * position in the tree.
458 */
459 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
460 if (!res_counter_soft_limit_excess(&mz->mem->res) ||
461 !css_tryget(&mz->mem->css))
462 goto retry;
463done:
464 return mz;
465}
466
467static struct mem_cgroup_per_zone *
468mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
469{
470 struct mem_cgroup_per_zone *mz;
471
472 spin_lock(&mctz->lock);
473 mz = __mem_cgroup_largest_soft_limit_node(mctz);
474 spin_unlock(&mctz->lock);
475 return mz;
476}
477
478static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
479 bool charge)
480{
481 int val = (charge) ? 1 : -1;
482 struct mem_cgroup_stat *stat = &mem->stat;
483 struct mem_cgroup_stat_cpu *cpustat;
484 int cpu = get_cpu();
485
486 cpustat = &stat->cpustat[cpu];
487 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
488 put_cpu();
489}
490
222static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 491static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
223 struct page_cgroup *pc, 492 struct page_cgroup *pc,
224 bool charge) 493 bool charge)
225{ 494{
226 int val = (charge)? 1 : -1; 495 int val = (charge) ? 1 : -1;
227 struct mem_cgroup_stat *stat = &mem->stat; 496 struct mem_cgroup_stat *stat = &mem->stat;
228 struct mem_cgroup_stat_cpu *cpustat; 497 struct mem_cgroup_stat_cpu *cpustat;
229 int cpu = get_cpu(); 498 int cpu = get_cpu();
@@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
240 else 509 else
241 __mem_cgroup_stat_add_safe(cpustat, 510 __mem_cgroup_stat_add_safe(cpustat,
242 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 511 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
512 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1);
243 put_cpu(); 513 put_cpu();
244} 514}
245 515
246static struct mem_cgroup_per_zone *
247mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
248{
249 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
250}
251
252static struct mem_cgroup_per_zone *
253page_cgroup_zoneinfo(struct page_cgroup *pc)
254{
255 struct mem_cgroup *mem = pc->mem_cgroup;
256 int nid = page_cgroup_nid(pc);
257 int zid = page_cgroup_zid(pc);
258
259 if (!mem)
260 return NULL;
261
262 return mem_cgroup_zoneinfo(mem, nid, zid);
263}
264
265static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 516static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
266 enum lru_list idx) 517 enum lru_list idx)
267{ 518{
@@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
354 return ret; 605 return ret;
355} 606}
356 607
608static inline bool mem_cgroup_is_root(struct mem_cgroup *mem)
609{
610 return (mem == root_mem_cgroup);
611}
612
357/* 613/*
358 * Following LRU functions are allowed to be used without PCG_LOCK. 614 * Following LRU functions are allowed to be used without PCG_LOCK.
359 * Operations are called by routine of global LRU independently from memcg. 615 * Operations are called by routine of global LRU independently from memcg.
@@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
371void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) 627void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
372{ 628{
373 struct page_cgroup *pc; 629 struct page_cgroup *pc;
374 struct mem_cgroup *mem;
375 struct mem_cgroup_per_zone *mz; 630 struct mem_cgroup_per_zone *mz;
376 631
377 if (mem_cgroup_disabled()) 632 if (mem_cgroup_disabled())
378 return; 633 return;
379 pc = lookup_page_cgroup(page); 634 pc = lookup_page_cgroup(page);
380 /* can happen while we handle swapcache. */ 635 /* can happen while we handle swapcache. */
381 if (list_empty(&pc->lru) || !pc->mem_cgroup) 636 if (!TestClearPageCgroupAcctLRU(pc))
382 return; 637 return;
638 VM_BUG_ON(!pc->mem_cgroup);
383 /* 639 /*
384 * We don't check PCG_USED bit. It's cleared when the "page" is finally 640 * We don't check PCG_USED bit. It's cleared when the "page" is finally
385 * removed from global LRU. 641 * removed from global LRU.
386 */ 642 */
387 mz = page_cgroup_zoneinfo(pc); 643 mz = page_cgroup_zoneinfo(pc);
388 mem = pc->mem_cgroup;
389 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 644 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
645 if (mem_cgroup_is_root(pc->mem_cgroup))
646 return;
647 VM_BUG_ON(list_empty(&pc->lru));
390 list_del_init(&pc->lru); 648 list_del_init(&pc->lru);
391 return; 649 return;
392} 650}
@@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
410 * For making pc->mem_cgroup visible, insert smp_rmb() here. 668 * For making pc->mem_cgroup visible, insert smp_rmb() here.
411 */ 669 */
412 smp_rmb(); 670 smp_rmb();
413 /* unused page is not rotated. */ 671 /* unused or root page is not rotated. */
414 if (!PageCgroupUsed(pc)) 672 if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup))
415 return; 673 return;
416 mz = page_cgroup_zoneinfo(pc); 674 mz = page_cgroup_zoneinfo(pc);
417 list_move(&pc->lru, &mz->lists[lru]); 675 list_move(&pc->lru, &mz->lists[lru]);
@@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
425 if (mem_cgroup_disabled()) 683 if (mem_cgroup_disabled())
426 return; 684 return;
427 pc = lookup_page_cgroup(page); 685 pc = lookup_page_cgroup(page);
686 VM_BUG_ON(PageCgroupAcctLRU(pc));
428 /* 687 /*
429 * Used bit is set without atomic ops but after smp_wmb(). 688 * Used bit is set without atomic ops but after smp_wmb().
430 * For making pc->mem_cgroup visible, insert smp_rmb() here. 689 * For making pc->mem_cgroup visible, insert smp_rmb() here.
@@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
435 694
436 mz = page_cgroup_zoneinfo(pc); 695 mz = page_cgroup_zoneinfo(pc);
437 MEM_CGROUP_ZSTAT(mz, lru) += 1; 696 MEM_CGROUP_ZSTAT(mz, lru) += 1;
697 SetPageCgroupAcctLRU(pc);
698 if (mem_cgroup_is_root(pc->mem_cgroup))
699 return;
438 list_add(&pc->lru, &mz->lists[lru]); 700 list_add(&pc->lru, &mz->lists[lru]);
439} 701}
440 702
@@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
469 731
470 spin_lock_irqsave(&zone->lru_lock, flags); 732 spin_lock_irqsave(&zone->lru_lock, flags);
471 /* link when the page is linked to LRU but page_cgroup isn't */ 733 /* link when the page is linked to LRU but page_cgroup isn't */
472 if (PageLRU(page) && list_empty(&pc->lru)) 734 if (PageLRU(page) && !PageCgroupAcctLRU(pc))
473 mem_cgroup_add_lru_list(page, page_lru(page)); 735 mem_cgroup_add_lru_list(page, page_lru(page));
474 spin_unlock_irqrestore(&zone->lru_lock, flags); 736 spin_unlock_irqrestore(&zone->lru_lock, flags);
475} 737}
@@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
648 int nid = z->zone_pgdat->node_id; 910 int nid = z->zone_pgdat->node_id;
649 int zid = zone_idx(z); 911 int zid = zone_idx(z);
650 struct mem_cgroup_per_zone *mz; 912 struct mem_cgroup_per_zone *mz;
651 int lru = LRU_FILE * !!file + !!active; 913 int lru = LRU_FILE * file + active;
652 int ret; 914 int ret;
653 915
654 BUG_ON(!mem_cont); 916 BUG_ON(!mem_cont);
@@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
855 * If shrink==true, for avoiding to free too much, this returns immedieately. 1117 * If shrink==true, for avoiding to free too much, this returns immedieately.
856 */ 1118 */
857static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 1119static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
858 gfp_t gfp_mask, bool noswap, bool shrink) 1120 struct zone *zone,
1121 gfp_t gfp_mask,
1122 unsigned long reclaim_options)
859{ 1123{
860 struct mem_cgroup *victim; 1124 struct mem_cgroup *victim;
861 int ret, total = 0; 1125 int ret, total = 0;
862 int loop = 0; 1126 int loop = 0;
1127 bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
1128 bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
1129 bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
1130 unsigned long excess = mem_cgroup_get_excess(root_mem);
863 1131
864 /* If memsw_is_minimum==1, swap-out is of-no-use. */ 1132 /* If memsw_is_minimum==1, swap-out is of-no-use. */
865 if (root_mem->memsw_is_minimum) 1133 if (root_mem->memsw_is_minimum)
866 noswap = true; 1134 noswap = true;
867 1135
868 while (loop < 2) { 1136 while (1) {
869 victim = mem_cgroup_select_victim(root_mem); 1137 victim = mem_cgroup_select_victim(root_mem);
870 if (victim == root_mem) 1138 if (victim == root_mem) {
871 loop++; 1139 loop++;
1140 if (loop >= 2) {
1141 /*
1142 * If we have not been able to reclaim
1143 * anything, it might because there are
1144 * no reclaimable pages under this hierarchy
1145 */
1146 if (!check_soft || !total) {
1147 css_put(&victim->css);
1148 break;
1149 }
1150 /*
1151 * We want to do more targetted reclaim.
1152 * excess >> 2 is not to excessive so as to
1153 * reclaim too much, nor too less that we keep
1154 * coming back to reclaim from this cgroup
1155 */
1156 if (total >= (excess >> 2) ||
1157 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) {
1158 css_put(&victim->css);
1159 break;
1160 }
1161 }
1162 }
872 if (!mem_cgroup_local_usage(&victim->stat)) { 1163 if (!mem_cgroup_local_usage(&victim->stat)) {
873 /* this cgroup's local usage == 0 */ 1164 /* this cgroup's local usage == 0 */
874 css_put(&victim->css); 1165 css_put(&victim->css);
875 continue; 1166 continue;
876 } 1167 }
877 /* we use swappiness of local cgroup */ 1168 /* we use swappiness of local cgroup */
878 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, 1169 if (check_soft)
879 get_swappiness(victim)); 1170 ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
1171 noswap, get_swappiness(victim), zone,
1172 zone->zone_pgdat->node_id);
1173 else
1174 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
1175 noswap, get_swappiness(victim));
880 css_put(&victim->css); 1176 css_put(&victim->css);
881 /* 1177 /*
882 * At shrinking usage, we can't check we should stop here or 1178 * At shrinking usage, we can't check we should stop here or
@@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
886 if (shrink) 1182 if (shrink)
887 return ret; 1183 return ret;
888 total += ret; 1184 total += ret;
889 if (mem_cgroup_check_under_limit(root_mem)) 1185 if (check_soft) {
1186 if (res_counter_check_under_soft_limit(&root_mem->res))
1187 return total;
1188 } else if (mem_cgroup_check_under_limit(root_mem))
890 return 1 + total; 1189 return 1 + total;
891 } 1190 }
892 return total; 1191 return total;
@@ -965,7 +1264,7 @@ done:
965 */ 1264 */
966static int __mem_cgroup_try_charge(struct mm_struct *mm, 1265static int __mem_cgroup_try_charge(struct mm_struct *mm,
967 gfp_t gfp_mask, struct mem_cgroup **memcg, 1266 gfp_t gfp_mask, struct mem_cgroup **memcg,
968 bool oom) 1267 bool oom, struct page *page)
969{ 1268{
970 struct mem_cgroup *mem, *mem_over_limit; 1269 struct mem_cgroup *mem, *mem_over_limit;
971 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 1270 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
996 VM_BUG_ON(css_is_removed(&mem->css)); 1295 VM_BUG_ON(css_is_removed(&mem->css));
997 1296
998 while (1) { 1297 while (1) {
999 int ret; 1298 int ret = 0;
1000 bool noswap = false; 1299 unsigned long flags = 0;
1001 1300
1301 if (mem_cgroup_is_root(mem))
1302 goto done;
1002 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); 1303 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
1003 if (likely(!ret)) { 1304 if (likely(!ret)) {
1004 if (!do_swap_account) 1305 if (!do_swap_account)
@@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1009 break; 1310 break;
1010 /* mem+swap counter fails */ 1311 /* mem+swap counter fails */
1011 res_counter_uncharge(&mem->res, PAGE_SIZE); 1312 res_counter_uncharge(&mem->res, PAGE_SIZE);
1012 noswap = true; 1313 flags |= MEM_CGROUP_RECLAIM_NOSWAP;
1013 mem_over_limit = mem_cgroup_from_res_counter(fail_res, 1314 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
1014 memsw); 1315 memsw);
1015 } else 1316 } else
@@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1020 if (!(gfp_mask & __GFP_WAIT)) 1321 if (!(gfp_mask & __GFP_WAIT))
1021 goto nomem; 1322 goto nomem;
1022 1323
1023 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 1324 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
1024 noswap, false); 1325 gfp_mask, flags);
1025 if (ret) 1326 if (ret)
1026 continue; 1327 continue;
1027 1328
@@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
1046 goto nomem; 1347 goto nomem;
1047 } 1348 }
1048 } 1349 }
1350 /*
1351 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
1352 * if they exceeds softlimit.
1353 */
1354 if (mem_cgroup_soft_limit_check(mem))
1355 mem_cgroup_update_tree(mem, page);
1356done:
1049 return 0; 1357 return 0;
1050nomem: 1358nomem:
1051 css_put(&mem->css); 1359 css_put(&mem->css);
1052 return -ENOMEM; 1360 return -ENOMEM;
1053} 1361}
1054 1362
1055
1056/* 1363/*
1057 * A helper function to get mem_cgroup from ID. must be called under 1364 * A helper function to get mem_cgroup from ID. must be called under
1058 * rcu_read_lock(). The caller must check css_is_removed() or some if 1365 * rcu_read_lock(). The caller must check css_is_removed() or some if
@@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
1119 lock_page_cgroup(pc); 1426 lock_page_cgroup(pc);
1120 if (unlikely(PageCgroupUsed(pc))) { 1427 if (unlikely(PageCgroupUsed(pc))) {
1121 unlock_page_cgroup(pc); 1428 unlock_page_cgroup(pc);
1122 res_counter_uncharge(&mem->res, PAGE_SIZE); 1429 if (!mem_cgroup_is_root(mem)) {
1123 if (do_swap_account) 1430 res_counter_uncharge(&mem->res, PAGE_SIZE);
1124 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1431 if (do_swap_account)
1432 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1433 }
1125 css_put(&mem->css); 1434 css_put(&mem->css);
1126 return; 1435 return;
1127 } 1436 }
1437
1128 pc->mem_cgroup = mem; 1438 pc->mem_cgroup = mem;
1439 /*
1440 * We access a page_cgroup asynchronously without lock_page_cgroup().
1441 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
1442 * is accessed after testing USED bit. To make pc->mem_cgroup visible
1443 * before USED bit, we need memory barrier here.
1444 * See mem_cgroup_add_lru_list(), etc.
1445 */
1129 smp_wmb(); 1446 smp_wmb();
1130 pc->flags = pcg_default_flags[ctype]; 1447 switch (ctype) {
1448 case MEM_CGROUP_CHARGE_TYPE_CACHE:
1449 case MEM_CGROUP_CHARGE_TYPE_SHMEM:
1450 SetPageCgroupCache(pc);
1451 SetPageCgroupUsed(pc);
1452 break;
1453 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1454 ClearPageCgroupCache(pc);
1455 SetPageCgroupUsed(pc);
1456 break;
1457 default:
1458 break;
1459 }
1131 1460
1132 mem_cgroup_charge_statistics(mem, pc, true); 1461 mem_cgroup_charge_statistics(mem, pc, true);
1133 1462
@@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1178 if (pc->mem_cgroup != from) 1507 if (pc->mem_cgroup != from)
1179 goto out; 1508 goto out;
1180 1509
1181 res_counter_uncharge(&from->res, PAGE_SIZE); 1510 if (!mem_cgroup_is_root(from))
1511 res_counter_uncharge(&from->res, PAGE_SIZE);
1182 mem_cgroup_charge_statistics(from, pc, false); 1512 mem_cgroup_charge_statistics(from, pc, false);
1183 1513
1184 page = pc->page; 1514 page = pc->page;
@@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
1197 1); 1527 1);
1198 } 1528 }
1199 1529
1200 if (do_swap_account) 1530 if (do_swap_account && !mem_cgroup_is_root(from))
1201 res_counter_uncharge(&from->memsw, PAGE_SIZE); 1531 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1202 css_put(&from->css); 1532 css_put(&from->css);
1203 1533
@@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
1238 parent = mem_cgroup_from_cont(pcg); 1568 parent = mem_cgroup_from_cont(pcg);
1239 1569
1240 1570
1241 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); 1571 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
1242 if (ret || !parent) 1572 if (ret || !parent)
1243 return ret; 1573 return ret;
1244 1574
@@ -1268,9 +1598,11 @@ uncharge:
1268 /* drop extra refcnt by try_charge() */ 1598 /* drop extra refcnt by try_charge() */
1269 css_put(&parent->css); 1599 css_put(&parent->css);
1270 /* uncharge if move fails */ 1600 /* uncharge if move fails */
1271 res_counter_uncharge(&parent->res, PAGE_SIZE); 1601 if (!mem_cgroup_is_root(parent)) {
1272 if (do_swap_account) 1602 res_counter_uncharge(&parent->res, PAGE_SIZE);
1273 res_counter_uncharge(&parent->memsw, PAGE_SIZE); 1603 if (do_swap_account)
1604 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1605 }
1274 return ret; 1606 return ret;
1275} 1607}
1276 1608
@@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1295 prefetchw(pc); 1627 prefetchw(pc);
1296 1628
1297 mem = memcg; 1629 mem = memcg;
1298 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); 1630 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
1299 if (ret || !mem) 1631 if (ret || !mem)
1300 return ret; 1632 return ret;
1301 1633
@@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1414 if (!mem) 1746 if (!mem)
1415 goto charge_cur_mm; 1747 goto charge_cur_mm;
1416 *ptr = mem; 1748 *ptr = mem;
1417 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); 1749 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
1418 /* drop extra refcnt from tryget */ 1750 /* drop extra refcnt from tryget */
1419 css_put(&mem->css); 1751 css_put(&mem->css);
1420 return ret; 1752 return ret;
1421charge_cur_mm: 1753charge_cur_mm:
1422 if (unlikely(!mm)) 1754 if (unlikely(!mm))
1423 mm = &init_mm; 1755 mm = &init_mm;
1424 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1756 return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
1425} 1757}
1426 1758
1427static void 1759static void
@@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1459 * This recorded memcg can be obsolete one. So, avoid 1791 * This recorded memcg can be obsolete one. So, avoid
1460 * calling css_tryget 1792 * calling css_tryget
1461 */ 1793 */
1462 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1794 if (!mem_cgroup_is_root(memcg))
1795 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1796 mem_cgroup_swap_statistics(memcg, false);
1463 mem_cgroup_put(memcg); 1797 mem_cgroup_put(memcg);
1464 } 1798 }
1465 rcu_read_unlock(); 1799 rcu_read_unlock();
@@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1484 return; 1818 return;
1485 if (!mem) 1819 if (!mem)
1486 return; 1820 return;
1487 res_counter_uncharge(&mem->res, PAGE_SIZE); 1821 if (!mem_cgroup_is_root(mem)) {
1488 if (do_swap_account) 1822 res_counter_uncharge(&mem->res, PAGE_SIZE);
1489 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1823 if (do_swap_account)
1824 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1825 }
1490 css_put(&mem->css); 1826 css_put(&mem->css);
1491} 1827}
1492 1828
@@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1538 break; 1874 break;
1539 } 1875 }
1540 1876
1541 res_counter_uncharge(&mem->res, PAGE_SIZE); 1877 if (!mem_cgroup_is_root(mem)) {
1542 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1878 res_counter_uncharge(&mem->res, PAGE_SIZE);
1543 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1879 if (do_swap_account &&
1880 (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1881 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1882 }
1883 if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1884 mem_cgroup_swap_statistics(mem, true);
1544 mem_cgroup_charge_statistics(mem, pc, false); 1885 mem_cgroup_charge_statistics(mem, pc, false);
1545 1886
1546 ClearPageCgroupUsed(pc); 1887 ClearPageCgroupUsed(pc);
@@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1554 mz = page_cgroup_zoneinfo(pc); 1895 mz = page_cgroup_zoneinfo(pc);
1555 unlock_page_cgroup(pc); 1896 unlock_page_cgroup(pc);
1556 1897
1898 if (mem_cgroup_soft_limit_check(mem))
1899 mem_cgroup_update_tree(mem, page);
1557 /* at swapout, this memcg will be accessed to record to swap */ 1900 /* at swapout, this memcg will be accessed to record to swap */
1558 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 1901 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1559 css_put(&mem->css); 1902 css_put(&mem->css);
@@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
1629 * We uncharge this because swap is freed. 1972 * We uncharge this because swap is freed.
1630 * This memcg can be obsolete one. We avoid calling css_tryget 1973 * This memcg can be obsolete one. We avoid calling css_tryget
1631 */ 1974 */
1632 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1975 if (!mem_cgroup_is_root(memcg))
1976 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1977 mem_cgroup_swap_statistics(memcg, false);
1633 mem_cgroup_put(memcg); 1978 mem_cgroup_put(memcg);
1634 } 1979 }
1635 rcu_read_unlock(); 1980 rcu_read_unlock();
@@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
1658 unlock_page_cgroup(pc); 2003 unlock_page_cgroup(pc);
1659 2004
1660 if (mem) { 2005 if (mem) {
1661 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); 2006 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
2007 page);
1662 css_put(&mem->css); 2008 css_put(&mem->css);
1663 } 2009 }
1664 *ptr = mem; 2010 *ptr = mem;
@@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1798 if (!ret) 2144 if (!ret)
1799 break; 2145 break;
1800 2146
1801 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 2147 progress = mem_cgroup_hierarchical_reclaim(memcg, NULL,
1802 false, true); 2148 GFP_KERNEL,
2149 MEM_CGROUP_RECLAIM_SHRINK);
1803 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 2150 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1804 /* Usage is reduced ? */ 2151 /* Usage is reduced ? */
1805 if (curusage >= oldusage) 2152 if (curusage >= oldusage)
@@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1851 if (!ret) 2198 if (!ret)
1852 break; 2199 break;
1853 2200
1854 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); 2201 mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
2202 MEM_CGROUP_RECLAIM_NOSWAP |
2203 MEM_CGROUP_RECLAIM_SHRINK);
1855 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 2204 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1856 /* Usage is reduced ? */ 2205 /* Usage is reduced ? */
1857 if (curusage >= oldusage) 2206 if (curusage >= oldusage)
@@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1862 return ret; 2211 return ret;
1863} 2212}
1864 2213
2214unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
2215 gfp_t gfp_mask, int nid,
2216 int zid)
2217{
2218 unsigned long nr_reclaimed = 0;
2219 struct mem_cgroup_per_zone *mz, *next_mz = NULL;
2220 unsigned long reclaimed;
2221 int loop = 0;
2222 struct mem_cgroup_tree_per_zone *mctz;
2223 unsigned long long excess;
2224
2225 if (order > 0)
2226 return 0;
2227
2228 mctz = soft_limit_tree_node_zone(nid, zid);
2229 /*
2230 * This loop can run a while, specially if mem_cgroup's continuously
2231 * keep exceeding their soft limit and putting the system under
2232 * pressure
2233 */
2234 do {
2235 if (next_mz)
2236 mz = next_mz;
2237 else
2238 mz = mem_cgroup_largest_soft_limit_node(mctz);
2239 if (!mz)
2240 break;
2241
2242 reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
2243 gfp_mask,
2244 MEM_CGROUP_RECLAIM_SOFT);
2245 nr_reclaimed += reclaimed;
2246 spin_lock(&mctz->lock);
2247
2248 /*
2249 * If we failed to reclaim anything from this memory cgroup
2250 * it is time to move on to the next cgroup
2251 */
2252 next_mz = NULL;
2253 if (!reclaimed) {
2254 do {
2255 /*
2256 * Loop until we find yet another one.
2257 *
2258 * By the time we get the soft_limit lock
2259 * again, someone might have aded the
2260 * group back on the RB tree. Iterate to
2261 * make sure we get a different mem.
2262 * mem_cgroup_largest_soft_limit_node returns
2263 * NULL if no other cgroup is present on
2264 * the tree
2265 */
2266 next_mz =
2267 __mem_cgroup_largest_soft_limit_node(mctz);
2268 if (next_mz == mz) {
2269 css_put(&next_mz->mem->css);
2270 next_mz = NULL;
2271 } else /* next_mz == NULL or other memcg */
2272 break;
2273 } while (1);
2274 }
2275 __mem_cgroup_remove_exceeded(mz->mem, mz, mctz);
2276 excess = res_counter_soft_limit_excess(&mz->mem->res);
2277 /*
2278 * One school of thought says that we should not add
2279 * back the node to the tree if reclaim returns 0.
2280 * But our reclaim could return 0, simply because due
2281 * to priority we are exposing a smaller subset of
2282 * memory to reclaim from. Consider this as a longer
2283 * term TODO.
2284 */
2285 /* If excess == 0, no tree ops */
2286 __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess);
2287 spin_unlock(&mctz->lock);
2288 css_put(&mz->mem->css);
2289 loop++;
2290 /*
2291 * Could not reclaim anything and there are no more
2292 * mem cgroups to try or we seem to be looping without
2293 * reclaiming anything.
2294 */
2295 if (!nr_reclaimed &&
2296 (next_mz == NULL ||
2297 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2298 break;
2299 } while (!nr_reclaimed);
2300 if (next_mz)
2301 css_put(&next_mz->mem->css);
2302 return nr_reclaimed;
2303}
2304
1865/* 2305/*
1866 * This routine traverse page_cgroup in given list and drop them all. 2306 * This routine traverse page_cgroup in given list and drop them all.
1867 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 2307 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
@@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
2046 return retval; 2486 return retval;
2047} 2487}
2048 2488
2489struct mem_cgroup_idx_data {
2490 s64 val;
2491 enum mem_cgroup_stat_index idx;
2492};
2493
2494static int
2495mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
2496{
2497 struct mem_cgroup_idx_data *d = data;
2498 d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
2499 return 0;
2500}
2501
2502static void
2503mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem,
2504 enum mem_cgroup_stat_index idx, s64 *val)
2505{
2506 struct mem_cgroup_idx_data d;
2507 d.idx = idx;
2508 d.val = 0;
2509 mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat);
2510 *val = d.val;
2511}
2512
2049static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 2513static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
2050{ 2514{
2051 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2515 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2052 u64 val = 0; 2516 u64 idx_val, val;
2053 int type, name; 2517 int type, name;
2054 2518
2055 type = MEMFILE_TYPE(cft->private); 2519 type = MEMFILE_TYPE(cft->private);
2056 name = MEMFILE_ATTR(cft->private); 2520 name = MEMFILE_ATTR(cft->private);
2057 switch (type) { 2521 switch (type) {
2058 case _MEM: 2522 case _MEM:
2059 val = res_counter_read_u64(&mem->res, name); 2523 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2524 mem_cgroup_get_recursive_idx_stat(mem,
2525 MEM_CGROUP_STAT_CACHE, &idx_val);
2526 val = idx_val;
2527 mem_cgroup_get_recursive_idx_stat(mem,
2528 MEM_CGROUP_STAT_RSS, &idx_val);
2529 val += idx_val;
2530 val <<= PAGE_SHIFT;
2531 } else
2532 val = res_counter_read_u64(&mem->res, name);
2060 break; 2533 break;
2061 case _MEMSWAP: 2534 case _MEMSWAP:
2062 val = res_counter_read_u64(&mem->memsw, name); 2535 if (name == RES_USAGE && mem_cgroup_is_root(mem)) {
2536 mem_cgroup_get_recursive_idx_stat(mem,
2537 MEM_CGROUP_STAT_CACHE, &idx_val);
2538 val = idx_val;
2539 mem_cgroup_get_recursive_idx_stat(mem,
2540 MEM_CGROUP_STAT_RSS, &idx_val);
2541 val += idx_val;
2542 mem_cgroup_get_recursive_idx_stat(mem,
2543 MEM_CGROUP_STAT_SWAPOUT, &idx_val);
2544 val <<= PAGE_SHIFT;
2545 } else
2546 val = res_counter_read_u64(&mem->memsw, name);
2063 break; 2547 break;
2064 default: 2548 default:
2065 BUG(); 2549 BUG();
@@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2083 name = MEMFILE_ATTR(cft->private); 2567 name = MEMFILE_ATTR(cft->private);
2084 switch (name) { 2568 switch (name) {
2085 case RES_LIMIT: 2569 case RES_LIMIT:
2570 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2571 ret = -EINVAL;
2572 break;
2573 }
2086 /* This function does all necessary parse...reuse it */ 2574 /* This function does all necessary parse...reuse it */
2087 ret = res_counter_memparse_write_strategy(buffer, &val); 2575 ret = res_counter_memparse_write_strategy(buffer, &val);
2088 if (ret) 2576 if (ret)
@@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
2092 else 2580 else
2093 ret = mem_cgroup_resize_memsw_limit(memcg, val); 2581 ret = mem_cgroup_resize_memsw_limit(memcg, val);
2094 break; 2582 break;
2583 case RES_SOFT_LIMIT:
2584 ret = res_counter_memparse_write_strategy(buffer, &val);
2585 if (ret)
2586 break;
2587 /*
2588 * For memsw, soft limits are hard to implement in terms
2589 * of semantics, for now, we support soft limits for
2590 * control without swap
2591 */
2592 if (type == _MEM)
2593 ret = res_counter_set_soft_limit(&memcg->res, val);
2594 else
2595 ret = -EINVAL;
2596 break;
2095 default: 2597 default:
2096 ret = -EINVAL; /* should be BUG() ? */ 2598 ret = -EINVAL; /* should be BUG() ? */
2097 break; 2599 break;
@@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
2149 res_counter_reset_failcnt(&mem->memsw); 2651 res_counter_reset_failcnt(&mem->memsw);
2150 break; 2652 break;
2151 } 2653 }
2654
2152 return 0; 2655 return 0;
2153} 2656}
2154 2657
@@ -2160,6 +2663,7 @@ enum {
2160 MCS_MAPPED_FILE, 2663 MCS_MAPPED_FILE,
2161 MCS_PGPGIN, 2664 MCS_PGPGIN,
2162 MCS_PGPGOUT, 2665 MCS_PGPGOUT,
2666 MCS_SWAP,
2163 MCS_INACTIVE_ANON, 2667 MCS_INACTIVE_ANON,
2164 MCS_ACTIVE_ANON, 2668 MCS_ACTIVE_ANON,
2165 MCS_INACTIVE_FILE, 2669 MCS_INACTIVE_FILE,
@@ -2181,6 +2685,7 @@ struct {
2181 {"mapped_file", "total_mapped_file"}, 2685 {"mapped_file", "total_mapped_file"},
2182 {"pgpgin", "total_pgpgin"}, 2686 {"pgpgin", "total_pgpgin"},
2183 {"pgpgout", "total_pgpgout"}, 2687 {"pgpgout", "total_pgpgout"},
2688 {"swap", "total_swap"},
2184 {"inactive_anon", "total_inactive_anon"}, 2689 {"inactive_anon", "total_inactive_anon"},
2185 {"active_anon", "total_active_anon"}, 2690 {"active_anon", "total_active_anon"},
2186 {"inactive_file", "total_inactive_file"}, 2691 {"inactive_file", "total_inactive_file"},
@@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2205 s->stat[MCS_PGPGIN] += val; 2710 s->stat[MCS_PGPGIN] += val;
2206 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); 2711 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2207 s->stat[MCS_PGPGOUT] += val; 2712 s->stat[MCS_PGPGOUT] += val;
2713 if (do_swap_account) {
2714 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
2715 s->stat[MCS_SWAP] += val * PAGE_SIZE;
2716 }
2208 2717
2209 /* per zone stat */ 2718 /* per zone stat */
2210 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); 2719 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
@@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2236 memset(&mystat, 0, sizeof(mystat)); 2745 memset(&mystat, 0, sizeof(mystat));
2237 mem_cgroup_get_local_stat(mem_cont, &mystat); 2746 mem_cgroup_get_local_stat(mem_cont, &mystat);
2238 2747
2239 for (i = 0; i < NR_MCS_STAT; i++) 2748 for (i = 0; i < NR_MCS_STAT; i++) {
2749 if (i == MCS_SWAP && !do_swap_account)
2750 continue;
2240 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); 2751 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
2752 }
2241 2753
2242 /* Hierarchical information */ 2754 /* Hierarchical information */
2243 { 2755 {
@@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
2250 2762
2251 memset(&mystat, 0, sizeof(mystat)); 2763 memset(&mystat, 0, sizeof(mystat));
2252 mem_cgroup_get_total_stat(mem_cont, &mystat); 2764 mem_cgroup_get_total_stat(mem_cont, &mystat);
2253 for (i = 0; i < NR_MCS_STAT; i++) 2765 for (i = 0; i < NR_MCS_STAT; i++) {
2766 if (i == MCS_SWAP && !do_swap_account)
2767 continue;
2254 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); 2768 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2255 2769 }
2256 2770
2257#ifdef CONFIG_DEBUG_VM 2771#ifdef CONFIG_DEBUG_VM
2258 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2772 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
@@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = {
2345 .read_u64 = mem_cgroup_read, 2859 .read_u64 = mem_cgroup_read,
2346 }, 2860 },
2347 { 2861 {
2862 .name = "soft_limit_in_bytes",
2863 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
2864 .write_string = mem_cgroup_write,
2865 .read_u64 = mem_cgroup_read,
2866 },
2867 {
2348 .name = "failcnt", 2868 .name = "failcnt",
2349 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 2869 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
2350 .trigger = mem_cgroup_reset, 2870 .trigger = mem_cgroup_reset,
@@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
2438 mz = &pn->zoneinfo[zone]; 2958 mz = &pn->zoneinfo[zone];
2439 for_each_lru(l) 2959 for_each_lru(l)
2440 INIT_LIST_HEAD(&mz->lists[l]); 2960 INIT_LIST_HEAD(&mz->lists[l]);
2961 mz->usage_in_excess = 0;
2962 mz->on_tree = false;
2963 mz->mem = mem;
2441 } 2964 }
2442 return 0; 2965 return 0;
2443} 2966}
@@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2483{ 3006{
2484 int node; 3007 int node;
2485 3008
3009 mem_cgroup_remove_from_trees(mem);
2486 free_css_id(&mem_cgroup_subsys, &mem->css); 3010 free_css_id(&mem_cgroup_subsys, &mem->css);
2487 3011
2488 for_each_node_state(node, N_POSSIBLE) 3012 for_each_node_state(node, N_POSSIBLE)
@@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void)
2531} 3055}
2532#endif 3056#endif
2533 3057
3058static int mem_cgroup_soft_limit_tree_init(void)
3059{
3060 struct mem_cgroup_tree_per_node *rtpn;
3061 struct mem_cgroup_tree_per_zone *rtpz;
3062 int tmp, node, zone;
3063
3064 for_each_node_state(node, N_POSSIBLE) {
3065 tmp = node;
3066 if (!node_state(node, N_NORMAL_MEMORY))
3067 tmp = -1;
3068 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
3069 if (!rtpn)
3070 return 1;
3071
3072 soft_limit_tree.rb_tree_per_node[node] = rtpn;
3073
3074 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
3075 rtpz = &rtpn->rb_tree_per_zone[zone];
3076 rtpz->rb_root = RB_ROOT;
3077 spin_lock_init(&rtpz->lock);
3078 }
3079 }
3080 return 0;
3081}
3082
2534static struct cgroup_subsys_state * __ref 3083static struct cgroup_subsys_state * __ref
2535mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 3084mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2536{ 3085{
@@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2545 for_each_node_state(node, N_POSSIBLE) 3094 for_each_node_state(node, N_POSSIBLE)
2546 if (alloc_mem_cgroup_per_zone_info(mem, node)) 3095 if (alloc_mem_cgroup_per_zone_info(mem, node))
2547 goto free_out; 3096 goto free_out;
3097
2548 /* root ? */ 3098 /* root ? */
2549 if (cont->parent == NULL) { 3099 if (cont->parent == NULL) {
2550 enable_swap_cgroup(); 3100 enable_swap_cgroup();
2551 parent = NULL; 3101 parent = NULL;
3102 root_mem_cgroup = mem;
3103 if (mem_cgroup_soft_limit_tree_init())
3104 goto free_out;
3105
2552 } else { 3106 } else {
2553 parent = mem_cgroup_from_cont(cont->parent); 3107 parent = mem_cgroup_from_cont(cont->parent);
2554 mem->use_hierarchy = parent->use_hierarchy; 3108 mem->use_hierarchy = parent->use_hierarchy;
@@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2577 return &mem->css; 3131 return &mem->css;
2578free_out: 3132free_out:
2579 __mem_cgroup_free(mem); 3133 __mem_cgroup_free(mem);
3134 root_mem_cgroup = NULL;
2580 return ERR_PTR(error); 3135 return ERR_PTR(error);
2581} 3136}
2582 3137
@@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss,
2612static void mem_cgroup_move_task(struct cgroup_subsys *ss, 3167static void mem_cgroup_move_task(struct cgroup_subsys *ss,
2613 struct cgroup *cont, 3168 struct cgroup *cont,
2614 struct cgroup *old_cont, 3169 struct cgroup *old_cont,
2615 struct task_struct *p) 3170 struct task_struct *p,
3171 bool threadgroup)
2616{ 3172{
2617 mutex_lock(&memcg_tasklist); 3173 mutex_lock(&memcg_tasklist);
2618 /* 3174 /*