diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 714 |
1 files changed, 635 insertions, 79 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d86de5..f99f5991d6bb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/rbtree.h> | ||
32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
@@ -43,6 +44,7 @@ | |||
43 | 44 | ||
44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 45 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 46 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
47 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
46 | 48 | ||
47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 49 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 50 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
53 | #endif | 55 | #endif |
54 | 56 | ||
55 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
56 | 59 | ||
57 | /* | 60 | /* |
58 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { | |||
66 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
67 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
68 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
69 | 74 | ||
70 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
71 | }; | 76 | }; |
@@ -78,6 +83,20 @@ struct mem_cgroup_stat { | |||
78 | struct mem_cgroup_stat_cpu cpustat[0]; | 83 | struct mem_cgroup_stat_cpu cpustat[0]; |
79 | }; | 84 | }; |
80 | 85 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
81 | /* | 100 | /* |
82 | * For accounting under irq disable, no need for increment preempt count. | 101 | * For accounting under irq disable, no need for increment preempt count. |
83 | */ | 102 | */ |
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { | |||
117 | unsigned long count[NR_LRU_LISTS]; | 136 | unsigned long count[NR_LRU_LISTS]; |
118 | 137 | ||
119 | struct zone_reclaim_stat reclaim_stat; | 138 | struct zone_reclaim_stat reclaim_stat; |
139 | struct rb_node tree_node; /* RB tree node */ | ||
140 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
141 | /* the soft limit is exceeded*/ | ||
142 | bool on_tree; | ||
143 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
144 | /* use container_of */ | ||
120 | }; | 145 | }; |
121 | /* Macro for accessing counter */ | 146 | /* Macro for accessing counter */ |
122 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 147 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { | |||
130 | }; | 155 | }; |
131 | 156 | ||
132 | /* | 157 | /* |
158 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
159 | * their hierarchy representation | ||
160 | */ | ||
161 | |||
162 | struct mem_cgroup_tree_per_zone { | ||
163 | struct rb_root rb_root; | ||
164 | spinlock_t lock; | ||
165 | }; | ||
166 | |||
167 | struct mem_cgroup_tree_per_node { | ||
168 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
169 | }; | ||
170 | |||
171 | struct mem_cgroup_tree { | ||
172 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
173 | }; | ||
174 | |||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
176 | |||
177 | /* | ||
133 | * The memory controller data structure. The memory controller controls both | 178 | * The memory controller data structure. The memory controller controls both |
134 | * page cache and RSS per cgroup. We would eventually like to provide | 179 | * page cache and RSS per cgroup. We would eventually like to provide |
135 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 180 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
@@ -186,6 +231,13 @@ struct mem_cgroup { | |||
186 | struct mem_cgroup_stat stat; | 231 | struct mem_cgroup_stat stat; |
187 | }; | 232 | }; |
188 | 233 | ||
234 | /* | ||
235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
236 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
237 | */ | ||
238 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
239 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
240 | |||
189 | enum charge_type { | 241 | enum charge_type { |
190 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 242 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
191 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 243 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -200,13 +252,8 @@ enum charge_type { | |||
200 | #define PCGF_CACHE (1UL << PCG_CACHE) | 252 | #define PCGF_CACHE (1UL << PCG_CACHE) |
201 | #define PCGF_USED (1UL << PCG_USED) | 253 | #define PCGF_USED (1UL << PCG_USED) |
202 | #define PCGF_LOCK (1UL << PCG_LOCK) | 254 | #define PCGF_LOCK (1UL << PCG_LOCK) |
203 | static const unsigned long | 255 | /* Not used, but added here for completeness */ |
204 | pcg_default_flags[NR_CHARGE_TYPE] = { | 256 | #define PCGF_ACCT (1UL << PCG_ACCT) |
205 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
206 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
207 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
208 | 0, /* FORCE */ | ||
209 | }; | ||
210 | 257 | ||
211 | /* for encoding cft->private value on file */ | 258 | /* for encoding cft->private value on file */ |
212 | #define _MEM (0) | 259 | #define _MEM (0) |
@@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
215 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 262 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
216 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 263 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
217 | 264 | ||
265 | /* | ||
266 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
267 | */ | ||
268 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
269 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
270 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
271 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
272 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
273 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
274 | |||
218 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
219 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
220 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
221 | 278 | ||
279 | static struct mem_cgroup_per_zone * | ||
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
281 | { | ||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
283 | } | ||
284 | |||
285 | static struct mem_cgroup_per_zone * | ||
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
287 | { | ||
288 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
289 | int nid = page_cgroup_nid(pc); | ||
290 | int zid = page_cgroup_zid(pc); | ||
291 | |||
292 | if (!mem) | ||
293 | return NULL; | ||
294 | |||
295 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
296 | } | ||
297 | |||
298 | static struct mem_cgroup_tree_per_zone * | ||
299 | soft_limit_tree_node_zone(int nid, int zid) | ||
300 | { | ||
301 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
302 | } | ||
303 | |||
304 | static struct mem_cgroup_tree_per_zone * | ||
305 | soft_limit_tree_from_page(struct page *page) | ||
306 | { | ||
307 | int nid = page_to_nid(page); | ||
308 | int zid = page_zonenum(page); | ||
309 | |||
310 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
311 | } | ||
312 | |||
313 | static void | ||
314 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
315 | struct mem_cgroup_per_zone *mz, | ||
316 | struct mem_cgroup_tree_per_zone *mctz, | ||
317 | unsigned long long new_usage_in_excess) | ||
318 | { | ||
319 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
320 | struct rb_node *parent = NULL; | ||
321 | struct mem_cgroup_per_zone *mz_node; | ||
322 | |||
323 | if (mz->on_tree) | ||
324 | return; | ||
325 | |||
326 | mz->usage_in_excess = new_usage_in_excess; | ||
327 | if (!mz->usage_in_excess) | ||
328 | return; | ||
329 | while (*p) { | ||
330 | parent = *p; | ||
331 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
332 | tree_node); | ||
333 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
334 | p = &(*p)->rb_left; | ||
335 | /* | ||
336 | * We can't avoid mem cgroups that are over their soft | ||
337 | * limit by the same amount | ||
338 | */ | ||
339 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
340 | p = &(*p)->rb_right; | ||
341 | } | ||
342 | rb_link_node(&mz->tree_node, parent, p); | ||
343 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
344 | mz->on_tree = true; | ||
345 | } | ||
346 | |||
347 | static void | ||
348 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
349 | struct mem_cgroup_per_zone *mz, | ||
350 | struct mem_cgroup_tree_per_zone *mctz) | ||
351 | { | ||
352 | if (!mz->on_tree) | ||
353 | return; | ||
354 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
355 | mz->on_tree = false; | ||
356 | } | ||
357 | |||
358 | static void | ||
359 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
360 | struct mem_cgroup_per_zone *mz, | ||
361 | struct mem_cgroup_tree_per_zone *mctz) | ||
362 | { | ||
363 | spin_lock(&mctz->lock); | ||
364 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
365 | spin_unlock(&mctz->lock); | ||
366 | } | ||
367 | |||
368 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
369 | { | ||
370 | bool ret = false; | ||
371 | int cpu; | ||
372 | s64 val; | ||
373 | struct mem_cgroup_stat_cpu *cpustat; | ||
374 | |||
375 | cpu = get_cpu(); | ||
376 | cpustat = &mem->stat.cpustat[cpu]; | ||
377 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
378 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
379 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
380 | ret = true; | ||
381 | } | ||
382 | put_cpu(); | ||
383 | return ret; | ||
384 | } | ||
385 | |||
386 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
387 | { | ||
388 | unsigned long long excess; | ||
389 | struct mem_cgroup_per_zone *mz; | ||
390 | struct mem_cgroup_tree_per_zone *mctz; | ||
391 | int nid = page_to_nid(page); | ||
392 | int zid = page_zonenum(page); | ||
393 | mctz = soft_limit_tree_from_page(page); | ||
394 | |||
395 | /* | ||
396 | * Necessary to update all ancestors when hierarchy is used. | ||
397 | * because their event counter is not touched. | ||
398 | */ | ||
399 | for (; mem; mem = parent_mem_cgroup(mem)) { | ||
400 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
401 | excess = res_counter_soft_limit_excess(&mem->res); | ||
402 | /* | ||
403 | * We have to update the tree if mz is on RB-tree or | ||
404 | * mem is over its softlimit. | ||
405 | */ | ||
406 | if (excess || mz->on_tree) { | ||
407 | spin_lock(&mctz->lock); | ||
408 | /* if on-tree, remove it */ | ||
409 | if (mz->on_tree) | ||
410 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
411 | /* | ||
412 | * Insert again. mz->usage_in_excess will be updated. | ||
413 | * If excess is 0, no tree ops. | ||
414 | */ | ||
415 | __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); | ||
416 | spin_unlock(&mctz->lock); | ||
417 | } | ||
418 | } | ||
419 | } | ||
420 | |||
421 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
422 | { | ||
423 | int node, zone; | ||
424 | struct mem_cgroup_per_zone *mz; | ||
425 | struct mem_cgroup_tree_per_zone *mctz; | ||
426 | |||
427 | for_each_node_state(node, N_POSSIBLE) { | ||
428 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
429 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
430 | mctz = soft_limit_tree_node_zone(node, zone); | ||
431 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
432 | } | ||
433 | } | ||
434 | } | ||
435 | |||
436 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
437 | { | ||
438 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
439 | } | ||
440 | |||
441 | static struct mem_cgroup_per_zone * | ||
442 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
443 | { | ||
444 | struct rb_node *rightmost = NULL; | ||
445 | struct mem_cgroup_per_zone *mz; | ||
446 | |||
447 | retry: | ||
448 | mz = NULL; | ||
449 | rightmost = rb_last(&mctz->rb_root); | ||
450 | if (!rightmost) | ||
451 | goto done; /* Nothing to reclaim from */ | ||
452 | |||
453 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
454 | /* | ||
455 | * Remove the node now but someone else can add it back, | ||
456 | * we will to add it back at the end of reclaim to its correct | ||
457 | * position in the tree. | ||
458 | */ | ||
459 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
460 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
461 | !css_tryget(&mz->mem->css)) | ||
462 | goto retry; | ||
463 | done: | ||
464 | return mz; | ||
465 | } | ||
466 | |||
467 | static struct mem_cgroup_per_zone * | ||
468 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
469 | { | ||
470 | struct mem_cgroup_per_zone *mz; | ||
471 | |||
472 | spin_lock(&mctz->lock); | ||
473 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
474 | spin_unlock(&mctz->lock); | ||
475 | return mz; | ||
476 | } | ||
477 | |||
478 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
479 | bool charge) | ||
480 | { | ||
481 | int val = (charge) ? 1 : -1; | ||
482 | struct mem_cgroup_stat *stat = &mem->stat; | ||
483 | struct mem_cgroup_stat_cpu *cpustat; | ||
484 | int cpu = get_cpu(); | ||
485 | |||
486 | cpustat = &stat->cpustat[cpu]; | ||
487 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
488 | put_cpu(); | ||
489 | } | ||
490 | |||
222 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 491 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
223 | struct page_cgroup *pc, | 492 | struct page_cgroup *pc, |
224 | bool charge) | 493 | bool charge) |
225 | { | 494 | { |
226 | int val = (charge)? 1 : -1; | 495 | int val = (charge) ? 1 : -1; |
227 | struct mem_cgroup_stat *stat = &mem->stat; | 496 | struct mem_cgroup_stat *stat = &mem->stat; |
228 | struct mem_cgroup_stat_cpu *cpustat; | 497 | struct mem_cgroup_stat_cpu *cpustat; |
229 | int cpu = get_cpu(); | 498 | int cpu = get_cpu(); |
@@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
240 | else | 509 | else |
241 | __mem_cgroup_stat_add_safe(cpustat, | 510 | __mem_cgroup_stat_add_safe(cpustat, |
242 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 511 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
512 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
243 | put_cpu(); | 513 | put_cpu(); |
244 | } | 514 | } |
245 | 515 | ||
246 | static struct mem_cgroup_per_zone * | ||
247 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
248 | { | ||
249 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
250 | } | ||
251 | |||
252 | static struct mem_cgroup_per_zone * | ||
253 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
254 | { | ||
255 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
256 | int nid = page_cgroup_nid(pc); | ||
257 | int zid = page_cgroup_zid(pc); | ||
258 | |||
259 | if (!mem) | ||
260 | return NULL; | ||
261 | |||
262 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
263 | } | ||
264 | |||
265 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 516 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
266 | enum lru_list idx) | 517 | enum lru_list idx) |
267 | { | 518 | { |
@@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
354 | return ret; | 605 | return ret; |
355 | } | 606 | } |
356 | 607 | ||
608 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
609 | { | ||
610 | return (mem == root_mem_cgroup); | ||
611 | } | ||
612 | |||
357 | /* | 613 | /* |
358 | * Following LRU functions are allowed to be used without PCG_LOCK. | 614 | * Following LRU functions are allowed to be used without PCG_LOCK. |
359 | * Operations are called by routine of global LRU independently from memcg. | 615 | * Operations are called by routine of global LRU independently from memcg. |
@@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
371 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 627 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
372 | { | 628 | { |
373 | struct page_cgroup *pc; | 629 | struct page_cgroup *pc; |
374 | struct mem_cgroup *mem; | ||
375 | struct mem_cgroup_per_zone *mz; | 630 | struct mem_cgroup_per_zone *mz; |
376 | 631 | ||
377 | if (mem_cgroup_disabled()) | 632 | if (mem_cgroup_disabled()) |
378 | return; | 633 | return; |
379 | pc = lookup_page_cgroup(page); | 634 | pc = lookup_page_cgroup(page); |
380 | /* can happen while we handle swapcache. */ | 635 | /* can happen while we handle swapcache. */ |
381 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 636 | if (!TestClearPageCgroupAcctLRU(pc)) |
382 | return; | 637 | return; |
638 | VM_BUG_ON(!pc->mem_cgroup); | ||
383 | /* | 639 | /* |
384 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 640 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
385 | * removed from global LRU. | 641 | * removed from global LRU. |
386 | */ | 642 | */ |
387 | mz = page_cgroup_zoneinfo(pc); | 643 | mz = page_cgroup_zoneinfo(pc); |
388 | mem = pc->mem_cgroup; | ||
389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 644 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
645 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
646 | return; | ||
647 | VM_BUG_ON(list_empty(&pc->lru)); | ||
390 | list_del_init(&pc->lru); | 648 | list_del_init(&pc->lru); |
391 | return; | 649 | return; |
392 | } | 650 | } |
@@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
410 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 668 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
411 | */ | 669 | */ |
412 | smp_rmb(); | 670 | smp_rmb(); |
413 | /* unused page is not rotated. */ | 671 | /* unused or root page is not rotated. */ |
414 | if (!PageCgroupUsed(pc)) | 672 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
415 | return; | 673 | return; |
416 | mz = page_cgroup_zoneinfo(pc); | 674 | mz = page_cgroup_zoneinfo(pc); |
417 | list_move(&pc->lru, &mz->lists[lru]); | 675 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
425 | if (mem_cgroup_disabled()) | 683 | if (mem_cgroup_disabled()) |
426 | return; | 684 | return; |
427 | pc = lookup_page_cgroup(page); | 685 | pc = lookup_page_cgroup(page); |
686 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
428 | /* | 687 | /* |
429 | * Used bit is set without atomic ops but after smp_wmb(). | 688 | * Used bit is set without atomic ops but after smp_wmb(). |
430 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 689 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
@@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
435 | 694 | ||
436 | mz = page_cgroup_zoneinfo(pc); | 695 | mz = page_cgroup_zoneinfo(pc); |
437 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 696 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
697 | SetPageCgroupAcctLRU(pc); | ||
698 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
699 | return; | ||
438 | list_add(&pc->lru, &mz->lists[lru]); | 700 | list_add(&pc->lru, &mz->lists[lru]); |
439 | } | 701 | } |
440 | 702 | ||
@@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
469 | 731 | ||
470 | spin_lock_irqsave(&zone->lru_lock, flags); | 732 | spin_lock_irqsave(&zone->lru_lock, flags); |
471 | /* link when the page is linked to LRU but page_cgroup isn't */ | 733 | /* link when the page is linked to LRU but page_cgroup isn't */ |
472 | if (PageLRU(page) && list_empty(&pc->lru)) | 734 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
473 | mem_cgroup_add_lru_list(page, page_lru(page)); | 735 | mem_cgroup_add_lru_list(page, page_lru(page)); |
474 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 736 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
475 | } | 737 | } |
@@ -648,7 +910,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
648 | int nid = z->zone_pgdat->node_id; | 910 | int nid = z->zone_pgdat->node_id; |
649 | int zid = zone_idx(z); | 911 | int zid = zone_idx(z); |
650 | struct mem_cgroup_per_zone *mz; | 912 | struct mem_cgroup_per_zone *mz; |
651 | int lru = LRU_FILE * !!file + !!active; | 913 | int lru = LRU_FILE * file + active; |
652 | int ret; | 914 | int ret; |
653 | 915 | ||
654 | BUG_ON(!mem_cont); | 916 | BUG_ON(!mem_cont); |
@@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
855 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1117 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
856 | */ | 1118 | */ |
857 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1119 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
858 | gfp_t gfp_mask, bool noswap, bool shrink) | 1120 | struct zone *zone, |
1121 | gfp_t gfp_mask, | ||
1122 | unsigned long reclaim_options) | ||
859 | { | 1123 | { |
860 | struct mem_cgroup *victim; | 1124 | struct mem_cgroup *victim; |
861 | int ret, total = 0; | 1125 | int ret, total = 0; |
862 | int loop = 0; | 1126 | int loop = 0; |
1127 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
1128 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
1129 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1130 | unsigned long excess = mem_cgroup_get_excess(root_mem); | ||
863 | 1131 | ||
864 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1132 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
865 | if (root_mem->memsw_is_minimum) | 1133 | if (root_mem->memsw_is_minimum) |
866 | noswap = true; | 1134 | noswap = true; |
867 | 1135 | ||
868 | while (loop < 2) { | 1136 | while (1) { |
869 | victim = mem_cgroup_select_victim(root_mem); | 1137 | victim = mem_cgroup_select_victim(root_mem); |
870 | if (victim == root_mem) | 1138 | if (victim == root_mem) { |
871 | loop++; | 1139 | loop++; |
1140 | if (loop >= 2) { | ||
1141 | /* | ||
1142 | * If we have not been able to reclaim | ||
1143 | * anything, it might because there are | ||
1144 | * no reclaimable pages under this hierarchy | ||
1145 | */ | ||
1146 | if (!check_soft || !total) { | ||
1147 | css_put(&victim->css); | ||
1148 | break; | ||
1149 | } | ||
1150 | /* | ||
1151 | * We want to do more targetted reclaim. | ||
1152 | * excess >> 2 is not to excessive so as to | ||
1153 | * reclaim too much, nor too less that we keep | ||
1154 | * coming back to reclaim from this cgroup | ||
1155 | */ | ||
1156 | if (total >= (excess >> 2) || | ||
1157 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
1158 | css_put(&victim->css); | ||
1159 | break; | ||
1160 | } | ||
1161 | } | ||
1162 | } | ||
872 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1163 | if (!mem_cgroup_local_usage(&victim->stat)) { |
873 | /* this cgroup's local usage == 0 */ | 1164 | /* this cgroup's local usage == 0 */ |
874 | css_put(&victim->css); | 1165 | css_put(&victim->css); |
875 | continue; | 1166 | continue; |
876 | } | 1167 | } |
877 | /* we use swappiness of local cgroup */ | 1168 | /* we use swappiness of local cgroup */ |
878 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | 1169 | if (check_soft) |
879 | get_swappiness(victim)); | 1170 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1171 | noswap, get_swappiness(victim), zone, | ||
1172 | zone->zone_pgdat->node_id); | ||
1173 | else | ||
1174 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
1175 | noswap, get_swappiness(victim)); | ||
880 | css_put(&victim->css); | 1176 | css_put(&victim->css); |
881 | /* | 1177 | /* |
882 | * At shrinking usage, we can't check we should stop here or | 1178 | * At shrinking usage, we can't check we should stop here or |
@@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
886 | if (shrink) | 1182 | if (shrink) |
887 | return ret; | 1183 | return ret; |
888 | total += ret; | 1184 | total += ret; |
889 | if (mem_cgroup_check_under_limit(root_mem)) | 1185 | if (check_soft) { |
1186 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
1187 | return total; | ||
1188 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
890 | return 1 + total; | 1189 | return 1 + total; |
891 | } | 1190 | } |
892 | return total; | 1191 | return total; |
@@ -965,7 +1264,7 @@ done: | |||
965 | */ | 1264 | */ |
966 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1265 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
967 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1266 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
968 | bool oom) | 1267 | bool oom, struct page *page) |
969 | { | 1268 | { |
970 | struct mem_cgroup *mem, *mem_over_limit; | 1269 | struct mem_cgroup *mem, *mem_over_limit; |
971 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
@@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
996 | VM_BUG_ON(css_is_removed(&mem->css)); | 1295 | VM_BUG_ON(css_is_removed(&mem->css)); |
997 | 1296 | ||
998 | while (1) { | 1297 | while (1) { |
999 | int ret; | 1298 | int ret = 0; |
1000 | bool noswap = false; | 1299 | unsigned long flags = 0; |
1001 | 1300 | ||
1301 | if (mem_cgroup_is_root(mem)) | ||
1302 | goto done; | ||
1002 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); |
1003 | if (likely(!ret)) { | 1304 | if (likely(!ret)) { |
1004 | if (!do_swap_account) | 1305 | if (!do_swap_account) |
@@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1009 | break; | 1310 | break; |
1010 | /* mem+swap counter fails */ | 1311 | /* mem+swap counter fails */ |
1011 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1012 | noswap = true; | 1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1013 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1014 | memsw); | 1315 | memsw); |
1015 | } else | 1316 | } else |
@@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1020 | if (!(gfp_mask & __GFP_WAIT)) | 1321 | if (!(gfp_mask & __GFP_WAIT)) |
1021 | goto nomem; | 1322 | goto nomem; |
1022 | 1323 | ||
1023 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1324 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1024 | noswap, false); | 1325 | gfp_mask, flags); |
1025 | if (ret) | 1326 | if (ret) |
1026 | continue; | 1327 | continue; |
1027 | 1328 | ||
@@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1046 | goto nomem; | 1347 | goto nomem; |
1047 | } | 1348 | } |
1048 | } | 1349 | } |
1350 | /* | ||
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1352 | * if they exceeds softlimit. | ||
1353 | */ | ||
1354 | if (mem_cgroup_soft_limit_check(mem)) | ||
1355 | mem_cgroup_update_tree(mem, page); | ||
1356 | done: | ||
1049 | return 0; | 1357 | return 0; |
1050 | nomem: | 1358 | nomem: |
1051 | css_put(&mem->css); | 1359 | css_put(&mem->css); |
1052 | return -ENOMEM; | 1360 | return -ENOMEM; |
1053 | } | 1361 | } |
1054 | 1362 | ||
1055 | |||
1056 | /* | 1363 | /* |
1057 | * A helper function to get mem_cgroup from ID. must be called under | 1364 | * A helper function to get mem_cgroup from ID. must be called under |
1058 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1365 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1119 | lock_page_cgroup(pc); | 1426 | lock_page_cgroup(pc); |
1120 | if (unlikely(PageCgroupUsed(pc))) { | 1427 | if (unlikely(PageCgroupUsed(pc))) { |
1121 | unlock_page_cgroup(pc); | 1428 | unlock_page_cgroup(pc); |
1122 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1429 | if (!mem_cgroup_is_root(mem)) { |
1123 | if (do_swap_account) | 1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1124 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1431 | if (do_swap_account) |
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1125 | css_put(&mem->css); | 1434 | css_put(&mem->css); |
1126 | return; | 1435 | return; |
1127 | } | 1436 | } |
1437 | |||
1128 | pc->mem_cgroup = mem; | 1438 | pc->mem_cgroup = mem; |
1439 | /* | ||
1440 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
1441 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
1442 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
1443 | * before USED bit, we need memory barrier here. | ||
1444 | * See mem_cgroup_add_lru_list(), etc. | ||
1445 | */ | ||
1129 | smp_wmb(); | 1446 | smp_wmb(); |
1130 | pc->flags = pcg_default_flags[ctype]; | 1447 | switch (ctype) { |
1448 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
1449 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
1450 | SetPageCgroupCache(pc); | ||
1451 | SetPageCgroupUsed(pc); | ||
1452 | break; | ||
1453 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1454 | ClearPageCgroupCache(pc); | ||
1455 | SetPageCgroupUsed(pc); | ||
1456 | break; | ||
1457 | default: | ||
1458 | break; | ||
1459 | } | ||
1131 | 1460 | ||
1132 | mem_cgroup_charge_statistics(mem, pc, true); | 1461 | mem_cgroup_charge_statistics(mem, pc, true); |
1133 | 1462 | ||
@@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1178 | if (pc->mem_cgroup != from) | 1507 | if (pc->mem_cgroup != from) |
1179 | goto out; | 1508 | goto out; |
1180 | 1509 | ||
1181 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1510 | if (!mem_cgroup_is_root(from)) |
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1182 | mem_cgroup_charge_statistics(from, pc, false); | 1512 | mem_cgroup_charge_statistics(from, pc, false); |
1183 | 1513 | ||
1184 | page = pc->page; | 1514 | page = pc->page; |
@@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1197 | 1); | 1527 | 1); |
1198 | } | 1528 | } |
1199 | 1529 | ||
1200 | if (do_swap_account) | 1530 | if (do_swap_account && !mem_cgroup_is_root(from)) |
1201 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1531 | res_counter_uncharge(&from->memsw, PAGE_SIZE); |
1202 | css_put(&from->css); | 1532 | css_put(&from->css); |
1203 | 1533 | ||
@@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1238 | parent = mem_cgroup_from_cont(pcg); | 1568 | parent = mem_cgroup_from_cont(pcg); |
1239 | 1569 | ||
1240 | 1570 | ||
1241 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1242 | if (ret || !parent) | 1572 | if (ret || !parent) |
1243 | return ret; | 1573 | return ret; |
1244 | 1574 | ||
@@ -1268,9 +1598,11 @@ uncharge: | |||
1268 | /* drop extra refcnt by try_charge() */ | 1598 | /* drop extra refcnt by try_charge() */ |
1269 | css_put(&parent->css); | 1599 | css_put(&parent->css); |
1270 | /* uncharge if move fails */ | 1600 | /* uncharge if move fails */ |
1271 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1601 | if (!mem_cgroup_is_root(parent)) { |
1272 | if (do_swap_account) | 1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); |
1273 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1603 | if (do_swap_account) |
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1274 | return ret; | 1606 | return ret; |
1275 | } | 1607 | } |
1276 | 1608 | ||
@@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1295 | prefetchw(pc); | 1627 | prefetchw(pc); |
1296 | 1628 | ||
1297 | mem = memcg; | 1629 | mem = memcg; |
1298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1630 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
1299 | if (ret || !mem) | 1631 | if (ret || !mem) |
1300 | return ret; | 1632 | return ret; |
1301 | 1633 | ||
@@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1414 | if (!mem) | 1746 | if (!mem) |
1415 | goto charge_cur_mm; | 1747 | goto charge_cur_mm; |
1416 | *ptr = mem; | 1748 | *ptr = mem; |
1417 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1749 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
1418 | /* drop extra refcnt from tryget */ | 1750 | /* drop extra refcnt from tryget */ |
1419 | css_put(&mem->css); | 1751 | css_put(&mem->css); |
1420 | return ret; | 1752 | return ret; |
1421 | charge_cur_mm: | 1753 | charge_cur_mm: |
1422 | if (unlikely(!mm)) | 1754 | if (unlikely(!mm)) |
1423 | mm = &init_mm; | 1755 | mm = &init_mm; |
1424 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1756 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
1425 | } | 1757 | } |
1426 | 1758 | ||
1427 | static void | 1759 | static void |
@@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1459 | * This recorded memcg can be obsolete one. So, avoid | 1791 | * This recorded memcg can be obsolete one. So, avoid |
1460 | * calling css_tryget | 1792 | * calling css_tryget |
1461 | */ | 1793 | */ |
1462 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1794 | if (!mem_cgroup_is_root(memcg)) |
1795 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1796 | mem_cgroup_swap_statistics(memcg, false); | ||
1463 | mem_cgroup_put(memcg); | 1797 | mem_cgroup_put(memcg); |
1464 | } | 1798 | } |
1465 | rcu_read_unlock(); | 1799 | rcu_read_unlock(); |
@@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1484 | return; | 1818 | return; |
1485 | if (!mem) | 1819 | if (!mem) |
1486 | return; | 1820 | return; |
1487 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1821 | if (!mem_cgroup_is_root(mem)) { |
1488 | if (do_swap_account) | 1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1489 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1823 | if (do_swap_account) |
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1490 | css_put(&mem->css); | 1826 | css_put(&mem->css); |
1491 | } | 1827 | } |
1492 | 1828 | ||
@@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1538 | break; | 1874 | break; |
1539 | } | 1875 | } |
1540 | 1876 | ||
1541 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1877 | if (!mem_cgroup_is_root(mem)) { |
1542 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
1543 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1879 | if (do_swap_account && |
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1884 | mem_cgroup_swap_statistics(mem, true); | ||
1544 | mem_cgroup_charge_statistics(mem, pc, false); | 1885 | mem_cgroup_charge_statistics(mem, pc, false); |
1545 | 1886 | ||
1546 | ClearPageCgroupUsed(pc); | 1887 | ClearPageCgroupUsed(pc); |
@@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1554 | mz = page_cgroup_zoneinfo(pc); | 1895 | mz = page_cgroup_zoneinfo(pc); |
1555 | unlock_page_cgroup(pc); | 1896 | unlock_page_cgroup(pc); |
1556 | 1897 | ||
1898 | if (mem_cgroup_soft_limit_check(mem)) | ||
1899 | mem_cgroup_update_tree(mem, page); | ||
1557 | /* at swapout, this memcg will be accessed to record to swap */ | 1900 | /* at swapout, this memcg will be accessed to record to swap */ |
1558 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1901 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1559 | css_put(&mem->css); | 1902 | css_put(&mem->css); |
@@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1629 | * We uncharge this because swap is freed. | 1972 | * We uncharge this because swap is freed. |
1630 | * This memcg can be obsolete one. We avoid calling css_tryget | 1973 | * This memcg can be obsolete one. We avoid calling css_tryget |
1631 | */ | 1974 | */ |
1632 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1975 | if (!mem_cgroup_is_root(memcg)) |
1976 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | ||
1977 | mem_cgroup_swap_statistics(memcg, false); | ||
1633 | mem_cgroup_put(memcg); | 1978 | mem_cgroup_put(memcg); |
1634 | } | 1979 | } |
1635 | rcu_read_unlock(); | 1980 | rcu_read_unlock(); |
@@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
1658 | unlock_page_cgroup(pc); | 2003 | unlock_page_cgroup(pc); |
1659 | 2004 | ||
1660 | if (mem) { | 2005 | if (mem) { |
1661 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 2006 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
2007 | page); | ||
1662 | css_put(&mem->css); | 2008 | css_put(&mem->css); |
1663 | } | 2009 | } |
1664 | *ptr = mem; | 2010 | *ptr = mem; |
@@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
1798 | if (!ret) | 2144 | if (!ret) |
1799 | break; | 2145 | break; |
1800 | 2146 | ||
1801 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
1802 | false, true); | 2148 | GFP_KERNEL, |
2149 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1803 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
1804 | /* Usage is reduced ? */ | 2151 | /* Usage is reduced ? */ |
1805 | if (curusage >= oldusage) | 2152 | if (curusage >= oldusage) |
@@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1851 | if (!ret) | 2198 | if (!ret) |
1852 | break; | 2199 | break; |
1853 | 2200 | ||
1854 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); | 2201 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2202 | MEM_CGROUP_RECLAIM_NOSWAP | | ||
2203 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1855 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2204 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
1856 | /* Usage is reduced ? */ | 2205 | /* Usage is reduced ? */ |
1857 | if (curusage >= oldusage) | 2206 | if (curusage >= oldusage) |
@@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1862 | return ret; | 2211 | return ret; |
1863 | } | 2212 | } |
1864 | 2213 | ||
2214 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
2215 | gfp_t gfp_mask, int nid, | ||
2216 | int zid) | ||
2217 | { | ||
2218 | unsigned long nr_reclaimed = 0; | ||
2219 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
2220 | unsigned long reclaimed; | ||
2221 | int loop = 0; | ||
2222 | struct mem_cgroup_tree_per_zone *mctz; | ||
2223 | unsigned long long excess; | ||
2224 | |||
2225 | if (order > 0) | ||
2226 | return 0; | ||
2227 | |||
2228 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
2229 | /* | ||
2230 | * This loop can run a while, specially if mem_cgroup's continuously | ||
2231 | * keep exceeding their soft limit and putting the system under | ||
2232 | * pressure | ||
2233 | */ | ||
2234 | do { | ||
2235 | if (next_mz) | ||
2236 | mz = next_mz; | ||
2237 | else | ||
2238 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
2239 | if (!mz) | ||
2240 | break; | ||
2241 | |||
2242 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
2243 | gfp_mask, | ||
2244 | MEM_CGROUP_RECLAIM_SOFT); | ||
2245 | nr_reclaimed += reclaimed; | ||
2246 | spin_lock(&mctz->lock); | ||
2247 | |||
2248 | /* | ||
2249 | * If we failed to reclaim anything from this memory cgroup | ||
2250 | * it is time to move on to the next cgroup | ||
2251 | */ | ||
2252 | next_mz = NULL; | ||
2253 | if (!reclaimed) { | ||
2254 | do { | ||
2255 | /* | ||
2256 | * Loop until we find yet another one. | ||
2257 | * | ||
2258 | * By the time we get the soft_limit lock | ||
2259 | * again, someone might have aded the | ||
2260 | * group back on the RB tree. Iterate to | ||
2261 | * make sure we get a different mem. | ||
2262 | * mem_cgroup_largest_soft_limit_node returns | ||
2263 | * NULL if no other cgroup is present on | ||
2264 | * the tree | ||
2265 | */ | ||
2266 | next_mz = | ||
2267 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
2268 | if (next_mz == mz) { | ||
2269 | css_put(&next_mz->mem->css); | ||
2270 | next_mz = NULL; | ||
2271 | } else /* next_mz == NULL or other memcg */ | ||
2272 | break; | ||
2273 | } while (1); | ||
2274 | } | ||
2275 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
2276 | excess = res_counter_soft_limit_excess(&mz->mem->res); | ||
2277 | /* | ||
2278 | * One school of thought says that we should not add | ||
2279 | * back the node to the tree if reclaim returns 0. | ||
2280 | * But our reclaim could return 0, simply because due | ||
2281 | * to priority we are exposing a smaller subset of | ||
2282 | * memory to reclaim from. Consider this as a longer | ||
2283 | * term TODO. | ||
2284 | */ | ||
2285 | /* If excess == 0, no tree ops */ | ||
2286 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); | ||
2287 | spin_unlock(&mctz->lock); | ||
2288 | css_put(&mz->mem->css); | ||
2289 | loop++; | ||
2290 | /* | ||
2291 | * Could not reclaim anything and there are no more | ||
2292 | * mem cgroups to try or we seem to be looping without | ||
2293 | * reclaiming anything. | ||
2294 | */ | ||
2295 | if (!nr_reclaimed && | ||
2296 | (next_mz == NULL || | ||
2297 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
2298 | break; | ||
2299 | } while (!nr_reclaimed); | ||
2300 | if (next_mz) | ||
2301 | css_put(&next_mz->mem->css); | ||
2302 | return nr_reclaimed; | ||
2303 | } | ||
2304 | |||
1865 | /* | 2305 | /* |
1866 | * This routine traverse page_cgroup in given list and drop them all. | 2306 | * This routine traverse page_cgroup in given list and drop them all. |
1867 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2307 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
@@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2046 | return retval; | 2486 | return retval; |
2047 | } | 2487 | } |
2048 | 2488 | ||
2489 | struct mem_cgroup_idx_data { | ||
2490 | s64 val; | ||
2491 | enum mem_cgroup_stat_index idx; | ||
2492 | }; | ||
2493 | |||
2494 | static int | ||
2495 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
2496 | { | ||
2497 | struct mem_cgroup_idx_data *d = data; | ||
2498 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
2499 | return 0; | ||
2500 | } | ||
2501 | |||
2502 | static void | ||
2503 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
2504 | enum mem_cgroup_stat_index idx, s64 *val) | ||
2505 | { | ||
2506 | struct mem_cgroup_idx_data d; | ||
2507 | d.idx = idx; | ||
2508 | d.val = 0; | ||
2509 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
2510 | *val = d.val; | ||
2511 | } | ||
2512 | |||
2049 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2513 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2050 | { | 2514 | { |
2051 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2515 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2052 | u64 val = 0; | 2516 | u64 idx_val, val; |
2053 | int type, name; | 2517 | int type, name; |
2054 | 2518 | ||
2055 | type = MEMFILE_TYPE(cft->private); | 2519 | type = MEMFILE_TYPE(cft->private); |
2056 | name = MEMFILE_ATTR(cft->private); | 2520 | name = MEMFILE_ATTR(cft->private); |
2057 | switch (type) { | 2521 | switch (type) { |
2058 | case _MEM: | 2522 | case _MEM: |
2059 | val = res_counter_read_u64(&mem->res, name); | 2523 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2524 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2525 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2526 | val = idx_val; | ||
2527 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2528 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2529 | val += idx_val; | ||
2530 | val <<= PAGE_SHIFT; | ||
2531 | } else | ||
2532 | val = res_counter_read_u64(&mem->res, name); | ||
2060 | break; | 2533 | break; |
2061 | case _MEMSWAP: | 2534 | case _MEMSWAP: |
2062 | val = res_counter_read_u64(&mem->memsw, name); | 2535 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2536 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2537 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2538 | val = idx_val; | ||
2539 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2540 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2541 | val += idx_val; | ||
2542 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2544 | val <<= PAGE_SHIFT; | ||
2545 | } else | ||
2546 | val = res_counter_read_u64(&mem->memsw, name); | ||
2063 | break; | 2547 | break; |
2064 | default: | 2548 | default: |
2065 | BUG(); | 2549 | BUG(); |
@@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2083 | name = MEMFILE_ATTR(cft->private); | 2567 | name = MEMFILE_ATTR(cft->private); |
2084 | switch (name) { | 2568 | switch (name) { |
2085 | case RES_LIMIT: | 2569 | case RES_LIMIT: |
2570 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
2571 | ret = -EINVAL; | ||
2572 | break; | ||
2573 | } | ||
2086 | /* This function does all necessary parse...reuse it */ | 2574 | /* This function does all necessary parse...reuse it */ |
2087 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2575 | ret = res_counter_memparse_write_strategy(buffer, &val); |
2088 | if (ret) | 2576 | if (ret) |
@@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2092 | else | 2580 | else |
2093 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 2581 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
2094 | break; | 2582 | break; |
2583 | case RES_SOFT_LIMIT: | ||
2584 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
2585 | if (ret) | ||
2586 | break; | ||
2587 | /* | ||
2588 | * For memsw, soft limits are hard to implement in terms | ||
2589 | * of semantics, for now, we support soft limits for | ||
2590 | * control without swap | ||
2591 | */ | ||
2592 | if (type == _MEM) | ||
2593 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
2594 | else | ||
2595 | ret = -EINVAL; | ||
2596 | break; | ||
2095 | default: | 2597 | default: |
2096 | ret = -EINVAL; /* should be BUG() ? */ | 2598 | ret = -EINVAL; /* should be BUG() ? */ |
2097 | break; | 2599 | break; |
@@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2149 | res_counter_reset_failcnt(&mem->memsw); | 2651 | res_counter_reset_failcnt(&mem->memsw); |
2150 | break; | 2652 | break; |
2151 | } | 2653 | } |
2654 | |||
2152 | return 0; | 2655 | return 0; |
2153 | } | 2656 | } |
2154 | 2657 | ||
@@ -2160,6 +2663,7 @@ enum { | |||
2160 | MCS_MAPPED_FILE, | 2663 | MCS_MAPPED_FILE, |
2161 | MCS_PGPGIN, | 2664 | MCS_PGPGIN, |
2162 | MCS_PGPGOUT, | 2665 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | ||
2163 | MCS_INACTIVE_ANON, | 2667 | MCS_INACTIVE_ANON, |
2164 | MCS_ACTIVE_ANON, | 2668 | MCS_ACTIVE_ANON, |
2165 | MCS_INACTIVE_FILE, | 2669 | MCS_INACTIVE_FILE, |
@@ -2181,6 +2685,7 @@ struct { | |||
2181 | {"mapped_file", "total_mapped_file"}, | 2685 | {"mapped_file", "total_mapped_file"}, |
2182 | {"pgpgin", "total_pgpgin"}, | 2686 | {"pgpgin", "total_pgpgin"}, |
2183 | {"pgpgout", "total_pgpgout"}, | 2687 | {"pgpgout", "total_pgpgout"}, |
2688 | {"swap", "total_swap"}, | ||
2184 | {"inactive_anon", "total_inactive_anon"}, | 2689 | {"inactive_anon", "total_inactive_anon"}, |
2185 | {"active_anon", "total_active_anon"}, | 2690 | {"active_anon", "total_active_anon"}, |
2186 | {"inactive_file", "total_inactive_file"}, | 2691 | {"inactive_file", "total_inactive_file"}, |
@@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2205 | s->stat[MCS_PGPGIN] += val; | 2710 | s->stat[MCS_PGPGIN] += val; |
2206 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2207 | s->stat[MCS_PGPGOUT] += val; | 2712 | s->stat[MCS_PGPGOUT] += val; |
2713 | if (do_swap_account) { | ||
2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | ||
2715 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
2716 | } | ||
2208 | 2717 | ||
2209 | /* per zone stat */ | 2718 | /* per zone stat */ |
2210 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 2719 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
@@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2236 | memset(&mystat, 0, sizeof(mystat)); | 2745 | memset(&mystat, 0, sizeof(mystat)); |
2237 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 2746 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
2238 | 2747 | ||
2239 | for (i = 0; i < NR_MCS_STAT; i++) | 2748 | for (i = 0; i < NR_MCS_STAT; i++) { |
2749 | if (i == MCS_SWAP && !do_swap_account) | ||
2750 | continue; | ||
2240 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 2751 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); |
2752 | } | ||
2241 | 2753 | ||
2242 | /* Hierarchical information */ | 2754 | /* Hierarchical information */ |
2243 | { | 2755 | { |
@@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2250 | 2762 | ||
2251 | memset(&mystat, 0, sizeof(mystat)); | 2763 | memset(&mystat, 0, sizeof(mystat)); |
2252 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 2764 | mem_cgroup_get_total_stat(mem_cont, &mystat); |
2253 | for (i = 0; i < NR_MCS_STAT; i++) | 2765 | for (i = 0; i < NR_MCS_STAT; i++) { |
2766 | if (i == MCS_SWAP && !do_swap_account) | ||
2767 | continue; | ||
2254 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 2768 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); |
2255 | 2769 | } | |
2256 | 2770 | ||
2257 | #ifdef CONFIG_DEBUG_VM | 2771 | #ifdef CONFIG_DEBUG_VM |
2258 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 2772 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
@@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = { | |||
2345 | .read_u64 = mem_cgroup_read, | 2859 | .read_u64 = mem_cgroup_read, |
2346 | }, | 2860 | }, |
2347 | { | 2861 | { |
2862 | .name = "soft_limit_in_bytes", | ||
2863 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | ||
2864 | .write_string = mem_cgroup_write, | ||
2865 | .read_u64 = mem_cgroup_read, | ||
2866 | }, | ||
2867 | { | ||
2348 | .name = "failcnt", | 2868 | .name = "failcnt", |
2349 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 2869 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
2350 | .trigger = mem_cgroup_reset, | 2870 | .trigger = mem_cgroup_reset, |
@@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2438 | mz = &pn->zoneinfo[zone]; | 2958 | mz = &pn->zoneinfo[zone]; |
2439 | for_each_lru(l) | 2959 | for_each_lru(l) |
2440 | INIT_LIST_HEAD(&mz->lists[l]); | 2960 | INIT_LIST_HEAD(&mz->lists[l]); |
2961 | mz->usage_in_excess = 0; | ||
2962 | mz->on_tree = false; | ||
2963 | mz->mem = mem; | ||
2441 | } | 2964 | } |
2442 | return 0; | 2965 | return 0; |
2443 | } | 2966 | } |
@@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
2483 | { | 3006 | { |
2484 | int node; | 3007 | int node; |
2485 | 3008 | ||
3009 | mem_cgroup_remove_from_trees(mem); | ||
2486 | free_css_id(&mem_cgroup_subsys, &mem->css); | 3010 | free_css_id(&mem_cgroup_subsys, &mem->css); |
2487 | 3011 | ||
2488 | for_each_node_state(node, N_POSSIBLE) | 3012 | for_each_node_state(node, N_POSSIBLE) |
@@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void) | |||
2531 | } | 3055 | } |
2532 | #endif | 3056 | #endif |
2533 | 3057 | ||
3058 | static int mem_cgroup_soft_limit_tree_init(void) | ||
3059 | { | ||
3060 | struct mem_cgroup_tree_per_node *rtpn; | ||
3061 | struct mem_cgroup_tree_per_zone *rtpz; | ||
3062 | int tmp, node, zone; | ||
3063 | |||
3064 | for_each_node_state(node, N_POSSIBLE) { | ||
3065 | tmp = node; | ||
3066 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
3067 | tmp = -1; | ||
3068 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
3069 | if (!rtpn) | ||
3070 | return 1; | ||
3071 | |||
3072 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
3073 | |||
3074 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
3075 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
3076 | rtpz->rb_root = RB_ROOT; | ||
3077 | spin_lock_init(&rtpz->lock); | ||
3078 | } | ||
3079 | } | ||
3080 | return 0; | ||
3081 | } | ||
3082 | |||
2534 | static struct cgroup_subsys_state * __ref | 3083 | static struct cgroup_subsys_state * __ref |
2535 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 3084 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2536 | { | 3085 | { |
@@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2545 | for_each_node_state(node, N_POSSIBLE) | 3094 | for_each_node_state(node, N_POSSIBLE) |
2546 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 3095 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
2547 | goto free_out; | 3096 | goto free_out; |
3097 | |||
2548 | /* root ? */ | 3098 | /* root ? */ |
2549 | if (cont->parent == NULL) { | 3099 | if (cont->parent == NULL) { |
2550 | enable_swap_cgroup(); | 3100 | enable_swap_cgroup(); |
2551 | parent = NULL; | 3101 | parent = NULL; |
3102 | root_mem_cgroup = mem; | ||
3103 | if (mem_cgroup_soft_limit_tree_init()) | ||
3104 | goto free_out; | ||
3105 | |||
2552 | } else { | 3106 | } else { |
2553 | parent = mem_cgroup_from_cont(cont->parent); | 3107 | parent = mem_cgroup_from_cont(cont->parent); |
2554 | mem->use_hierarchy = parent->use_hierarchy; | 3108 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2577 | return &mem->css; | 3131 | return &mem->css; |
2578 | free_out: | 3132 | free_out: |
2579 | __mem_cgroup_free(mem); | 3133 | __mem_cgroup_free(mem); |
3134 | root_mem_cgroup = NULL; | ||
2580 | return ERR_PTR(error); | 3135 | return ERR_PTR(error); |
2581 | } | 3136 | } |
2582 | 3137 | ||
@@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
2612 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3167 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
2613 | struct cgroup *cont, | 3168 | struct cgroup *cont, |
2614 | struct cgroup *old_cont, | 3169 | struct cgroup *old_cont, |
2615 | struct task_struct *p) | 3170 | struct task_struct *p, |
3171 | bool threadgroup) | ||
2616 | { | 3172 | { |
2617 | mutex_lock(&memcg_tasklist); | 3173 | mutex_lock(&memcg_tasklist); |
2618 | /* | 3174 | /* |