diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 737 |
1 files changed, 653 insertions, 84 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9b10d8753784..e2b98a6875c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/rbtree.h> | ||
32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
@@ -43,6 +44,7 @@ | |||
43 | 44 | ||
44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 45 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 46 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
47 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
46 | 48 | ||
47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 49 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 50 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
@@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
53 | #endif | 55 | #endif |
54 | 56 | ||
55 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
56 | 59 | ||
57 | /* | 60 | /* |
58 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
@@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { | |||
66 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
67 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
68 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
69 | 74 | ||
70 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
71 | }; | 76 | }; |
@@ -78,6 +83,20 @@ struct mem_cgroup_stat { | |||
78 | struct mem_cgroup_stat_cpu cpustat[0]; | 83 | struct mem_cgroup_stat_cpu cpustat[0]; |
79 | }; | 84 | }; |
80 | 85 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
81 | /* | 100 | /* |
82 | * For accounting under irq disable, no need for increment preempt count. | 101 | * For accounting under irq disable, no need for increment preempt count. |
83 | */ | 102 | */ |
@@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { | |||
117 | unsigned long count[NR_LRU_LISTS]; | 136 | unsigned long count[NR_LRU_LISTS]; |
118 | 137 | ||
119 | struct zone_reclaim_stat reclaim_stat; | 138 | struct zone_reclaim_stat reclaim_stat; |
139 | struct rb_node tree_node; /* RB tree node */ | ||
140 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
141 | /* the soft limit is exceeded*/ | ||
142 | bool on_tree; | ||
143 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
144 | /* use container_of */ | ||
120 | }; | 145 | }; |
121 | /* Macro for accessing counter */ | 146 | /* Macro for accessing counter */ |
122 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 147 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { | |||
130 | }; | 155 | }; |
131 | 156 | ||
132 | /* | 157 | /* |
158 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
159 | * their hierarchy representation | ||
160 | */ | ||
161 | |||
162 | struct mem_cgroup_tree_per_zone { | ||
163 | struct rb_root rb_root; | ||
164 | spinlock_t lock; | ||
165 | }; | ||
166 | |||
167 | struct mem_cgroup_tree_per_node { | ||
168 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
169 | }; | ||
170 | |||
171 | struct mem_cgroup_tree { | ||
172 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
173 | }; | ||
174 | |||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
176 | |||
177 | /* | ||
133 | * The memory controller data structure. The memory controller controls both | 178 | * The memory controller data structure. The memory controller controls both |
134 | * page cache and RSS per cgroup. We would eventually like to provide | 179 | * page cache and RSS per cgroup. We would eventually like to provide |
135 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 180 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
@@ -186,6 +231,13 @@ struct mem_cgroup { | |||
186 | struct mem_cgroup_stat stat; | 231 | struct mem_cgroup_stat stat; |
187 | }; | 232 | }; |
188 | 233 | ||
234 | /* | ||
235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
236 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
237 | */ | ||
238 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
239 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
240 | |||
189 | enum charge_type { | 241 | enum charge_type { |
190 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 242 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
191 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 243 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
@@ -200,13 +252,8 @@ enum charge_type { | |||
200 | #define PCGF_CACHE (1UL << PCG_CACHE) | 252 | #define PCGF_CACHE (1UL << PCG_CACHE) |
201 | #define PCGF_USED (1UL << PCG_USED) | 253 | #define PCGF_USED (1UL << PCG_USED) |
202 | #define PCGF_LOCK (1UL << PCG_LOCK) | 254 | #define PCGF_LOCK (1UL << PCG_LOCK) |
203 | static const unsigned long | 255 | /* Not used, but added here for completeness */ |
204 | pcg_default_flags[NR_CHARGE_TYPE] = { | 256 | #define PCGF_ACCT (1UL << PCG_ACCT) |
205 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
206 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
207 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
208 | 0, /* FORCE */ | ||
209 | }; | ||
210 | 257 | ||
211 | /* for encoding cft->private value on file */ | 258 | /* for encoding cft->private value on file */ |
212 | #define _MEM (0) | 259 | #define _MEM (0) |
@@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
215 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 262 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
216 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 263 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
217 | 264 | ||
265 | /* | ||
266 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
267 | */ | ||
268 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
269 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
270 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
271 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
272 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
273 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
274 | |||
218 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
219 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
220 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
221 | 278 | ||
279 | static struct mem_cgroup_per_zone * | ||
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
281 | { | ||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
283 | } | ||
284 | |||
285 | static struct mem_cgroup_per_zone * | ||
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
287 | { | ||
288 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
289 | int nid = page_cgroup_nid(pc); | ||
290 | int zid = page_cgroup_zid(pc); | ||
291 | |||
292 | if (!mem) | ||
293 | return NULL; | ||
294 | |||
295 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
296 | } | ||
297 | |||
298 | static struct mem_cgroup_tree_per_zone * | ||
299 | soft_limit_tree_node_zone(int nid, int zid) | ||
300 | { | ||
301 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
302 | } | ||
303 | |||
304 | static struct mem_cgroup_tree_per_zone * | ||
305 | soft_limit_tree_from_page(struct page *page) | ||
306 | { | ||
307 | int nid = page_to_nid(page); | ||
308 | int zid = page_zonenum(page); | ||
309 | |||
310 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
311 | } | ||
312 | |||
313 | static void | ||
314 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
315 | struct mem_cgroup_per_zone *mz, | ||
316 | struct mem_cgroup_tree_per_zone *mctz) | ||
317 | { | ||
318 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
319 | struct rb_node *parent = NULL; | ||
320 | struct mem_cgroup_per_zone *mz_node; | ||
321 | |||
322 | if (mz->on_tree) | ||
323 | return; | ||
324 | |||
325 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
326 | while (*p) { | ||
327 | parent = *p; | ||
328 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
329 | tree_node); | ||
330 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
331 | p = &(*p)->rb_left; | ||
332 | /* | ||
333 | * We can't avoid mem cgroups that are over their soft | ||
334 | * limit by the same amount | ||
335 | */ | ||
336 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
337 | p = &(*p)->rb_right; | ||
338 | } | ||
339 | rb_link_node(&mz->tree_node, parent, p); | ||
340 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
341 | mz->on_tree = true; | ||
342 | } | ||
343 | |||
344 | static void | ||
345 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
346 | struct mem_cgroup_per_zone *mz, | ||
347 | struct mem_cgroup_tree_per_zone *mctz) | ||
348 | { | ||
349 | if (!mz->on_tree) | ||
350 | return; | ||
351 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
352 | mz->on_tree = false; | ||
353 | } | ||
354 | |||
355 | static void | ||
356 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
357 | struct mem_cgroup_per_zone *mz, | ||
358 | struct mem_cgroup_tree_per_zone *mctz) | ||
359 | { | ||
360 | spin_lock(&mctz->lock); | ||
361 | __mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
362 | spin_unlock(&mctz->lock); | ||
363 | } | ||
364 | |||
365 | static void | ||
366 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
367 | struct mem_cgroup_per_zone *mz, | ||
368 | struct mem_cgroup_tree_per_zone *mctz) | ||
369 | { | ||
370 | spin_lock(&mctz->lock); | ||
371 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
372 | spin_unlock(&mctz->lock); | ||
373 | } | ||
374 | |||
375 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
376 | { | ||
377 | bool ret = false; | ||
378 | int cpu; | ||
379 | s64 val; | ||
380 | struct mem_cgroup_stat_cpu *cpustat; | ||
381 | |||
382 | cpu = get_cpu(); | ||
383 | cpustat = &mem->stat.cpustat[cpu]; | ||
384 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
385 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
386 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
387 | ret = true; | ||
388 | } | ||
389 | put_cpu(); | ||
390 | return ret; | ||
391 | } | ||
392 | |||
393 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
394 | { | ||
395 | unsigned long long prev_usage_in_excess, new_usage_in_excess; | ||
396 | bool updated_tree = false; | ||
397 | struct mem_cgroup_per_zone *mz; | ||
398 | struct mem_cgroup_tree_per_zone *mctz; | ||
399 | |||
400 | mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); | ||
401 | mctz = soft_limit_tree_from_page(page); | ||
402 | |||
403 | /* | ||
404 | * We do updates in lazy mode, mem's are removed | ||
405 | * lazily from the per-zone, per-node rb tree | ||
406 | */ | ||
407 | prev_usage_in_excess = mz->usage_in_excess; | ||
408 | |||
409 | new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
410 | if (prev_usage_in_excess) { | ||
411 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
412 | updated_tree = true; | ||
413 | } | ||
414 | if (!new_usage_in_excess) | ||
415 | goto done; | ||
416 | mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
417 | |||
418 | done: | ||
419 | if (updated_tree) { | ||
420 | spin_lock(&mctz->lock); | ||
421 | mz->usage_in_excess = new_usage_in_excess; | ||
422 | spin_unlock(&mctz->lock); | ||
423 | } | ||
424 | } | ||
425 | |||
426 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
427 | { | ||
428 | int node, zone; | ||
429 | struct mem_cgroup_per_zone *mz; | ||
430 | struct mem_cgroup_tree_per_zone *mctz; | ||
431 | |||
432 | for_each_node_state(node, N_POSSIBLE) { | ||
433 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
434 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
435 | mctz = soft_limit_tree_node_zone(node, zone); | ||
436 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
437 | } | ||
438 | } | ||
439 | } | ||
440 | |||
441 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
442 | { | ||
443 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
444 | } | ||
445 | |||
446 | static struct mem_cgroup_per_zone * | ||
447 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
448 | { | ||
449 | struct rb_node *rightmost = NULL; | ||
450 | struct mem_cgroup_per_zone *mz = NULL; | ||
451 | |||
452 | retry: | ||
453 | rightmost = rb_last(&mctz->rb_root); | ||
454 | if (!rightmost) | ||
455 | goto done; /* Nothing to reclaim from */ | ||
456 | |||
457 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
458 | /* | ||
459 | * Remove the node now but someone else can add it back, | ||
460 | * we will to add it back at the end of reclaim to its correct | ||
461 | * position in the tree. | ||
462 | */ | ||
463 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
464 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
465 | !css_tryget(&mz->mem->css)) | ||
466 | goto retry; | ||
467 | done: | ||
468 | return mz; | ||
469 | } | ||
470 | |||
471 | static struct mem_cgroup_per_zone * | ||
472 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
473 | { | ||
474 | struct mem_cgroup_per_zone *mz; | ||
475 | |||
476 | spin_lock(&mctz->lock); | ||
477 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
478 | spin_unlock(&mctz->lock); | ||
479 | return mz; | ||
480 | } | ||
481 | |||
482 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
483 | bool charge) | ||
484 | { | ||
485 | int val = (charge) ? 1 : -1; | ||
486 | struct mem_cgroup_stat *stat = &mem->stat; | ||
487 | struct mem_cgroup_stat_cpu *cpustat; | ||
488 | int cpu = get_cpu(); | ||
489 | |||
490 | cpustat = &stat->cpustat[cpu]; | ||
491 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
492 | put_cpu(); | ||
493 | } | ||
494 | |||
222 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 495 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
223 | struct page_cgroup *pc, | 496 | struct page_cgroup *pc, |
224 | bool charge) | 497 | bool charge) |
225 | { | 498 | { |
226 | int val = (charge)? 1 : -1; | 499 | int val = (charge) ? 1 : -1; |
227 | struct mem_cgroup_stat *stat = &mem->stat; | 500 | struct mem_cgroup_stat *stat = &mem->stat; |
228 | struct mem_cgroup_stat_cpu *cpustat; | 501 | struct mem_cgroup_stat_cpu *cpustat; |
229 | int cpu = get_cpu(); | 502 | int cpu = get_cpu(); |
@@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
240 | else | 513 | else |
241 | __mem_cgroup_stat_add_safe(cpustat, | 514 | __mem_cgroup_stat_add_safe(cpustat, |
242 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 515 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
516 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
243 | put_cpu(); | 517 | put_cpu(); |
244 | } | 518 | } |
245 | 519 | ||
246 | static struct mem_cgroup_per_zone * | ||
247 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
248 | { | ||
249 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
250 | } | ||
251 | |||
252 | static struct mem_cgroup_per_zone * | ||
253 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
254 | { | ||
255 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
256 | int nid = page_cgroup_nid(pc); | ||
257 | int zid = page_cgroup_zid(pc); | ||
258 | |||
259 | if (!mem) | ||
260 | return NULL; | ||
261 | |||
262 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
263 | } | ||
264 | |||
265 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 520 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
266 | enum lru_list idx) | 521 | enum lru_list idx) |
267 | { | 522 | { |
@@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
354 | return ret; | 609 | return ret; |
355 | } | 610 | } |
356 | 611 | ||
612 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
613 | { | ||
614 | return (mem == root_mem_cgroup); | ||
615 | } | ||
616 | |||
357 | /* | 617 | /* |
358 | * Following LRU functions are allowed to be used without PCG_LOCK. | 618 | * Following LRU functions are allowed to be used without PCG_LOCK. |
359 | * Operations are called by routine of global LRU independently from memcg. | 619 | * Operations are called by routine of global LRU independently from memcg. |
@@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
371 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 631 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
372 | { | 632 | { |
373 | struct page_cgroup *pc; | 633 | struct page_cgroup *pc; |
374 | struct mem_cgroup *mem; | ||
375 | struct mem_cgroup_per_zone *mz; | 634 | struct mem_cgroup_per_zone *mz; |
376 | 635 | ||
377 | if (mem_cgroup_disabled()) | 636 | if (mem_cgroup_disabled()) |
378 | return; | 637 | return; |
379 | pc = lookup_page_cgroup(page); | 638 | pc = lookup_page_cgroup(page); |
380 | /* can happen while we handle swapcache. */ | 639 | /* can happen while we handle swapcache. */ |
381 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 640 | if (!TestClearPageCgroupAcctLRU(pc)) |
382 | return; | 641 | return; |
642 | VM_BUG_ON(!pc->mem_cgroup); | ||
383 | /* | 643 | /* |
384 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 644 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
385 | * removed from global LRU. | 645 | * removed from global LRU. |
386 | */ | 646 | */ |
387 | mz = page_cgroup_zoneinfo(pc); | 647 | mz = page_cgroup_zoneinfo(pc); |
388 | mem = pc->mem_cgroup; | ||
389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 648 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
649 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
650 | return; | ||
651 | VM_BUG_ON(list_empty(&pc->lru)); | ||
390 | list_del_init(&pc->lru); | 652 | list_del_init(&pc->lru); |
391 | return; | 653 | return; |
392 | } | 654 | } |
@@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
410 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 672 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
411 | */ | 673 | */ |
412 | smp_rmb(); | 674 | smp_rmb(); |
413 | /* unused page is not rotated. */ | 675 | /* unused or root page is not rotated. */ |
414 | if (!PageCgroupUsed(pc)) | 676 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
415 | return; | 677 | return; |
416 | mz = page_cgroup_zoneinfo(pc); | 678 | mz = page_cgroup_zoneinfo(pc); |
417 | list_move(&pc->lru, &mz->lists[lru]); | 679 | list_move(&pc->lru, &mz->lists[lru]); |
@@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
425 | if (mem_cgroup_disabled()) | 687 | if (mem_cgroup_disabled()) |
426 | return; | 688 | return; |
427 | pc = lookup_page_cgroup(page); | 689 | pc = lookup_page_cgroup(page); |
690 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
428 | /* | 691 | /* |
429 | * Used bit is set without atomic ops but after smp_wmb(). | 692 | * Used bit is set without atomic ops but after smp_wmb(). |
430 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 693 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
@@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
435 | 698 | ||
436 | mz = page_cgroup_zoneinfo(pc); | 699 | mz = page_cgroup_zoneinfo(pc); |
437 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 700 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
701 | SetPageCgroupAcctLRU(pc); | ||
702 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
703 | return; | ||
438 | list_add(&pc->lru, &mz->lists[lru]); | 704 | list_add(&pc->lru, &mz->lists[lru]); |
439 | } | 705 | } |
440 | 706 | ||
@@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
469 | 735 | ||
470 | spin_lock_irqsave(&zone->lru_lock, flags); | 736 | spin_lock_irqsave(&zone->lru_lock, flags); |
471 | /* link when the page is linked to LRU but page_cgroup isn't */ | 737 | /* link when the page is linked to LRU but page_cgroup isn't */ |
472 | if (PageLRU(page) && list_empty(&pc->lru)) | 738 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
473 | mem_cgroup_add_lru_list(page, page_lru(page)); | 739 | mem_cgroup_add_lru_list(page, page_lru(page)); |
474 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 740 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
475 | } | 741 | } |
@@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
855 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1121 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
856 | */ | 1122 | */ |
857 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1123 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
858 | gfp_t gfp_mask, bool noswap, bool shrink) | 1124 | struct zone *zone, |
1125 | gfp_t gfp_mask, | ||
1126 | unsigned long reclaim_options) | ||
859 | { | 1127 | { |
860 | struct mem_cgroup *victim; | 1128 | struct mem_cgroup *victim; |
861 | int ret, total = 0; | 1129 | int ret, total = 0; |
862 | int loop = 0; | 1130 | int loop = 0; |
1131 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
1132 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
1133 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
1134 | unsigned long excess = mem_cgroup_get_excess(root_mem); | ||
863 | 1135 | ||
864 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1136 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
865 | if (root_mem->memsw_is_minimum) | 1137 | if (root_mem->memsw_is_minimum) |
866 | noswap = true; | 1138 | noswap = true; |
867 | 1139 | ||
868 | while (loop < 2) { | 1140 | while (1) { |
869 | victim = mem_cgroup_select_victim(root_mem); | 1141 | victim = mem_cgroup_select_victim(root_mem); |
870 | if (victim == root_mem) | 1142 | if (victim == root_mem) { |
871 | loop++; | 1143 | loop++; |
1144 | if (loop >= 2) { | ||
1145 | /* | ||
1146 | * If we have not been able to reclaim | ||
1147 | * anything, it might because there are | ||
1148 | * no reclaimable pages under this hierarchy | ||
1149 | */ | ||
1150 | if (!check_soft || !total) { | ||
1151 | css_put(&victim->css); | ||
1152 | break; | ||
1153 | } | ||
1154 | /* | ||
1155 | * We want to do more targetted reclaim. | ||
1156 | * excess >> 2 is not to excessive so as to | ||
1157 | * reclaim too much, nor too less that we keep | ||
1158 | * coming back to reclaim from this cgroup | ||
1159 | */ | ||
1160 | if (total >= (excess >> 2) || | ||
1161 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
1162 | css_put(&victim->css); | ||
1163 | break; | ||
1164 | } | ||
1165 | } | ||
1166 | } | ||
872 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1167 | if (!mem_cgroup_local_usage(&victim->stat)) { |
873 | /* this cgroup's local usage == 0 */ | 1168 | /* this cgroup's local usage == 0 */ |
874 | css_put(&victim->css); | 1169 | css_put(&victim->css); |
875 | continue; | 1170 | continue; |
876 | } | 1171 | } |
877 | /* we use swappiness of local cgroup */ | 1172 | /* we use swappiness of local cgroup */ |
878 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | 1173 | if (check_soft) |
879 | get_swappiness(victim)); | 1174 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
1175 | noswap, get_swappiness(victim), zone, | ||
1176 | zone->zone_pgdat->node_id); | ||
1177 | else | ||
1178 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
1179 | noswap, get_swappiness(victim)); | ||
880 | css_put(&victim->css); | 1180 | css_put(&victim->css); |
881 | /* | 1181 | /* |
882 | * At shrinking usage, we can't check we should stop here or | 1182 | * At shrinking usage, we can't check we should stop here or |
@@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
886 | if (shrink) | 1186 | if (shrink) |
887 | return ret; | 1187 | return ret; |
888 | total += ret; | 1188 | total += ret; |
889 | if (mem_cgroup_check_under_limit(root_mem)) | 1189 | if (check_soft) { |
1190 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
1191 | return total; | ||
1192 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
890 | return 1 + total; | 1193 | return 1 + total; |
891 | } | 1194 | } |
892 | return total; | 1195 | return total; |
@@ -965,11 +1268,11 @@ done: | |||
965 | */ | 1268 | */ |
966 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1269 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
967 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1270 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
968 | bool oom) | 1271 | bool oom, struct page *page) |
969 | { | 1272 | { |
970 | struct mem_cgroup *mem, *mem_over_limit; | 1273 | struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; |
971 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1274 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
972 | struct res_counter *fail_res; | 1275 | struct res_counter *fail_res, *soft_fail_res = NULL; |
973 | 1276 | ||
974 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1277 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
975 | /* Don't account this! */ | 1278 | /* Don't account this! */ |
@@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
996 | VM_BUG_ON(css_is_removed(&mem->css)); | 1299 | VM_BUG_ON(css_is_removed(&mem->css)); |
997 | 1300 | ||
998 | while (1) { | 1301 | while (1) { |
999 | int ret; | 1302 | int ret = 0; |
1000 | bool noswap = false; | 1303 | unsigned long flags = 0; |
1001 | 1304 | ||
1002 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1305 | if (mem_cgroup_is_root(mem)) |
1306 | goto done; | ||
1307 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, | ||
1308 | &soft_fail_res); | ||
1003 | if (likely(!ret)) { | 1309 | if (likely(!ret)) { |
1004 | if (!do_swap_account) | 1310 | if (!do_swap_account) |
1005 | break; | 1311 | break; |
1006 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1312 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, |
1007 | &fail_res); | 1313 | &fail_res, NULL); |
1008 | if (likely(!ret)) | 1314 | if (likely(!ret)) |
1009 | break; | 1315 | break; |
1010 | /* mem+swap counter fails */ | 1316 | /* mem+swap counter fails */ |
1011 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1317 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1012 | noswap = true; | 1318 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1013 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1319 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1014 | memsw); | 1320 | memsw); |
1015 | } else | 1321 | } else |
@@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1020 | if (!(gfp_mask & __GFP_WAIT)) | 1326 | if (!(gfp_mask & __GFP_WAIT)) |
1021 | goto nomem; | 1327 | goto nomem; |
1022 | 1328 | ||
1023 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1329 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
1024 | noswap, false); | 1330 | gfp_mask, flags); |
1025 | if (ret) | 1331 | if (ret) |
1026 | continue; | 1332 | continue; |
1027 | 1333 | ||
@@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1046 | goto nomem; | 1352 | goto nomem; |
1047 | } | 1353 | } |
1048 | } | 1354 | } |
1355 | /* | ||
1356 | * Insert just the ancestor, we should trickle down to the correct | ||
1357 | * cgroup for reclaim, since the other nodes will be below their | ||
1358 | * soft limit | ||
1359 | */ | ||
1360 | if (soft_fail_res) { | ||
1361 | mem_over_soft_limit = | ||
1362 | mem_cgroup_from_res_counter(soft_fail_res, res); | ||
1363 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | ||
1364 | mem_cgroup_update_tree(mem_over_soft_limit, page); | ||
1365 | } | ||
1366 | done: | ||
1049 | return 0; | 1367 | return 0; |
1050 | nomem: | 1368 | nomem: |
1051 | css_put(&mem->css); | 1369 | css_put(&mem->css); |
1052 | return -ENOMEM; | 1370 | return -ENOMEM; |
1053 | } | 1371 | } |
1054 | 1372 | ||
1055 | |||
1056 | /* | 1373 | /* |
1057 | * A helper function to get mem_cgroup from ID. must be called under | 1374 | * A helper function to get mem_cgroup from ID. must be called under |
1058 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1375 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1119 | lock_page_cgroup(pc); | 1436 | lock_page_cgroup(pc); |
1120 | if (unlikely(PageCgroupUsed(pc))) { | 1437 | if (unlikely(PageCgroupUsed(pc))) { |
1121 | unlock_page_cgroup(pc); | 1438 | unlock_page_cgroup(pc); |
1122 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1439 | if (!mem_cgroup_is_root(mem)) { |
1123 | if (do_swap_account) | 1440 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1124 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1441 | if (do_swap_account) |
1442 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, | ||
1443 | NULL); | ||
1444 | } | ||
1125 | css_put(&mem->css); | 1445 | css_put(&mem->css); |
1126 | return; | 1446 | return; |
1127 | } | 1447 | } |
1448 | |||
1128 | pc->mem_cgroup = mem; | 1449 | pc->mem_cgroup = mem; |
1450 | /* | ||
1451 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
1452 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
1453 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
1454 | * before USED bit, we need memory barrier here. | ||
1455 | * See mem_cgroup_add_lru_list(), etc. | ||
1456 | */ | ||
1129 | smp_wmb(); | 1457 | smp_wmb(); |
1130 | pc->flags = pcg_default_flags[ctype]; | 1458 | switch (ctype) { |
1459 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
1460 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
1461 | SetPageCgroupCache(pc); | ||
1462 | SetPageCgroupUsed(pc); | ||
1463 | break; | ||
1464 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
1465 | ClearPageCgroupCache(pc); | ||
1466 | SetPageCgroupUsed(pc); | ||
1467 | break; | ||
1468 | default: | ||
1469 | break; | ||
1470 | } | ||
1131 | 1471 | ||
1132 | mem_cgroup_charge_statistics(mem, pc, true); | 1472 | mem_cgroup_charge_statistics(mem, pc, true); |
1133 | 1473 | ||
@@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1178 | if (pc->mem_cgroup != from) | 1518 | if (pc->mem_cgroup != from) |
1179 | goto out; | 1519 | goto out; |
1180 | 1520 | ||
1181 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1521 | if (!mem_cgroup_is_root(from)) |
1522 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); | ||
1182 | mem_cgroup_charge_statistics(from, pc, false); | 1523 | mem_cgroup_charge_statistics(from, pc, false); |
1183 | 1524 | ||
1184 | page = pc->page; | 1525 | page = pc->page; |
@@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1197 | 1); | 1538 | 1); |
1198 | } | 1539 | } |
1199 | 1540 | ||
1200 | if (do_swap_account) | 1541 | if (do_swap_account && !mem_cgroup_is_root(from)) |
1201 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1542 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
1202 | css_put(&from->css); | 1543 | css_put(&from->css); |
1203 | 1544 | ||
1204 | css_get(&to->css); | 1545 | css_get(&to->css); |
@@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1238 | parent = mem_cgroup_from_cont(pcg); | 1579 | parent = mem_cgroup_from_cont(pcg); |
1239 | 1580 | ||
1240 | 1581 | ||
1241 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1582 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1242 | if (ret || !parent) | 1583 | if (ret || !parent) |
1243 | return ret; | 1584 | return ret; |
1244 | 1585 | ||
@@ -1268,9 +1609,11 @@ uncharge: | |||
1268 | /* drop extra refcnt by try_charge() */ | 1609 | /* drop extra refcnt by try_charge() */ |
1269 | css_put(&parent->css); | 1610 | css_put(&parent->css); |
1270 | /* uncharge if move fails */ | 1611 | /* uncharge if move fails */ |
1271 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1612 | if (!mem_cgroup_is_root(parent)) { |
1272 | if (do_swap_account) | 1613 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
1273 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1614 | if (do_swap_account) |
1615 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); | ||
1616 | } | ||
1274 | return ret; | 1617 | return ret; |
1275 | } | 1618 | } |
1276 | 1619 | ||
@@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1295 | prefetchw(pc); | 1638 | prefetchw(pc); |
1296 | 1639 | ||
1297 | mem = memcg; | 1640 | mem = memcg; |
1298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1641 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
1299 | if (ret || !mem) | 1642 | if (ret || !mem) |
1300 | return ret; | 1643 | return ret; |
1301 | 1644 | ||
@@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1414 | if (!mem) | 1757 | if (!mem) |
1415 | goto charge_cur_mm; | 1758 | goto charge_cur_mm; |
1416 | *ptr = mem; | 1759 | *ptr = mem; |
1417 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1760 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
1418 | /* drop extra refcnt from tryget */ | 1761 | /* drop extra refcnt from tryget */ |
1419 | css_put(&mem->css); | 1762 | css_put(&mem->css); |
1420 | return ret; | 1763 | return ret; |
1421 | charge_cur_mm: | 1764 | charge_cur_mm: |
1422 | if (unlikely(!mm)) | 1765 | if (unlikely(!mm)) |
1423 | mm = &init_mm; | 1766 | mm = &init_mm; |
1424 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1767 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
1425 | } | 1768 | } |
1426 | 1769 | ||
1427 | static void | 1770 | static void |
@@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1459 | * This recorded memcg can be obsolete one. So, avoid | 1802 | * This recorded memcg can be obsolete one. So, avoid |
1460 | * calling css_tryget | 1803 | * calling css_tryget |
1461 | */ | 1804 | */ |
1462 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1805 | if (!mem_cgroup_is_root(memcg)) |
1806 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, | ||
1807 | NULL); | ||
1808 | mem_cgroup_swap_statistics(memcg, false); | ||
1463 | mem_cgroup_put(memcg); | 1809 | mem_cgroup_put(memcg); |
1464 | } | 1810 | } |
1465 | rcu_read_unlock(); | 1811 | rcu_read_unlock(); |
@@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1484 | return; | 1830 | return; |
1485 | if (!mem) | 1831 | if (!mem) |
1486 | return; | 1832 | return; |
1487 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1833 | if (!mem_cgroup_is_root(mem)) { |
1488 | if (do_swap_account) | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1489 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1835 | if (do_swap_account) |
1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1837 | } | ||
1490 | css_put(&mem->css); | 1838 | css_put(&mem->css); |
1491 | } | 1839 | } |
1492 | 1840 | ||
@@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1500 | struct page_cgroup *pc; | 1848 | struct page_cgroup *pc; |
1501 | struct mem_cgroup *mem = NULL; | 1849 | struct mem_cgroup *mem = NULL; |
1502 | struct mem_cgroup_per_zone *mz; | 1850 | struct mem_cgroup_per_zone *mz; |
1851 | bool soft_limit_excess = false; | ||
1503 | 1852 | ||
1504 | if (mem_cgroup_disabled()) | 1853 | if (mem_cgroup_disabled()) |
1505 | return NULL; | 1854 | return NULL; |
@@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1538 | break; | 1887 | break; |
1539 | } | 1888 | } |
1540 | 1889 | ||
1541 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1890 | if (!mem_cgroup_is_root(mem)) { |
1542 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1891 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
1543 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1892 | if (do_swap_account && |
1893 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1894 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
1895 | } | ||
1896 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1897 | mem_cgroup_swap_statistics(mem, true); | ||
1544 | mem_cgroup_charge_statistics(mem, pc, false); | 1898 | mem_cgroup_charge_statistics(mem, pc, false); |
1545 | 1899 | ||
1546 | ClearPageCgroupUsed(pc); | 1900 | ClearPageCgroupUsed(pc); |
@@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1554 | mz = page_cgroup_zoneinfo(pc); | 1908 | mz = page_cgroup_zoneinfo(pc); |
1555 | unlock_page_cgroup(pc); | 1909 | unlock_page_cgroup(pc); |
1556 | 1910 | ||
1911 | if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) | ||
1912 | mem_cgroup_update_tree(mem, page); | ||
1557 | /* at swapout, this memcg will be accessed to record to swap */ | 1913 | /* at swapout, this memcg will be accessed to record to swap */ |
1558 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1914 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1559 | css_put(&mem->css); | 1915 | css_put(&mem->css); |
@@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1629 | * We uncharge this because swap is freed. | 1985 | * We uncharge this because swap is freed. |
1630 | * This memcg can be obsolete one. We avoid calling css_tryget | 1986 | * This memcg can be obsolete one. We avoid calling css_tryget |
1631 | */ | 1987 | */ |
1632 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1988 | if (!mem_cgroup_is_root(memcg)) |
1989 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | ||
1990 | mem_cgroup_swap_statistics(memcg, false); | ||
1633 | mem_cgroup_put(memcg); | 1991 | mem_cgroup_put(memcg); |
1634 | } | 1992 | } |
1635 | rcu_read_unlock(); | 1993 | rcu_read_unlock(); |
@@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
1658 | unlock_page_cgroup(pc); | 2016 | unlock_page_cgroup(pc); |
1659 | 2017 | ||
1660 | if (mem) { | 2018 | if (mem) { |
1661 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 2019 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
2020 | page); | ||
1662 | css_put(&mem->css); | 2021 | css_put(&mem->css); |
1663 | } | 2022 | } |
1664 | *ptr = mem; | 2023 | *ptr = mem; |
@@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
1798 | if (!ret) | 2157 | if (!ret) |
1799 | break; | 2158 | break; |
1800 | 2159 | ||
1801 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2160 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
1802 | false, true); | 2161 | GFP_KERNEL, |
2162 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1803 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2163 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
1804 | /* Usage is reduced ? */ | 2164 | /* Usage is reduced ? */ |
1805 | if (curusage >= oldusage) | 2165 | if (curusage >= oldusage) |
@@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1851 | if (!ret) | 2211 | if (!ret) |
1852 | break; | 2212 | break; |
1853 | 2213 | ||
1854 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); | 2214 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2215 | MEM_CGROUP_RECLAIM_NOSWAP | | ||
2216 | MEM_CGROUP_RECLAIM_SHRINK); | ||
1855 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2217 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
1856 | /* Usage is reduced ? */ | 2218 | /* Usage is reduced ? */ |
1857 | if (curusage >= oldusage) | 2219 | if (curusage >= oldusage) |
@@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
1862 | return ret; | 2224 | return ret; |
1863 | } | 2225 | } |
1864 | 2226 | ||
2227 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
2228 | gfp_t gfp_mask, int nid, | ||
2229 | int zid) | ||
2230 | { | ||
2231 | unsigned long nr_reclaimed = 0; | ||
2232 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
2233 | unsigned long reclaimed; | ||
2234 | int loop = 0; | ||
2235 | struct mem_cgroup_tree_per_zone *mctz; | ||
2236 | |||
2237 | if (order > 0) | ||
2238 | return 0; | ||
2239 | |||
2240 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
2241 | /* | ||
2242 | * This loop can run a while, specially if mem_cgroup's continuously | ||
2243 | * keep exceeding their soft limit and putting the system under | ||
2244 | * pressure | ||
2245 | */ | ||
2246 | do { | ||
2247 | if (next_mz) | ||
2248 | mz = next_mz; | ||
2249 | else | ||
2250 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
2251 | if (!mz) | ||
2252 | break; | ||
2253 | |||
2254 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
2255 | gfp_mask, | ||
2256 | MEM_CGROUP_RECLAIM_SOFT); | ||
2257 | nr_reclaimed += reclaimed; | ||
2258 | spin_lock(&mctz->lock); | ||
2259 | |||
2260 | /* | ||
2261 | * If we failed to reclaim anything from this memory cgroup | ||
2262 | * it is time to move on to the next cgroup | ||
2263 | */ | ||
2264 | next_mz = NULL; | ||
2265 | if (!reclaimed) { | ||
2266 | do { | ||
2267 | /* | ||
2268 | * Loop until we find yet another one. | ||
2269 | * | ||
2270 | * By the time we get the soft_limit lock | ||
2271 | * again, someone might have aded the | ||
2272 | * group back on the RB tree. Iterate to | ||
2273 | * make sure we get a different mem. | ||
2274 | * mem_cgroup_largest_soft_limit_node returns | ||
2275 | * NULL if no other cgroup is present on | ||
2276 | * the tree | ||
2277 | */ | ||
2278 | next_mz = | ||
2279 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
2280 | if (next_mz == mz) { | ||
2281 | css_put(&next_mz->mem->css); | ||
2282 | next_mz = NULL; | ||
2283 | } else /* next_mz == NULL or other memcg */ | ||
2284 | break; | ||
2285 | } while (1); | ||
2286 | } | ||
2287 | mz->usage_in_excess = | ||
2288 | res_counter_soft_limit_excess(&mz->mem->res); | ||
2289 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
2290 | /* | ||
2291 | * One school of thought says that we should not add | ||
2292 | * back the node to the tree if reclaim returns 0. | ||
2293 | * But our reclaim could return 0, simply because due | ||
2294 | * to priority we are exposing a smaller subset of | ||
2295 | * memory to reclaim from. Consider this as a longer | ||
2296 | * term TODO. | ||
2297 | */ | ||
2298 | if (mz->usage_in_excess) | ||
2299 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); | ||
2300 | spin_unlock(&mctz->lock); | ||
2301 | css_put(&mz->mem->css); | ||
2302 | loop++; | ||
2303 | /* | ||
2304 | * Could not reclaim anything and there are no more | ||
2305 | * mem cgroups to try or we seem to be looping without | ||
2306 | * reclaiming anything. | ||
2307 | */ | ||
2308 | if (!nr_reclaimed && | ||
2309 | (next_mz == NULL || | ||
2310 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
2311 | break; | ||
2312 | } while (!nr_reclaimed); | ||
2313 | if (next_mz) | ||
2314 | css_put(&next_mz->mem->css); | ||
2315 | return nr_reclaimed; | ||
2316 | } | ||
2317 | |||
1865 | /* | 2318 | /* |
1866 | * This routine traverse page_cgroup in given list and drop them all. | 2319 | * This routine traverse page_cgroup in given list and drop them all. |
1867 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2320 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
@@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2046 | return retval; | 2499 | return retval; |
2047 | } | 2500 | } |
2048 | 2501 | ||
2502 | struct mem_cgroup_idx_data { | ||
2503 | s64 val; | ||
2504 | enum mem_cgroup_stat_index idx; | ||
2505 | }; | ||
2506 | |||
2507 | static int | ||
2508 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
2509 | { | ||
2510 | struct mem_cgroup_idx_data *d = data; | ||
2511 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
2512 | return 0; | ||
2513 | } | ||
2514 | |||
2515 | static void | ||
2516 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
2517 | enum mem_cgroup_stat_index idx, s64 *val) | ||
2518 | { | ||
2519 | struct mem_cgroup_idx_data d; | ||
2520 | d.idx = idx; | ||
2521 | d.val = 0; | ||
2522 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
2523 | *val = d.val; | ||
2524 | } | ||
2525 | |||
2049 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2526 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2050 | { | 2527 | { |
2051 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2528 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2052 | u64 val = 0; | 2529 | u64 idx_val, val; |
2053 | int type, name; | 2530 | int type, name; |
2054 | 2531 | ||
2055 | type = MEMFILE_TYPE(cft->private); | 2532 | type = MEMFILE_TYPE(cft->private); |
2056 | name = MEMFILE_ATTR(cft->private); | 2533 | name = MEMFILE_ATTR(cft->private); |
2057 | switch (type) { | 2534 | switch (type) { |
2058 | case _MEM: | 2535 | case _MEM: |
2059 | val = res_counter_read_u64(&mem->res, name); | 2536 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2537 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2538 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2539 | val = idx_val; | ||
2540 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2541 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2542 | val += idx_val; | ||
2543 | val <<= PAGE_SHIFT; | ||
2544 | } else | ||
2545 | val = res_counter_read_u64(&mem->res, name); | ||
2060 | break; | 2546 | break; |
2061 | case _MEMSWAP: | 2547 | case _MEMSWAP: |
2062 | val = res_counter_read_u64(&mem->memsw, name); | 2548 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
2549 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2550 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2551 | val = idx_val; | ||
2552 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2553 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2554 | val += idx_val; | ||
2555 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2556 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2557 | val <<= PAGE_SHIFT; | ||
2558 | } else | ||
2559 | val = res_counter_read_u64(&mem->memsw, name); | ||
2063 | break; | 2560 | break; |
2064 | default: | 2561 | default: |
2065 | BUG(); | 2562 | BUG(); |
@@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2083 | name = MEMFILE_ATTR(cft->private); | 2580 | name = MEMFILE_ATTR(cft->private); |
2084 | switch (name) { | 2581 | switch (name) { |
2085 | case RES_LIMIT: | 2582 | case RES_LIMIT: |
2583 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
2584 | ret = -EINVAL; | ||
2585 | break; | ||
2586 | } | ||
2086 | /* This function does all necessary parse...reuse it */ | 2587 | /* This function does all necessary parse...reuse it */ |
2087 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2588 | ret = res_counter_memparse_write_strategy(buffer, &val); |
2088 | if (ret) | 2589 | if (ret) |
@@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
2092 | else | 2593 | else |
2093 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 2594 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
2094 | break; | 2595 | break; |
2596 | case RES_SOFT_LIMIT: | ||
2597 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
2598 | if (ret) | ||
2599 | break; | ||
2600 | /* | ||
2601 | * For memsw, soft limits are hard to implement in terms | ||
2602 | * of semantics, for now, we support soft limits for | ||
2603 | * control without swap | ||
2604 | */ | ||
2605 | if (type == _MEM) | ||
2606 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
2607 | else | ||
2608 | ret = -EINVAL; | ||
2609 | break; | ||
2095 | default: | 2610 | default: |
2096 | ret = -EINVAL; /* should be BUG() ? */ | 2611 | ret = -EINVAL; /* should be BUG() ? */ |
2097 | break; | 2612 | break; |
@@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2149 | res_counter_reset_failcnt(&mem->memsw); | 2664 | res_counter_reset_failcnt(&mem->memsw); |
2150 | break; | 2665 | break; |
2151 | } | 2666 | } |
2667 | |||
2152 | return 0; | 2668 | return 0; |
2153 | } | 2669 | } |
2154 | 2670 | ||
@@ -2160,6 +2676,7 @@ enum { | |||
2160 | MCS_MAPPED_FILE, | 2676 | MCS_MAPPED_FILE, |
2161 | MCS_PGPGIN, | 2677 | MCS_PGPGIN, |
2162 | MCS_PGPGOUT, | 2678 | MCS_PGPGOUT, |
2679 | MCS_SWAP, | ||
2163 | MCS_INACTIVE_ANON, | 2680 | MCS_INACTIVE_ANON, |
2164 | MCS_ACTIVE_ANON, | 2681 | MCS_ACTIVE_ANON, |
2165 | MCS_INACTIVE_FILE, | 2682 | MCS_INACTIVE_FILE, |
@@ -2181,6 +2698,7 @@ struct { | |||
2181 | {"mapped_file", "total_mapped_file"}, | 2698 | {"mapped_file", "total_mapped_file"}, |
2182 | {"pgpgin", "total_pgpgin"}, | 2699 | {"pgpgin", "total_pgpgin"}, |
2183 | {"pgpgout", "total_pgpgout"}, | 2700 | {"pgpgout", "total_pgpgout"}, |
2701 | {"swap", "total_swap"}, | ||
2184 | {"inactive_anon", "total_inactive_anon"}, | 2702 | {"inactive_anon", "total_inactive_anon"}, |
2185 | {"active_anon", "total_active_anon"}, | 2703 | {"active_anon", "total_active_anon"}, |
2186 | {"inactive_file", "total_inactive_file"}, | 2704 | {"inactive_file", "total_inactive_file"}, |
@@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2205 | s->stat[MCS_PGPGIN] += val; | 2723 | s->stat[MCS_PGPGIN] += val; |
2206 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2724 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2207 | s->stat[MCS_PGPGOUT] += val; | 2725 | s->stat[MCS_PGPGOUT] += val; |
2726 | if (do_swap_account) { | ||
2727 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | ||
2728 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
2729 | } | ||
2208 | 2730 | ||
2209 | /* per zone stat */ | 2731 | /* per zone stat */ |
2210 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 2732 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
@@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2236 | memset(&mystat, 0, sizeof(mystat)); | 2758 | memset(&mystat, 0, sizeof(mystat)); |
2237 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 2759 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
2238 | 2760 | ||
2239 | for (i = 0; i < NR_MCS_STAT; i++) | 2761 | for (i = 0; i < NR_MCS_STAT; i++) { |
2762 | if (i == MCS_SWAP && !do_swap_account) | ||
2763 | continue; | ||
2240 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 2764 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); |
2765 | } | ||
2241 | 2766 | ||
2242 | /* Hierarchical information */ | 2767 | /* Hierarchical information */ |
2243 | { | 2768 | { |
@@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
2250 | 2775 | ||
2251 | memset(&mystat, 0, sizeof(mystat)); | 2776 | memset(&mystat, 0, sizeof(mystat)); |
2252 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 2777 | mem_cgroup_get_total_stat(mem_cont, &mystat); |
2253 | for (i = 0; i < NR_MCS_STAT; i++) | 2778 | for (i = 0; i < NR_MCS_STAT; i++) { |
2779 | if (i == MCS_SWAP && !do_swap_account) | ||
2780 | continue; | ||
2254 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 2781 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); |
2255 | 2782 | } | |
2256 | 2783 | ||
2257 | #ifdef CONFIG_DEBUG_VM | 2784 | #ifdef CONFIG_DEBUG_VM |
2258 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 2785 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
@@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = { | |||
2345 | .read_u64 = mem_cgroup_read, | 2872 | .read_u64 = mem_cgroup_read, |
2346 | }, | 2873 | }, |
2347 | { | 2874 | { |
2875 | .name = "soft_limit_in_bytes", | ||
2876 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | ||
2877 | .write_string = mem_cgroup_write, | ||
2878 | .read_u64 = mem_cgroup_read, | ||
2879 | }, | ||
2880 | { | ||
2348 | .name = "failcnt", | 2881 | .name = "failcnt", |
2349 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 2882 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
2350 | .trigger = mem_cgroup_reset, | 2883 | .trigger = mem_cgroup_reset, |
@@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2438 | mz = &pn->zoneinfo[zone]; | 2971 | mz = &pn->zoneinfo[zone]; |
2439 | for_each_lru(l) | 2972 | for_each_lru(l) |
2440 | INIT_LIST_HEAD(&mz->lists[l]); | 2973 | INIT_LIST_HEAD(&mz->lists[l]); |
2974 | mz->usage_in_excess = 0; | ||
2975 | mz->on_tree = false; | ||
2976 | mz->mem = mem; | ||
2441 | } | 2977 | } |
2442 | return 0; | 2978 | return 0; |
2443 | } | 2979 | } |
@@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
2483 | { | 3019 | { |
2484 | int node; | 3020 | int node; |
2485 | 3021 | ||
3022 | mem_cgroup_remove_from_trees(mem); | ||
2486 | free_css_id(&mem_cgroup_subsys, &mem->css); | 3023 | free_css_id(&mem_cgroup_subsys, &mem->css); |
2487 | 3024 | ||
2488 | for_each_node_state(node, N_POSSIBLE) | 3025 | for_each_node_state(node, N_POSSIBLE) |
@@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void) | |||
2531 | } | 3068 | } |
2532 | #endif | 3069 | #endif |
2533 | 3070 | ||
3071 | static int mem_cgroup_soft_limit_tree_init(void) | ||
3072 | { | ||
3073 | struct mem_cgroup_tree_per_node *rtpn; | ||
3074 | struct mem_cgroup_tree_per_zone *rtpz; | ||
3075 | int tmp, node, zone; | ||
3076 | |||
3077 | for_each_node_state(node, N_POSSIBLE) { | ||
3078 | tmp = node; | ||
3079 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
3080 | tmp = -1; | ||
3081 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
3082 | if (!rtpn) | ||
3083 | return 1; | ||
3084 | |||
3085 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
3086 | |||
3087 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
3088 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
3089 | rtpz->rb_root = RB_ROOT; | ||
3090 | spin_lock_init(&rtpz->lock); | ||
3091 | } | ||
3092 | } | ||
3093 | return 0; | ||
3094 | } | ||
3095 | |||
2534 | static struct cgroup_subsys_state * __ref | 3096 | static struct cgroup_subsys_state * __ref |
2535 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 3097 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2536 | { | 3098 | { |
@@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2545 | for_each_node_state(node, N_POSSIBLE) | 3107 | for_each_node_state(node, N_POSSIBLE) |
2546 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 3108 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
2547 | goto free_out; | 3109 | goto free_out; |
3110 | |||
2548 | /* root ? */ | 3111 | /* root ? */ |
2549 | if (cont->parent == NULL) { | 3112 | if (cont->parent == NULL) { |
2550 | enable_swap_cgroup(); | 3113 | enable_swap_cgroup(); |
2551 | parent = NULL; | 3114 | parent = NULL; |
3115 | root_mem_cgroup = mem; | ||
3116 | if (mem_cgroup_soft_limit_tree_init()) | ||
3117 | goto free_out; | ||
3118 | |||
2552 | } else { | 3119 | } else { |
2553 | parent = mem_cgroup_from_cont(cont->parent); | 3120 | parent = mem_cgroup_from_cont(cont->parent); |
2554 | mem->use_hierarchy = parent->use_hierarchy; | 3121 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2577 | return &mem->css; | 3144 | return &mem->css; |
2578 | free_out: | 3145 | free_out: |
2579 | __mem_cgroup_free(mem); | 3146 | __mem_cgroup_free(mem); |
3147 | root_mem_cgroup = NULL; | ||
2580 | return ERR_PTR(error); | 3148 | return ERR_PTR(error); |
2581 | } | 3149 | } |
2582 | 3150 | ||
@@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
2612 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3180 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
2613 | struct cgroup *cont, | 3181 | struct cgroup *cont, |
2614 | struct cgroup *old_cont, | 3182 | struct cgroup *old_cont, |
2615 | struct task_struct *p) | 3183 | struct task_struct *p, |
3184 | bool threadgroup) | ||
2616 | { | 3185 | { |
2617 | mutex_lock(&memcg_tasklist); | 3186 | mutex_lock(&memcg_tasklist); |
2618 | /* | 3187 | /* |