diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1835 |
1 files changed, 1496 insertions, 339 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..f4ede99c8b9b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -6,6 +6,10 @@ | |||
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | ||
10 | * Copyright (C) 2009 Nokia Corporation | ||
11 | * Author: Kirill A. Shutemov | ||
12 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +25,7 @@ | |||
21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
@@ -32,12 +37,16 @@ | |||
32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | ||
35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | ||
43 | #include <linux/sort.h> | ||
36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 50 | #include "internal.h" |
42 | 51 | ||
43 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
@@ -54,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
55 | #endif | 64 | #endif |
56 | 65 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 66 | /* |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 67 | * Per memcg event counter is incremented at every pagein/pageout. This counter |
68 | * is used for trigger some periodic events. This is straightforward and better | ||
69 | * than using jiffies etc. to handle periodic memcg event. | ||
70 | * | ||
71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
72 | */ | ||
73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
59 | 75 | ||
60 | /* | 76 | /* |
61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
@@ -66,65 +82,19 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 82 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 83 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 84 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
74 | 90 | ||
75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
76 | }; | 92 | }; |
77 | 93 | ||
78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
80 | } ____cacheline_aligned_in_smp; | ||
81 | |||
82 | struct mem_cgroup_stat { | ||
83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * For accounting under irq disable, no need for increment preempt count. | ||
102 | */ | ||
103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
104 | enum mem_cgroup_stat_index idx, int val) | ||
105 | { | ||
106 | stat->count[idx] += val; | ||
107 | } | ||
108 | |||
109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
110 | enum mem_cgroup_stat_index idx) | ||
111 | { | ||
112 | int cpu; | ||
113 | s64 ret = 0; | ||
114 | for_each_possible_cpu(cpu) | ||
115 | ret += stat->cpustat[cpu].count[idx]; | ||
116 | return ret; | ||
117 | } | ||
118 | |||
119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
120 | { | ||
121 | s64 ret; | ||
122 | |||
123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | /* | 98 | /* |
129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
130 | */ | 100 | */ |
@@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
174 | 144 | ||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
176 | 146 | ||
147 | struct mem_cgroup_threshold { | ||
148 | struct eventfd_ctx *eventfd; | ||
149 | u64 threshold; | ||
150 | }; | ||
151 | |||
152 | struct mem_cgroup_threshold_ary { | ||
153 | /* An array index points to threshold just below usage. */ | ||
154 | atomic_t current_threshold; | ||
155 | /* Size of entries[] */ | ||
156 | unsigned int size; | ||
157 | /* Array of thresholds */ | ||
158 | struct mem_cgroup_threshold entries[0]; | ||
159 | }; | ||
160 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
162 | |||
177 | /* | 163 | /* |
178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -209,7 +195,7 @@ struct mem_cgroup { | |||
209 | int prev_priority; /* for recording reclaim priority */ | 195 | int prev_priority; /* for recording reclaim priority */ |
210 | 196 | ||
211 | /* | 197 | /* |
212 | * While reclaiming in a hiearchy, we cache the last child we | 198 | * While reclaiming in a hierarchy, we cache the last child we |
213 | * reclaimed from. | 199 | * reclaimed from. |
214 | */ | 200 | */ |
215 | int last_scanned_child; | 201 | int last_scanned_child; |
@@ -217,7 +203,7 @@ struct mem_cgroup { | |||
217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
218 | */ | 204 | */ |
219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
222 | 208 | ||
223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -225,10 +211,48 @@ struct mem_cgroup { | |||
225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
227 | 213 | ||
214 | /* protect arrays of thresholds */ | ||
215 | struct mutex thresholds_lock; | ||
216 | |||
217 | /* thresholds for memory usage. RCU-protected */ | ||
218 | struct mem_cgroup_threshold_ary *thresholds; | ||
219 | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
222 | |||
223 | /* | ||
224 | * Should we move charges of a task when a task is moved into this | ||
225 | * mem_cgroup ? And what type of charges should we move ? | ||
226 | */ | ||
227 | unsigned long move_charge_at_immigrate; | ||
228 | |||
228 | /* | 229 | /* |
229 | * statistics. This must be placed at the end of memcg. | 230 | * percpu counter. |
230 | */ | 231 | */ |
231 | struct mem_cgroup_stat stat; | 232 | struct mem_cgroup_stat_cpu *stat; |
233 | }; | ||
234 | |||
235 | /* Stuffs for move charges at task migration. */ | ||
236 | /* | ||
237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
238 | * left-shifted bitmap of these types. | ||
239 | */ | ||
240 | enum move_type { | ||
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
242 | NR_MOVE_TYPE, | ||
243 | }; | ||
244 | |||
245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
246 | static struct move_charge_struct { | ||
247 | struct mem_cgroup *from; | ||
248 | struct mem_cgroup *to; | ||
249 | unsigned long precharge; | ||
250 | unsigned long moved_charge; | ||
251 | unsigned long moved_swap; | ||
252 | struct task_struct *moving_task; /* a task moving charges */ | ||
253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
254 | } mc = { | ||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
232 | }; | 256 | }; |
233 | 257 | ||
234 | /* | 258 | /* |
@@ -275,6 +299,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 299 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 300 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 301 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
302 | static void drain_all_stock_async(void); | ||
278 | 303 | ||
279 | static struct mem_cgroup_per_zone * | 304 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 305 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -282,6 +307,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 307 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
283 | } | 308 | } |
284 | 309 | ||
310 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
311 | { | ||
312 | return &mem->css; | ||
313 | } | ||
314 | |||
285 | static struct mem_cgroup_per_zone * | 315 | static struct mem_cgroup_per_zone * |
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 316 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
287 | { | 317 | { |
@@ -365,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
365 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
366 | } | 396 | } |
367 | 397 | ||
368 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
369 | { | ||
370 | bool ret = false; | ||
371 | int cpu; | ||
372 | s64 val; | ||
373 | struct mem_cgroup_stat_cpu *cpustat; | ||
374 | |||
375 | cpu = get_cpu(); | ||
376 | cpustat = &mem->stat.cpustat[cpu]; | ||
377 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
378 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
379 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
380 | ret = true; | ||
381 | } | ||
382 | put_cpu(); | ||
383 | return ret; | ||
384 | } | ||
385 | 398 | ||
386 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
387 | { | 400 | { |
@@ -475,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
475 | return mz; | 488 | return mz; |
476 | } | 489 | } |
477 | 490 | ||
491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
492 | enum mem_cgroup_stat_index idx) | ||
493 | { | ||
494 | int cpu; | ||
495 | s64 val = 0; | ||
496 | |||
497 | for_each_possible_cpu(cpu) | ||
498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
499 | return val; | ||
500 | } | ||
501 | |||
502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
503 | { | ||
504 | s64 ret; | ||
505 | |||
506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
478 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
479 | bool charge) | 512 | bool charge) |
480 | { | 513 | { |
481 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
482 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
483 | struct mem_cgroup_stat_cpu *cpustat; | ||
484 | int cpu = get_cpu(); | ||
485 | |||
486 | cpustat = &stat->cpustat[cpu]; | ||
487 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
488 | put_cpu(); | ||
489 | } | 516 | } |
490 | 517 | ||
491 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
@@ -493,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
493 | bool charge) | 520 | bool charge) |
494 | { | 521 | { |
495 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
496 | struct mem_cgroup_stat *stat = &mem->stat; | ||
497 | struct mem_cgroup_stat_cpu *cpustat; | ||
498 | int cpu = get_cpu(); | ||
499 | 523 | ||
500 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
525 | |||
501 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
502 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
503 | else | 528 | else |
504 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
505 | 530 | ||
506 | if (charge) | 531 | if (charge) |
507 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
508 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
509 | else | 533 | else |
510 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
511 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
512 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
513 | put_cpu(); | 537 | preempt_enable(); |
514 | } | 538 | } |
515 | 539 | ||
516 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
@@ -528,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
528 | return total; | 552 | return total; |
529 | } | 553 | } |
530 | 554 | ||
555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
556 | { | ||
557 | s64 val; | ||
558 | |||
559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
560 | |||
561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * Check events in order. | ||
566 | * | ||
567 | */ | ||
568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
569 | { | ||
570 | /* threshold event is triggered in finer grain than soft limit */ | ||
571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
572 | mem_cgroup_threshold(mem); | ||
573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
574 | mem_cgroup_update_tree(mem, page); | ||
575 | } | ||
576 | } | ||
577 | |||
531 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
532 | { | 579 | { |
533 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
@@ -758,7 +805,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 805 | task_unlock(task); |
759 | if (!curr) | 806 | if (!curr) |
760 | return 0; | 807 | return 0; |
761 | if (curr->use_hierarchy) | 808 | /* |
809 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
810 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
811 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
812 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
813 | */ | ||
814 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 815 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 816 | else |
764 | ret = (curr == mem); | 817 | ret = (curr == mem); |
@@ -988,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
988 | } | 1041 | } |
989 | 1042 | ||
990 | /** | 1043 | /** |
991 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1044 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
992 | * @memcg: The memory cgroup that went over limit | 1045 | * @memcg: The memory cgroup that went over limit |
993 | * @p: Task that is going to be killed | 1046 | * @p: Task that is going to be killed |
994 | * | 1047 | * |
@@ -1007,7 +1060,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1060 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1061 | int ret; |
1009 | 1062 | ||
1010 | if (!memcg) | 1063 | if (!memcg || !p) |
1011 | return; | 1064 | return; |
1012 | 1065 | ||
1013 | 1066 | ||
@@ -1137,6 +1190,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1190 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1191 | if (victim == root_mem) { |
1139 | loop++; | 1192 | loop++; |
1193 | if (loop >= 1) | ||
1194 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1195 | if (loop >= 2) { |
1141 | /* | 1196 | /* |
1142 | * If we have not been able to reclaim | 1197 | * If we have not been able to reclaim |
@@ -1160,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1160 | } | 1215 | } |
1161 | } | 1216 | } |
1162 | } | 1217 | } |
1163 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1218 | if (!mem_cgroup_local_usage(victim)) { |
1164 | /* this cgroup's local usage == 0 */ | 1219 | /* this cgroup's local usage == 0 */ |
1165 | css_put(&victim->css); | 1220 | css_put(&victim->css); |
1166 | continue; | 1221 | continue; |
@@ -1191,90 +1246,284 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1191 | return total; | 1246 | return total; |
1192 | } | 1247 | } |
1193 | 1248 | ||
1194 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1195 | { | 1250 | { |
1196 | bool ret = false; | 1251 | int *val = (int *)data; |
1197 | struct mem_cgroup *mem; | 1252 | int x; |
1198 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1199 | 1269 | ||
1200 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1201 | mm = task->mm; | 1271 | |
1202 | if (!mm) | 1272 | if (lock_count == 1) |
1203 | mm = &init_mm; | 1273 | return true; |
1204 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1205 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1206 | ret = true; | ||
1207 | rcu_read_unlock(); | ||
1208 | return ret; | ||
1209 | } | 1275 | } |
1210 | 1276 | ||
1211 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1212 | { | 1278 | { |
1213 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1214 | return 0; | 1285 | return 0; |
1215 | } | 1286 | } |
1216 | 1287 | ||
1217 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1289 | { | ||
1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1218 | { | 1300 | { |
1219 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1301 | DEFINE_WAIT(wait); |
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1220 | } | 1345 | } |
1221 | 1346 | ||
1222 | /* | 1347 | /* |
1223 | * Currently used to update mapped file statistics, but the routine can be | 1348 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1349 | * generalized to update other statistics as well. |
1225 | */ | 1350 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1351 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1352 | { |
1228 | struct mem_cgroup *mem; | 1353 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | ||
1230 | struct mem_cgroup_stat_cpu *cpustat; | ||
1231 | int cpu; | ||
1232 | struct page_cgroup *pc; | 1354 | struct page_cgroup *pc; |
1233 | 1355 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1356 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1357 | if (unlikely(!pc)) |
1239 | return; | 1358 | return; |
1240 | 1359 | ||
1241 | lock_page_cgroup(pc); | 1360 | lock_page_cgroup(pc); |
1242 | mem = pc->mem_cgroup; | 1361 | mem = pc->mem_cgroup; |
1243 | if (!mem) | 1362 | if (!mem || !PageCgroupUsed(pc)) |
1244 | goto done; | ||
1245 | |||
1246 | if (!PageCgroupUsed(pc)) | ||
1247 | goto done; | 1363 | goto done; |
1248 | 1364 | ||
1249 | /* | 1365 | /* |
1250 | * Preemption is already disabled, we don't need get_cpu() | 1366 | * Preemption is already disabled. We can use __this_cpu_xxx |
1251 | */ | 1367 | */ |
1252 | cpu = smp_processor_id(); | 1368 | if (val > 0) { |
1253 | stat = &mem->stat; | 1369 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1254 | cpustat = &stat->cpustat[cpu]; | 1370 | SetPageCgroupFileMapped(pc); |
1371 | } else { | ||
1372 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | ||
1373 | ClearPageCgroupFileMapped(pc); | ||
1374 | } | ||
1255 | 1375 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | ||
1257 | done: | 1376 | done: |
1258 | unlock_page_cgroup(pc); | 1377 | unlock_page_cgroup(pc); |
1259 | } | 1378 | } |
1260 | 1379 | ||
1261 | /* | 1380 | /* |
1381 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1382 | * TODO: maybe necessary to use big numbers in big irons. | ||
1383 | */ | ||
1384 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1385 | struct memcg_stock_pcp { | ||
1386 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1387 | int charge; | ||
1388 | struct work_struct work; | ||
1389 | }; | ||
1390 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1391 | static atomic_t memcg_drain_count; | ||
1392 | |||
1393 | /* | ||
1394 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1395 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1396 | * cgroup which is not current target, returns false. This stock will be | ||
1397 | * refilled. | ||
1398 | */ | ||
1399 | static bool consume_stock(struct mem_cgroup *mem) | ||
1400 | { | ||
1401 | struct memcg_stock_pcp *stock; | ||
1402 | bool ret = true; | ||
1403 | |||
1404 | stock = &get_cpu_var(memcg_stock); | ||
1405 | if (mem == stock->cached && stock->charge) | ||
1406 | stock->charge -= PAGE_SIZE; | ||
1407 | else /* need to call res_counter_charge */ | ||
1408 | ret = false; | ||
1409 | put_cpu_var(memcg_stock); | ||
1410 | return ret; | ||
1411 | } | ||
1412 | |||
1413 | /* | ||
1414 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1415 | */ | ||
1416 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1417 | { | ||
1418 | struct mem_cgroup *old = stock->cached; | ||
1419 | |||
1420 | if (stock->charge) { | ||
1421 | res_counter_uncharge(&old->res, stock->charge); | ||
1422 | if (do_swap_account) | ||
1423 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1424 | } | ||
1425 | stock->cached = NULL; | ||
1426 | stock->charge = 0; | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * This must be called under preempt disabled or must be called by | ||
1431 | * a thread which is pinned to local cpu. | ||
1432 | */ | ||
1433 | static void drain_local_stock(struct work_struct *dummy) | ||
1434 | { | ||
1435 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1436 | drain_stock(stock); | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1441 | * This will be consumed by consumt_stock() function, later. | ||
1442 | */ | ||
1443 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1444 | { | ||
1445 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1446 | |||
1447 | if (stock->cached != mem) { /* reset if necessary */ | ||
1448 | drain_stock(stock); | ||
1449 | stock->cached = mem; | ||
1450 | } | ||
1451 | stock->charge += val; | ||
1452 | put_cpu_var(memcg_stock); | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1457 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1458 | * expects some charges will be back to res_counter later but cannot wait for | ||
1459 | * it. | ||
1460 | */ | ||
1461 | static void drain_all_stock_async(void) | ||
1462 | { | ||
1463 | int cpu; | ||
1464 | /* This function is for scheduling "drain" in asynchronous way. | ||
1465 | * The result of "drain" is not directly handled by callers. Then, | ||
1466 | * if someone is calling drain, we don't have to call drain more. | ||
1467 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1468 | * there is a race. We just do loose check here. | ||
1469 | */ | ||
1470 | if (atomic_read(&memcg_drain_count)) | ||
1471 | return; | ||
1472 | /* Notify other cpus that system-wide "drain" is running */ | ||
1473 | atomic_inc(&memcg_drain_count); | ||
1474 | get_online_cpus(); | ||
1475 | for_each_online_cpu(cpu) { | ||
1476 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1477 | schedule_work_on(cpu, &stock->work); | ||
1478 | } | ||
1479 | put_online_cpus(); | ||
1480 | atomic_dec(&memcg_drain_count); | ||
1481 | /* We don't wait for flush_work */ | ||
1482 | } | ||
1483 | |||
1484 | /* This is a synchronous drain interface. */ | ||
1485 | static void drain_all_stock_sync(void) | ||
1486 | { | ||
1487 | /* called when force_empty is called */ | ||
1488 | atomic_inc(&memcg_drain_count); | ||
1489 | schedule_on_each_cpu(drain_local_stock); | ||
1490 | atomic_dec(&memcg_drain_count); | ||
1491 | } | ||
1492 | |||
1493 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1494 | unsigned long action, | ||
1495 | void *hcpu) | ||
1496 | { | ||
1497 | int cpu = (unsigned long)hcpu; | ||
1498 | struct memcg_stock_pcp *stock; | ||
1499 | |||
1500 | if (action != CPU_DEAD) | ||
1501 | return NOTIFY_OK; | ||
1502 | stock = &per_cpu(memcg_stock, cpu); | ||
1503 | drain_stock(stock); | ||
1504 | return NOTIFY_OK; | ||
1505 | } | ||
1506 | |||
1507 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1508 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1509 | * oom-killer can be invoked. |
1264 | */ | 1510 | */ |
1265 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1511 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1266 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1512 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1267 | bool oom, struct page *page) | ||
1268 | { | 1513 | { |
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1514 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1515 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1516 | struct res_counter *fail_res; |
1517 | int csize = CHARGE_SIZE; | ||
1272 | 1518 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1519 | /* |
1274 | /* Don't account this! */ | 1520 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1275 | *memcg = NULL; | 1521 | * in system level. So, allow to go ahead dying process in addition to |
1276 | return 0; | 1522 | * MEMDIE process. |
1277 | } | 1523 | */ |
1524 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1525 | || fatal_signal_pending(current))) | ||
1526 | goto bypass; | ||
1278 | 1527 | ||
1279 | /* | 1528 | /* |
1280 | * We always charge the cgroup the mm_struct belongs to. | 1529 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1293,23 +1542,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1542 | return 0; |
1294 | 1543 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1544 | VM_BUG_ON(css_is_removed(&mem->css)); |
1545 | if (mem_cgroup_is_root(mem)) | ||
1546 | goto done; | ||
1296 | 1547 | ||
1297 | while (1) { | 1548 | while (1) { |
1298 | int ret = 0; | 1549 | int ret = 0; |
1299 | unsigned long flags = 0; | 1550 | unsigned long flags = 0; |
1300 | 1551 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1552 | if (consume_stock(mem)) |
1302 | goto done; | 1553 | goto done; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1554 | |
1555 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1556 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1557 | if (!do_swap_account) |
1306 | break; | 1558 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1559 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1560 | if (likely(!ret)) |
1310 | break; | 1561 | break; |
1311 | /* mem+swap counter fails */ | 1562 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1563 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1564 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1565 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1566 | memsw); |
@@ -1318,6 +1569,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1569 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1570 | res); |
1320 | 1571 | ||
1572 | /* reduce request size and retry */ | ||
1573 | if (csize > PAGE_SIZE) { | ||
1574 | csize = PAGE_SIZE; | ||
1575 | continue; | ||
1576 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1577 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1578 | goto nomem; |
1323 | 1579 | ||
@@ -1337,27 +1593,94 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1337 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1593 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
1338 | continue; | 1594 | continue; |
1339 | 1595 | ||
1596 | /* try to avoid oom while someone is moving charge */ | ||
1597 | if (mc.moving_task && current != mc.moving_task) { | ||
1598 | struct mem_cgroup *from, *to; | ||
1599 | bool do_continue = false; | ||
1600 | /* | ||
1601 | * There is a small race that "from" or "to" can be | ||
1602 | * freed by rmdir, so we use css_tryget(). | ||
1603 | */ | ||
1604 | rcu_read_lock(); | ||
1605 | from = mc.from; | ||
1606 | to = mc.to; | ||
1607 | if (from && css_tryget(&from->css)) { | ||
1608 | if (mem_over_limit->use_hierarchy) | ||
1609 | do_continue = css_is_ancestor( | ||
1610 | &from->css, | ||
1611 | &mem_over_limit->css); | ||
1612 | else | ||
1613 | do_continue = (from == mem_over_limit); | ||
1614 | css_put(&from->css); | ||
1615 | } | ||
1616 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1617 | if (mem_over_limit->use_hierarchy) | ||
1618 | do_continue = css_is_ancestor( | ||
1619 | &to->css, | ||
1620 | &mem_over_limit->css); | ||
1621 | else | ||
1622 | do_continue = (to == mem_over_limit); | ||
1623 | css_put(&to->css); | ||
1624 | } | ||
1625 | rcu_read_unlock(); | ||
1626 | if (do_continue) { | ||
1627 | DEFINE_WAIT(wait); | ||
1628 | prepare_to_wait(&mc.waitq, &wait, | ||
1629 | TASK_INTERRUPTIBLE); | ||
1630 | /* moving charge context might have finished. */ | ||
1631 | if (mc.moving_task) | ||
1632 | schedule(); | ||
1633 | finish_wait(&mc.waitq, &wait); | ||
1634 | continue; | ||
1635 | } | ||
1636 | } | ||
1637 | |||
1340 | if (!nr_retries--) { | 1638 | if (!nr_retries--) { |
1341 | if (oom) { | 1639 | if (!oom) |
1342 | mutex_lock(&memcg_tasklist); | 1640 | goto nomem; |
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1641 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1344 | mutex_unlock(&memcg_tasklist); | 1642 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1345 | record_last_oom(mem_over_limit); | 1643 | continue; |
1346 | } | 1644 | } |
1347 | goto nomem; | 1645 | /* When we reach here, current task is dying .*/ |
1646 | css_put(&mem->css); | ||
1647 | goto bypass; | ||
1348 | } | 1648 | } |
1349 | } | 1649 | } |
1350 | /* | 1650 | if (csize > PAGE_SIZE) |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1651 | refill_stock(mem, csize - PAGE_SIZE); |
1352 | * if they exceeds softlimit. | ||
1353 | */ | ||
1354 | if (mem_cgroup_soft_limit_check(mem)) | ||
1355 | mem_cgroup_update_tree(mem, page); | ||
1356 | done: | 1652 | done: |
1357 | return 0; | 1653 | return 0; |
1358 | nomem: | 1654 | nomem: |
1359 | css_put(&mem->css); | 1655 | css_put(&mem->css); |
1360 | return -ENOMEM; | 1656 | return -ENOMEM; |
1657 | bypass: | ||
1658 | *memcg = NULL; | ||
1659 | return 0; | ||
1660 | } | ||
1661 | |||
1662 | /* | ||
1663 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1664 | * This function is for that and do uncharge, put css's refcnt. | ||
1665 | * gotten by try_charge(). | ||
1666 | */ | ||
1667 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
1668 | unsigned long count) | ||
1669 | { | ||
1670 | if (!mem_cgroup_is_root(mem)) { | ||
1671 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
1672 | if (do_swap_account) | ||
1673 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | ||
1674 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1675 | WARN_ON_ONCE(count > INT_MAX); | ||
1676 | __css_put(&mem->css, (int)count); | ||
1677 | } | ||
1678 | /* we don't need css_put for root */ | ||
1679 | } | ||
1680 | |||
1681 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1682 | { | ||
1683 | __mem_cgroup_cancel_charge(mem, 1); | ||
1361 | } | 1684 | } |
1362 | 1685 | ||
1363 | /* | 1686 | /* |
@@ -1379,25 +1702,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1379 | return container_of(css, struct mem_cgroup, css); | 1702 | return container_of(css, struct mem_cgroup, css); |
1380 | } | 1703 | } |
1381 | 1704 | ||
1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1705 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1383 | { | 1706 | { |
1384 | struct mem_cgroup *mem; | 1707 | struct mem_cgroup *mem = NULL; |
1385 | struct page_cgroup *pc; | 1708 | struct page_cgroup *pc; |
1386 | unsigned short id; | 1709 | unsigned short id; |
1387 | swp_entry_t ent; | 1710 | swp_entry_t ent; |
1388 | 1711 | ||
1389 | VM_BUG_ON(!PageLocked(page)); | 1712 | VM_BUG_ON(!PageLocked(page)); |
1390 | 1713 | ||
1391 | if (!PageSwapCache(page)) | ||
1392 | return NULL; | ||
1393 | |||
1394 | pc = lookup_page_cgroup(page); | 1714 | pc = lookup_page_cgroup(page); |
1395 | lock_page_cgroup(pc); | 1715 | lock_page_cgroup(pc); |
1396 | if (PageCgroupUsed(pc)) { | 1716 | if (PageCgroupUsed(pc)) { |
1397 | mem = pc->mem_cgroup; | 1717 | mem = pc->mem_cgroup; |
1398 | if (mem && !css_tryget(&mem->css)) | 1718 | if (mem && !css_tryget(&mem->css)) |
1399 | mem = NULL; | 1719 | mem = NULL; |
1400 | } else { | 1720 | } else if (PageSwapCache(page)) { |
1401 | ent.val = page_private(page); | 1721 | ent.val = page_private(page); |
1402 | id = lookup_swap_cgroup(ent); | 1722 | id = lookup_swap_cgroup(ent); |
1403 | rcu_read_lock(); | 1723 | rcu_read_lock(); |
@@ -1426,12 +1746,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1746 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1747 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1748 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1749 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1750 | return; |
1436 | } | 1751 | } |
1437 | 1752 | ||
@@ -1461,88 +1776,83 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1461 | mem_cgroup_charge_statistics(mem, pc, true); | 1776 | mem_cgroup_charge_statistics(mem, pc, true); |
1462 | 1777 | ||
1463 | unlock_page_cgroup(pc); | 1778 | unlock_page_cgroup(pc); |
1779 | /* | ||
1780 | * "charge_statistics" updated event counter. Then, check it. | ||
1781 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1782 | * if they exceeds softlimit. | ||
1783 | */ | ||
1784 | memcg_check_events(mem, pc->page); | ||
1464 | } | 1785 | } |
1465 | 1786 | ||
1466 | /** | 1787 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1788 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1789 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1790 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1791 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1792 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
1471 | * | 1793 | * |
1472 | * The caller must confirm following. | 1794 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1795 | * - page is not on LRU (isolate_page() is useful.) |
1796 | * - the pc is locked, used, and ->mem_cgroup points to @from. | ||
1474 | * | 1797 | * |
1475 | * returns 0 at success, | 1798 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | 1799 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
1477 | * | 1800 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1801 | * @uncharge is false, so a caller should do "uncharge". |
1479 | * new cgroup. It should be done by a caller. | ||
1480 | */ | 1802 | */ |
1481 | 1803 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1804 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1805 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1484 | { | 1806 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | ||
1489 | int cpu; | ||
1490 | struct mem_cgroup_stat *stat; | ||
1491 | struct mem_cgroup_stat_cpu *cpustat; | ||
1492 | |||
1493 | VM_BUG_ON(from == to); | 1807 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1808 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1809 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1810 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1811 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | 1812 | |
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | 1813 | if (PageCgroupFileMapped(pc)) { |
1500 | 1814 | /* Update mapped_file data for mem_cgroup */ | |
1501 | if (!trylock_page_cgroup(pc)) | 1815 | preempt_disable(); |
1502 | return ret; | 1816 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1503 | 1817 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | |
1504 | if (!PageCgroupUsed(pc)) | 1818 | preempt_enable(); |
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | |||
1510 | if (!mem_cgroup_is_root(from)) | ||
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1512 | mem_cgroup_charge_statistics(from, pc, false); | ||
1513 | |||
1514 | page = pc->page; | ||
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | ||
1516 | cpu = smp_processor_id(); | ||
1517 | /* Update mapped_file data for mem_cgroup "from" */ | ||
1518 | stat = &from->stat; | ||
1519 | cpustat = &stat->cpustat[cpu]; | ||
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1521 | -1); | ||
1522 | |||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1524 | stat = &to->stat; | ||
1525 | cpustat = &stat->cpustat[cpu]; | ||
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1527 | 1); | ||
1528 | } | 1819 | } |
1820 | mem_cgroup_charge_statistics(from, pc, false); | ||
1821 | if (uncharge) | ||
1822 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
1823 | mem_cgroup_cancel_charge(from); | ||
1529 | 1824 | ||
1530 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1825 | /* caller should have done css_get */ |
1531 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1532 | css_put(&from->css); | ||
1533 | |||
1534 | css_get(&to->css); | ||
1535 | pc->mem_cgroup = to; | 1826 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1827 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1828 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1829 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1830 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1831 | * this function is just force_empty() and move charge, so it's |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1832 | * garanteed that "to" is never removed. So, we don't check rmdir |
1833 | * status here. | ||
1834 | */ | ||
1835 | } | ||
1836 | |||
1837 | /* | ||
1838 | * check whether the @pc is valid for moving account and call | ||
1839 | * __mem_cgroup_move_account() | ||
1840 | */ | ||
1841 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1842 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | ||
1843 | { | ||
1844 | int ret = -EINVAL; | ||
1845 | lock_page_cgroup(pc); | ||
1846 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1847 | __mem_cgroup_move_account(pc, from, to, uncharge); | ||
1848 | ret = 0; | ||
1849 | } | ||
1850 | unlock_page_cgroup(pc); | ||
1851 | /* | ||
1852 | * check events | ||
1545 | */ | 1853 | */ |
1854 | memcg_check_events(to, pc->page); | ||
1855 | memcg_check_events(from, pc->page); | ||
1546 | return ret; | 1856 | return ret; |
1547 | } | 1857 | } |
1548 | 1858 | ||
@@ -1564,45 +1874,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1874 | if (!pcg) |
1565 | return -EINVAL; | 1875 | return -EINVAL; |
1566 | 1876 | ||
1877 | ret = -EBUSY; | ||
1878 | if (!get_page_unless_zero(page)) | ||
1879 | goto out; | ||
1880 | if (isolate_lru_page(page)) | ||
1881 | goto put; | ||
1567 | 1882 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1883 | parent = mem_cgroup_from_cont(pcg); |
1569 | 1884 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | |
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | ||
1572 | if (ret || !parent) | 1885 | if (ret || !parent) |
1573 | return ret; | 1886 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | 1887 | ||
1888 | ret = mem_cgroup_move_account(pc, child, parent, true); | ||
1582 | if (ret) | 1889 | if (ret) |
1583 | goto cancel; | 1890 | mem_cgroup_cancel_charge(parent); |
1584 | 1891 | put_back: | |
1585 | ret = mem_cgroup_move_account(pc, child, parent); | ||
1586 | |||
1587 | putback_lru_page(page); | 1892 | putback_lru_page(page); |
1588 | if (!ret) { | 1893 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1894 | put_page(page); |
1597 | uncharge: | 1895 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1896 | return ret; |
1607 | } | 1897 | } |
1608 | 1898 | ||
@@ -1627,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1627 | prefetchw(pc); | 1917 | prefetchw(pc); |
1628 | 1918 | ||
1629 | mem = memcg; | 1919 | mem = memcg; |
1630 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1920 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1631 | if (ret || !mem) | 1921 | if (ret || !mem) |
1632 | return ret; | 1922 | return ret; |
1633 | 1923 | ||
@@ -1720,7 +2010,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
1720 | /* | 2010 | /* |
1721 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 2011 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
1722 | * And when try_charge() successfully returns, one refcnt to memcg without | 2012 | * And when try_charge() successfully returns, one refcnt to memcg without |
1723 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | 2013 | * struct page_cgroup is acquired. This refcnt will be consumed by |
1724 | * "commit()" or removed by "cancel()" | 2014 | * "commit()" or removed by "cancel()" |
1725 | */ | 2015 | */ |
1726 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2016 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
@@ -1737,23 +2027,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1737 | goto charge_cur_mm; | 2027 | goto charge_cur_mm; |
1738 | /* | 2028 | /* |
1739 | * A racing thread's fault, or swapoff, may have already updated | 2029 | * A racing thread's fault, or swapoff, may have already updated |
1740 | * the pte, and even removed page from swap cache: return success | 2030 | * the pte, and even removed page from swap cache: in those cases |
1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 2031 | * do_swap_page()'s pte_same() test will fail; but there's also a |
2032 | * KSM case which does need to charge the page. | ||
1742 | */ | 2033 | */ |
1743 | if (!PageSwapCache(page)) | 2034 | if (!PageSwapCache(page)) |
1744 | return 0; | 2035 | goto charge_cur_mm; |
1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 2036 | mem = try_get_mem_cgroup_from_page(page); |
1746 | if (!mem) | 2037 | if (!mem) |
1747 | goto charge_cur_mm; | 2038 | goto charge_cur_mm; |
1748 | *ptr = mem; | 2039 | *ptr = mem; |
1749 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2040 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1750 | /* drop extra refcnt from tryget */ | 2041 | /* drop extra refcnt from tryget */ |
1751 | css_put(&mem->css); | 2042 | css_put(&mem->css); |
1752 | return ret; | 2043 | return ret; |
1753 | charge_cur_mm: | 2044 | charge_cur_mm: |
1754 | if (unlikely(!mm)) | 2045 | if (unlikely(!mm)) |
1755 | mm = &init_mm; | 2046 | mm = &init_mm; |
1756 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2047 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
1757 | } | 2048 | } |
1758 | 2049 | ||
1759 | static void | 2050 | static void |
@@ -1818,14 +2109,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1818 | return; | 2109 | return; |
1819 | if (!mem) | 2110 | if (!mem) |
1820 | return; | 2111 | return; |
1821 | if (!mem_cgroup_is_root(mem)) { | 2112 | mem_cgroup_cancel_charge(mem); |
1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1823 | if (do_swap_account) | ||
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1826 | css_put(&mem->css); | ||
1827 | } | 2113 | } |
1828 | 2114 | ||
2115 | static void | ||
2116 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
2117 | { | ||
2118 | struct memcg_batch_info *batch = NULL; | ||
2119 | bool uncharge_memsw = true; | ||
2120 | /* If swapout, usage of swap doesn't decrease */ | ||
2121 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
2122 | uncharge_memsw = false; | ||
2123 | /* | ||
2124 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2125 | * In those cases, all pages freed continously can be expected to be in | ||
2126 | * the same cgroup and we have chance to coalesce uncharges. | ||
2127 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2128 | * because we want to do uncharge as soon as possible. | ||
2129 | */ | ||
2130 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2131 | goto direct_uncharge; | ||
2132 | |||
2133 | batch = ¤t->memcg_batch; | ||
2134 | /* | ||
2135 | * In usual, we do css_get() when we remember memcg pointer. | ||
2136 | * But in this case, we keep res->usage until end of a series of | ||
2137 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
2138 | */ | ||
2139 | if (!batch->memcg) | ||
2140 | batch->memcg = mem; | ||
2141 | /* | ||
2142 | * In typical case, batch->memcg == mem. This means we can | ||
2143 | * merge a series of uncharges to an uncharge of res_counter. | ||
2144 | * If not, we uncharge res_counter ony by one. | ||
2145 | */ | ||
2146 | if (batch->memcg != mem) | ||
2147 | goto direct_uncharge; | ||
2148 | /* remember freed charge and uncharge it later */ | ||
2149 | batch->bytes += PAGE_SIZE; | ||
2150 | if (uncharge_memsw) | ||
2151 | batch->memsw_bytes += PAGE_SIZE; | ||
2152 | return; | ||
2153 | direct_uncharge: | ||
2154 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
2155 | if (uncharge_memsw) | ||
2156 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2157 | return; | ||
2158 | } | ||
1829 | 2159 | ||
1830 | /* | 2160 | /* |
1831 | * uncharge if !page_mapped(page) | 2161 | * uncharge if !page_mapped(page) |
@@ -1874,12 +2204,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1874 | break; | 2204 | break; |
1875 | } | 2205 | } |
1876 | 2206 | ||
1877 | if (!mem_cgroup_is_root(mem)) { | 2207 | if (!mem_cgroup_is_root(mem)) |
1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2208 | __do_uncharge(mem, ctype); |
1879 | if (do_swap_account && | ||
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2209 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1884 | mem_cgroup_swap_statistics(mem, true); | 2210 | mem_cgroup_swap_statistics(mem, true); |
1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2211 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1895,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1895 | mz = page_cgroup_zoneinfo(pc); | 2221 | mz = page_cgroup_zoneinfo(pc); |
1896 | unlock_page_cgroup(pc); | 2222 | unlock_page_cgroup(pc); |
1897 | 2223 | ||
1898 | if (mem_cgroup_soft_limit_check(mem)) | 2224 | memcg_check_events(mem, page); |
1899 | mem_cgroup_update_tree(mem, page); | ||
1900 | /* at swapout, this memcg will be accessed to record to swap */ | 2225 | /* at swapout, this memcg will be accessed to record to swap */ |
1901 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2226 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1902 | css_put(&mem->css); | 2227 | css_put(&mem->css); |
@@ -1925,6 +2250,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2250 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1926 | } | 2251 | } |
1927 | 2252 | ||
2253 | /* | ||
2254 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2255 | * In that cases, pages are freed continuously and we can expect pages | ||
2256 | * are in the same memcg. All these calls itself limits the number of | ||
2257 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2258 | * This may be called prural(2) times in a context, | ||
2259 | */ | ||
2260 | |||
2261 | void mem_cgroup_uncharge_start(void) | ||
2262 | { | ||
2263 | current->memcg_batch.do_batch++; | ||
2264 | /* We can do nest. */ | ||
2265 | if (current->memcg_batch.do_batch == 1) { | ||
2266 | current->memcg_batch.memcg = NULL; | ||
2267 | current->memcg_batch.bytes = 0; | ||
2268 | current->memcg_batch.memsw_bytes = 0; | ||
2269 | } | ||
2270 | } | ||
2271 | |||
2272 | void mem_cgroup_uncharge_end(void) | ||
2273 | { | ||
2274 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2275 | |||
2276 | if (!batch->do_batch) | ||
2277 | return; | ||
2278 | |||
2279 | batch->do_batch--; | ||
2280 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2281 | return; | ||
2282 | |||
2283 | if (!batch->memcg) | ||
2284 | return; | ||
2285 | /* | ||
2286 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2287 | * bacause we hide charges behind us. | ||
2288 | */ | ||
2289 | if (batch->bytes) | ||
2290 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2291 | if (batch->memsw_bytes) | ||
2292 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2293 | /* forget this pointer (for sanity check) */ | ||
2294 | batch->memcg = NULL; | ||
2295 | } | ||
2296 | |||
1928 | #ifdef CONFIG_SWAP | 2297 | #ifdef CONFIG_SWAP |
1929 | /* | 2298 | /* |
1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2299 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -1979,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1979 | } | 2348 | } |
1980 | rcu_read_unlock(); | 2349 | rcu_read_unlock(); |
1981 | } | 2350 | } |
2351 | |||
2352 | /** | ||
2353 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2354 | * @entry: swap entry to be moved | ||
2355 | * @from: mem_cgroup which the entry is moved from | ||
2356 | * @to: mem_cgroup which the entry is moved to | ||
2357 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
2358 | * | ||
2359 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2360 | * as the mem_cgroup's id of @from. | ||
2361 | * | ||
2362 | * Returns 0 on success, -EINVAL on failure. | ||
2363 | * | ||
2364 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2365 | * both res and memsw, and called css_get(). | ||
2366 | */ | ||
2367 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2368 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2369 | { | ||
2370 | unsigned short old_id, new_id; | ||
2371 | |||
2372 | old_id = css_id(&from->css); | ||
2373 | new_id = css_id(&to->css); | ||
2374 | |||
2375 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2376 | mem_cgroup_swap_statistics(from, false); | ||
2377 | mem_cgroup_swap_statistics(to, true); | ||
2378 | /* | ||
2379 | * This function is only called from task migration context now. | ||
2380 | * It postpones res_counter and refcount handling till the end | ||
2381 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
2382 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
2383 | * because if the process that has been moved to @to does | ||
2384 | * swap-in, the refcount of @to might be decreased to 0. | ||
2385 | */ | ||
2386 | mem_cgroup_get(to); | ||
2387 | if (need_fixup) { | ||
2388 | if (!mem_cgroup_is_root(from)) | ||
2389 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2390 | mem_cgroup_put(from); | ||
2391 | /* | ||
2392 | * we charged both to->res and to->memsw, so we should | ||
2393 | * uncharge to->res. | ||
2394 | */ | ||
2395 | if (!mem_cgroup_is_root(to)) | ||
2396 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2397 | css_put(&to->css); | ||
2398 | } | ||
2399 | return 0; | ||
2400 | } | ||
2401 | return -EINVAL; | ||
2402 | } | ||
2403 | #else | ||
2404 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2405 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2406 | { | ||
2407 | return -EINVAL; | ||
2408 | } | ||
1982 | #endif | 2409 | #endif |
1983 | 2410 | ||
1984 | /* | 2411 | /* |
@@ -2003,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2003 | unlock_page_cgroup(pc); | 2430 | unlock_page_cgroup(pc); |
2004 | 2431 | ||
2005 | if (mem) { | 2432 | if (mem) { |
2006 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2433 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); |
2007 | page); | ||
2008 | css_put(&mem->css); | 2434 | css_put(&mem->css); |
2009 | } | 2435 | } |
2010 | *ptr = mem; | 2436 | *ptr = mem; |
@@ -2100,7 +2526,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2100 | unsigned long long val) | 2526 | unsigned long long val) |
2101 | { | 2527 | { |
2102 | int retry_count; | 2528 | int retry_count; |
2103 | int progress; | ||
2104 | u64 memswlimit; | 2529 | u64 memswlimit; |
2105 | int ret = 0; | 2530 | int ret = 0; |
2106 | int children = mem_cgroup_count_children(memcg); | 2531 | int children = mem_cgroup_count_children(memcg); |
@@ -2144,8 +2569,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2144 | if (!ret) | 2569 | if (!ret) |
2145 | break; | 2570 | break; |
2146 | 2571 | ||
2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2572 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2148 | GFP_KERNEL, | ||
2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2573 | MEM_CGROUP_RECLAIM_SHRINK); |
2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2574 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2151 | /* Usage is reduced ? */ | 2575 | /* Usage is reduced ? */ |
@@ -2334,7 +2758,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
2334 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2758 | pc = list_entry(list->prev, struct page_cgroup, lru); |
2335 | if (busy == pc) { | 2759 | if (busy == pc) { |
2336 | list_move(&pc->lru, list); | 2760 | list_move(&pc->lru, list); |
2337 | busy = 0; | 2761 | busy = NULL; |
2338 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2762 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2339 | continue; | 2763 | continue; |
2340 | } | 2764 | } |
@@ -2375,7 +2799,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | |||
2375 | if (free_all) | 2799 | if (free_all) |
2376 | goto try_to_free; | 2800 | goto try_to_free; |
2377 | move_account: | 2801 | move_account: |
2378 | while (mem->res.usage > 0) { | 2802 | do { |
2379 | ret = -EBUSY; | 2803 | ret = -EBUSY; |
2380 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 2804 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
2381 | goto out; | 2805 | goto out; |
@@ -2384,6 +2808,7 @@ move_account: | |||
2384 | goto out; | 2808 | goto out; |
2385 | /* This is for making all *used* pages to be on LRU. */ | 2809 | /* This is for making all *used* pages to be on LRU. */ |
2386 | lru_add_drain_all(); | 2810 | lru_add_drain_all(); |
2811 | drain_all_stock_sync(); | ||
2387 | ret = 0; | 2812 | ret = 0; |
2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2813 | for_each_node_state(node, N_HIGH_MEMORY) { |
2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2814 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2402,8 +2827,8 @@ move_account: | |||
2402 | if (ret == -ENOMEM) | 2827 | if (ret == -ENOMEM) |
2403 | goto try_to_free; | 2828 | goto try_to_free; |
2404 | cond_resched(); | 2829 | cond_resched(); |
2405 | } | 2830 | /* "ret" should also be checked to ensure all lists are empty. */ |
2406 | ret = 0; | 2831 | } while (mem->res.usage > 0 || ret); |
2407 | out: | 2832 | out: |
2408 | css_put(&mem->css); | 2833 | css_put(&mem->css); |
2409 | return ret; | 2834 | return ret; |
@@ -2436,10 +2861,7 @@ try_to_free: | |||
2436 | } | 2861 | } |
2437 | lru_add_drain(); | 2862 | lru_add_drain(); |
2438 | /* try move_account...there may be some *locked* pages. */ | 2863 | /* try move_account...there may be some *locked* pages. */ |
2439 | if (mem->res.usage) | 2864 | goto move_account; |
2440 | goto move_account; | ||
2441 | ret = 0; | ||
2442 | goto out; | ||
2443 | } | 2865 | } |
2444 | 2866 | ||
2445 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 2867 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
@@ -2466,7 +2888,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2466 | 2888 | ||
2467 | cgroup_lock(); | 2889 | cgroup_lock(); |
2468 | /* | 2890 | /* |
2469 | * If parent's use_hiearchy is set, we can't make any modifications | 2891 | * If parent's use_hierarchy is set, we can't make any modifications |
2470 | * in the child subtrees. If it is unset, then the change can | 2892 | * in the child subtrees. If it is unset, then the change can |
2471 | * occur, provided the current cgroup has no children. | 2893 | * occur, provided the current cgroup has no children. |
2472 | * | 2894 | * |
@@ -2495,7 +2917,7 @@ static int | |||
2495 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2917 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
2496 | { | 2918 | { |
2497 | struct mem_cgroup_idx_data *d = data; | 2919 | struct mem_cgroup_idx_data *d = data; |
2498 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2920 | d->val += mem_cgroup_read_stat(mem, d->idx); |
2499 | return 0; | 2921 | return 0; |
2500 | } | 2922 | } |
2501 | 2923 | ||
@@ -2510,39 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
2510 | *val = d.val; | 2932 | *val = d.val; |
2511 | } | 2933 | } |
2512 | 2934 | ||
2935 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
2936 | { | ||
2937 | u64 idx_val, val; | ||
2938 | |||
2939 | if (!mem_cgroup_is_root(mem)) { | ||
2940 | if (!swap) | ||
2941 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
2942 | else | ||
2943 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
2944 | } | ||
2945 | |||
2946 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2947 | val = idx_val; | ||
2948 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
2949 | val += idx_val; | ||
2950 | |||
2951 | if (swap) { | ||
2952 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2953 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2954 | val += idx_val; | ||
2955 | } | ||
2956 | |||
2957 | return val << PAGE_SHIFT; | ||
2958 | } | ||
2959 | |||
2513 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2960 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2514 | { | 2961 | { |
2515 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2962 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2516 | u64 idx_val, val; | 2963 | u64 val; |
2517 | int type, name; | 2964 | int type, name; |
2518 | 2965 | ||
2519 | type = MEMFILE_TYPE(cft->private); | 2966 | type = MEMFILE_TYPE(cft->private); |
2520 | name = MEMFILE_ATTR(cft->private); | 2967 | name = MEMFILE_ATTR(cft->private); |
2521 | switch (type) { | 2968 | switch (type) { |
2522 | case _MEM: | 2969 | case _MEM: |
2523 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2970 | if (name == RES_USAGE) |
2524 | mem_cgroup_get_recursive_idx_stat(mem, | 2971 | val = mem_cgroup_usage(mem, false); |
2525 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2972 | else |
2526 | val = idx_val; | ||
2527 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2528 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2529 | val += idx_val; | ||
2530 | val <<= PAGE_SHIFT; | ||
2531 | } else | ||
2532 | val = res_counter_read_u64(&mem->res, name); | 2973 | val = res_counter_read_u64(&mem->res, name); |
2533 | break; | 2974 | break; |
2534 | case _MEMSWAP: | 2975 | case _MEMSWAP: |
2535 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2976 | if (name == RES_USAGE) |
2536 | mem_cgroup_get_recursive_idx_stat(mem, | 2977 | val = mem_cgroup_usage(mem, true); |
2537 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2978 | else |
2538 | val = idx_val; | ||
2539 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2540 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2541 | val += idx_val; | ||
2542 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2544 | val <<= PAGE_SHIFT; | ||
2545 | } else | ||
2546 | val = res_counter_read_u64(&mem->memsw, name); | 2979 | val = res_counter_read_u64(&mem->memsw, name); |
2547 | break; | 2980 | break; |
2548 | default: | 2981 | default: |
@@ -2655,12 +3088,45 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2655 | return 0; | 3088 | return 0; |
2656 | } | 3089 | } |
2657 | 3090 | ||
3091 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
3092 | struct cftype *cft) | ||
3093 | { | ||
3094 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
3095 | } | ||
3096 | |||
3097 | #ifdef CONFIG_MMU | ||
3098 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3099 | struct cftype *cft, u64 val) | ||
3100 | { | ||
3101 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3102 | |||
3103 | if (val >= (1 << NR_MOVE_TYPE)) | ||
3104 | return -EINVAL; | ||
3105 | /* | ||
3106 | * We check this value several times in both in can_attach() and | ||
3107 | * attach(), so we need cgroup lock to prevent this value from being | ||
3108 | * inconsistent. | ||
3109 | */ | ||
3110 | cgroup_lock(); | ||
3111 | mem->move_charge_at_immigrate = val; | ||
3112 | cgroup_unlock(); | ||
3113 | |||
3114 | return 0; | ||
3115 | } | ||
3116 | #else | ||
3117 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3118 | struct cftype *cft, u64 val) | ||
3119 | { | ||
3120 | return -ENOSYS; | ||
3121 | } | ||
3122 | #endif | ||
3123 | |||
2658 | 3124 | ||
2659 | /* For read statistics */ | 3125 | /* For read statistics */ |
2660 | enum { | 3126 | enum { |
2661 | MCS_CACHE, | 3127 | MCS_CACHE, |
2662 | MCS_RSS, | 3128 | MCS_RSS, |
2663 | MCS_MAPPED_FILE, | 3129 | MCS_FILE_MAPPED, |
2664 | MCS_PGPGIN, | 3130 | MCS_PGPGIN, |
2665 | MCS_PGPGOUT, | 3131 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | 3132 | MCS_SWAP, |
@@ -2700,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2700 | s64 val; | 3166 | s64 val; |
2701 | 3167 | ||
2702 | /* per cpu stat */ | 3168 | /* per cpu stat */ |
2703 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3169 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3170 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3171 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3172 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 3173 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 3174 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2710 | s->stat[MCS_PGPGIN] += val; | 3176 | s->stat[MCS_PGPGIN] += val; |
2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3177 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2712 | s->stat[MCS_PGPGOUT] += val; | 3178 | s->stat[MCS_PGPGOUT] += val; |
2713 | if (do_swap_account) { | 3179 | if (do_swap_account) { |
2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3180 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
2715 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3181 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
2716 | } | 3182 | } |
2717 | 3183 | ||
@@ -2839,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
2839 | return 0; | 3305 | return 0; |
2840 | } | 3306 | } |
2841 | 3307 | ||
3308 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
3309 | { | ||
3310 | struct mem_cgroup_threshold_ary *t; | ||
3311 | u64 usage; | ||
3312 | int i; | ||
3313 | |||
3314 | rcu_read_lock(); | ||
3315 | if (!swap) | ||
3316 | t = rcu_dereference(memcg->thresholds); | ||
3317 | else | ||
3318 | t = rcu_dereference(memcg->memsw_thresholds); | ||
3319 | |||
3320 | if (!t) | ||
3321 | goto unlock; | ||
3322 | |||
3323 | usage = mem_cgroup_usage(memcg, swap); | ||
3324 | |||
3325 | /* | ||
3326 | * current_threshold points to threshold just below usage. | ||
3327 | * If it's not true, a threshold was crossed after last | ||
3328 | * call of __mem_cgroup_threshold(). | ||
3329 | */ | ||
3330 | i = atomic_read(&t->current_threshold); | ||
3331 | |||
3332 | /* | ||
3333 | * Iterate backward over array of thresholds starting from | ||
3334 | * current_threshold and check if a threshold is crossed. | ||
3335 | * If none of thresholds below usage is crossed, we read | ||
3336 | * only one element of the array here. | ||
3337 | */ | ||
3338 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
3339 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3340 | |||
3341 | /* i = current_threshold + 1 */ | ||
3342 | i++; | ||
3343 | |||
3344 | /* | ||
3345 | * Iterate forward over array of thresholds starting from | ||
3346 | * current_threshold+1 and check if a threshold is crossed. | ||
3347 | * If none of thresholds above usage is crossed, we read | ||
3348 | * only one element of the array here. | ||
3349 | */ | ||
3350 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
3351 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3352 | |||
3353 | /* Update current_threshold */ | ||
3354 | atomic_set(&t->current_threshold, i - 1); | ||
3355 | unlock: | ||
3356 | rcu_read_unlock(); | ||
3357 | } | ||
3358 | |||
3359 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
3360 | { | ||
3361 | __mem_cgroup_threshold(memcg, false); | ||
3362 | if (do_swap_account) | ||
3363 | __mem_cgroup_threshold(memcg, true); | ||
3364 | } | ||
3365 | |||
3366 | static int compare_thresholds(const void *a, const void *b) | ||
3367 | { | ||
3368 | const struct mem_cgroup_threshold *_a = a; | ||
3369 | const struct mem_cgroup_threshold *_b = b; | ||
3370 | |||
3371 | return _a->threshold - _b->threshold; | ||
3372 | } | ||
3373 | |||
3374 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
3375 | struct eventfd_ctx *eventfd, const char *args) | ||
3376 | { | ||
3377 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3378 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3379 | int type = MEMFILE_TYPE(cft->private); | ||
3380 | u64 threshold, usage; | ||
3381 | int size; | ||
3382 | int i, ret; | ||
3383 | |||
3384 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
3385 | if (ret) | ||
3386 | return ret; | ||
3387 | |||
3388 | mutex_lock(&memcg->thresholds_lock); | ||
3389 | if (type == _MEM) | ||
3390 | thresholds = memcg->thresholds; | ||
3391 | else if (type == _MEMSWAP) | ||
3392 | thresholds = memcg->memsw_thresholds; | ||
3393 | else | ||
3394 | BUG(); | ||
3395 | |||
3396 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3397 | |||
3398 | /* Check if a threshold crossed before adding a new one */ | ||
3399 | if (thresholds) | ||
3400 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3401 | |||
3402 | if (thresholds) | ||
3403 | size = thresholds->size + 1; | ||
3404 | else | ||
3405 | size = 1; | ||
3406 | |||
3407 | /* Allocate memory for new array of thresholds */ | ||
3408 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3409 | size * sizeof(struct mem_cgroup_threshold), | ||
3410 | GFP_KERNEL); | ||
3411 | if (!thresholds_new) { | ||
3412 | ret = -ENOMEM; | ||
3413 | goto unlock; | ||
3414 | } | ||
3415 | thresholds_new->size = size; | ||
3416 | |||
3417 | /* Copy thresholds (if any) to new array */ | ||
3418 | if (thresholds) | ||
3419 | memcpy(thresholds_new->entries, thresholds->entries, | ||
3420 | thresholds->size * | ||
3421 | sizeof(struct mem_cgroup_threshold)); | ||
3422 | /* Add new threshold */ | ||
3423 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
3424 | thresholds_new->entries[size - 1].threshold = threshold; | ||
3425 | |||
3426 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
3427 | sort(thresholds_new->entries, size, | ||
3428 | sizeof(struct mem_cgroup_threshold), | ||
3429 | compare_thresholds, NULL); | ||
3430 | |||
3431 | /* Find current threshold */ | ||
3432 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3433 | for (i = 0; i < size; i++) { | ||
3434 | if (thresholds_new->entries[i].threshold < usage) { | ||
3435 | /* | ||
3436 | * thresholds_new->current_threshold will not be used | ||
3437 | * until rcu_assign_pointer(), so it's safe to increment | ||
3438 | * it here. | ||
3439 | */ | ||
3440 | atomic_inc(&thresholds_new->current_threshold); | ||
3441 | } | ||
3442 | } | ||
3443 | |||
3444 | if (type == _MEM) | ||
3445 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3446 | else | ||
3447 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3448 | |||
3449 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3450 | synchronize_rcu(); | ||
3451 | |||
3452 | kfree(thresholds); | ||
3453 | unlock: | ||
3454 | mutex_unlock(&memcg->thresholds_lock); | ||
3455 | |||
3456 | return ret; | ||
3457 | } | ||
3458 | |||
3459 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
3460 | struct eventfd_ctx *eventfd) | ||
3461 | { | ||
3462 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3463 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3464 | int type = MEMFILE_TYPE(cft->private); | ||
3465 | u64 usage; | ||
3466 | int size = 0; | ||
3467 | int i, j, ret; | ||
3468 | |||
3469 | mutex_lock(&memcg->thresholds_lock); | ||
3470 | if (type == _MEM) | ||
3471 | thresholds = memcg->thresholds; | ||
3472 | else if (type == _MEMSWAP) | ||
3473 | thresholds = memcg->memsw_thresholds; | ||
3474 | else | ||
3475 | BUG(); | ||
3476 | |||
3477 | /* | ||
3478 | * Something went wrong if we trying to unregister a threshold | ||
3479 | * if we don't have thresholds | ||
3480 | */ | ||
3481 | BUG_ON(!thresholds); | ||
3482 | |||
3483 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3484 | |||
3485 | /* Check if a threshold crossed before removing */ | ||
3486 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3487 | |||
3488 | /* Calculate new number of threshold */ | ||
3489 | for (i = 0; i < thresholds->size; i++) { | ||
3490 | if (thresholds->entries[i].eventfd != eventfd) | ||
3491 | size++; | ||
3492 | } | ||
3493 | |||
3494 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
3495 | if (!size) { | ||
3496 | thresholds_new = NULL; | ||
3497 | goto assign; | ||
3498 | } | ||
3499 | |||
3500 | /* Allocate memory for new array of thresholds */ | ||
3501 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3502 | size * sizeof(struct mem_cgroup_threshold), | ||
3503 | GFP_KERNEL); | ||
3504 | if (!thresholds_new) { | ||
3505 | ret = -ENOMEM; | ||
3506 | goto unlock; | ||
3507 | } | ||
3508 | thresholds_new->size = size; | ||
3509 | |||
3510 | /* Copy thresholds and find current threshold */ | ||
3511 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3512 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
3513 | if (thresholds->entries[i].eventfd == eventfd) | ||
3514 | continue; | ||
3515 | |||
3516 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
3517 | if (thresholds_new->entries[j].threshold < usage) { | ||
3518 | /* | ||
3519 | * thresholds_new->current_threshold will not be used | ||
3520 | * until rcu_assign_pointer(), so it's safe to increment | ||
3521 | * it here. | ||
3522 | */ | ||
3523 | atomic_inc(&thresholds_new->current_threshold); | ||
3524 | } | ||
3525 | j++; | ||
3526 | } | ||
3527 | |||
3528 | assign: | ||
3529 | if (type == _MEM) | ||
3530 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3531 | else | ||
3532 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3533 | |||
3534 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3535 | synchronize_rcu(); | ||
3536 | |||
3537 | kfree(thresholds); | ||
3538 | unlock: | ||
3539 | mutex_unlock(&memcg->thresholds_lock); | ||
3540 | |||
3541 | return ret; | ||
3542 | } | ||
2842 | 3543 | ||
2843 | static struct cftype mem_cgroup_files[] = { | 3544 | static struct cftype mem_cgroup_files[] = { |
2844 | { | 3545 | { |
2845 | .name = "usage_in_bytes", | 3546 | .name = "usage_in_bytes", |
2846 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3547 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
2847 | .read_u64 = mem_cgroup_read, | 3548 | .read_u64 = mem_cgroup_read, |
3549 | .register_event = mem_cgroup_register_event, | ||
3550 | .unregister_event = mem_cgroup_unregister_event, | ||
2848 | }, | 3551 | }, |
2849 | { | 3552 | { |
2850 | .name = "max_usage_in_bytes", | 3553 | .name = "max_usage_in_bytes", |
@@ -2888,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = { | |||
2888 | .read_u64 = mem_cgroup_swappiness_read, | 3591 | .read_u64 = mem_cgroup_swappiness_read, |
2889 | .write_u64 = mem_cgroup_swappiness_write, | 3592 | .write_u64 = mem_cgroup_swappiness_write, |
2890 | }, | 3593 | }, |
3594 | { | ||
3595 | .name = "move_charge_at_immigrate", | ||
3596 | .read_u64 = mem_cgroup_move_charge_read, | ||
3597 | .write_u64 = mem_cgroup_move_charge_write, | ||
3598 | }, | ||
2891 | }; | 3599 | }; |
2892 | 3600 | ||
2893 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3601 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -2896,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
2896 | .name = "memsw.usage_in_bytes", | 3604 | .name = "memsw.usage_in_bytes", |
2897 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3605 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
2898 | .read_u64 = mem_cgroup_read, | 3606 | .read_u64 = mem_cgroup_read, |
3607 | .register_event = mem_cgroup_register_event, | ||
3608 | .unregister_event = mem_cgroup_unregister_event, | ||
2899 | }, | 3609 | }, |
2900 | { | 3610 | { |
2901 | .name = "memsw.max_usage_in_bytes", | 3611 | .name = "memsw.max_usage_in_bytes", |
@@ -2970,24 +3680,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2970 | kfree(mem->info.nodeinfo[node]); | 3680 | kfree(mem->info.nodeinfo[node]); |
2971 | } | 3681 | } |
2972 | 3682 | ||
2973 | static int mem_cgroup_size(void) | ||
2974 | { | ||
2975 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
2976 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
2977 | } | ||
2978 | |||
2979 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3683 | static struct mem_cgroup *mem_cgroup_alloc(void) |
2980 | { | 3684 | { |
2981 | struct mem_cgroup *mem; | 3685 | struct mem_cgroup *mem; |
2982 | int size = mem_cgroup_size(); | 3686 | int size = sizeof(struct mem_cgroup); |
2983 | 3687 | ||
3688 | /* Can be very big if MAX_NUMNODES is very big */ | ||
2984 | if (size < PAGE_SIZE) | 3689 | if (size < PAGE_SIZE) |
2985 | mem = kmalloc(size, GFP_KERNEL); | 3690 | mem = kmalloc(size, GFP_KERNEL); |
2986 | else | 3691 | else |
2987 | mem = vmalloc(size); | 3692 | mem = vmalloc(size); |
2988 | 3693 | ||
2989 | if (mem) | 3694 | if (!mem) |
2990 | memset(mem, 0, size); | 3695 | return NULL; |
3696 | |||
3697 | memset(mem, 0, size); | ||
3698 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
3699 | if (!mem->stat) { | ||
3700 | if (size < PAGE_SIZE) | ||
3701 | kfree(mem); | ||
3702 | else | ||
3703 | vfree(mem); | ||
3704 | mem = NULL; | ||
3705 | } | ||
2991 | return mem; | 3706 | return mem; |
2992 | } | 3707 | } |
2993 | 3708 | ||
@@ -3012,7 +3727,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
3012 | for_each_node_state(node, N_POSSIBLE) | 3727 | for_each_node_state(node, N_POSSIBLE) |
3013 | free_mem_cgroup_per_zone_info(mem, node); | 3728 | free_mem_cgroup_per_zone_info(mem, node); |
3014 | 3729 | ||
3015 | if (mem_cgroup_size() < PAGE_SIZE) | 3730 | free_percpu(mem->stat); |
3731 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
3016 | kfree(mem); | 3732 | kfree(mem); |
3017 | else | 3733 | else |
3018 | vfree(mem); | 3734 | vfree(mem); |
@@ -3023,9 +3739,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
3023 | atomic_inc(&mem->refcnt); | 3739 | atomic_inc(&mem->refcnt); |
3024 | } | 3740 | } |
3025 | 3741 | ||
3026 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3742 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
3027 | { | 3743 | { |
3028 | if (atomic_dec_and_test(&mem->refcnt)) { | 3744 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
3029 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3745 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
3030 | __mem_cgroup_free(mem); | 3746 | __mem_cgroup_free(mem); |
3031 | if (parent) | 3747 | if (parent) |
@@ -3033,6 +3749,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
3033 | } | 3749 | } |
3034 | } | 3750 | } |
3035 | 3751 | ||
3752 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
3753 | { | ||
3754 | __mem_cgroup_put(mem, 1); | ||
3755 | } | ||
3756 | |||
3036 | /* | 3757 | /* |
3037 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3758 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
3038 | */ | 3759 | */ |
@@ -3097,12 +3818,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3097 | 3818 | ||
3098 | /* root ? */ | 3819 | /* root ? */ |
3099 | if (cont->parent == NULL) { | 3820 | if (cont->parent == NULL) { |
3821 | int cpu; | ||
3100 | enable_swap_cgroup(); | 3822 | enable_swap_cgroup(); |
3101 | parent = NULL; | 3823 | parent = NULL; |
3102 | root_mem_cgroup = mem; | 3824 | root_mem_cgroup = mem; |
3103 | if (mem_cgroup_soft_limit_tree_init()) | 3825 | if (mem_cgroup_soft_limit_tree_init()) |
3104 | goto free_out; | 3826 | goto free_out; |
3105 | 3827 | for_each_possible_cpu(cpu) { | |
3828 | struct memcg_stock_pcp *stock = | ||
3829 | &per_cpu(memcg_stock, cpu); | ||
3830 | INIT_WORK(&stock->work, drain_local_stock); | ||
3831 | } | ||
3832 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3106 | } else { | 3833 | } else { |
3107 | parent = mem_cgroup_from_cont(cont->parent); | 3834 | parent = mem_cgroup_from_cont(cont->parent); |
3108 | mem->use_hierarchy = parent->use_hierarchy; | 3835 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -3128,6 +3855,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3128 | if (parent) | 3855 | if (parent) |
3129 | mem->swappiness = get_swappiness(parent); | 3856 | mem->swappiness = get_swappiness(parent); |
3130 | atomic_set(&mem->refcnt, 1); | 3857 | atomic_set(&mem->refcnt, 1); |
3858 | mem->move_charge_at_immigrate = 0; | ||
3859 | mutex_init(&mem->thresholds_lock); | ||
3131 | return &mem->css; | 3860 | return &mem->css; |
3132 | free_out: | 3861 | free_out: |
3133 | __mem_cgroup_free(mem); | 3862 | __mem_cgroup_free(mem); |
@@ -3164,19 +3893,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3164 | return ret; | 3893 | return ret; |
3165 | } | 3894 | } |
3166 | 3895 | ||
3896 | #ifdef CONFIG_MMU | ||
3897 | /* Handlers for move charge at task migration. */ | ||
3898 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
3899 | static int mem_cgroup_do_precharge(unsigned long count) | ||
3900 | { | ||
3901 | int ret = 0; | ||
3902 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3903 | struct mem_cgroup *mem = mc.to; | ||
3904 | |||
3905 | if (mem_cgroup_is_root(mem)) { | ||
3906 | mc.precharge += count; | ||
3907 | /* we don't need css_get for root */ | ||
3908 | return ret; | ||
3909 | } | ||
3910 | /* try to charge at once */ | ||
3911 | if (count > 1) { | ||
3912 | struct res_counter *dummy; | ||
3913 | /* | ||
3914 | * "mem" cannot be under rmdir() because we've already checked | ||
3915 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
3916 | * are still under the same cgroup_mutex. So we can postpone | ||
3917 | * css_get(). | ||
3918 | */ | ||
3919 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
3920 | goto one_by_one; | ||
3921 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
3922 | PAGE_SIZE * count, &dummy)) { | ||
3923 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
3924 | goto one_by_one; | ||
3925 | } | ||
3926 | mc.precharge += count; | ||
3927 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
3928 | WARN_ON_ONCE(count > INT_MAX); | ||
3929 | __css_get(&mem->css, (int)count); | ||
3930 | return ret; | ||
3931 | } | ||
3932 | one_by_one: | ||
3933 | /* fall back to one by one charge */ | ||
3934 | while (count--) { | ||
3935 | if (signal_pending(current)) { | ||
3936 | ret = -EINTR; | ||
3937 | break; | ||
3938 | } | ||
3939 | if (!batch_count--) { | ||
3940 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3941 | cond_resched(); | ||
3942 | } | ||
3943 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
3944 | if (ret || !mem) | ||
3945 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
3946 | return -ENOMEM; | ||
3947 | mc.precharge++; | ||
3948 | } | ||
3949 | return ret; | ||
3950 | } | ||
3951 | |||
3952 | /** | ||
3953 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3954 | * @vma: the vma the pte to be checked belongs | ||
3955 | * @addr: the address corresponding to the pte to be checked | ||
3956 | * @ptent: the pte to be checked | ||
3957 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
3958 | * | ||
3959 | * Returns | ||
3960 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3961 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3962 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3963 | * with extra refcnt got(Callers should handle it). | ||
3964 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3965 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3966 | * in target->ent. | ||
3967 | * | ||
3968 | * Called with pte lock held. | ||
3969 | */ | ||
3970 | union mc_target { | ||
3971 | struct page *page; | ||
3972 | swp_entry_t ent; | ||
3973 | }; | ||
3974 | |||
3975 | enum mc_target_type { | ||
3976 | MC_TARGET_NONE, /* not used */ | ||
3977 | MC_TARGET_PAGE, | ||
3978 | MC_TARGET_SWAP, | ||
3979 | }; | ||
3980 | |||
3981 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
3982 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
3983 | { | ||
3984 | struct page *page = NULL; | ||
3985 | struct page_cgroup *pc; | ||
3986 | int ret = 0; | ||
3987 | swp_entry_t ent = { .val = 0 }; | ||
3988 | int usage_count = 0; | ||
3989 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3990 | &mc.to->move_charge_at_immigrate); | ||
3991 | |||
3992 | if (!pte_present(ptent)) { | ||
3993 | /* TODO: handle swap of shmes/tmpfs */ | ||
3994 | if (pte_none(ptent) || pte_file(ptent)) | ||
3995 | return 0; | ||
3996 | else if (is_swap_pte(ptent)) { | ||
3997 | ent = pte_to_swp_entry(ptent); | ||
3998 | if (!move_anon || non_swap_entry(ent)) | ||
3999 | return 0; | ||
4000 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4001 | } | ||
4002 | } else { | ||
4003 | page = vm_normal_page(vma, addr, ptent); | ||
4004 | if (!page || !page_mapped(page)) | ||
4005 | return 0; | ||
4006 | /* | ||
4007 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4008 | * pages for now. | ||
4009 | */ | ||
4010 | if (!move_anon || !PageAnon(page)) | ||
4011 | return 0; | ||
4012 | if (!get_page_unless_zero(page)) | ||
4013 | return 0; | ||
4014 | usage_count = page_mapcount(page); | ||
4015 | } | ||
4016 | if (usage_count > 1) { | ||
4017 | /* | ||
4018 | * TODO: We don't move charges of shared(used by multiple | ||
4019 | * processes) pages for now. | ||
4020 | */ | ||
4021 | if (page) | ||
4022 | put_page(page); | ||
4023 | return 0; | ||
4024 | } | ||
4025 | if (page) { | ||
4026 | pc = lookup_page_cgroup(page); | ||
4027 | /* | ||
4028 | * Do only loose check w/o page_cgroup lock. | ||
4029 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
4030 | * the lock. | ||
4031 | */ | ||
4032 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
4033 | ret = MC_TARGET_PAGE; | ||
4034 | if (target) | ||
4035 | target->page = page; | ||
4036 | } | ||
4037 | if (!ret || !target) | ||
4038 | put_page(page); | ||
4039 | } | ||
4040 | /* throught */ | ||
4041 | if (ent.val && do_swap_account && !ret && | ||
4042 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
4043 | ret = MC_TARGET_SWAP; | ||
4044 | if (target) | ||
4045 | target->ent = ent; | ||
4046 | } | ||
4047 | return ret; | ||
4048 | } | ||
4049 | |||
4050 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
4051 | unsigned long addr, unsigned long end, | ||
4052 | struct mm_walk *walk) | ||
4053 | { | ||
4054 | struct vm_area_struct *vma = walk->private; | ||
4055 | pte_t *pte; | ||
4056 | spinlock_t *ptl; | ||
4057 | |||
4058 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4059 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
4060 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
4061 | mc.precharge++; /* increment precharge temporarily */ | ||
4062 | pte_unmap_unlock(pte - 1, ptl); | ||
4063 | cond_resched(); | ||
4064 | |||
4065 | return 0; | ||
4066 | } | ||
4067 | |||
4068 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
4069 | { | ||
4070 | unsigned long precharge; | ||
4071 | struct vm_area_struct *vma; | ||
4072 | |||
4073 | down_read(&mm->mmap_sem); | ||
4074 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4075 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4076 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4077 | .mm = mm, | ||
4078 | .private = vma, | ||
4079 | }; | ||
4080 | if (is_vm_hugetlb_page(vma)) | ||
4081 | continue; | ||
4082 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4083 | if (vma->vm_flags & VM_SHARED) | ||
4084 | continue; | ||
4085 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4086 | &mem_cgroup_count_precharge_walk); | ||
4087 | } | ||
4088 | up_read(&mm->mmap_sem); | ||
4089 | |||
4090 | precharge = mc.precharge; | ||
4091 | mc.precharge = 0; | ||
4092 | |||
4093 | return precharge; | ||
4094 | } | ||
4095 | |||
4096 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
4097 | { | ||
4098 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
4099 | } | ||
4100 | |||
4101 | static void mem_cgroup_clear_mc(void) | ||
4102 | { | ||
4103 | /* we must uncharge all the leftover precharges from mc.to */ | ||
4104 | if (mc.precharge) { | ||
4105 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
4106 | mc.precharge = 0; | ||
4107 | } | ||
4108 | /* | ||
4109 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | ||
4110 | * we must uncharge here. | ||
4111 | */ | ||
4112 | if (mc.moved_charge) { | ||
4113 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
4114 | mc.moved_charge = 0; | ||
4115 | } | ||
4116 | /* we must fixup refcnts and charges */ | ||
4117 | if (mc.moved_swap) { | ||
4118 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4119 | /* uncharge swap account from the old cgroup */ | ||
4120 | if (!mem_cgroup_is_root(mc.from)) | ||
4121 | res_counter_uncharge(&mc.from->memsw, | ||
4122 | PAGE_SIZE * mc.moved_swap); | ||
4123 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
4124 | |||
4125 | if (!mem_cgroup_is_root(mc.to)) { | ||
4126 | /* | ||
4127 | * we charged both to->res and to->memsw, so we should | ||
4128 | * uncharge to->res. | ||
4129 | */ | ||
4130 | res_counter_uncharge(&mc.to->res, | ||
4131 | PAGE_SIZE * mc.moved_swap); | ||
4132 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4133 | __css_put(&mc.to->css, mc.moved_swap); | ||
4134 | } | ||
4135 | /* we've already done mem_cgroup_get(mc.to) */ | ||
4136 | |||
4137 | mc.moved_swap = 0; | ||
4138 | } | ||
4139 | mc.from = NULL; | ||
4140 | mc.to = NULL; | ||
4141 | mc.moving_task = NULL; | ||
4142 | wake_up_all(&mc.waitq); | ||
4143 | } | ||
4144 | |||
4145 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4146 | struct cgroup *cgroup, | ||
4147 | struct task_struct *p, | ||
4148 | bool threadgroup) | ||
4149 | { | ||
4150 | int ret = 0; | ||
4151 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
4152 | |||
4153 | if (mem->move_charge_at_immigrate) { | ||
4154 | struct mm_struct *mm; | ||
4155 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
4156 | |||
4157 | VM_BUG_ON(from == mem); | ||
4158 | |||
4159 | mm = get_task_mm(p); | ||
4160 | if (!mm) | ||
4161 | return 0; | ||
4162 | /* We move charges only when we move a owner of the mm */ | ||
4163 | if (mm->owner == p) { | ||
4164 | VM_BUG_ON(mc.from); | ||
4165 | VM_BUG_ON(mc.to); | ||
4166 | VM_BUG_ON(mc.precharge); | ||
4167 | VM_BUG_ON(mc.moved_charge); | ||
4168 | VM_BUG_ON(mc.moved_swap); | ||
4169 | VM_BUG_ON(mc.moving_task); | ||
4170 | mc.from = from; | ||
4171 | mc.to = mem; | ||
4172 | mc.precharge = 0; | ||
4173 | mc.moved_charge = 0; | ||
4174 | mc.moved_swap = 0; | ||
4175 | mc.moving_task = current; | ||
4176 | |||
4177 | ret = mem_cgroup_precharge_mc(mm); | ||
4178 | if (ret) | ||
4179 | mem_cgroup_clear_mc(); | ||
4180 | } | ||
4181 | mmput(mm); | ||
4182 | } | ||
4183 | return ret; | ||
4184 | } | ||
4185 | |||
4186 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4187 | struct cgroup *cgroup, | ||
4188 | struct task_struct *p, | ||
4189 | bool threadgroup) | ||
4190 | { | ||
4191 | mem_cgroup_clear_mc(); | ||
4192 | } | ||
4193 | |||
4194 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
4195 | unsigned long addr, unsigned long end, | ||
4196 | struct mm_walk *walk) | ||
4197 | { | ||
4198 | int ret = 0; | ||
4199 | struct vm_area_struct *vma = walk->private; | ||
4200 | pte_t *pte; | ||
4201 | spinlock_t *ptl; | ||
4202 | |||
4203 | retry: | ||
4204 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4205 | for (; addr != end; addr += PAGE_SIZE) { | ||
4206 | pte_t ptent = *(pte++); | ||
4207 | union mc_target target; | ||
4208 | int type; | ||
4209 | struct page *page; | ||
4210 | struct page_cgroup *pc; | ||
4211 | swp_entry_t ent; | ||
4212 | |||
4213 | if (!mc.precharge) | ||
4214 | break; | ||
4215 | |||
4216 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
4217 | switch (type) { | ||
4218 | case MC_TARGET_PAGE: | ||
4219 | page = target.page; | ||
4220 | if (isolate_lru_page(page)) | ||
4221 | goto put; | ||
4222 | pc = lookup_page_cgroup(page); | ||
4223 | if (!mem_cgroup_move_account(pc, | ||
4224 | mc.from, mc.to, false)) { | ||
4225 | mc.precharge--; | ||
4226 | /* we uncharge from mc.from later. */ | ||
4227 | mc.moved_charge++; | ||
4228 | } | ||
4229 | putback_lru_page(page); | ||
4230 | put: /* is_target_pte_for_mc() gets the page */ | ||
4231 | put_page(page); | ||
4232 | break; | ||
4233 | case MC_TARGET_SWAP: | ||
4234 | ent = target.ent; | ||
4235 | if (!mem_cgroup_move_swap_account(ent, | ||
4236 | mc.from, mc.to, false)) { | ||
4237 | mc.precharge--; | ||
4238 | /* we fixup refcnts and charges later. */ | ||
4239 | mc.moved_swap++; | ||
4240 | } | ||
4241 | break; | ||
4242 | default: | ||
4243 | break; | ||
4244 | } | ||
4245 | } | ||
4246 | pte_unmap_unlock(pte - 1, ptl); | ||
4247 | cond_resched(); | ||
4248 | |||
4249 | if (addr != end) { | ||
4250 | /* | ||
4251 | * We have consumed all precharges we got in can_attach(). | ||
4252 | * We try charge one by one, but don't do any additional | ||
4253 | * charges to mc.to if we have failed in charge once in attach() | ||
4254 | * phase. | ||
4255 | */ | ||
4256 | ret = mem_cgroup_do_precharge(1); | ||
4257 | if (!ret) | ||
4258 | goto retry; | ||
4259 | } | ||
4260 | |||
4261 | return ret; | ||
4262 | } | ||
4263 | |||
4264 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
4265 | { | ||
4266 | struct vm_area_struct *vma; | ||
4267 | |||
4268 | lru_add_drain_all(); | ||
4269 | down_read(&mm->mmap_sem); | ||
4270 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4271 | int ret; | ||
4272 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
4273 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
4274 | .mm = mm, | ||
4275 | .private = vma, | ||
4276 | }; | ||
4277 | if (is_vm_hugetlb_page(vma)) | ||
4278 | continue; | ||
4279 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4280 | if (vma->vm_flags & VM_SHARED) | ||
4281 | continue; | ||
4282 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
4283 | &mem_cgroup_move_charge_walk); | ||
4284 | if (ret) | ||
4285 | /* | ||
4286 | * means we have consumed all precharges and failed in | ||
4287 | * doing additional charge. Just abandon here. | ||
4288 | */ | ||
4289 | break; | ||
4290 | } | ||
4291 | up_read(&mm->mmap_sem); | ||
4292 | } | ||
4293 | |||
3167 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4294 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3168 | struct cgroup *cont, | 4295 | struct cgroup *cont, |
3169 | struct cgroup *old_cont, | 4296 | struct cgroup *old_cont, |
3170 | struct task_struct *p, | 4297 | struct task_struct *p, |
3171 | bool threadgroup) | 4298 | bool threadgroup) |
3172 | { | 4299 | { |
3173 | mutex_lock(&memcg_tasklist); | 4300 | struct mm_struct *mm; |
3174 | /* | 4301 | |
3175 | * FIXME: It's better to move charges of this process from old | 4302 | if (!mc.to) |
3176 | * memcg to new memcg. But it's just on TODO-List now. | 4303 | /* no need to move charge */ |
3177 | */ | 4304 | return; |
3178 | mutex_unlock(&memcg_tasklist); | 4305 | |
4306 | mm = get_task_mm(p); | ||
4307 | if (mm) { | ||
4308 | mem_cgroup_move_charge(mm); | ||
4309 | mmput(mm); | ||
4310 | } | ||
4311 | mem_cgroup_clear_mc(); | ||
4312 | } | ||
4313 | #else /* !CONFIG_MMU */ | ||
4314 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4315 | struct cgroup *cgroup, | ||
4316 | struct task_struct *p, | ||
4317 | bool threadgroup) | ||
4318 | { | ||
4319 | return 0; | ||
3179 | } | 4320 | } |
4321 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4322 | struct cgroup *cgroup, | ||
4323 | struct task_struct *p, | ||
4324 | bool threadgroup) | ||
4325 | { | ||
4326 | } | ||
4327 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
4328 | struct cgroup *cont, | ||
4329 | struct cgroup *old_cont, | ||
4330 | struct task_struct *p, | ||
4331 | bool threadgroup) | ||
4332 | { | ||
4333 | } | ||
4334 | #endif | ||
3180 | 4335 | ||
3181 | struct cgroup_subsys mem_cgroup_subsys = { | 4336 | struct cgroup_subsys mem_cgroup_subsys = { |
3182 | .name = "memory", | 4337 | .name = "memory", |
@@ -3185,6 +4340,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3185 | .pre_destroy = mem_cgroup_pre_destroy, | 4340 | .pre_destroy = mem_cgroup_pre_destroy, |
3186 | .destroy = mem_cgroup_destroy, | 4341 | .destroy = mem_cgroup_destroy, |
3187 | .populate = mem_cgroup_populate, | 4342 | .populate = mem_cgroup_populate, |
4343 | .can_attach = mem_cgroup_can_attach, | ||
4344 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3188 | .attach = mem_cgroup_move_task, | 4345 | .attach = mem_cgroup_move_task, |
3189 | .early_init = 0, | 4346 | .early_init = 0, |
3190 | .use_id = 1, | 4347 | .use_id = 1, |