diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1386 |
1 files changed, 1163 insertions, 223 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d813823ab08f..7973b5221fb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -6,6 +6,10 @@ | |||
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | ||
10 | * Copyright (C) 2009 Nokia Corporation | ||
11 | * Author: Kirill A. Shutemov | ||
12 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +25,7 @@ | |||
21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
@@ -32,7 +37,10 @@ | |||
32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | ||
35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | ||
43 | #include <linux/sort.h> | ||
36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
@@ -55,7 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
55 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
56 | #endif | 64 | #endif |
57 | 65 | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 66 | /* |
67 | * Per memcg event counter is incremented at every pagein/pageout. This counter | ||
68 | * is used for trigger some periodic events. This is straightforward and better | ||
69 | * than using jiffies etc. to handle periodic memcg event. | ||
70 | * | ||
71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
72 | */ | ||
73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
59 | 75 | ||
60 | /* | 76 | /* |
61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
@@ -69,62 +85,16 @@ enum mem_cgroup_stat_index { | |||
69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
74 | 90 | ||
75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
76 | }; | 92 | }; |
77 | 93 | ||
78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
80 | } ____cacheline_aligned_in_smp; | ||
81 | |||
82 | struct mem_cgroup_stat { | ||
83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * For accounting under irq disable, no need for increment preempt count. | ||
102 | */ | ||
103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
104 | enum mem_cgroup_stat_index idx, int val) | ||
105 | { | ||
106 | stat->count[idx] += val; | ||
107 | } | ||
108 | |||
109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
110 | enum mem_cgroup_stat_index idx) | ||
111 | { | ||
112 | int cpu; | ||
113 | s64 ret = 0; | ||
114 | for_each_possible_cpu(cpu) | ||
115 | ret += stat->cpustat[cpu].count[idx]; | ||
116 | return ret; | ||
117 | } | ||
118 | |||
119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
120 | { | ||
121 | s64 ret; | ||
122 | |||
123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | /* | 98 | /* |
129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
130 | */ | 100 | */ |
@@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
174 | 144 | ||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
176 | 146 | ||
147 | struct mem_cgroup_threshold { | ||
148 | struct eventfd_ctx *eventfd; | ||
149 | u64 threshold; | ||
150 | }; | ||
151 | |||
152 | struct mem_cgroup_threshold_ary { | ||
153 | /* An array index points to threshold just below usage. */ | ||
154 | atomic_t current_threshold; | ||
155 | /* Size of entries[] */ | ||
156 | unsigned int size; | ||
157 | /* Array of thresholds */ | ||
158 | struct mem_cgroup_threshold entries[0]; | ||
159 | }; | ||
160 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
162 | |||
177 | /* | 163 | /* |
178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -217,7 +203,7 @@ struct mem_cgroup { | |||
217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
218 | */ | 204 | */ |
219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
222 | 208 | ||
223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -225,10 +211,48 @@ struct mem_cgroup { | |||
225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
227 | 213 | ||
214 | /* protect arrays of thresholds */ | ||
215 | struct mutex thresholds_lock; | ||
216 | |||
217 | /* thresholds for memory usage. RCU-protected */ | ||
218 | struct mem_cgroup_threshold_ary *thresholds; | ||
219 | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
222 | |||
228 | /* | 223 | /* |
229 | * statistics. This must be placed at the end of memcg. | 224 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | ||
230 | */ | 226 | */ |
231 | struct mem_cgroup_stat stat; | 227 | unsigned long move_charge_at_immigrate; |
228 | |||
229 | /* | ||
230 | * percpu counter. | ||
231 | */ | ||
232 | struct mem_cgroup_stat_cpu *stat; | ||
233 | }; | ||
234 | |||
235 | /* Stuffs for move charges at task migration. */ | ||
236 | /* | ||
237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
238 | * left-shifted bitmap of these types. | ||
239 | */ | ||
240 | enum move_type { | ||
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
242 | NR_MOVE_TYPE, | ||
243 | }; | ||
244 | |||
245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
246 | static struct move_charge_struct { | ||
247 | struct mem_cgroup *from; | ||
248 | struct mem_cgroup *to; | ||
249 | unsigned long precharge; | ||
250 | unsigned long moved_charge; | ||
251 | unsigned long moved_swap; | ||
252 | struct task_struct *moving_task; /* a task moving charges */ | ||
253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
254 | } mc = { | ||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
232 | }; | 256 | }; |
233 | 257 | ||
234 | /* | 258 | /* |
@@ -371,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
371 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
372 | } | 396 | } |
373 | 397 | ||
374 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
375 | { | ||
376 | bool ret = false; | ||
377 | int cpu; | ||
378 | s64 val; | ||
379 | struct mem_cgroup_stat_cpu *cpustat; | ||
380 | |||
381 | cpu = get_cpu(); | ||
382 | cpustat = &mem->stat.cpustat[cpu]; | ||
383 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
384 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
385 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
386 | ret = true; | ||
387 | } | ||
388 | put_cpu(); | ||
389 | return ret; | ||
390 | } | ||
391 | 398 | ||
392 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
393 | { | 400 | { |
@@ -481,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
481 | return mz; | 488 | return mz; |
482 | } | 489 | } |
483 | 490 | ||
491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
492 | enum mem_cgroup_stat_index idx) | ||
493 | { | ||
494 | int cpu; | ||
495 | s64 val = 0; | ||
496 | |||
497 | for_each_possible_cpu(cpu) | ||
498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
499 | return val; | ||
500 | } | ||
501 | |||
502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
503 | { | ||
504 | s64 ret; | ||
505 | |||
506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
484 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
485 | bool charge) | 512 | bool charge) |
486 | { | 513 | { |
487 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
488 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
489 | struct mem_cgroup_stat_cpu *cpustat; | ||
490 | int cpu = get_cpu(); | ||
491 | |||
492 | cpustat = &stat->cpustat[cpu]; | ||
493 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
494 | put_cpu(); | ||
495 | } | 516 | } |
496 | 517 | ||
497 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
@@ -499,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
499 | bool charge) | 520 | bool charge) |
500 | { | 521 | { |
501 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
502 | struct mem_cgroup_stat *stat = &mem->stat; | ||
503 | struct mem_cgroup_stat_cpu *cpustat; | ||
504 | int cpu = get_cpu(); | ||
505 | 523 | ||
506 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
525 | |||
507 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
508 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
509 | else | 528 | else |
510 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
511 | 530 | ||
512 | if (charge) | 531 | if (charge) |
513 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
514 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
515 | else | 533 | else |
516 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
517 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
518 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
519 | put_cpu(); | 537 | preempt_enable(); |
520 | } | 538 | } |
521 | 539 | ||
522 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
@@ -534,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
534 | return total; | 552 | return total; |
535 | } | 553 | } |
536 | 554 | ||
555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
556 | { | ||
557 | s64 val; | ||
558 | |||
559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
560 | |||
561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * Check events in order. | ||
566 | * | ||
567 | */ | ||
568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
569 | { | ||
570 | /* threshold event is triggered in finer grain than soft limit */ | ||
571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
572 | mem_cgroup_threshold(mem); | ||
573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
574 | mem_cgroup_update_tree(mem, page); | ||
575 | } | ||
576 | } | ||
577 | |||
537 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
538 | { | 579 | { |
539 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
@@ -1000,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
1000 | } | 1041 | } |
1001 | 1042 | ||
1002 | /** | 1043 | /** |
1003 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1044 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1004 | * @memcg: The memory cgroup that went over limit | 1045 | * @memcg: The memory cgroup that went over limit |
1005 | * @p: Task that is going to be killed | 1046 | * @p: Task that is going to be killed |
1006 | * | 1047 | * |
@@ -1174,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1174 | } | 1215 | } |
1175 | } | 1216 | } |
1176 | } | 1217 | } |
1177 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1218 | if (!mem_cgroup_local_usage(victim)) { |
1178 | /* this cgroup's local usage == 0 */ | 1219 | /* this cgroup's local usage == 0 */ |
1179 | css_put(&victim->css); | 1220 | css_put(&victim->css); |
1180 | continue; | 1221 | continue; |
@@ -1205,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1205 | return total; | 1246 | return total; |
1206 | } | 1247 | } |
1207 | 1248 | ||
1208 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1209 | { | 1250 | { |
1210 | bool ret = false; | 1251 | int *val = (int *)data; |
1211 | struct mem_cgroup *mem; | 1252 | int x; |
1212 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1213 | 1269 | ||
1214 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1215 | mm = task->mm; | 1271 | |
1216 | if (!mm) | 1272 | if (lock_count == 1) |
1217 | mm = &init_mm; | 1273 | return true; |
1218 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1219 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1220 | ret = true; | ||
1221 | rcu_read_unlock(); | ||
1222 | return ret; | ||
1223 | } | 1275 | } |
1224 | 1276 | ||
1225 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1226 | { | 1278 | { |
1227 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1228 | return 0; | 1285 | return 0; |
1229 | } | 1286 | } |
1230 | 1287 | ||
1231 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1232 | { | 1289 | { |
1233 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); |
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1300 | { | ||
1301 | DEFINE_WAIT(wait); | ||
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1234 | } | 1345 | } |
1235 | 1346 | ||
1236 | /* | 1347 | /* |
@@ -1240,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1351 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1241 | { | 1352 | { |
1242 | struct mem_cgroup *mem; | 1353 | struct mem_cgroup *mem; |
1243 | struct mem_cgroup_stat *stat; | ||
1244 | struct mem_cgroup_stat_cpu *cpustat; | ||
1245 | int cpu; | ||
1246 | struct page_cgroup *pc; | 1354 | struct page_cgroup *pc; |
1247 | 1355 | ||
1248 | pc = lookup_page_cgroup(page); | 1356 | pc = lookup_page_cgroup(page); |
@@ -1258,13 +1366,10 @@ void mem_cgroup_update_file_mapped(struct page *page, int val) | |||
1258 | goto done; | 1366 | goto done; |
1259 | 1367 | ||
1260 | /* | 1368 | /* |
1261 | * Preemption is already disabled, we don't need get_cpu() | 1369 | * Preemption is already disabled. We can use __this_cpu_xxx |
1262 | */ | 1370 | */ |
1263 | cpu = smp_processor_id(); | 1371 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], val); |
1264 | stat = &mem->stat; | ||
1265 | cpustat = &stat->cpustat[cpu]; | ||
1266 | 1372 | ||
1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1268 | done: | 1373 | done: |
1269 | unlock_page_cgroup(pc); | 1374 | unlock_page_cgroup(pc); |
1270 | } | 1375 | } |
@@ -1401,19 +1506,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | |||
1401 | * oom-killer can be invoked. | 1506 | * oom-killer can be invoked. |
1402 | */ | 1507 | */ |
1403 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1508 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1404 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1509 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1405 | bool oom, struct page *page) | ||
1406 | { | 1510 | { |
1407 | struct mem_cgroup *mem, *mem_over_limit; | 1511 | struct mem_cgroup *mem, *mem_over_limit; |
1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1512 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1409 | struct res_counter *fail_res; | 1513 | struct res_counter *fail_res; |
1410 | int csize = CHARGE_SIZE; | 1514 | int csize = CHARGE_SIZE; |
1411 | 1515 | ||
1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1516 | /* |
1413 | /* Don't account this! */ | 1517 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1414 | *memcg = NULL; | 1518 | * in system level. So, allow to go ahead dying process in addition to |
1415 | return 0; | 1519 | * MEMDIE process. |
1416 | } | 1520 | */ |
1521 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1522 | || fatal_signal_pending(current))) | ||
1523 | goto bypass; | ||
1417 | 1524 | ||
1418 | /* | 1525 | /* |
1419 | * We always charge the cgroup the mm_struct belongs to. | 1526 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1440,7 +1547,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1440 | unsigned long flags = 0; | 1547 | unsigned long flags = 0; |
1441 | 1548 | ||
1442 | if (consume_stock(mem)) | 1549 | if (consume_stock(mem)) |
1443 | goto charged; | 1550 | goto done; |
1444 | 1551 | ||
1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | 1552 | ret = res_counter_charge(&mem->res, csize, &fail_res); |
1446 | if (likely(!ret)) { | 1553 | if (likely(!ret)) { |
@@ -1483,28 +1590,70 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1483 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1590 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
1484 | continue; | 1591 | continue; |
1485 | 1592 | ||
1593 | /* try to avoid oom while someone is moving charge */ | ||
1594 | if (mc.moving_task && current != mc.moving_task) { | ||
1595 | struct mem_cgroup *from, *to; | ||
1596 | bool do_continue = false; | ||
1597 | /* | ||
1598 | * There is a small race that "from" or "to" can be | ||
1599 | * freed by rmdir, so we use css_tryget(). | ||
1600 | */ | ||
1601 | rcu_read_lock(); | ||
1602 | from = mc.from; | ||
1603 | to = mc.to; | ||
1604 | if (from && css_tryget(&from->css)) { | ||
1605 | if (mem_over_limit->use_hierarchy) | ||
1606 | do_continue = css_is_ancestor( | ||
1607 | &from->css, | ||
1608 | &mem_over_limit->css); | ||
1609 | else | ||
1610 | do_continue = (from == mem_over_limit); | ||
1611 | css_put(&from->css); | ||
1612 | } | ||
1613 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1614 | if (mem_over_limit->use_hierarchy) | ||
1615 | do_continue = css_is_ancestor( | ||
1616 | &to->css, | ||
1617 | &mem_over_limit->css); | ||
1618 | else | ||
1619 | do_continue = (to == mem_over_limit); | ||
1620 | css_put(&to->css); | ||
1621 | } | ||
1622 | rcu_read_unlock(); | ||
1623 | if (do_continue) { | ||
1624 | DEFINE_WAIT(wait); | ||
1625 | prepare_to_wait(&mc.waitq, &wait, | ||
1626 | TASK_INTERRUPTIBLE); | ||
1627 | /* moving charge context might have finished. */ | ||
1628 | if (mc.moving_task) | ||
1629 | schedule(); | ||
1630 | finish_wait(&mc.waitq, &wait); | ||
1631 | continue; | ||
1632 | } | ||
1633 | } | ||
1634 | |||
1486 | if (!nr_retries--) { | 1635 | if (!nr_retries--) { |
1487 | if (oom) { | 1636 | if (!oom) |
1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1637 | goto nomem; |
1489 | record_last_oom(mem_over_limit); | 1638 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1639 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1640 | continue; | ||
1490 | } | 1641 | } |
1491 | goto nomem; | 1642 | /* When we reach here, current task is dying .*/ |
1643 | css_put(&mem->css); | ||
1644 | goto bypass; | ||
1492 | } | 1645 | } |
1493 | } | 1646 | } |
1494 | if (csize > PAGE_SIZE) | 1647 | if (csize > PAGE_SIZE) |
1495 | refill_stock(mem, csize - PAGE_SIZE); | 1648 | refill_stock(mem, csize - PAGE_SIZE); |
1496 | charged: | ||
1497 | /* | ||
1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1499 | * if they exceeds softlimit. | ||
1500 | */ | ||
1501 | if (mem_cgroup_soft_limit_check(mem)) | ||
1502 | mem_cgroup_update_tree(mem, page); | ||
1503 | done: | 1649 | done: |
1504 | return 0; | 1650 | return 0; |
1505 | nomem: | 1651 | nomem: |
1506 | css_put(&mem->css); | 1652 | css_put(&mem->css); |
1507 | return -ENOMEM; | 1653 | return -ENOMEM; |
1654 | bypass: | ||
1655 | *memcg = NULL; | ||
1656 | return 0; | ||
1508 | } | 1657 | } |
1509 | 1658 | ||
1510 | /* | 1659 | /* |
@@ -1512,14 +1661,23 @@ nomem: | |||
1512 | * This function is for that and do uncharge, put css's refcnt. | 1661 | * This function is for that and do uncharge, put css's refcnt. |
1513 | * gotten by try_charge(). | 1662 | * gotten by try_charge(). |
1514 | */ | 1663 | */ |
1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | 1664 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, |
1665 | unsigned long count) | ||
1516 | { | 1666 | { |
1517 | if (!mem_cgroup_is_root(mem)) { | 1667 | if (!mem_cgroup_is_root(mem)) { |
1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1668 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); |
1519 | if (do_swap_account) | 1669 | if (do_swap_account) |
1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1670 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); |
1671 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1672 | WARN_ON_ONCE(count > INT_MAX); | ||
1673 | __css_put(&mem->css, (int)count); | ||
1521 | } | 1674 | } |
1522 | css_put(&mem->css); | 1675 | /* we don't need css_put for root */ |
1676 | } | ||
1677 | |||
1678 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1679 | { | ||
1680 | __mem_cgroup_cancel_charge(mem, 1); | ||
1523 | } | 1681 | } |
1524 | 1682 | ||
1525 | /* | 1683 | /* |
@@ -1615,6 +1773,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1615 | mem_cgroup_charge_statistics(mem, pc, true); | 1773 | mem_cgroup_charge_statistics(mem, pc, true); |
1616 | 1774 | ||
1617 | unlock_page_cgroup(pc); | 1775 | unlock_page_cgroup(pc); |
1776 | /* | ||
1777 | * "charge_statistics" updated event counter. Then, check it. | ||
1778 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1779 | * if they exceeds softlimit. | ||
1780 | */ | ||
1781 | memcg_check_events(mem, pc->page); | ||
1618 | } | 1782 | } |
1619 | 1783 | ||
1620 | /** | 1784 | /** |
@@ -1622,22 +1786,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1622 | * @pc: page_cgroup of the page. | 1786 | * @pc: page_cgroup of the page. |
1623 | * @from: mem_cgroup which the page is moved from. | 1787 | * @from: mem_cgroup which the page is moved from. |
1624 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1788 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1789 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
1625 | * | 1790 | * |
1626 | * The caller must confirm following. | 1791 | * The caller must confirm following. |
1627 | * - page is not on LRU (isolate_page() is useful.) | 1792 | * - page is not on LRU (isolate_page() is useful.) |
1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. | 1793 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1629 | * | 1794 | * |
1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1795 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
1631 | * new cgroup. It should be done by a caller. | 1796 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
1797 | * true, this function does "uncharge" from old cgroup, but it doesn't if | ||
1798 | * @uncharge is false, so a caller should do "uncharge". | ||
1632 | */ | 1799 | */ |
1633 | 1800 | ||
1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, | 1801 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1635 | struct mem_cgroup *from, struct mem_cgroup *to) | 1802 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1636 | { | 1803 | { |
1637 | struct page *page; | 1804 | struct page *page; |
1638 | int cpu; | ||
1639 | struct mem_cgroup_stat *stat; | ||
1640 | struct mem_cgroup_stat_cpu *cpustat; | ||
1641 | 1805 | ||
1642 | VM_BUG_ON(from == to); | 1806 | VM_BUG_ON(from == to); |
1643 | VM_BUG_ON(PageLRU(pc->page)); | 1807 | VM_BUG_ON(PageLRU(pc->page)); |
@@ -1645,38 +1809,28 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
1645 | VM_BUG_ON(!PageCgroupUsed(pc)); | 1809 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1646 | VM_BUG_ON(pc->mem_cgroup != from); | 1810 | VM_BUG_ON(pc->mem_cgroup != from); |
1647 | 1811 | ||
1648 | if (!mem_cgroup_is_root(from)) | ||
1649 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1650 | mem_cgroup_charge_statistics(from, pc, false); | ||
1651 | |||
1652 | page = pc->page; | 1812 | page = pc->page; |
1653 | if (page_mapped(page) && !PageAnon(page)) { | 1813 | if (page_mapped(page) && !PageAnon(page)) { |
1654 | cpu = smp_processor_id(); | 1814 | /* Update mapped_file data for mem_cgroup */ |
1655 | /* Update mapped_file data for mem_cgroup "from" */ | 1815 | preempt_disable(); |
1656 | stat = &from->stat; | 1816 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1657 | cpustat = &stat->cpustat[cpu]; | 1817 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | 1818 | preempt_enable(); |
1659 | -1); | ||
1660 | |||
1661 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1662 | stat = &to->stat; | ||
1663 | cpustat = &stat->cpustat[cpu]; | ||
1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, | ||
1665 | 1); | ||
1666 | } | 1819 | } |
1820 | mem_cgroup_charge_statistics(from, pc, false); | ||
1821 | if (uncharge) | ||
1822 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
1823 | mem_cgroup_cancel_charge(from); | ||
1667 | 1824 | ||
1668 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1825 | /* caller should have done css_get */ |
1669 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1670 | css_put(&from->css); | ||
1671 | |||
1672 | css_get(&to->css); | ||
1673 | pc->mem_cgroup = to; | 1826 | pc->mem_cgroup = to; |
1674 | mem_cgroup_charge_statistics(to, pc, true); | 1827 | mem_cgroup_charge_statistics(to, pc, true); |
1675 | /* | 1828 | /* |
1676 | * We charges against "to" which may not have any tasks. Then, "to" | 1829 | * We charges against "to" which may not have any tasks. Then, "to" |
1677 | * can be under rmdir(). But in current implementation, caller of | 1830 | * can be under rmdir(). But in current implementation, caller of |
1678 | * this function is just force_empty() and it's garanteed that | 1831 | * this function is just force_empty() and move charge, so it's |
1679 | * "to" is never removed. So, we don't check rmdir status here. | 1832 | * garanteed that "to" is never removed. So, we don't check rmdir |
1833 | * status here. | ||
1680 | */ | 1834 | */ |
1681 | } | 1835 | } |
1682 | 1836 | ||
@@ -1685,15 +1839,20 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, | |||
1685 | * __mem_cgroup_move_account() | 1839 | * __mem_cgroup_move_account() |
1686 | */ | 1840 | */ |
1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1841 | static int mem_cgroup_move_account(struct page_cgroup *pc, |
1688 | struct mem_cgroup *from, struct mem_cgroup *to) | 1842 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1689 | { | 1843 | { |
1690 | int ret = -EINVAL; | 1844 | int ret = -EINVAL; |
1691 | lock_page_cgroup(pc); | 1845 | lock_page_cgroup(pc); |
1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | 1846 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { |
1693 | __mem_cgroup_move_account(pc, from, to); | 1847 | __mem_cgroup_move_account(pc, from, to, uncharge); |
1694 | ret = 0; | 1848 | ret = 0; |
1695 | } | 1849 | } |
1696 | unlock_page_cgroup(pc); | 1850 | unlock_page_cgroup(pc); |
1851 | /* | ||
1852 | * check events | ||
1853 | */ | ||
1854 | memcg_check_events(to, pc->page); | ||
1855 | memcg_check_events(from, pc->page); | ||
1697 | return ret; | 1856 | return ret; |
1698 | } | 1857 | } |
1699 | 1858 | ||
@@ -1722,15 +1881,13 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1722 | goto put; | 1881 | goto put; |
1723 | 1882 | ||
1724 | parent = mem_cgroup_from_cont(pcg); | 1883 | parent = mem_cgroup_from_cont(pcg); |
1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1884 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); |
1726 | if (ret || !parent) | 1885 | if (ret || !parent) |
1727 | goto put_back; | 1886 | goto put_back; |
1728 | 1887 | ||
1729 | ret = mem_cgroup_move_account(pc, child, parent); | 1888 | ret = mem_cgroup_move_account(pc, child, parent, true); |
1730 | if (!ret) | 1889 | if (ret) |
1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | 1890 | mem_cgroup_cancel_charge(parent); |
1732 | else | ||
1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1734 | put_back: | 1891 | put_back: |
1735 | putback_lru_page(page); | 1892 | putback_lru_page(page); |
1736 | put: | 1893 | put: |
@@ -1760,7 +1917,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1760 | prefetchw(pc); | 1917 | prefetchw(pc); |
1761 | 1918 | ||
1762 | mem = memcg; | 1919 | mem = memcg; |
1763 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1920 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1764 | if (ret || !mem) | 1921 | if (ret || !mem) |
1765 | return ret; | 1922 | return ret; |
1766 | 1923 | ||
@@ -1880,14 +2037,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1880 | if (!mem) | 2037 | if (!mem) |
1881 | goto charge_cur_mm; | 2038 | goto charge_cur_mm; |
1882 | *ptr = mem; | 2039 | *ptr = mem; |
1883 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2040 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1884 | /* drop extra refcnt from tryget */ | 2041 | /* drop extra refcnt from tryget */ |
1885 | css_put(&mem->css); | 2042 | css_put(&mem->css); |
1886 | return ret; | 2043 | return ret; |
1887 | charge_cur_mm: | 2044 | charge_cur_mm: |
1888 | if (unlikely(!mm)) | 2045 | if (unlikely(!mm)) |
1889 | mm = &init_mm; | 2046 | mm = &init_mm; |
1890 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2047 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
1891 | } | 2048 | } |
1892 | 2049 | ||
1893 | static void | 2050 | static void |
@@ -2064,8 +2221,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2064 | mz = page_cgroup_zoneinfo(pc); | 2221 | mz = page_cgroup_zoneinfo(pc); |
2065 | unlock_page_cgroup(pc); | 2222 | unlock_page_cgroup(pc); |
2066 | 2223 | ||
2067 | if (mem_cgroup_soft_limit_check(mem)) | 2224 | memcg_check_events(mem, page); |
2068 | mem_cgroup_update_tree(mem, page); | ||
2069 | /* at swapout, this memcg will be accessed to record to swap */ | 2225 | /* at swapout, this memcg will be accessed to record to swap */ |
2070 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2226 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2071 | css_put(&mem->css); | 2227 | css_put(&mem->css); |
@@ -2192,6 +2348,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
2192 | } | 2348 | } |
2193 | rcu_read_unlock(); | 2349 | rcu_read_unlock(); |
2194 | } | 2350 | } |
2351 | |||
2352 | /** | ||
2353 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2354 | * @entry: swap entry to be moved | ||
2355 | * @from: mem_cgroup which the entry is moved from | ||
2356 | * @to: mem_cgroup which the entry is moved to | ||
2357 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
2358 | * | ||
2359 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2360 | * as the mem_cgroup's id of @from. | ||
2361 | * | ||
2362 | * Returns 0 on success, -EINVAL on failure. | ||
2363 | * | ||
2364 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2365 | * both res and memsw, and called css_get(). | ||
2366 | */ | ||
2367 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2368 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2369 | { | ||
2370 | unsigned short old_id, new_id; | ||
2371 | |||
2372 | old_id = css_id(&from->css); | ||
2373 | new_id = css_id(&to->css); | ||
2374 | |||
2375 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2376 | mem_cgroup_swap_statistics(from, false); | ||
2377 | mem_cgroup_swap_statistics(to, true); | ||
2378 | /* | ||
2379 | * This function is only called from task migration context now. | ||
2380 | * It postpones res_counter and refcount handling till the end | ||
2381 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
2382 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
2383 | * because if the process that has been moved to @to does | ||
2384 | * swap-in, the refcount of @to might be decreased to 0. | ||
2385 | */ | ||
2386 | mem_cgroup_get(to); | ||
2387 | if (need_fixup) { | ||
2388 | if (!mem_cgroup_is_root(from)) | ||
2389 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2390 | mem_cgroup_put(from); | ||
2391 | /* | ||
2392 | * we charged both to->res and to->memsw, so we should | ||
2393 | * uncharge to->res. | ||
2394 | */ | ||
2395 | if (!mem_cgroup_is_root(to)) | ||
2396 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2397 | css_put(&to->css); | ||
2398 | } | ||
2399 | return 0; | ||
2400 | } | ||
2401 | return -EINVAL; | ||
2402 | } | ||
2403 | #else | ||
2404 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2405 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2406 | { | ||
2407 | return -EINVAL; | ||
2408 | } | ||
2195 | #endif | 2409 | #endif |
2196 | 2410 | ||
2197 | /* | 2411 | /* |
@@ -2216,8 +2430,7 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2216 | unlock_page_cgroup(pc); | 2430 | unlock_page_cgroup(pc); |
2217 | 2431 | ||
2218 | if (mem) { | 2432 | if (mem) { |
2219 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2433 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); |
2220 | page); | ||
2221 | css_put(&mem->css); | 2434 | css_put(&mem->css); |
2222 | } | 2435 | } |
2223 | *ptr = mem; | 2436 | *ptr = mem; |
@@ -2704,7 +2917,7 @@ static int | |||
2704 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2917 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
2705 | { | 2918 | { |
2706 | struct mem_cgroup_idx_data *d = data; | 2919 | struct mem_cgroup_idx_data *d = data; |
2707 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2920 | d->val += mem_cgroup_read_stat(mem, d->idx); |
2708 | return 0; | 2921 | return 0; |
2709 | } | 2922 | } |
2710 | 2923 | ||
@@ -2719,40 +2932,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
2719 | *val = d.val; | 2932 | *val = d.val; |
2720 | } | 2933 | } |
2721 | 2934 | ||
2935 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
2936 | { | ||
2937 | u64 idx_val, val; | ||
2938 | |||
2939 | if (!mem_cgroup_is_root(mem)) { | ||
2940 | if (!swap) | ||
2941 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
2942 | else | ||
2943 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
2944 | } | ||
2945 | |||
2946 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2947 | val = idx_val; | ||
2948 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
2949 | val += idx_val; | ||
2950 | |||
2951 | if (swap) { | ||
2952 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2953 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2954 | val += idx_val; | ||
2955 | } | ||
2956 | |||
2957 | return val << PAGE_SHIFT; | ||
2958 | } | ||
2959 | |||
2722 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2960 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2723 | { | 2961 | { |
2724 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2962 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2725 | u64 idx_val, val; | 2963 | u64 val; |
2726 | int type, name; | 2964 | int type, name; |
2727 | 2965 | ||
2728 | type = MEMFILE_TYPE(cft->private); | 2966 | type = MEMFILE_TYPE(cft->private); |
2729 | name = MEMFILE_ATTR(cft->private); | 2967 | name = MEMFILE_ATTR(cft->private); |
2730 | switch (type) { | 2968 | switch (type) { |
2731 | case _MEM: | 2969 | case _MEM: |
2732 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2970 | if (name == RES_USAGE) |
2733 | mem_cgroup_get_recursive_idx_stat(mem, | 2971 | val = mem_cgroup_usage(mem, false); |
2734 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2972 | else |
2735 | val = idx_val; | ||
2736 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2737 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2738 | val += idx_val; | ||
2739 | val <<= PAGE_SHIFT; | ||
2740 | } else | ||
2741 | val = res_counter_read_u64(&mem->res, name); | 2973 | val = res_counter_read_u64(&mem->res, name); |
2742 | break; | 2974 | break; |
2743 | case _MEMSWAP: | 2975 | case _MEMSWAP: |
2744 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2976 | if (name == RES_USAGE) |
2745 | mem_cgroup_get_recursive_idx_stat(mem, | 2977 | val = mem_cgroup_usage(mem, true); |
2746 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2978 | else |
2747 | val = idx_val; | ||
2748 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2749 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2750 | val += idx_val; | ||
2751 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2752 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2753 | val += idx_val; | ||
2754 | val <<= PAGE_SHIFT; | ||
2755 | } else | ||
2756 | val = res_counter_read_u64(&mem->memsw, name); | 2979 | val = res_counter_read_u64(&mem->memsw, name); |
2757 | break; | 2980 | break; |
2758 | default: | 2981 | default: |
@@ -2865,6 +3088,39 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2865 | return 0; | 3088 | return 0; |
2866 | } | 3089 | } |
2867 | 3090 | ||
3091 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
3092 | struct cftype *cft) | ||
3093 | { | ||
3094 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
3095 | } | ||
3096 | |||
3097 | #ifdef CONFIG_MMU | ||
3098 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3099 | struct cftype *cft, u64 val) | ||
3100 | { | ||
3101 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3102 | |||
3103 | if (val >= (1 << NR_MOVE_TYPE)) | ||
3104 | return -EINVAL; | ||
3105 | /* | ||
3106 | * We check this value several times in both in can_attach() and | ||
3107 | * attach(), so we need cgroup lock to prevent this value from being | ||
3108 | * inconsistent. | ||
3109 | */ | ||
3110 | cgroup_lock(); | ||
3111 | mem->move_charge_at_immigrate = val; | ||
3112 | cgroup_unlock(); | ||
3113 | |||
3114 | return 0; | ||
3115 | } | ||
3116 | #else | ||
3117 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3118 | struct cftype *cft, u64 val) | ||
3119 | { | ||
3120 | return -ENOSYS; | ||
3121 | } | ||
3122 | #endif | ||
3123 | |||
2868 | 3124 | ||
2869 | /* For read statistics */ | 3125 | /* For read statistics */ |
2870 | enum { | 3126 | enum { |
@@ -2910,18 +3166,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2910 | s64 val; | 3166 | s64 val; |
2911 | 3167 | ||
2912 | /* per cpu stat */ | 3168 | /* per cpu stat */ |
2913 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3169 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
2914 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3170 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2915 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3171 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
2916 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3172 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2917 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); | 3173 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
2918 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; | 3174 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2919 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2920 | s->stat[MCS_PGPGIN] += val; | 3176 | s->stat[MCS_PGPGIN] += val; |
2921 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3177 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2922 | s->stat[MCS_PGPGOUT] += val; | 3178 | s->stat[MCS_PGPGOUT] += val; |
2923 | if (do_swap_account) { | 3179 | if (do_swap_account) { |
2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3180 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
2925 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3181 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
2926 | } | 3182 | } |
2927 | 3183 | ||
@@ -3049,12 +3305,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
3049 | return 0; | 3305 | return 0; |
3050 | } | 3306 | } |
3051 | 3307 | ||
3308 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
3309 | { | ||
3310 | struct mem_cgroup_threshold_ary *t; | ||
3311 | u64 usage; | ||
3312 | int i; | ||
3313 | |||
3314 | rcu_read_lock(); | ||
3315 | if (!swap) | ||
3316 | t = rcu_dereference(memcg->thresholds); | ||
3317 | else | ||
3318 | t = rcu_dereference(memcg->memsw_thresholds); | ||
3319 | |||
3320 | if (!t) | ||
3321 | goto unlock; | ||
3322 | |||
3323 | usage = mem_cgroup_usage(memcg, swap); | ||
3324 | |||
3325 | /* | ||
3326 | * current_threshold points to threshold just below usage. | ||
3327 | * If it's not true, a threshold was crossed after last | ||
3328 | * call of __mem_cgroup_threshold(). | ||
3329 | */ | ||
3330 | i = atomic_read(&t->current_threshold); | ||
3331 | |||
3332 | /* | ||
3333 | * Iterate backward over array of thresholds starting from | ||
3334 | * current_threshold and check if a threshold is crossed. | ||
3335 | * If none of thresholds below usage is crossed, we read | ||
3336 | * only one element of the array here. | ||
3337 | */ | ||
3338 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
3339 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3340 | |||
3341 | /* i = current_threshold + 1 */ | ||
3342 | i++; | ||
3343 | |||
3344 | /* | ||
3345 | * Iterate forward over array of thresholds starting from | ||
3346 | * current_threshold+1 and check if a threshold is crossed. | ||
3347 | * If none of thresholds above usage is crossed, we read | ||
3348 | * only one element of the array here. | ||
3349 | */ | ||
3350 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
3351 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3352 | |||
3353 | /* Update current_threshold */ | ||
3354 | atomic_set(&t->current_threshold, i - 1); | ||
3355 | unlock: | ||
3356 | rcu_read_unlock(); | ||
3357 | } | ||
3358 | |||
3359 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
3360 | { | ||
3361 | __mem_cgroup_threshold(memcg, false); | ||
3362 | if (do_swap_account) | ||
3363 | __mem_cgroup_threshold(memcg, true); | ||
3364 | } | ||
3365 | |||
3366 | static int compare_thresholds(const void *a, const void *b) | ||
3367 | { | ||
3368 | const struct mem_cgroup_threshold *_a = a; | ||
3369 | const struct mem_cgroup_threshold *_b = b; | ||
3370 | |||
3371 | return _a->threshold - _b->threshold; | ||
3372 | } | ||
3373 | |||
3374 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
3375 | struct eventfd_ctx *eventfd, const char *args) | ||
3376 | { | ||
3377 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3378 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3379 | int type = MEMFILE_TYPE(cft->private); | ||
3380 | u64 threshold, usage; | ||
3381 | int size; | ||
3382 | int i, ret; | ||
3383 | |||
3384 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
3385 | if (ret) | ||
3386 | return ret; | ||
3387 | |||
3388 | mutex_lock(&memcg->thresholds_lock); | ||
3389 | if (type == _MEM) | ||
3390 | thresholds = memcg->thresholds; | ||
3391 | else if (type == _MEMSWAP) | ||
3392 | thresholds = memcg->memsw_thresholds; | ||
3393 | else | ||
3394 | BUG(); | ||
3395 | |||
3396 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3397 | |||
3398 | /* Check if a threshold crossed before adding a new one */ | ||
3399 | if (thresholds) | ||
3400 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3401 | |||
3402 | if (thresholds) | ||
3403 | size = thresholds->size + 1; | ||
3404 | else | ||
3405 | size = 1; | ||
3406 | |||
3407 | /* Allocate memory for new array of thresholds */ | ||
3408 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3409 | size * sizeof(struct mem_cgroup_threshold), | ||
3410 | GFP_KERNEL); | ||
3411 | if (!thresholds_new) { | ||
3412 | ret = -ENOMEM; | ||
3413 | goto unlock; | ||
3414 | } | ||
3415 | thresholds_new->size = size; | ||
3416 | |||
3417 | /* Copy thresholds (if any) to new array */ | ||
3418 | if (thresholds) | ||
3419 | memcpy(thresholds_new->entries, thresholds->entries, | ||
3420 | thresholds->size * | ||
3421 | sizeof(struct mem_cgroup_threshold)); | ||
3422 | /* Add new threshold */ | ||
3423 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
3424 | thresholds_new->entries[size - 1].threshold = threshold; | ||
3425 | |||
3426 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
3427 | sort(thresholds_new->entries, size, | ||
3428 | sizeof(struct mem_cgroup_threshold), | ||
3429 | compare_thresholds, NULL); | ||
3430 | |||
3431 | /* Find current threshold */ | ||
3432 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3433 | for (i = 0; i < size; i++) { | ||
3434 | if (thresholds_new->entries[i].threshold < usage) { | ||
3435 | /* | ||
3436 | * thresholds_new->current_threshold will not be used | ||
3437 | * until rcu_assign_pointer(), so it's safe to increment | ||
3438 | * it here. | ||
3439 | */ | ||
3440 | atomic_inc(&thresholds_new->current_threshold); | ||
3441 | } | ||
3442 | } | ||
3443 | |||
3444 | if (type == _MEM) | ||
3445 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3446 | else | ||
3447 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3448 | |||
3449 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3450 | synchronize_rcu(); | ||
3451 | |||
3452 | kfree(thresholds); | ||
3453 | unlock: | ||
3454 | mutex_unlock(&memcg->thresholds_lock); | ||
3455 | |||
3456 | return ret; | ||
3457 | } | ||
3458 | |||
3459 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
3460 | struct eventfd_ctx *eventfd) | ||
3461 | { | ||
3462 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3463 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3464 | int type = MEMFILE_TYPE(cft->private); | ||
3465 | u64 usage; | ||
3466 | int size = 0; | ||
3467 | int i, j, ret; | ||
3468 | |||
3469 | mutex_lock(&memcg->thresholds_lock); | ||
3470 | if (type == _MEM) | ||
3471 | thresholds = memcg->thresholds; | ||
3472 | else if (type == _MEMSWAP) | ||
3473 | thresholds = memcg->memsw_thresholds; | ||
3474 | else | ||
3475 | BUG(); | ||
3476 | |||
3477 | /* | ||
3478 | * Something went wrong if we trying to unregister a threshold | ||
3479 | * if we don't have thresholds | ||
3480 | */ | ||
3481 | BUG_ON(!thresholds); | ||
3482 | |||
3483 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3484 | |||
3485 | /* Check if a threshold crossed before removing */ | ||
3486 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3487 | |||
3488 | /* Calculate new number of threshold */ | ||
3489 | for (i = 0; i < thresholds->size; i++) { | ||
3490 | if (thresholds->entries[i].eventfd != eventfd) | ||
3491 | size++; | ||
3492 | } | ||
3493 | |||
3494 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
3495 | if (!size) { | ||
3496 | thresholds_new = NULL; | ||
3497 | goto assign; | ||
3498 | } | ||
3499 | |||
3500 | /* Allocate memory for new array of thresholds */ | ||
3501 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3502 | size * sizeof(struct mem_cgroup_threshold), | ||
3503 | GFP_KERNEL); | ||
3504 | if (!thresholds_new) { | ||
3505 | ret = -ENOMEM; | ||
3506 | goto unlock; | ||
3507 | } | ||
3508 | thresholds_new->size = size; | ||
3509 | |||
3510 | /* Copy thresholds and find current threshold */ | ||
3511 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3512 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
3513 | if (thresholds->entries[i].eventfd == eventfd) | ||
3514 | continue; | ||
3515 | |||
3516 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
3517 | if (thresholds_new->entries[j].threshold < usage) { | ||
3518 | /* | ||
3519 | * thresholds_new->current_threshold will not be used | ||
3520 | * until rcu_assign_pointer(), so it's safe to increment | ||
3521 | * it here. | ||
3522 | */ | ||
3523 | atomic_inc(&thresholds_new->current_threshold); | ||
3524 | } | ||
3525 | j++; | ||
3526 | } | ||
3527 | |||
3528 | assign: | ||
3529 | if (type == _MEM) | ||
3530 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3531 | else | ||
3532 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3533 | |||
3534 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3535 | synchronize_rcu(); | ||
3536 | |||
3537 | kfree(thresholds); | ||
3538 | unlock: | ||
3539 | mutex_unlock(&memcg->thresholds_lock); | ||
3540 | |||
3541 | return ret; | ||
3542 | } | ||
3052 | 3543 | ||
3053 | static struct cftype mem_cgroup_files[] = { | 3544 | static struct cftype mem_cgroup_files[] = { |
3054 | { | 3545 | { |
3055 | .name = "usage_in_bytes", | 3546 | .name = "usage_in_bytes", |
3056 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3547 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3057 | .read_u64 = mem_cgroup_read, | 3548 | .read_u64 = mem_cgroup_read, |
3549 | .register_event = mem_cgroup_register_event, | ||
3550 | .unregister_event = mem_cgroup_unregister_event, | ||
3058 | }, | 3551 | }, |
3059 | { | 3552 | { |
3060 | .name = "max_usage_in_bytes", | 3553 | .name = "max_usage_in_bytes", |
@@ -3098,6 +3591,11 @@ static struct cftype mem_cgroup_files[] = { | |||
3098 | .read_u64 = mem_cgroup_swappiness_read, | 3591 | .read_u64 = mem_cgroup_swappiness_read, |
3099 | .write_u64 = mem_cgroup_swappiness_write, | 3592 | .write_u64 = mem_cgroup_swappiness_write, |
3100 | }, | 3593 | }, |
3594 | { | ||
3595 | .name = "move_charge_at_immigrate", | ||
3596 | .read_u64 = mem_cgroup_move_charge_read, | ||
3597 | .write_u64 = mem_cgroup_move_charge_write, | ||
3598 | }, | ||
3101 | }; | 3599 | }; |
3102 | 3600 | ||
3103 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3601 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3106,6 +3604,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3106 | .name = "memsw.usage_in_bytes", | 3604 | .name = "memsw.usage_in_bytes", |
3107 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3605 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3108 | .read_u64 = mem_cgroup_read, | 3606 | .read_u64 = mem_cgroup_read, |
3607 | .register_event = mem_cgroup_register_event, | ||
3608 | .unregister_event = mem_cgroup_unregister_event, | ||
3109 | }, | 3609 | }, |
3110 | { | 3610 | { |
3111 | .name = "memsw.max_usage_in_bytes", | 3611 | .name = "memsw.max_usage_in_bytes", |
@@ -3180,17 +3680,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
3180 | kfree(mem->info.nodeinfo[node]); | 3680 | kfree(mem->info.nodeinfo[node]); |
3181 | } | 3681 | } |
3182 | 3682 | ||
3183 | static int mem_cgroup_size(void) | ||
3184 | { | ||
3185 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
3186 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
3187 | } | ||
3188 | |||
3189 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3683 | static struct mem_cgroup *mem_cgroup_alloc(void) |
3190 | { | 3684 | { |
3191 | struct mem_cgroup *mem; | 3685 | struct mem_cgroup *mem; |
3192 | int size = mem_cgroup_size(); | 3686 | int size = sizeof(struct mem_cgroup); |
3193 | 3687 | ||
3688 | /* Can be very big if MAX_NUMNODES is very big */ | ||
3194 | if (size < PAGE_SIZE) | 3689 | if (size < PAGE_SIZE) |
3195 | mem = kmalloc(size, GFP_KERNEL); | 3690 | mem = kmalloc(size, GFP_KERNEL); |
3196 | else | 3691 | else |
@@ -3198,6 +3693,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
3198 | 3693 | ||
3199 | if (mem) | 3694 | if (mem) |
3200 | memset(mem, 0, size); | 3695 | memset(mem, 0, size); |
3696 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
3697 | if (!mem->stat) { | ||
3698 | if (size < PAGE_SIZE) | ||
3699 | kfree(mem); | ||
3700 | else | ||
3701 | vfree(mem); | ||
3702 | mem = NULL; | ||
3703 | } | ||
3201 | return mem; | 3704 | return mem; |
3202 | } | 3705 | } |
3203 | 3706 | ||
@@ -3222,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
3222 | for_each_node_state(node, N_POSSIBLE) | 3725 | for_each_node_state(node, N_POSSIBLE) |
3223 | free_mem_cgroup_per_zone_info(mem, node); | 3726 | free_mem_cgroup_per_zone_info(mem, node); |
3224 | 3727 | ||
3225 | if (mem_cgroup_size() < PAGE_SIZE) | 3728 | free_percpu(mem->stat); |
3729 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
3226 | kfree(mem); | 3730 | kfree(mem); |
3227 | else | 3731 | else |
3228 | vfree(mem); | 3732 | vfree(mem); |
@@ -3233,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
3233 | atomic_inc(&mem->refcnt); | 3737 | atomic_inc(&mem->refcnt); |
3234 | } | 3738 | } |
3235 | 3739 | ||
3236 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3740 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
3237 | { | 3741 | { |
3238 | if (atomic_dec_and_test(&mem->refcnt)) { | 3742 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
3239 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3743 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
3240 | __mem_cgroup_free(mem); | 3744 | __mem_cgroup_free(mem); |
3241 | if (parent) | 3745 | if (parent) |
@@ -3243,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
3243 | } | 3747 | } |
3244 | } | 3748 | } |
3245 | 3749 | ||
3750 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
3751 | { | ||
3752 | __mem_cgroup_put(mem, 1); | ||
3753 | } | ||
3754 | |||
3246 | /* | 3755 | /* |
3247 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3756 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
3248 | */ | 3757 | */ |
@@ -3319,7 +3828,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3319 | INIT_WORK(&stock->work, drain_local_stock); | 3828 | INIT_WORK(&stock->work, drain_local_stock); |
3320 | } | 3829 | } |
3321 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 3830 | hotcpu_notifier(memcg_stock_cpu_callback, 0); |
3322 | |||
3323 | } else { | 3831 | } else { |
3324 | parent = mem_cgroup_from_cont(cont->parent); | 3832 | parent = mem_cgroup_from_cont(cont->parent); |
3325 | mem->use_hierarchy = parent->use_hierarchy; | 3833 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -3345,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3345 | if (parent) | 3853 | if (parent) |
3346 | mem->swappiness = get_swappiness(parent); | 3854 | mem->swappiness = get_swappiness(parent); |
3347 | atomic_set(&mem->refcnt, 1); | 3855 | atomic_set(&mem->refcnt, 1); |
3856 | mem->move_charge_at_immigrate = 0; | ||
3857 | mutex_init(&mem->thresholds_lock); | ||
3348 | return &mem->css; | 3858 | return &mem->css; |
3349 | free_out: | 3859 | free_out: |
3350 | __mem_cgroup_free(mem); | 3860 | __mem_cgroup_free(mem); |
@@ -3381,16 +3891,444 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3381 | return ret; | 3891 | return ret; |
3382 | } | 3892 | } |
3383 | 3893 | ||
3894 | #ifdef CONFIG_MMU | ||
3895 | /* Handlers for move charge at task migration. */ | ||
3896 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
3897 | static int mem_cgroup_do_precharge(unsigned long count) | ||
3898 | { | ||
3899 | int ret = 0; | ||
3900 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3901 | struct mem_cgroup *mem = mc.to; | ||
3902 | |||
3903 | if (mem_cgroup_is_root(mem)) { | ||
3904 | mc.precharge += count; | ||
3905 | /* we don't need css_get for root */ | ||
3906 | return ret; | ||
3907 | } | ||
3908 | /* try to charge at once */ | ||
3909 | if (count > 1) { | ||
3910 | struct res_counter *dummy; | ||
3911 | /* | ||
3912 | * "mem" cannot be under rmdir() because we've already checked | ||
3913 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
3914 | * are still under the same cgroup_mutex. So we can postpone | ||
3915 | * css_get(). | ||
3916 | */ | ||
3917 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
3918 | goto one_by_one; | ||
3919 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
3920 | PAGE_SIZE * count, &dummy)) { | ||
3921 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
3922 | goto one_by_one; | ||
3923 | } | ||
3924 | mc.precharge += count; | ||
3925 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
3926 | WARN_ON_ONCE(count > INT_MAX); | ||
3927 | __css_get(&mem->css, (int)count); | ||
3928 | return ret; | ||
3929 | } | ||
3930 | one_by_one: | ||
3931 | /* fall back to one by one charge */ | ||
3932 | while (count--) { | ||
3933 | if (signal_pending(current)) { | ||
3934 | ret = -EINTR; | ||
3935 | break; | ||
3936 | } | ||
3937 | if (!batch_count--) { | ||
3938 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3939 | cond_resched(); | ||
3940 | } | ||
3941 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
3942 | if (ret || !mem) | ||
3943 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
3944 | return -ENOMEM; | ||
3945 | mc.precharge++; | ||
3946 | } | ||
3947 | return ret; | ||
3948 | } | ||
3949 | #else /* !CONFIG_MMU */ | ||
3950 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
3951 | struct cgroup *cgroup, | ||
3952 | struct task_struct *p, | ||
3953 | bool threadgroup) | ||
3954 | { | ||
3955 | return 0; | ||
3956 | } | ||
3957 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
3958 | struct cgroup *cgroup, | ||
3959 | struct task_struct *p, | ||
3960 | bool threadgroup) | ||
3961 | { | ||
3962 | } | ||
3384 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3963 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3385 | struct cgroup *cont, | 3964 | struct cgroup *cont, |
3386 | struct cgroup *old_cont, | 3965 | struct cgroup *old_cont, |
3387 | struct task_struct *p, | 3966 | struct task_struct *p, |
3388 | bool threadgroup) | 3967 | bool threadgroup) |
3389 | { | 3968 | { |
3969 | } | ||
3970 | #endif | ||
3971 | |||
3972 | /** | ||
3973 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3974 | * @vma: the vma the pte to be checked belongs | ||
3975 | * @addr: the address corresponding to the pte to be checked | ||
3976 | * @ptent: the pte to be checked | ||
3977 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
3978 | * | ||
3979 | * Returns | ||
3980 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3981 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3982 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3983 | * with extra refcnt got(Callers should handle it). | ||
3984 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3985 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3986 | * in target->ent. | ||
3987 | * | ||
3988 | * Called with pte lock held. | ||
3989 | */ | ||
3990 | union mc_target { | ||
3991 | struct page *page; | ||
3992 | swp_entry_t ent; | ||
3993 | }; | ||
3994 | |||
3995 | enum mc_target_type { | ||
3996 | MC_TARGET_NONE, /* not used */ | ||
3997 | MC_TARGET_PAGE, | ||
3998 | MC_TARGET_SWAP, | ||
3999 | }; | ||
4000 | |||
4001 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
4002 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
4003 | { | ||
4004 | struct page *page = NULL; | ||
4005 | struct page_cgroup *pc; | ||
4006 | int ret = 0; | ||
4007 | swp_entry_t ent = { .val = 0 }; | ||
4008 | int usage_count = 0; | ||
4009 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
4010 | &mc.to->move_charge_at_immigrate); | ||
4011 | |||
4012 | if (!pte_present(ptent)) { | ||
4013 | /* TODO: handle swap of shmes/tmpfs */ | ||
4014 | if (pte_none(ptent) || pte_file(ptent)) | ||
4015 | return 0; | ||
4016 | else if (is_swap_pte(ptent)) { | ||
4017 | ent = pte_to_swp_entry(ptent); | ||
4018 | if (!move_anon || non_swap_entry(ent)) | ||
4019 | return 0; | ||
4020 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4021 | } | ||
4022 | } else { | ||
4023 | page = vm_normal_page(vma, addr, ptent); | ||
4024 | if (!page || !page_mapped(page)) | ||
4025 | return 0; | ||
4026 | /* | ||
4027 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4028 | * pages for now. | ||
4029 | */ | ||
4030 | if (!move_anon || !PageAnon(page)) | ||
4031 | return 0; | ||
4032 | if (!get_page_unless_zero(page)) | ||
4033 | return 0; | ||
4034 | usage_count = page_mapcount(page); | ||
4035 | } | ||
4036 | if (usage_count > 1) { | ||
4037 | /* | ||
4038 | * TODO: We don't move charges of shared(used by multiple | ||
4039 | * processes) pages for now. | ||
4040 | */ | ||
4041 | if (page) | ||
4042 | put_page(page); | ||
4043 | return 0; | ||
4044 | } | ||
4045 | if (page) { | ||
4046 | pc = lookup_page_cgroup(page); | ||
4047 | /* | ||
4048 | * Do only loose check w/o page_cgroup lock. | ||
4049 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
4050 | * the lock. | ||
4051 | */ | ||
4052 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
4053 | ret = MC_TARGET_PAGE; | ||
4054 | if (target) | ||
4055 | target->page = page; | ||
4056 | } | ||
4057 | if (!ret || !target) | ||
4058 | put_page(page); | ||
4059 | } | ||
4060 | /* throught */ | ||
4061 | if (ent.val && do_swap_account && !ret && | ||
4062 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
4063 | ret = MC_TARGET_SWAP; | ||
4064 | if (target) | ||
4065 | target->ent = ent; | ||
4066 | } | ||
4067 | return ret; | ||
4068 | } | ||
4069 | |||
4070 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
4071 | unsigned long addr, unsigned long end, | ||
4072 | struct mm_walk *walk) | ||
4073 | { | ||
4074 | struct vm_area_struct *vma = walk->private; | ||
4075 | pte_t *pte; | ||
4076 | spinlock_t *ptl; | ||
4077 | |||
4078 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4079 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
4080 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
4081 | mc.precharge++; /* increment precharge temporarily */ | ||
4082 | pte_unmap_unlock(pte - 1, ptl); | ||
4083 | cond_resched(); | ||
4084 | |||
4085 | return 0; | ||
4086 | } | ||
4087 | |||
4088 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
4089 | { | ||
4090 | unsigned long precharge; | ||
4091 | struct vm_area_struct *vma; | ||
4092 | |||
4093 | down_read(&mm->mmap_sem); | ||
4094 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4095 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4096 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4097 | .mm = mm, | ||
4098 | .private = vma, | ||
4099 | }; | ||
4100 | if (is_vm_hugetlb_page(vma)) | ||
4101 | continue; | ||
4102 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4103 | if (vma->vm_flags & VM_SHARED) | ||
4104 | continue; | ||
4105 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4106 | &mem_cgroup_count_precharge_walk); | ||
4107 | } | ||
4108 | up_read(&mm->mmap_sem); | ||
4109 | |||
4110 | precharge = mc.precharge; | ||
4111 | mc.precharge = 0; | ||
4112 | |||
4113 | return precharge; | ||
4114 | } | ||
4115 | |||
4116 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
4117 | { | ||
4118 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
4119 | } | ||
4120 | |||
4121 | static void mem_cgroup_clear_mc(void) | ||
4122 | { | ||
4123 | /* we must uncharge all the leftover precharges from mc.to */ | ||
4124 | if (mc.precharge) { | ||
4125 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
4126 | mc.precharge = 0; | ||
4127 | } | ||
3390 | /* | 4128 | /* |
3391 | * FIXME: It's better to move charges of this process from old | 4129 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
3392 | * memcg to new memcg. But it's just on TODO-List now. | 4130 | * we must uncharge here. |
3393 | */ | 4131 | */ |
4132 | if (mc.moved_charge) { | ||
4133 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
4134 | mc.moved_charge = 0; | ||
4135 | } | ||
4136 | /* we must fixup refcnts and charges */ | ||
4137 | if (mc.moved_swap) { | ||
4138 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4139 | /* uncharge swap account from the old cgroup */ | ||
4140 | if (!mem_cgroup_is_root(mc.from)) | ||
4141 | res_counter_uncharge(&mc.from->memsw, | ||
4142 | PAGE_SIZE * mc.moved_swap); | ||
4143 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
4144 | |||
4145 | if (!mem_cgroup_is_root(mc.to)) { | ||
4146 | /* | ||
4147 | * we charged both to->res and to->memsw, so we should | ||
4148 | * uncharge to->res. | ||
4149 | */ | ||
4150 | res_counter_uncharge(&mc.to->res, | ||
4151 | PAGE_SIZE * mc.moved_swap); | ||
4152 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4153 | __css_put(&mc.to->css, mc.moved_swap); | ||
4154 | } | ||
4155 | /* we've already done mem_cgroup_get(mc.to) */ | ||
4156 | |||
4157 | mc.moved_swap = 0; | ||
4158 | } | ||
4159 | mc.from = NULL; | ||
4160 | mc.to = NULL; | ||
4161 | mc.moving_task = NULL; | ||
4162 | wake_up_all(&mc.waitq); | ||
4163 | } | ||
4164 | |||
4165 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4166 | struct cgroup *cgroup, | ||
4167 | struct task_struct *p, | ||
4168 | bool threadgroup) | ||
4169 | { | ||
4170 | int ret = 0; | ||
4171 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
4172 | |||
4173 | if (mem->move_charge_at_immigrate) { | ||
4174 | struct mm_struct *mm; | ||
4175 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
4176 | |||
4177 | VM_BUG_ON(from == mem); | ||
4178 | |||
4179 | mm = get_task_mm(p); | ||
4180 | if (!mm) | ||
4181 | return 0; | ||
4182 | /* We move charges only when we move a owner of the mm */ | ||
4183 | if (mm->owner == p) { | ||
4184 | VM_BUG_ON(mc.from); | ||
4185 | VM_BUG_ON(mc.to); | ||
4186 | VM_BUG_ON(mc.precharge); | ||
4187 | VM_BUG_ON(mc.moved_charge); | ||
4188 | VM_BUG_ON(mc.moved_swap); | ||
4189 | VM_BUG_ON(mc.moving_task); | ||
4190 | mc.from = from; | ||
4191 | mc.to = mem; | ||
4192 | mc.precharge = 0; | ||
4193 | mc.moved_charge = 0; | ||
4194 | mc.moved_swap = 0; | ||
4195 | mc.moving_task = current; | ||
4196 | |||
4197 | ret = mem_cgroup_precharge_mc(mm); | ||
4198 | if (ret) | ||
4199 | mem_cgroup_clear_mc(); | ||
4200 | } | ||
4201 | mmput(mm); | ||
4202 | } | ||
4203 | return ret; | ||
4204 | } | ||
4205 | |||
4206 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4207 | struct cgroup *cgroup, | ||
4208 | struct task_struct *p, | ||
4209 | bool threadgroup) | ||
4210 | { | ||
4211 | mem_cgroup_clear_mc(); | ||
4212 | } | ||
4213 | |||
4214 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
4215 | unsigned long addr, unsigned long end, | ||
4216 | struct mm_walk *walk) | ||
4217 | { | ||
4218 | int ret = 0; | ||
4219 | struct vm_area_struct *vma = walk->private; | ||
4220 | pte_t *pte; | ||
4221 | spinlock_t *ptl; | ||
4222 | |||
4223 | retry: | ||
4224 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4225 | for (; addr != end; addr += PAGE_SIZE) { | ||
4226 | pte_t ptent = *(pte++); | ||
4227 | union mc_target target; | ||
4228 | int type; | ||
4229 | struct page *page; | ||
4230 | struct page_cgroup *pc; | ||
4231 | swp_entry_t ent; | ||
4232 | |||
4233 | if (!mc.precharge) | ||
4234 | break; | ||
4235 | |||
4236 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
4237 | switch (type) { | ||
4238 | case MC_TARGET_PAGE: | ||
4239 | page = target.page; | ||
4240 | if (isolate_lru_page(page)) | ||
4241 | goto put; | ||
4242 | pc = lookup_page_cgroup(page); | ||
4243 | if (!mem_cgroup_move_account(pc, | ||
4244 | mc.from, mc.to, false)) { | ||
4245 | mc.precharge--; | ||
4246 | /* we uncharge from mc.from later. */ | ||
4247 | mc.moved_charge++; | ||
4248 | } | ||
4249 | putback_lru_page(page); | ||
4250 | put: /* is_target_pte_for_mc() gets the page */ | ||
4251 | put_page(page); | ||
4252 | break; | ||
4253 | case MC_TARGET_SWAP: | ||
4254 | ent = target.ent; | ||
4255 | if (!mem_cgroup_move_swap_account(ent, | ||
4256 | mc.from, mc.to, false)) { | ||
4257 | mc.precharge--; | ||
4258 | /* we fixup refcnts and charges later. */ | ||
4259 | mc.moved_swap++; | ||
4260 | } | ||
4261 | break; | ||
4262 | default: | ||
4263 | break; | ||
4264 | } | ||
4265 | } | ||
4266 | pte_unmap_unlock(pte - 1, ptl); | ||
4267 | cond_resched(); | ||
4268 | |||
4269 | if (addr != end) { | ||
4270 | /* | ||
4271 | * We have consumed all precharges we got in can_attach(). | ||
4272 | * We try charge one by one, but don't do any additional | ||
4273 | * charges to mc.to if we have failed in charge once in attach() | ||
4274 | * phase. | ||
4275 | */ | ||
4276 | ret = mem_cgroup_do_precharge(1); | ||
4277 | if (!ret) | ||
4278 | goto retry; | ||
4279 | } | ||
4280 | |||
4281 | return ret; | ||
4282 | } | ||
4283 | |||
4284 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
4285 | { | ||
4286 | struct vm_area_struct *vma; | ||
4287 | |||
4288 | lru_add_drain_all(); | ||
4289 | down_read(&mm->mmap_sem); | ||
4290 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4291 | int ret; | ||
4292 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
4293 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
4294 | .mm = mm, | ||
4295 | .private = vma, | ||
4296 | }; | ||
4297 | if (is_vm_hugetlb_page(vma)) | ||
4298 | continue; | ||
4299 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4300 | if (vma->vm_flags & VM_SHARED) | ||
4301 | continue; | ||
4302 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
4303 | &mem_cgroup_move_charge_walk); | ||
4304 | if (ret) | ||
4305 | /* | ||
4306 | * means we have consumed all precharges and failed in | ||
4307 | * doing additional charge. Just abandon here. | ||
4308 | */ | ||
4309 | break; | ||
4310 | } | ||
4311 | up_read(&mm->mmap_sem); | ||
4312 | } | ||
4313 | |||
4314 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
4315 | struct cgroup *cont, | ||
4316 | struct cgroup *old_cont, | ||
4317 | struct task_struct *p, | ||
4318 | bool threadgroup) | ||
4319 | { | ||
4320 | struct mm_struct *mm; | ||
4321 | |||
4322 | if (!mc.to) | ||
4323 | /* no need to move charge */ | ||
4324 | return; | ||
4325 | |||
4326 | mm = get_task_mm(p); | ||
4327 | if (mm) { | ||
4328 | mem_cgroup_move_charge(mm); | ||
4329 | mmput(mm); | ||
4330 | } | ||
4331 | mem_cgroup_clear_mc(); | ||
3394 | } | 4332 | } |
3395 | 4333 | ||
3396 | struct cgroup_subsys mem_cgroup_subsys = { | 4334 | struct cgroup_subsys mem_cgroup_subsys = { |
@@ -3400,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3400 | .pre_destroy = mem_cgroup_pre_destroy, | 4338 | .pre_destroy = mem_cgroup_pre_destroy, |
3401 | .destroy = mem_cgroup_destroy, | 4339 | .destroy = mem_cgroup_destroy, |
3402 | .populate = mem_cgroup_populate, | 4340 | .populate = mem_cgroup_populate, |
4341 | .can_attach = mem_cgroup_can_attach, | ||
4342 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3403 | .attach = mem_cgroup_move_task, | 4343 | .attach = mem_cgroup_move_task, |
3404 | .early_init = 0, | 4344 | .early_init = 0, |
3405 | .use_id = 1, | 4345 | .use_id = 1, |