diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 1835 |
1 files changed, 1495 insertions, 340 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f99f5991d6bb..8a79a6f0f029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -6,6 +6,10 @@ | |||
6 | * Copyright 2007 OpenVZ SWsoft Inc | 6 | * Copyright 2007 OpenVZ SWsoft Inc |
7 | * Author: Pavel Emelianov <xemul@openvz.org> | 7 | * Author: Pavel Emelianov <xemul@openvz.org> |
8 | * | 8 | * |
9 | * Memory thresholds | ||
10 | * Copyright (C) 2009 Nokia Corporation | ||
11 | * Author: Kirill A. Shutemov | ||
12 | * | ||
9 | * This program is free software; you can redistribute it and/or modify | 13 | * This program is free software; you can redistribute it and/or modify |
10 | * it under the terms of the GNU General Public License as published by | 14 | * it under the terms of the GNU General Public License as published by |
11 | * the Free Software Foundation; either version 2 of the License, or | 15 | * the Free Software Foundation; either version 2 of the License, or |
@@ -21,6 +25,7 @@ | |||
21 | #include <linux/memcontrol.h> | 25 | #include <linux/memcontrol.h> |
22 | #include <linux/cgroup.h> | 26 | #include <linux/cgroup.h> |
23 | #include <linux/mm.h> | 27 | #include <linux/mm.h> |
28 | #include <linux/hugetlb.h> | ||
24 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
25 | #include <linux/smp.h> | 30 | #include <linux/smp.h> |
26 | #include <linux/page-flags.h> | 31 | #include <linux/page-flags.h> |
@@ -32,12 +37,16 @@ | |||
32 | #include <linux/rbtree.h> | 37 | #include <linux/rbtree.h> |
33 | #include <linux/slab.h> | 38 | #include <linux/slab.h> |
34 | #include <linux/swap.h> | 39 | #include <linux/swap.h> |
40 | #include <linux/swapops.h> | ||
35 | #include <linux/spinlock.h> | 41 | #include <linux/spinlock.h> |
42 | #include <linux/eventfd.h> | ||
43 | #include <linux/sort.h> | ||
36 | #include <linux/fs.h> | 44 | #include <linux/fs.h> |
37 | #include <linux/seq_file.h> | 45 | #include <linux/seq_file.h> |
38 | #include <linux/vmalloc.h> | 46 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 47 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 48 | #include <linux/page_cgroup.h> |
49 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 50 | #include "internal.h" |
42 | 51 | ||
43 | #include <asm/uaccess.h> | 52 | #include <asm/uaccess.h> |
@@ -54,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 63 | #define do_swap_account (0) |
55 | #endif | 64 | #endif |
56 | 65 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 66 | /* |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 67 | * Per memcg event counter is incremented at every pagein/pageout. This counter |
68 | * is used for trigger some periodic events. This is straightforward and better | ||
69 | * than using jiffies etc. to handle periodic memcg event. | ||
70 | * | ||
71 | * These values will be used as !((event) & ((1 <<(thresh)) - 1)) | ||
72 | */ | ||
73 | #define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ | ||
74 | #define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ | ||
59 | 75 | ||
60 | /* | 76 | /* |
61 | * Statistics for memory cgroup. | 77 | * Statistics for memory cgroup. |
@@ -66,65 +82,19 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 82 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 83 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 84 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 85 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 86 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 87 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 88 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
89 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | ||
74 | 90 | ||
75 | MEM_CGROUP_STAT_NSTATS, | 91 | MEM_CGROUP_STAT_NSTATS, |
76 | }; | 92 | }; |
77 | 93 | ||
78 | struct mem_cgroup_stat_cpu { | 94 | struct mem_cgroup_stat_cpu { |
79 | s64 count[MEM_CGROUP_STAT_NSTATS]; | 95 | s64 count[MEM_CGROUP_STAT_NSTATS]; |
80 | } ____cacheline_aligned_in_smp; | ||
81 | |||
82 | struct mem_cgroup_stat { | ||
83 | struct mem_cgroup_stat_cpu cpustat[0]; | ||
84 | }; | 96 | }; |
85 | 97 | ||
86 | static inline void | ||
87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
88 | enum mem_cgroup_stat_index idx) | ||
89 | { | ||
90 | stat->count[idx] = 0; | ||
91 | } | ||
92 | |||
93 | static inline s64 | ||
94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
95 | enum mem_cgroup_stat_index idx) | ||
96 | { | ||
97 | return stat->count[idx]; | ||
98 | } | ||
99 | |||
100 | /* | ||
101 | * For accounting under irq disable, no need for increment preempt count. | ||
102 | */ | ||
103 | static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, | ||
104 | enum mem_cgroup_stat_index idx, int val) | ||
105 | { | ||
106 | stat->count[idx] += val; | ||
107 | } | ||
108 | |||
109 | static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, | ||
110 | enum mem_cgroup_stat_index idx) | ||
111 | { | ||
112 | int cpu; | ||
113 | s64 ret = 0; | ||
114 | for_each_possible_cpu(cpu) | ||
115 | ret += stat->cpustat[cpu].count[idx]; | ||
116 | return ret; | ||
117 | } | ||
118 | |||
119 | static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) | ||
120 | { | ||
121 | s64 ret; | ||
122 | |||
123 | ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); | ||
124 | ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); | ||
125 | return ret; | ||
126 | } | ||
127 | |||
128 | /* | 98 | /* |
129 | * per-zone information in memory controller. | 99 | * per-zone information in memory controller. |
130 | */ | 100 | */ |
@@ -174,6 +144,22 @@ struct mem_cgroup_tree { | |||
174 | 144 | ||
175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | 145 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; |
176 | 146 | ||
147 | struct mem_cgroup_threshold { | ||
148 | struct eventfd_ctx *eventfd; | ||
149 | u64 threshold; | ||
150 | }; | ||
151 | |||
152 | struct mem_cgroup_threshold_ary { | ||
153 | /* An array index points to threshold just below usage. */ | ||
154 | atomic_t current_threshold; | ||
155 | /* Size of entries[] */ | ||
156 | unsigned int size; | ||
157 | /* Array of thresholds */ | ||
158 | struct mem_cgroup_threshold entries[0]; | ||
159 | }; | ||
160 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | ||
162 | |||
177 | /* | 163 | /* |
178 | * The memory controller data structure. The memory controller controls both | 164 | * The memory controller data structure. The memory controller controls both |
179 | * page cache and RSS per cgroup. We would eventually like to provide | 165 | * page cache and RSS per cgroup. We would eventually like to provide |
@@ -209,7 +195,7 @@ struct mem_cgroup { | |||
209 | int prev_priority; /* for recording reclaim priority */ | 195 | int prev_priority; /* for recording reclaim priority */ |
210 | 196 | ||
211 | /* | 197 | /* |
212 | * While reclaiming in a hiearchy, we cache the last child we | 198 | * While reclaiming in a hierarchy, we cache the last child we |
213 | * reclaimed from. | 199 | * reclaimed from. |
214 | */ | 200 | */ |
215 | int last_scanned_child; | 201 | int last_scanned_child; |
@@ -217,7 +203,7 @@ struct mem_cgroup { | |||
217 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
218 | */ | 204 | */ |
219 | bool use_hierarchy; | 205 | bool use_hierarchy; |
220 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
221 | atomic_t refcnt; | 207 | atomic_t refcnt; |
222 | 208 | ||
223 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -225,10 +211,48 @@ struct mem_cgroup { | |||
225 | /* set when res.limit == memsw.limit */ | 211 | /* set when res.limit == memsw.limit */ |
226 | bool memsw_is_minimum; | 212 | bool memsw_is_minimum; |
227 | 213 | ||
214 | /* protect arrays of thresholds */ | ||
215 | struct mutex thresholds_lock; | ||
216 | |||
217 | /* thresholds for memory usage. RCU-protected */ | ||
218 | struct mem_cgroup_threshold_ary *thresholds; | ||
219 | |||
220 | /* thresholds for mem+swap usage. RCU-protected */ | ||
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | ||
222 | |||
223 | /* | ||
224 | * Should we move charges of a task when a task is moved into this | ||
225 | * mem_cgroup ? And what type of charges should we move ? | ||
226 | */ | ||
227 | unsigned long move_charge_at_immigrate; | ||
228 | |||
228 | /* | 229 | /* |
229 | * statistics. This must be placed at the end of memcg. | 230 | * percpu counter. |
230 | */ | 231 | */ |
231 | struct mem_cgroup_stat stat; | 232 | struct mem_cgroup_stat_cpu *stat; |
233 | }; | ||
234 | |||
235 | /* Stuffs for move charges at task migration. */ | ||
236 | /* | ||
237 | * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a | ||
238 | * left-shifted bitmap of these types. | ||
239 | */ | ||
240 | enum move_type { | ||
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | ||
242 | NR_MOVE_TYPE, | ||
243 | }; | ||
244 | |||
245 | /* "mc" and its members are protected by cgroup_mutex */ | ||
246 | static struct move_charge_struct { | ||
247 | struct mem_cgroup *from; | ||
248 | struct mem_cgroup *to; | ||
249 | unsigned long precharge; | ||
250 | unsigned long moved_charge; | ||
251 | unsigned long moved_swap; | ||
252 | struct task_struct *moving_task; /* a task moving charges */ | ||
253 | wait_queue_head_t waitq; /* a waitq for other context */ | ||
254 | } mc = { | ||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | ||
232 | }; | 256 | }; |
233 | 257 | ||
234 | /* | 258 | /* |
@@ -275,6 +299,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 299 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 300 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 301 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
302 | static void drain_all_stock_async(void); | ||
278 | 303 | ||
279 | static struct mem_cgroup_per_zone * | 304 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 305 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -282,6 +307,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | |||
282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | 307 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; |
283 | } | 308 | } |
284 | 309 | ||
310 | struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) | ||
311 | { | ||
312 | return &mem->css; | ||
313 | } | ||
314 | |||
285 | static struct mem_cgroup_per_zone * | 315 | static struct mem_cgroup_per_zone * |
286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | 316 | page_cgroup_zoneinfo(struct page_cgroup *pc) |
287 | { | 317 | { |
@@ -365,23 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | |||
365 | spin_unlock(&mctz->lock); | 395 | spin_unlock(&mctz->lock); |
366 | } | 396 | } |
367 | 397 | ||
368 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
369 | { | ||
370 | bool ret = false; | ||
371 | int cpu; | ||
372 | s64 val; | ||
373 | struct mem_cgroup_stat_cpu *cpustat; | ||
374 | |||
375 | cpu = get_cpu(); | ||
376 | cpustat = &mem->stat.cpustat[cpu]; | ||
377 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
378 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
379 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
380 | ret = true; | ||
381 | } | ||
382 | put_cpu(); | ||
383 | return ret; | ||
384 | } | ||
385 | 398 | ||
386 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | 399 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) |
387 | { | 400 | { |
@@ -475,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
475 | return mz; | 488 | return mz; |
476 | } | 489 | } |
477 | 490 | ||
491 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | ||
492 | enum mem_cgroup_stat_index idx) | ||
493 | { | ||
494 | int cpu; | ||
495 | s64 val = 0; | ||
496 | |||
497 | for_each_possible_cpu(cpu) | ||
498 | val += per_cpu(mem->stat->count[idx], cpu); | ||
499 | return val; | ||
500 | } | ||
501 | |||
502 | static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) | ||
503 | { | ||
504 | s64 ret; | ||
505 | |||
506 | ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); | ||
507 | ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); | ||
508 | return ret; | ||
509 | } | ||
510 | |||
478 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | 511 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, |
479 | bool charge) | 512 | bool charge) |
480 | { | 513 | { |
481 | int val = (charge) ? 1 : -1; | 514 | int val = (charge) ? 1 : -1; |
482 | struct mem_cgroup_stat *stat = &mem->stat; | 515 | this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); |
483 | struct mem_cgroup_stat_cpu *cpustat; | ||
484 | int cpu = get_cpu(); | ||
485 | |||
486 | cpustat = &stat->cpustat[cpu]; | ||
487 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
488 | put_cpu(); | ||
489 | } | 516 | } |
490 | 517 | ||
491 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 518 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
@@ -493,24 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
493 | bool charge) | 520 | bool charge) |
494 | { | 521 | { |
495 | int val = (charge) ? 1 : -1; | 522 | int val = (charge) ? 1 : -1; |
496 | struct mem_cgroup_stat *stat = &mem->stat; | ||
497 | struct mem_cgroup_stat_cpu *cpustat; | ||
498 | int cpu = get_cpu(); | ||
499 | 523 | ||
500 | cpustat = &stat->cpustat[cpu]; | 524 | preempt_disable(); |
525 | |||
501 | if (PageCgroupCache(pc)) | 526 | if (PageCgroupCache(pc)) |
502 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); | 527 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); |
503 | else | 528 | else |
504 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); | 529 | __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); |
505 | 530 | ||
506 | if (charge) | 531 | if (charge) |
507 | __mem_cgroup_stat_add_safe(cpustat, | 532 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); |
508 | MEM_CGROUP_STAT_PGPGIN_COUNT, 1); | ||
509 | else | 533 | else |
510 | __mem_cgroup_stat_add_safe(cpustat, | 534 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); |
511 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 535 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); |
512 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | 536 | |
513 | put_cpu(); | 537 | preempt_enable(); |
514 | } | 538 | } |
515 | 539 | ||
516 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 540 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
@@ -528,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | |||
528 | return total; | 552 | return total; |
529 | } | 553 | } |
530 | 554 | ||
555 | static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) | ||
556 | { | ||
557 | s64 val; | ||
558 | |||
559 | val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); | ||
560 | |||
561 | return !(val & ((1 << event_mask_shift) - 1)); | ||
562 | } | ||
563 | |||
564 | /* | ||
565 | * Check events in order. | ||
566 | * | ||
567 | */ | ||
568 | static void memcg_check_events(struct mem_cgroup *mem, struct page *page) | ||
569 | { | ||
570 | /* threshold event is triggered in finer grain than soft limit */ | ||
571 | if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { | ||
572 | mem_cgroup_threshold(mem); | ||
573 | if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) | ||
574 | mem_cgroup_update_tree(mem, page); | ||
575 | } | ||
576 | } | ||
577 | |||
531 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) | 578 | static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) |
532 | { | 579 | { |
533 | return container_of(cgroup_subsys_state(cont, | 580 | return container_of(cgroup_subsys_state(cont, |
@@ -758,7 +805,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
758 | task_unlock(task); | 805 | task_unlock(task); |
759 | if (!curr) | 806 | if (!curr) |
760 | return 0; | 807 | return 0; |
761 | if (curr->use_hierarchy) | 808 | /* |
809 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
810 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
811 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
812 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
813 | */ | ||
814 | if (mem->use_hierarchy) | ||
762 | ret = css_is_ancestor(&curr->css, &mem->css); | 815 | ret = css_is_ancestor(&curr->css, &mem->css); |
763 | else | 816 | else |
764 | ret = (curr == mem); | 817 | ret = (curr == mem); |
@@ -988,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | |||
988 | } | 1041 | } |
989 | 1042 | ||
990 | /** | 1043 | /** |
991 | * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. | 1044 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
992 | * @memcg: The memory cgroup that went over limit | 1045 | * @memcg: The memory cgroup that went over limit |
993 | * @p: Task that is going to be killed | 1046 | * @p: Task that is going to be killed |
994 | * | 1047 | * |
@@ -1007,7 +1060,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1007 | static char memcg_name[PATH_MAX]; | 1060 | static char memcg_name[PATH_MAX]; |
1008 | int ret; | 1061 | int ret; |
1009 | 1062 | ||
1010 | if (!memcg) | 1063 | if (!memcg || !p) |
1011 | return; | 1064 | return; |
1012 | 1065 | ||
1013 | 1066 | ||
@@ -1137,6 +1190,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1137 | victim = mem_cgroup_select_victim(root_mem); | 1190 | victim = mem_cgroup_select_victim(root_mem); |
1138 | if (victim == root_mem) { | 1191 | if (victim == root_mem) { |
1139 | loop++; | 1192 | loop++; |
1193 | if (loop >= 1) | ||
1194 | drain_all_stock_async(); | ||
1140 | if (loop >= 2) { | 1195 | if (loop >= 2) { |
1141 | /* | 1196 | /* |
1142 | * If we have not been able to reclaim | 1197 | * If we have not been able to reclaim |
@@ -1160,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1160 | } | 1215 | } |
1161 | } | 1216 | } |
1162 | } | 1217 | } |
1163 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1218 | if (!mem_cgroup_local_usage(victim)) { |
1164 | /* this cgroup's local usage == 0 */ | 1219 | /* this cgroup's local usage == 0 */ |
1165 | css_put(&victim->css); | 1220 | css_put(&victim->css); |
1166 | continue; | 1221 | continue; |
@@ -1191,90 +1246,284 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1191 | return total; | 1246 | return total; |
1192 | } | 1247 | } |
1193 | 1248 | ||
1194 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1195 | { | 1250 | { |
1196 | bool ret = false; | 1251 | int *val = (int *)data; |
1197 | struct mem_cgroup *mem; | 1252 | int x; |
1198 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1199 | 1269 | ||
1200 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1201 | mm = task->mm; | 1271 | |
1202 | if (!mm) | 1272 | if (lock_count == 1) |
1203 | mm = &init_mm; | 1273 | return true; |
1204 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1205 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1206 | ret = true; | ||
1207 | rcu_read_unlock(); | ||
1208 | return ret; | ||
1209 | } | 1275 | } |
1210 | 1276 | ||
1211 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1212 | { | 1278 | { |
1213 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1214 | return 0; | 1285 | return 0; |
1215 | } | 1286 | } |
1216 | 1287 | ||
1217 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1289 | { | ||
1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1218 | { | 1300 | { |
1219 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1301 | DEFINE_WAIT(wait); |
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1220 | } | 1345 | } |
1221 | 1346 | ||
1222 | /* | 1347 | /* |
1223 | * Currently used to update mapped file statistics, but the routine can be | 1348 | * Currently used to update mapped file statistics, but the routine can be |
1224 | * generalized to update other statistics as well. | 1349 | * generalized to update other statistics as well. |
1225 | */ | 1350 | */ |
1226 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1351 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1227 | { | 1352 | { |
1228 | struct mem_cgroup *mem; | 1353 | struct mem_cgroup *mem; |
1229 | struct mem_cgroup_stat *stat; | ||
1230 | struct mem_cgroup_stat_cpu *cpustat; | ||
1231 | int cpu; | ||
1232 | struct page_cgroup *pc; | 1354 | struct page_cgroup *pc; |
1233 | 1355 | ||
1234 | if (!page_is_file_cache(page)) | ||
1235 | return; | ||
1236 | |||
1237 | pc = lookup_page_cgroup(page); | 1356 | pc = lookup_page_cgroup(page); |
1238 | if (unlikely(!pc)) | 1357 | if (unlikely(!pc)) |
1239 | return; | 1358 | return; |
1240 | 1359 | ||
1241 | lock_page_cgroup(pc); | 1360 | lock_page_cgroup(pc); |
1242 | mem = pc->mem_cgroup; | 1361 | mem = pc->mem_cgroup; |
1243 | if (!mem) | 1362 | if (!mem || !PageCgroupUsed(pc)) |
1244 | goto done; | ||
1245 | |||
1246 | if (!PageCgroupUsed(pc)) | ||
1247 | goto done; | 1363 | goto done; |
1248 | 1364 | ||
1249 | /* | 1365 | /* |
1250 | * Preemption is already disabled, we don't need get_cpu() | 1366 | * Preemption is already disabled. We can use __this_cpu_xxx |
1251 | */ | 1367 | */ |
1252 | cpu = smp_processor_id(); | 1368 | if (val > 0) { |
1253 | stat = &mem->stat; | 1369 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1254 | cpustat = &stat->cpustat[cpu]; | 1370 | SetPageCgroupFileMapped(pc); |
1371 | } else { | ||
1372 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | ||
1373 | ClearPageCgroupFileMapped(pc); | ||
1374 | } | ||
1255 | 1375 | ||
1256 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | ||
1257 | done: | 1376 | done: |
1258 | unlock_page_cgroup(pc); | 1377 | unlock_page_cgroup(pc); |
1259 | } | 1378 | } |
1260 | 1379 | ||
1261 | /* | 1380 | /* |
1381 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1382 | * TODO: maybe necessary to use big numbers in big irons. | ||
1383 | */ | ||
1384 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1385 | struct memcg_stock_pcp { | ||
1386 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1387 | int charge; | ||
1388 | struct work_struct work; | ||
1389 | }; | ||
1390 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1391 | static atomic_t memcg_drain_count; | ||
1392 | |||
1393 | /* | ||
1394 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1395 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1396 | * cgroup which is not current target, returns false. This stock will be | ||
1397 | * refilled. | ||
1398 | */ | ||
1399 | static bool consume_stock(struct mem_cgroup *mem) | ||
1400 | { | ||
1401 | struct memcg_stock_pcp *stock; | ||
1402 | bool ret = true; | ||
1403 | |||
1404 | stock = &get_cpu_var(memcg_stock); | ||
1405 | if (mem == stock->cached && stock->charge) | ||
1406 | stock->charge -= PAGE_SIZE; | ||
1407 | else /* need to call res_counter_charge */ | ||
1408 | ret = false; | ||
1409 | put_cpu_var(memcg_stock); | ||
1410 | return ret; | ||
1411 | } | ||
1412 | |||
1413 | /* | ||
1414 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1415 | */ | ||
1416 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1417 | { | ||
1418 | struct mem_cgroup *old = stock->cached; | ||
1419 | |||
1420 | if (stock->charge) { | ||
1421 | res_counter_uncharge(&old->res, stock->charge); | ||
1422 | if (do_swap_account) | ||
1423 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1424 | } | ||
1425 | stock->cached = NULL; | ||
1426 | stock->charge = 0; | ||
1427 | } | ||
1428 | |||
1429 | /* | ||
1430 | * This must be called under preempt disabled or must be called by | ||
1431 | * a thread which is pinned to local cpu. | ||
1432 | */ | ||
1433 | static void drain_local_stock(struct work_struct *dummy) | ||
1434 | { | ||
1435 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1436 | drain_stock(stock); | ||
1437 | } | ||
1438 | |||
1439 | /* | ||
1440 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1441 | * This will be consumed by consumt_stock() function, later. | ||
1442 | */ | ||
1443 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1444 | { | ||
1445 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1446 | |||
1447 | if (stock->cached != mem) { /* reset if necessary */ | ||
1448 | drain_stock(stock); | ||
1449 | stock->cached = mem; | ||
1450 | } | ||
1451 | stock->charge += val; | ||
1452 | put_cpu_var(memcg_stock); | ||
1453 | } | ||
1454 | |||
1455 | /* | ||
1456 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1457 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1458 | * expects some charges will be back to res_counter later but cannot wait for | ||
1459 | * it. | ||
1460 | */ | ||
1461 | static void drain_all_stock_async(void) | ||
1462 | { | ||
1463 | int cpu; | ||
1464 | /* This function is for scheduling "drain" in asynchronous way. | ||
1465 | * The result of "drain" is not directly handled by callers. Then, | ||
1466 | * if someone is calling drain, we don't have to call drain more. | ||
1467 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1468 | * there is a race. We just do loose check here. | ||
1469 | */ | ||
1470 | if (atomic_read(&memcg_drain_count)) | ||
1471 | return; | ||
1472 | /* Notify other cpus that system-wide "drain" is running */ | ||
1473 | atomic_inc(&memcg_drain_count); | ||
1474 | get_online_cpus(); | ||
1475 | for_each_online_cpu(cpu) { | ||
1476 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1477 | schedule_work_on(cpu, &stock->work); | ||
1478 | } | ||
1479 | put_online_cpus(); | ||
1480 | atomic_dec(&memcg_drain_count); | ||
1481 | /* We don't wait for flush_work */ | ||
1482 | } | ||
1483 | |||
1484 | /* This is a synchronous drain interface. */ | ||
1485 | static void drain_all_stock_sync(void) | ||
1486 | { | ||
1487 | /* called when force_empty is called */ | ||
1488 | atomic_inc(&memcg_drain_count); | ||
1489 | schedule_on_each_cpu(drain_local_stock); | ||
1490 | atomic_dec(&memcg_drain_count); | ||
1491 | } | ||
1492 | |||
1493 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1494 | unsigned long action, | ||
1495 | void *hcpu) | ||
1496 | { | ||
1497 | int cpu = (unsigned long)hcpu; | ||
1498 | struct memcg_stock_pcp *stock; | ||
1499 | |||
1500 | if (action != CPU_DEAD) | ||
1501 | return NOTIFY_OK; | ||
1502 | stock = &per_cpu(memcg_stock, cpu); | ||
1503 | drain_stock(stock); | ||
1504 | return NOTIFY_OK; | ||
1505 | } | ||
1506 | |||
1507 | /* | ||
1262 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1508 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1263 | * oom-killer can be invoked. | 1509 | * oom-killer can be invoked. |
1264 | */ | 1510 | */ |
1265 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1511 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
1266 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1512 | gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) |
1267 | bool oom, struct page *page) | ||
1268 | { | 1513 | { |
1269 | struct mem_cgroup *mem, *mem_over_limit; | 1514 | struct mem_cgroup *mem, *mem_over_limit; |
1270 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1515 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1271 | struct res_counter *fail_res; | 1516 | struct res_counter *fail_res; |
1517 | int csize = CHARGE_SIZE; | ||
1272 | 1518 | ||
1273 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1519 | /* |
1274 | /* Don't account this! */ | 1520 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1275 | *memcg = NULL; | 1521 | * in system level. So, allow to go ahead dying process in addition to |
1276 | return 0; | 1522 | * MEMDIE process. |
1277 | } | 1523 | */ |
1524 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1525 | || fatal_signal_pending(current))) | ||
1526 | goto bypass; | ||
1278 | 1527 | ||
1279 | /* | 1528 | /* |
1280 | * We always charge the cgroup the mm_struct belongs to. | 1529 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1293,23 +1542,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1293 | return 0; | 1542 | return 0; |
1294 | 1543 | ||
1295 | VM_BUG_ON(css_is_removed(&mem->css)); | 1544 | VM_BUG_ON(css_is_removed(&mem->css)); |
1545 | if (mem_cgroup_is_root(mem)) | ||
1546 | goto done; | ||
1296 | 1547 | ||
1297 | while (1) { | 1548 | while (1) { |
1298 | int ret = 0; | 1549 | int ret = 0; |
1299 | unsigned long flags = 0; | 1550 | unsigned long flags = 0; |
1300 | 1551 | ||
1301 | if (mem_cgroup_is_root(mem)) | 1552 | if (consume_stock(mem)) |
1302 | goto done; | 1553 | goto done; |
1303 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1554 | |
1555 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1304 | if (likely(!ret)) { | 1556 | if (likely(!ret)) { |
1305 | if (!do_swap_account) | 1557 | if (!do_swap_account) |
1306 | break; | 1558 | break; |
1307 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1559 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1308 | &fail_res); | ||
1309 | if (likely(!ret)) | 1560 | if (likely(!ret)) |
1310 | break; | 1561 | break; |
1311 | /* mem+swap counter fails */ | 1562 | /* mem+swap counter fails */ |
1312 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1563 | res_counter_uncharge(&mem->res, csize); |
1313 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1564 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1314 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1565 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1315 | memsw); | 1566 | memsw); |
@@ -1318,6 +1569,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1318 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1569 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1319 | res); | 1570 | res); |
1320 | 1571 | ||
1572 | /* reduce request size and retry */ | ||
1573 | if (csize > PAGE_SIZE) { | ||
1574 | csize = PAGE_SIZE; | ||
1575 | continue; | ||
1576 | } | ||
1321 | if (!(gfp_mask & __GFP_WAIT)) | 1577 | if (!(gfp_mask & __GFP_WAIT)) |
1322 | goto nomem; | 1578 | goto nomem; |
1323 | 1579 | ||
@@ -1337,27 +1593,92 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1337 | if (mem_cgroup_check_under_limit(mem_over_limit)) | 1593 | if (mem_cgroup_check_under_limit(mem_over_limit)) |
1338 | continue; | 1594 | continue; |
1339 | 1595 | ||
1596 | /* try to avoid oom while someone is moving charge */ | ||
1597 | if (mc.moving_task && current != mc.moving_task) { | ||
1598 | struct mem_cgroup *from, *to; | ||
1599 | bool do_continue = false; | ||
1600 | /* | ||
1601 | * There is a small race that "from" or "to" can be | ||
1602 | * freed by rmdir, so we use css_tryget(). | ||
1603 | */ | ||
1604 | from = mc.from; | ||
1605 | to = mc.to; | ||
1606 | if (from && css_tryget(&from->css)) { | ||
1607 | if (mem_over_limit->use_hierarchy) | ||
1608 | do_continue = css_is_ancestor( | ||
1609 | &from->css, | ||
1610 | &mem_over_limit->css); | ||
1611 | else | ||
1612 | do_continue = (from == mem_over_limit); | ||
1613 | css_put(&from->css); | ||
1614 | } | ||
1615 | if (!do_continue && to && css_tryget(&to->css)) { | ||
1616 | if (mem_over_limit->use_hierarchy) | ||
1617 | do_continue = css_is_ancestor( | ||
1618 | &to->css, | ||
1619 | &mem_over_limit->css); | ||
1620 | else | ||
1621 | do_continue = (to == mem_over_limit); | ||
1622 | css_put(&to->css); | ||
1623 | } | ||
1624 | if (do_continue) { | ||
1625 | DEFINE_WAIT(wait); | ||
1626 | prepare_to_wait(&mc.waitq, &wait, | ||
1627 | TASK_INTERRUPTIBLE); | ||
1628 | /* moving charge context might have finished. */ | ||
1629 | if (mc.moving_task) | ||
1630 | schedule(); | ||
1631 | finish_wait(&mc.waitq, &wait); | ||
1632 | continue; | ||
1633 | } | ||
1634 | } | ||
1635 | |||
1340 | if (!nr_retries--) { | 1636 | if (!nr_retries--) { |
1341 | if (oom) { | 1637 | if (!oom) |
1342 | mutex_lock(&memcg_tasklist); | 1638 | goto nomem; |
1343 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1639 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1344 | mutex_unlock(&memcg_tasklist); | 1640 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1345 | record_last_oom(mem_over_limit); | 1641 | continue; |
1346 | } | 1642 | } |
1347 | goto nomem; | 1643 | /* When we reach here, current task is dying .*/ |
1644 | css_put(&mem->css); | ||
1645 | goto bypass; | ||
1348 | } | 1646 | } |
1349 | } | 1647 | } |
1350 | /* | 1648 | if (csize > PAGE_SIZE) |
1351 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1649 | refill_stock(mem, csize - PAGE_SIZE); |
1352 | * if they exceeds softlimit. | ||
1353 | */ | ||
1354 | if (mem_cgroup_soft_limit_check(mem)) | ||
1355 | mem_cgroup_update_tree(mem, page); | ||
1356 | done: | 1650 | done: |
1357 | return 0; | 1651 | return 0; |
1358 | nomem: | 1652 | nomem: |
1359 | css_put(&mem->css); | 1653 | css_put(&mem->css); |
1360 | return -ENOMEM; | 1654 | return -ENOMEM; |
1655 | bypass: | ||
1656 | *memcg = NULL; | ||
1657 | return 0; | ||
1658 | } | ||
1659 | |||
1660 | /* | ||
1661 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1662 | * This function is for that and do uncharge, put css's refcnt. | ||
1663 | * gotten by try_charge(). | ||
1664 | */ | ||
1665 | static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, | ||
1666 | unsigned long count) | ||
1667 | { | ||
1668 | if (!mem_cgroup_is_root(mem)) { | ||
1669 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
1670 | if (do_swap_account) | ||
1671 | res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); | ||
1672 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
1673 | WARN_ON_ONCE(count > INT_MAX); | ||
1674 | __css_put(&mem->css, (int)count); | ||
1675 | } | ||
1676 | /* we don't need css_put for root */ | ||
1677 | } | ||
1678 | |||
1679 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1680 | { | ||
1681 | __mem_cgroup_cancel_charge(mem, 1); | ||
1361 | } | 1682 | } |
1362 | 1683 | ||
1363 | /* | 1684 | /* |
@@ -1379,25 +1700,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) | |||
1379 | return container_of(css, struct mem_cgroup, css); | 1700 | return container_of(css, struct mem_cgroup, css); |
1380 | } | 1701 | } |
1381 | 1702 | ||
1382 | static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) | 1703 | struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) |
1383 | { | 1704 | { |
1384 | struct mem_cgroup *mem; | 1705 | struct mem_cgroup *mem = NULL; |
1385 | struct page_cgroup *pc; | 1706 | struct page_cgroup *pc; |
1386 | unsigned short id; | 1707 | unsigned short id; |
1387 | swp_entry_t ent; | 1708 | swp_entry_t ent; |
1388 | 1709 | ||
1389 | VM_BUG_ON(!PageLocked(page)); | 1710 | VM_BUG_ON(!PageLocked(page)); |
1390 | 1711 | ||
1391 | if (!PageSwapCache(page)) | ||
1392 | return NULL; | ||
1393 | |||
1394 | pc = lookup_page_cgroup(page); | 1712 | pc = lookup_page_cgroup(page); |
1395 | lock_page_cgroup(pc); | 1713 | lock_page_cgroup(pc); |
1396 | if (PageCgroupUsed(pc)) { | 1714 | if (PageCgroupUsed(pc)) { |
1397 | mem = pc->mem_cgroup; | 1715 | mem = pc->mem_cgroup; |
1398 | if (mem && !css_tryget(&mem->css)) | 1716 | if (mem && !css_tryget(&mem->css)) |
1399 | mem = NULL; | 1717 | mem = NULL; |
1400 | } else { | 1718 | } else if (PageSwapCache(page)) { |
1401 | ent.val = page_private(page); | 1719 | ent.val = page_private(page); |
1402 | id = lookup_swap_cgroup(ent); | 1720 | id = lookup_swap_cgroup(ent); |
1403 | rcu_read_lock(); | 1721 | rcu_read_lock(); |
@@ -1426,12 +1744,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1426 | lock_page_cgroup(pc); | 1744 | lock_page_cgroup(pc); |
1427 | if (unlikely(PageCgroupUsed(pc))) { | 1745 | if (unlikely(PageCgroupUsed(pc))) { |
1428 | unlock_page_cgroup(pc); | 1746 | unlock_page_cgroup(pc); |
1429 | if (!mem_cgroup_is_root(mem)) { | 1747 | mem_cgroup_cancel_charge(mem); |
1430 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1431 | if (do_swap_account) | ||
1432 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1433 | } | ||
1434 | css_put(&mem->css); | ||
1435 | return; | 1748 | return; |
1436 | } | 1749 | } |
1437 | 1750 | ||
@@ -1461,88 +1774,83 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1461 | mem_cgroup_charge_statistics(mem, pc, true); | 1774 | mem_cgroup_charge_statistics(mem, pc, true); |
1462 | 1775 | ||
1463 | unlock_page_cgroup(pc); | 1776 | unlock_page_cgroup(pc); |
1777 | /* | ||
1778 | * "charge_statistics" updated event counter. Then, check it. | ||
1779 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | ||
1780 | * if they exceeds softlimit. | ||
1781 | */ | ||
1782 | memcg_check_events(mem, pc->page); | ||
1464 | } | 1783 | } |
1465 | 1784 | ||
1466 | /** | 1785 | /** |
1467 | * mem_cgroup_move_account - move account of the page | 1786 | * __mem_cgroup_move_account - move account of the page |
1468 | * @pc: page_cgroup of the page. | 1787 | * @pc: page_cgroup of the page. |
1469 | * @from: mem_cgroup which the page is moved from. | 1788 | * @from: mem_cgroup which the page is moved from. |
1470 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1789 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1790 | * @uncharge: whether we should call uncharge and css_put against @from. | ||
1471 | * | 1791 | * |
1472 | * The caller must confirm following. | 1792 | * The caller must confirm following. |
1473 | * - page is not on LRU (isolate_page() is useful.) | 1793 | * - page is not on LRU (isolate_page() is useful.) |
1794 | * - the pc is locked, used, and ->mem_cgroup points to @from. | ||
1474 | * | 1795 | * |
1475 | * returns 0 at success, | 1796 | * This function doesn't do "charge" nor css_get to new cgroup. It should be |
1476 | * returns -EBUSY when lock is busy or "pc" is unstable. | 1797 | * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is |
1477 | * | 1798 | * true, this function does "uncharge" from old cgroup, but it doesn't if |
1478 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1799 | * @uncharge is false, so a caller should do "uncharge". |
1479 | * new cgroup. It should be done by a caller. | ||
1480 | */ | 1800 | */ |
1481 | 1801 | ||
1482 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1802 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1483 | struct mem_cgroup *from, struct mem_cgroup *to) | 1803 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) |
1484 | { | 1804 | { |
1485 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1486 | int nid, zid; | ||
1487 | int ret = -EBUSY; | ||
1488 | struct page *page; | ||
1489 | int cpu; | ||
1490 | struct mem_cgroup_stat *stat; | ||
1491 | struct mem_cgroup_stat_cpu *cpustat; | ||
1492 | |||
1493 | VM_BUG_ON(from == to); | 1805 | VM_BUG_ON(from == to); |
1494 | VM_BUG_ON(PageLRU(pc->page)); | 1806 | VM_BUG_ON(PageLRU(pc->page)); |
1495 | 1807 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1496 | nid = page_cgroup_nid(pc); | 1808 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1497 | zid = page_cgroup_zid(pc); | 1809 | VM_BUG_ON(pc->mem_cgroup != from); |
1498 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | 1810 | |
1499 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | 1811 | if (PageCgroupFileMapped(pc)) { |
1500 | 1812 | /* Update mapped_file data for mem_cgroup */ | |
1501 | if (!trylock_page_cgroup(pc)) | 1813 | preempt_disable(); |
1502 | return ret; | 1814 | __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); |
1503 | 1815 | __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | |
1504 | if (!PageCgroupUsed(pc)) | 1816 | preempt_enable(); |
1505 | goto out; | ||
1506 | |||
1507 | if (pc->mem_cgroup != from) | ||
1508 | goto out; | ||
1509 | |||
1510 | if (!mem_cgroup_is_root(from)) | ||
1511 | res_counter_uncharge(&from->res, PAGE_SIZE); | ||
1512 | mem_cgroup_charge_statistics(from, pc, false); | ||
1513 | |||
1514 | page = pc->page; | ||
1515 | if (page_is_file_cache(page) && page_mapped(page)) { | ||
1516 | cpu = smp_processor_id(); | ||
1517 | /* Update mapped_file data for mem_cgroup "from" */ | ||
1518 | stat = &from->stat; | ||
1519 | cpustat = &stat->cpustat[cpu]; | ||
1520 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1521 | -1); | ||
1522 | |||
1523 | /* Update mapped_file data for mem_cgroup "to" */ | ||
1524 | stat = &to->stat; | ||
1525 | cpustat = &stat->cpustat[cpu]; | ||
1526 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | ||
1527 | 1); | ||
1528 | } | 1817 | } |
1818 | mem_cgroup_charge_statistics(from, pc, false); | ||
1819 | if (uncharge) | ||
1820 | /* This is not "cancel", but cancel_charge does all we need. */ | ||
1821 | mem_cgroup_cancel_charge(from); | ||
1529 | 1822 | ||
1530 | if (do_swap_account && !mem_cgroup_is_root(from)) | 1823 | /* caller should have done css_get */ |
1531 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
1532 | css_put(&from->css); | ||
1533 | |||
1534 | css_get(&to->css); | ||
1535 | pc->mem_cgroup = to; | 1824 | pc->mem_cgroup = to; |
1536 | mem_cgroup_charge_statistics(to, pc, true); | 1825 | mem_cgroup_charge_statistics(to, pc, true); |
1537 | ret = 0; | ||
1538 | out: | ||
1539 | unlock_page_cgroup(pc); | ||
1540 | /* | 1826 | /* |
1541 | * We charges against "to" which may not have any tasks. Then, "to" | 1827 | * We charges against "to" which may not have any tasks. Then, "to" |
1542 | * can be under rmdir(). But in current implementation, caller of | 1828 | * can be under rmdir(). But in current implementation, caller of |
1543 | * this function is just force_empty() and it's garanteed that | 1829 | * this function is just force_empty() and move charge, so it's |
1544 | * "to" is never removed. So, we don't check rmdir status here. | 1830 | * garanteed that "to" is never removed. So, we don't check rmdir |
1831 | * status here. | ||
1832 | */ | ||
1833 | } | ||
1834 | |||
1835 | /* | ||
1836 | * check whether the @pc is valid for moving account and call | ||
1837 | * __mem_cgroup_move_account() | ||
1838 | */ | ||
1839 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1840 | struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) | ||
1841 | { | ||
1842 | int ret = -EINVAL; | ||
1843 | lock_page_cgroup(pc); | ||
1844 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1845 | __mem_cgroup_move_account(pc, from, to, uncharge); | ||
1846 | ret = 0; | ||
1847 | } | ||
1848 | unlock_page_cgroup(pc); | ||
1849 | /* | ||
1850 | * check events | ||
1545 | */ | 1851 | */ |
1852 | memcg_check_events(to, pc->page); | ||
1853 | memcg_check_events(from, pc->page); | ||
1546 | return ret; | 1854 | return ret; |
1547 | } | 1855 | } |
1548 | 1856 | ||
@@ -1564,45 +1872,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1564 | if (!pcg) | 1872 | if (!pcg) |
1565 | return -EINVAL; | 1873 | return -EINVAL; |
1566 | 1874 | ||
1875 | ret = -EBUSY; | ||
1876 | if (!get_page_unless_zero(page)) | ||
1877 | goto out; | ||
1878 | if (isolate_lru_page(page)) | ||
1879 | goto put; | ||
1567 | 1880 | ||
1568 | parent = mem_cgroup_from_cont(pcg); | 1881 | parent = mem_cgroup_from_cont(pcg); |
1569 | 1882 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | |
1570 | |||
1571 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | ||
1572 | if (ret || !parent) | 1883 | if (ret || !parent) |
1573 | return ret; | 1884 | goto put_back; |
1574 | |||
1575 | if (!get_page_unless_zero(page)) { | ||
1576 | ret = -EBUSY; | ||
1577 | goto uncharge; | ||
1578 | } | ||
1579 | |||
1580 | ret = isolate_lru_page(page); | ||
1581 | 1885 | ||
1886 | ret = mem_cgroup_move_account(pc, child, parent, true); | ||
1582 | if (ret) | 1887 | if (ret) |
1583 | goto cancel; | 1888 | mem_cgroup_cancel_charge(parent); |
1584 | 1889 | put_back: | |
1585 | ret = mem_cgroup_move_account(pc, child, parent); | ||
1586 | |||
1587 | putback_lru_page(page); | 1890 | putback_lru_page(page); |
1588 | if (!ret) { | 1891 | put: |
1589 | put_page(page); | ||
1590 | /* drop extra refcnt by try_charge() */ | ||
1591 | css_put(&parent->css); | ||
1592 | return 0; | ||
1593 | } | ||
1594 | |||
1595 | cancel: | ||
1596 | put_page(page); | 1892 | put_page(page); |
1597 | uncharge: | 1893 | out: |
1598 | /* drop extra refcnt by try_charge() */ | ||
1599 | css_put(&parent->css); | ||
1600 | /* uncharge if move fails */ | ||
1601 | if (!mem_cgroup_is_root(parent)) { | ||
1602 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1603 | if (do_swap_account) | ||
1604 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1605 | } | ||
1606 | return ret; | 1894 | return ret; |
1607 | } | 1895 | } |
1608 | 1896 | ||
@@ -1627,7 +1915,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1627 | prefetchw(pc); | 1915 | prefetchw(pc); |
1628 | 1916 | ||
1629 | mem = memcg; | 1917 | mem = memcg; |
1630 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); | 1918 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); |
1631 | if (ret || !mem) | 1919 | if (ret || !mem) |
1632 | return ret; | 1920 | return ret; |
1633 | 1921 | ||
@@ -1720,7 +2008,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, | |||
1720 | /* | 2008 | /* |
1721 | * While swap-in, try_charge -> commit or cancel, the page is locked. | 2009 | * While swap-in, try_charge -> commit or cancel, the page is locked. |
1722 | * And when try_charge() successfully returns, one refcnt to memcg without | 2010 | * And when try_charge() successfully returns, one refcnt to memcg without |
1723 | * struct page_cgroup is aquired. This refcnt will be cumsumed by | 2011 | * struct page_cgroup is acquired. This refcnt will be consumed by |
1724 | * "commit()" or removed by "cancel()" | 2012 | * "commit()" or removed by "cancel()" |
1725 | */ | 2013 | */ |
1726 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | 2014 | int mem_cgroup_try_charge_swapin(struct mm_struct *mm, |
@@ -1737,23 +2025,24 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1737 | goto charge_cur_mm; | 2025 | goto charge_cur_mm; |
1738 | /* | 2026 | /* |
1739 | * A racing thread's fault, or swapoff, may have already updated | 2027 | * A racing thread's fault, or swapoff, may have already updated |
1740 | * the pte, and even removed page from swap cache: return success | 2028 | * the pte, and even removed page from swap cache: in those cases |
1741 | * to go on to do_swap_page()'s pte_same() test, which should fail. | 2029 | * do_swap_page()'s pte_same() test will fail; but there's also a |
2030 | * KSM case which does need to charge the page. | ||
1742 | */ | 2031 | */ |
1743 | if (!PageSwapCache(page)) | 2032 | if (!PageSwapCache(page)) |
1744 | return 0; | 2033 | goto charge_cur_mm; |
1745 | mem = try_get_mem_cgroup_from_swapcache(page); | 2034 | mem = try_get_mem_cgroup_from_page(page); |
1746 | if (!mem) | 2035 | if (!mem) |
1747 | goto charge_cur_mm; | 2036 | goto charge_cur_mm; |
1748 | *ptr = mem; | 2037 | *ptr = mem; |
1749 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); | 2038 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); |
1750 | /* drop extra refcnt from tryget */ | 2039 | /* drop extra refcnt from tryget */ |
1751 | css_put(&mem->css); | 2040 | css_put(&mem->css); |
1752 | return ret; | 2041 | return ret; |
1753 | charge_cur_mm: | 2042 | charge_cur_mm: |
1754 | if (unlikely(!mm)) | 2043 | if (unlikely(!mm)) |
1755 | mm = &init_mm; | 2044 | mm = &init_mm; |
1756 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); | 2045 | return __mem_cgroup_try_charge(mm, mask, ptr, true); |
1757 | } | 2046 | } |
1758 | 2047 | ||
1759 | static void | 2048 | static void |
@@ -1818,14 +2107,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1818 | return; | 2107 | return; |
1819 | if (!mem) | 2108 | if (!mem) |
1820 | return; | 2109 | return; |
1821 | if (!mem_cgroup_is_root(mem)) { | 2110 | mem_cgroup_cancel_charge(mem); |
1822 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1823 | if (do_swap_account) | ||
1824 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1825 | } | ||
1826 | css_put(&mem->css); | ||
1827 | } | 2111 | } |
1828 | 2112 | ||
2113 | static void | ||
2114 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
2115 | { | ||
2116 | struct memcg_batch_info *batch = NULL; | ||
2117 | bool uncharge_memsw = true; | ||
2118 | /* If swapout, usage of swap doesn't decrease */ | ||
2119 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
2120 | uncharge_memsw = false; | ||
2121 | /* | ||
2122 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2123 | * In those cases, all pages freed continously can be expected to be in | ||
2124 | * the same cgroup and we have chance to coalesce uncharges. | ||
2125 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2126 | * because we want to do uncharge as soon as possible. | ||
2127 | */ | ||
2128 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2129 | goto direct_uncharge; | ||
2130 | |||
2131 | batch = ¤t->memcg_batch; | ||
2132 | /* | ||
2133 | * In usual, we do css_get() when we remember memcg pointer. | ||
2134 | * But in this case, we keep res->usage until end of a series of | ||
2135 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
2136 | */ | ||
2137 | if (!batch->memcg) | ||
2138 | batch->memcg = mem; | ||
2139 | /* | ||
2140 | * In typical case, batch->memcg == mem. This means we can | ||
2141 | * merge a series of uncharges to an uncharge of res_counter. | ||
2142 | * If not, we uncharge res_counter ony by one. | ||
2143 | */ | ||
2144 | if (batch->memcg != mem) | ||
2145 | goto direct_uncharge; | ||
2146 | /* remember freed charge and uncharge it later */ | ||
2147 | batch->bytes += PAGE_SIZE; | ||
2148 | if (uncharge_memsw) | ||
2149 | batch->memsw_bytes += PAGE_SIZE; | ||
2150 | return; | ||
2151 | direct_uncharge: | ||
2152 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
2153 | if (uncharge_memsw) | ||
2154 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2155 | return; | ||
2156 | } | ||
1829 | 2157 | ||
1830 | /* | 2158 | /* |
1831 | * uncharge if !page_mapped(page) | 2159 | * uncharge if !page_mapped(page) |
@@ -1874,12 +2202,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1874 | break; | 2202 | break; |
1875 | } | 2203 | } |
1876 | 2204 | ||
1877 | if (!mem_cgroup_is_root(mem)) { | 2205 | if (!mem_cgroup_is_root(mem)) |
1878 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2206 | __do_uncharge(mem, ctype); |
1879 | if (do_swap_account && | ||
1880 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1881 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1882 | } | ||
1883 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2207 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1884 | mem_cgroup_swap_statistics(mem, true); | 2208 | mem_cgroup_swap_statistics(mem, true); |
1885 | mem_cgroup_charge_statistics(mem, pc, false); | 2209 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1895,8 +2219,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1895 | mz = page_cgroup_zoneinfo(pc); | 2219 | mz = page_cgroup_zoneinfo(pc); |
1896 | unlock_page_cgroup(pc); | 2220 | unlock_page_cgroup(pc); |
1897 | 2221 | ||
1898 | if (mem_cgroup_soft_limit_check(mem)) | 2222 | memcg_check_events(mem, page); |
1899 | mem_cgroup_update_tree(mem, page); | ||
1900 | /* at swapout, this memcg will be accessed to record to swap */ | 2223 | /* at swapout, this memcg will be accessed to record to swap */ |
1901 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2224 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1902 | css_put(&mem->css); | 2225 | css_put(&mem->css); |
@@ -1925,6 +2248,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1925 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2248 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1926 | } | 2249 | } |
1927 | 2250 | ||
2251 | /* | ||
2252 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2253 | * In that cases, pages are freed continuously and we can expect pages | ||
2254 | * are in the same memcg. All these calls itself limits the number of | ||
2255 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2256 | * This may be called prural(2) times in a context, | ||
2257 | */ | ||
2258 | |||
2259 | void mem_cgroup_uncharge_start(void) | ||
2260 | { | ||
2261 | current->memcg_batch.do_batch++; | ||
2262 | /* We can do nest. */ | ||
2263 | if (current->memcg_batch.do_batch == 1) { | ||
2264 | current->memcg_batch.memcg = NULL; | ||
2265 | current->memcg_batch.bytes = 0; | ||
2266 | current->memcg_batch.memsw_bytes = 0; | ||
2267 | } | ||
2268 | } | ||
2269 | |||
2270 | void mem_cgroup_uncharge_end(void) | ||
2271 | { | ||
2272 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2273 | |||
2274 | if (!batch->do_batch) | ||
2275 | return; | ||
2276 | |||
2277 | batch->do_batch--; | ||
2278 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2279 | return; | ||
2280 | |||
2281 | if (!batch->memcg) | ||
2282 | return; | ||
2283 | /* | ||
2284 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2285 | * bacause we hide charges behind us. | ||
2286 | */ | ||
2287 | if (batch->bytes) | ||
2288 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2289 | if (batch->memsw_bytes) | ||
2290 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2291 | /* forget this pointer (for sanity check) */ | ||
2292 | batch->memcg = NULL; | ||
2293 | } | ||
2294 | |||
1928 | #ifdef CONFIG_SWAP | 2295 | #ifdef CONFIG_SWAP |
1929 | /* | 2296 | /* |
1930 | * called after __delete_from_swap_cache() and drop "page" account. | 2297 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -1979,6 +2346,64 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1979 | } | 2346 | } |
1980 | rcu_read_unlock(); | 2347 | rcu_read_unlock(); |
1981 | } | 2348 | } |
2349 | |||
2350 | /** | ||
2351 | * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. | ||
2352 | * @entry: swap entry to be moved | ||
2353 | * @from: mem_cgroup which the entry is moved from | ||
2354 | * @to: mem_cgroup which the entry is moved to | ||
2355 | * @need_fixup: whether we should fixup res_counters and refcounts. | ||
2356 | * | ||
2357 | * It succeeds only when the swap_cgroup's record for this entry is the same | ||
2358 | * as the mem_cgroup's id of @from. | ||
2359 | * | ||
2360 | * Returns 0 on success, -EINVAL on failure. | ||
2361 | * | ||
2362 | * The caller must have charged to @to, IOW, called res_counter_charge() about | ||
2363 | * both res and memsw, and called css_get(). | ||
2364 | */ | ||
2365 | static int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2366 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2367 | { | ||
2368 | unsigned short old_id, new_id; | ||
2369 | |||
2370 | old_id = css_id(&from->css); | ||
2371 | new_id = css_id(&to->css); | ||
2372 | |||
2373 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | ||
2374 | mem_cgroup_swap_statistics(from, false); | ||
2375 | mem_cgroup_swap_statistics(to, true); | ||
2376 | /* | ||
2377 | * This function is only called from task migration context now. | ||
2378 | * It postpones res_counter and refcount handling till the end | ||
2379 | * of task migration(mem_cgroup_clear_mc()) for performance | ||
2380 | * improvement. But we cannot postpone mem_cgroup_get(to) | ||
2381 | * because if the process that has been moved to @to does | ||
2382 | * swap-in, the refcount of @to might be decreased to 0. | ||
2383 | */ | ||
2384 | mem_cgroup_get(to); | ||
2385 | if (need_fixup) { | ||
2386 | if (!mem_cgroup_is_root(from)) | ||
2387 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | ||
2388 | mem_cgroup_put(from); | ||
2389 | /* | ||
2390 | * we charged both to->res and to->memsw, so we should | ||
2391 | * uncharge to->res. | ||
2392 | */ | ||
2393 | if (!mem_cgroup_is_root(to)) | ||
2394 | res_counter_uncharge(&to->res, PAGE_SIZE); | ||
2395 | css_put(&to->css); | ||
2396 | } | ||
2397 | return 0; | ||
2398 | } | ||
2399 | return -EINVAL; | ||
2400 | } | ||
2401 | #else | ||
2402 | static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | ||
2403 | struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) | ||
2404 | { | ||
2405 | return -EINVAL; | ||
2406 | } | ||
1982 | #endif | 2407 | #endif |
1983 | 2408 | ||
1984 | /* | 2409 | /* |
@@ -2002,12 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2002 | } | 2427 | } |
2003 | unlock_page_cgroup(pc); | 2428 | unlock_page_cgroup(pc); |
2004 | 2429 | ||
2430 | *ptr = mem; | ||
2005 | if (mem) { | 2431 | if (mem) { |
2006 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, | 2432 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
2007 | page); | ||
2008 | css_put(&mem->css); | 2433 | css_put(&mem->css); |
2009 | } | 2434 | } |
2010 | *ptr = mem; | ||
2011 | return ret; | 2435 | return ret; |
2012 | } | 2436 | } |
2013 | 2437 | ||
@@ -2100,7 +2524,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2100 | unsigned long long val) | 2524 | unsigned long long val) |
2101 | { | 2525 | { |
2102 | int retry_count; | 2526 | int retry_count; |
2103 | int progress; | ||
2104 | u64 memswlimit; | 2527 | u64 memswlimit; |
2105 | int ret = 0; | 2528 | int ret = 0; |
2106 | int children = mem_cgroup_count_children(memcg); | 2529 | int children = mem_cgroup_count_children(memcg); |
@@ -2144,8 +2567,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2144 | if (!ret) | 2567 | if (!ret) |
2145 | break; | 2568 | break; |
2146 | 2569 | ||
2147 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2570 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2148 | GFP_KERNEL, | ||
2149 | MEM_CGROUP_RECLAIM_SHRINK); | 2571 | MEM_CGROUP_RECLAIM_SHRINK); |
2150 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2572 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2151 | /* Usage is reduced ? */ | 2573 | /* Usage is reduced ? */ |
@@ -2334,7 +2756,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, | |||
2334 | pc = list_entry(list->prev, struct page_cgroup, lru); | 2756 | pc = list_entry(list->prev, struct page_cgroup, lru); |
2335 | if (busy == pc) { | 2757 | if (busy == pc) { |
2336 | list_move(&pc->lru, list); | 2758 | list_move(&pc->lru, list); |
2337 | busy = 0; | 2759 | busy = NULL; |
2338 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 2760 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
2339 | continue; | 2761 | continue; |
2340 | } | 2762 | } |
@@ -2375,7 +2797,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) | |||
2375 | if (free_all) | 2797 | if (free_all) |
2376 | goto try_to_free; | 2798 | goto try_to_free; |
2377 | move_account: | 2799 | move_account: |
2378 | while (mem->res.usage > 0) { | 2800 | do { |
2379 | ret = -EBUSY; | 2801 | ret = -EBUSY; |
2380 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) | 2802 | if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) |
2381 | goto out; | 2803 | goto out; |
@@ -2384,6 +2806,7 @@ move_account: | |||
2384 | goto out; | 2806 | goto out; |
2385 | /* This is for making all *used* pages to be on LRU. */ | 2807 | /* This is for making all *used* pages to be on LRU. */ |
2386 | lru_add_drain_all(); | 2808 | lru_add_drain_all(); |
2809 | drain_all_stock_sync(); | ||
2387 | ret = 0; | 2810 | ret = 0; |
2388 | for_each_node_state(node, N_HIGH_MEMORY) { | 2811 | for_each_node_state(node, N_HIGH_MEMORY) { |
2389 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2812 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2402,8 +2825,8 @@ move_account: | |||
2402 | if (ret == -ENOMEM) | 2825 | if (ret == -ENOMEM) |
2403 | goto try_to_free; | 2826 | goto try_to_free; |
2404 | cond_resched(); | 2827 | cond_resched(); |
2405 | } | 2828 | /* "ret" should also be checked to ensure all lists are empty. */ |
2406 | ret = 0; | 2829 | } while (mem->res.usage > 0 || ret); |
2407 | out: | 2830 | out: |
2408 | css_put(&mem->css); | 2831 | css_put(&mem->css); |
2409 | return ret; | 2832 | return ret; |
@@ -2436,10 +2859,7 @@ try_to_free: | |||
2436 | } | 2859 | } |
2437 | lru_add_drain(); | 2860 | lru_add_drain(); |
2438 | /* try move_account...there may be some *locked* pages. */ | 2861 | /* try move_account...there may be some *locked* pages. */ |
2439 | if (mem->res.usage) | 2862 | goto move_account; |
2440 | goto move_account; | ||
2441 | ret = 0; | ||
2442 | goto out; | ||
2443 | } | 2863 | } |
2444 | 2864 | ||
2445 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) | 2865 | int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) |
@@ -2466,7 +2886,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
2466 | 2886 | ||
2467 | cgroup_lock(); | 2887 | cgroup_lock(); |
2468 | /* | 2888 | /* |
2469 | * If parent's use_hiearchy is set, we can't make any modifications | 2889 | * If parent's use_hierarchy is set, we can't make any modifications |
2470 | * in the child subtrees. If it is unset, then the change can | 2890 | * in the child subtrees. If it is unset, then the change can |
2471 | * occur, provided the current cgroup has no children. | 2891 | * occur, provided the current cgroup has no children. |
2472 | * | 2892 | * |
@@ -2495,7 +2915,7 @@ static int | |||
2495 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 2915 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) |
2496 | { | 2916 | { |
2497 | struct mem_cgroup_idx_data *d = data; | 2917 | struct mem_cgroup_idx_data *d = data; |
2498 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | 2918 | d->val += mem_cgroup_read_stat(mem, d->idx); |
2499 | return 0; | 2919 | return 0; |
2500 | } | 2920 | } |
2501 | 2921 | ||
@@ -2510,39 +2930,50 @@ mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | |||
2510 | *val = d.val; | 2930 | *val = d.val; |
2511 | } | 2931 | } |
2512 | 2932 | ||
2933 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | ||
2934 | { | ||
2935 | u64 idx_val, val; | ||
2936 | |||
2937 | if (!mem_cgroup_is_root(mem)) { | ||
2938 | if (!swap) | ||
2939 | return res_counter_read_u64(&mem->res, RES_USAGE); | ||
2940 | else | ||
2941 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | ||
2942 | } | ||
2943 | |||
2944 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | ||
2945 | val = idx_val; | ||
2946 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
2947 | val += idx_val; | ||
2948 | |||
2949 | if (swap) { | ||
2950 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2951 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2952 | val += idx_val; | ||
2953 | } | ||
2954 | |||
2955 | return val << PAGE_SHIFT; | ||
2956 | } | ||
2957 | |||
2513 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2958 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
2514 | { | 2959 | { |
2515 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2960 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
2516 | u64 idx_val, val; | 2961 | u64 val; |
2517 | int type, name; | 2962 | int type, name; |
2518 | 2963 | ||
2519 | type = MEMFILE_TYPE(cft->private); | 2964 | type = MEMFILE_TYPE(cft->private); |
2520 | name = MEMFILE_ATTR(cft->private); | 2965 | name = MEMFILE_ATTR(cft->private); |
2521 | switch (type) { | 2966 | switch (type) { |
2522 | case _MEM: | 2967 | case _MEM: |
2523 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2968 | if (name == RES_USAGE) |
2524 | mem_cgroup_get_recursive_idx_stat(mem, | 2969 | val = mem_cgroup_usage(mem, false); |
2525 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2970 | else |
2526 | val = idx_val; | ||
2527 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2528 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2529 | val += idx_val; | ||
2530 | val <<= PAGE_SHIFT; | ||
2531 | } else | ||
2532 | val = res_counter_read_u64(&mem->res, name); | 2971 | val = res_counter_read_u64(&mem->res, name); |
2533 | break; | 2972 | break; |
2534 | case _MEMSWAP: | 2973 | case _MEMSWAP: |
2535 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { | 2974 | if (name == RES_USAGE) |
2536 | mem_cgroup_get_recursive_idx_stat(mem, | 2975 | val = mem_cgroup_usage(mem, true); |
2537 | MEM_CGROUP_STAT_CACHE, &idx_val); | 2976 | else |
2538 | val = idx_val; | ||
2539 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2540 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
2541 | val += idx_val; | ||
2542 | mem_cgroup_get_recursive_idx_stat(mem, | ||
2543 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
2544 | val <<= PAGE_SHIFT; | ||
2545 | } else | ||
2546 | val = res_counter_read_u64(&mem->memsw, name); | 2977 | val = res_counter_read_u64(&mem->memsw, name); |
2547 | break; | 2978 | break; |
2548 | default: | 2979 | default: |
@@ -2655,12 +3086,45 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2655 | return 0; | 3086 | return 0; |
2656 | } | 3087 | } |
2657 | 3088 | ||
3089 | static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, | ||
3090 | struct cftype *cft) | ||
3091 | { | ||
3092 | return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; | ||
3093 | } | ||
3094 | |||
3095 | #ifdef CONFIG_MMU | ||
3096 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3097 | struct cftype *cft, u64 val) | ||
3098 | { | ||
3099 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3100 | |||
3101 | if (val >= (1 << NR_MOVE_TYPE)) | ||
3102 | return -EINVAL; | ||
3103 | /* | ||
3104 | * We check this value several times in both in can_attach() and | ||
3105 | * attach(), so we need cgroup lock to prevent this value from being | ||
3106 | * inconsistent. | ||
3107 | */ | ||
3108 | cgroup_lock(); | ||
3109 | mem->move_charge_at_immigrate = val; | ||
3110 | cgroup_unlock(); | ||
3111 | |||
3112 | return 0; | ||
3113 | } | ||
3114 | #else | ||
3115 | static int mem_cgroup_move_charge_write(struct cgroup *cgrp, | ||
3116 | struct cftype *cft, u64 val) | ||
3117 | { | ||
3118 | return -ENOSYS; | ||
3119 | } | ||
3120 | #endif | ||
3121 | |||
2658 | 3122 | ||
2659 | /* For read statistics */ | 3123 | /* For read statistics */ |
2660 | enum { | 3124 | enum { |
2661 | MCS_CACHE, | 3125 | MCS_CACHE, |
2662 | MCS_RSS, | 3126 | MCS_RSS, |
2663 | MCS_MAPPED_FILE, | 3127 | MCS_FILE_MAPPED, |
2664 | MCS_PGPGIN, | 3128 | MCS_PGPGIN, |
2665 | MCS_PGPGOUT, | 3129 | MCS_PGPGOUT, |
2666 | MCS_SWAP, | 3130 | MCS_SWAP, |
@@ -2700,18 +3164,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2700 | s64 val; | 3164 | s64 val; |
2701 | 3165 | ||
2702 | /* per cpu stat */ | 3166 | /* per cpu stat */ |
2703 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); | 3167 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); |
2704 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 3168 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2705 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 3169 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); |
2706 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 3170 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2707 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 3171 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); |
2708 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 3172 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2709 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 3173 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2710 | s->stat[MCS_PGPGIN] += val; | 3174 | s->stat[MCS_PGPGIN] += val; |
2711 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 3175 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
2712 | s->stat[MCS_PGPGOUT] += val; | 3176 | s->stat[MCS_PGPGOUT] += val; |
2713 | if (do_swap_account) { | 3177 | if (do_swap_account) { |
2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | 3178 | val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); |
2715 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | 3179 | s->stat[MCS_SWAP] += val * PAGE_SIZE; |
2716 | } | 3180 | } |
2717 | 3181 | ||
@@ -2839,12 +3303,249 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, | |||
2839 | return 0; | 3303 | return 0; |
2840 | } | 3304 | } |
2841 | 3305 | ||
3306 | static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | ||
3307 | { | ||
3308 | struct mem_cgroup_threshold_ary *t; | ||
3309 | u64 usage; | ||
3310 | int i; | ||
3311 | |||
3312 | rcu_read_lock(); | ||
3313 | if (!swap) | ||
3314 | t = rcu_dereference(memcg->thresholds); | ||
3315 | else | ||
3316 | t = rcu_dereference(memcg->memsw_thresholds); | ||
3317 | |||
3318 | if (!t) | ||
3319 | goto unlock; | ||
3320 | |||
3321 | usage = mem_cgroup_usage(memcg, swap); | ||
3322 | |||
3323 | /* | ||
3324 | * current_threshold points to threshold just below usage. | ||
3325 | * If it's not true, a threshold was crossed after last | ||
3326 | * call of __mem_cgroup_threshold(). | ||
3327 | */ | ||
3328 | i = atomic_read(&t->current_threshold); | ||
3329 | |||
3330 | /* | ||
3331 | * Iterate backward over array of thresholds starting from | ||
3332 | * current_threshold and check if a threshold is crossed. | ||
3333 | * If none of thresholds below usage is crossed, we read | ||
3334 | * only one element of the array here. | ||
3335 | */ | ||
3336 | for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) | ||
3337 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3338 | |||
3339 | /* i = current_threshold + 1 */ | ||
3340 | i++; | ||
3341 | |||
3342 | /* | ||
3343 | * Iterate forward over array of thresholds starting from | ||
3344 | * current_threshold+1 and check if a threshold is crossed. | ||
3345 | * If none of thresholds above usage is crossed, we read | ||
3346 | * only one element of the array here. | ||
3347 | */ | ||
3348 | for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) | ||
3349 | eventfd_signal(t->entries[i].eventfd, 1); | ||
3350 | |||
3351 | /* Update current_threshold */ | ||
3352 | atomic_set(&t->current_threshold, i - 1); | ||
3353 | unlock: | ||
3354 | rcu_read_unlock(); | ||
3355 | } | ||
3356 | |||
3357 | static void mem_cgroup_threshold(struct mem_cgroup *memcg) | ||
3358 | { | ||
3359 | __mem_cgroup_threshold(memcg, false); | ||
3360 | if (do_swap_account) | ||
3361 | __mem_cgroup_threshold(memcg, true); | ||
3362 | } | ||
3363 | |||
3364 | static int compare_thresholds(const void *a, const void *b) | ||
3365 | { | ||
3366 | const struct mem_cgroup_threshold *_a = a; | ||
3367 | const struct mem_cgroup_threshold *_b = b; | ||
3368 | |||
3369 | return _a->threshold - _b->threshold; | ||
3370 | } | ||
3371 | |||
3372 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | ||
3373 | struct eventfd_ctx *eventfd, const char *args) | ||
3374 | { | ||
3375 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3376 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3377 | int type = MEMFILE_TYPE(cft->private); | ||
3378 | u64 threshold, usage; | ||
3379 | int size; | ||
3380 | int i, ret; | ||
3381 | |||
3382 | ret = res_counter_memparse_write_strategy(args, &threshold); | ||
3383 | if (ret) | ||
3384 | return ret; | ||
3385 | |||
3386 | mutex_lock(&memcg->thresholds_lock); | ||
3387 | if (type == _MEM) | ||
3388 | thresholds = memcg->thresholds; | ||
3389 | else if (type == _MEMSWAP) | ||
3390 | thresholds = memcg->memsw_thresholds; | ||
3391 | else | ||
3392 | BUG(); | ||
3393 | |||
3394 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3395 | |||
3396 | /* Check if a threshold crossed before adding a new one */ | ||
3397 | if (thresholds) | ||
3398 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3399 | |||
3400 | if (thresholds) | ||
3401 | size = thresholds->size + 1; | ||
3402 | else | ||
3403 | size = 1; | ||
3404 | |||
3405 | /* Allocate memory for new array of thresholds */ | ||
3406 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3407 | size * sizeof(struct mem_cgroup_threshold), | ||
3408 | GFP_KERNEL); | ||
3409 | if (!thresholds_new) { | ||
3410 | ret = -ENOMEM; | ||
3411 | goto unlock; | ||
3412 | } | ||
3413 | thresholds_new->size = size; | ||
3414 | |||
3415 | /* Copy thresholds (if any) to new array */ | ||
3416 | if (thresholds) | ||
3417 | memcpy(thresholds_new->entries, thresholds->entries, | ||
3418 | thresholds->size * | ||
3419 | sizeof(struct mem_cgroup_threshold)); | ||
3420 | /* Add new threshold */ | ||
3421 | thresholds_new->entries[size - 1].eventfd = eventfd; | ||
3422 | thresholds_new->entries[size - 1].threshold = threshold; | ||
3423 | |||
3424 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | ||
3425 | sort(thresholds_new->entries, size, | ||
3426 | sizeof(struct mem_cgroup_threshold), | ||
3427 | compare_thresholds, NULL); | ||
3428 | |||
3429 | /* Find current threshold */ | ||
3430 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3431 | for (i = 0; i < size; i++) { | ||
3432 | if (thresholds_new->entries[i].threshold < usage) { | ||
3433 | /* | ||
3434 | * thresholds_new->current_threshold will not be used | ||
3435 | * until rcu_assign_pointer(), so it's safe to increment | ||
3436 | * it here. | ||
3437 | */ | ||
3438 | atomic_inc(&thresholds_new->current_threshold); | ||
3439 | } | ||
3440 | } | ||
3441 | |||
3442 | if (type == _MEM) | ||
3443 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3444 | else | ||
3445 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3446 | |||
3447 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3448 | synchronize_rcu(); | ||
3449 | |||
3450 | kfree(thresholds); | ||
3451 | unlock: | ||
3452 | mutex_unlock(&memcg->thresholds_lock); | ||
3453 | |||
3454 | return ret; | ||
3455 | } | ||
3456 | |||
3457 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | ||
3458 | struct eventfd_ctx *eventfd) | ||
3459 | { | ||
3460 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3461 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | ||
3462 | int type = MEMFILE_TYPE(cft->private); | ||
3463 | u64 usage; | ||
3464 | int size = 0; | ||
3465 | int i, j, ret; | ||
3466 | |||
3467 | mutex_lock(&memcg->thresholds_lock); | ||
3468 | if (type == _MEM) | ||
3469 | thresholds = memcg->thresholds; | ||
3470 | else if (type == _MEMSWAP) | ||
3471 | thresholds = memcg->memsw_thresholds; | ||
3472 | else | ||
3473 | BUG(); | ||
3474 | |||
3475 | /* | ||
3476 | * Something went wrong if we trying to unregister a threshold | ||
3477 | * if we don't have thresholds | ||
3478 | */ | ||
3479 | BUG_ON(!thresholds); | ||
3480 | |||
3481 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | ||
3482 | |||
3483 | /* Check if a threshold crossed before removing */ | ||
3484 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | ||
3485 | |||
3486 | /* Calculate new number of threshold */ | ||
3487 | for (i = 0; i < thresholds->size; i++) { | ||
3488 | if (thresholds->entries[i].eventfd != eventfd) | ||
3489 | size++; | ||
3490 | } | ||
3491 | |||
3492 | /* Set thresholds array to NULL if we don't have thresholds */ | ||
3493 | if (!size) { | ||
3494 | thresholds_new = NULL; | ||
3495 | goto assign; | ||
3496 | } | ||
3497 | |||
3498 | /* Allocate memory for new array of thresholds */ | ||
3499 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3500 | size * sizeof(struct mem_cgroup_threshold), | ||
3501 | GFP_KERNEL); | ||
3502 | if (!thresholds_new) { | ||
3503 | ret = -ENOMEM; | ||
3504 | goto unlock; | ||
3505 | } | ||
3506 | thresholds_new->size = size; | ||
3507 | |||
3508 | /* Copy thresholds and find current threshold */ | ||
3509 | atomic_set(&thresholds_new->current_threshold, -1); | ||
3510 | for (i = 0, j = 0; i < thresholds->size; i++) { | ||
3511 | if (thresholds->entries[i].eventfd == eventfd) | ||
3512 | continue; | ||
3513 | |||
3514 | thresholds_new->entries[j] = thresholds->entries[i]; | ||
3515 | if (thresholds_new->entries[j].threshold < usage) { | ||
3516 | /* | ||
3517 | * thresholds_new->current_threshold will not be used | ||
3518 | * until rcu_assign_pointer(), so it's safe to increment | ||
3519 | * it here. | ||
3520 | */ | ||
3521 | atomic_inc(&thresholds_new->current_threshold); | ||
3522 | } | ||
3523 | j++; | ||
3524 | } | ||
3525 | |||
3526 | assign: | ||
3527 | if (type == _MEM) | ||
3528 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | ||
3529 | else | ||
3530 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3531 | |||
3532 | /* To be sure that nobody uses thresholds before freeing it */ | ||
3533 | synchronize_rcu(); | ||
3534 | |||
3535 | kfree(thresholds); | ||
3536 | unlock: | ||
3537 | mutex_unlock(&memcg->thresholds_lock); | ||
3538 | |||
3539 | return ret; | ||
3540 | } | ||
2842 | 3541 | ||
2843 | static struct cftype mem_cgroup_files[] = { | 3542 | static struct cftype mem_cgroup_files[] = { |
2844 | { | 3543 | { |
2845 | .name = "usage_in_bytes", | 3544 | .name = "usage_in_bytes", |
2846 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3545 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
2847 | .read_u64 = mem_cgroup_read, | 3546 | .read_u64 = mem_cgroup_read, |
3547 | .register_event = mem_cgroup_register_event, | ||
3548 | .unregister_event = mem_cgroup_unregister_event, | ||
2848 | }, | 3549 | }, |
2849 | { | 3550 | { |
2850 | .name = "max_usage_in_bytes", | 3551 | .name = "max_usage_in_bytes", |
@@ -2888,6 +3589,11 @@ static struct cftype mem_cgroup_files[] = { | |||
2888 | .read_u64 = mem_cgroup_swappiness_read, | 3589 | .read_u64 = mem_cgroup_swappiness_read, |
2889 | .write_u64 = mem_cgroup_swappiness_write, | 3590 | .write_u64 = mem_cgroup_swappiness_write, |
2890 | }, | 3591 | }, |
3592 | { | ||
3593 | .name = "move_charge_at_immigrate", | ||
3594 | .read_u64 = mem_cgroup_move_charge_read, | ||
3595 | .write_u64 = mem_cgroup_move_charge_write, | ||
3596 | }, | ||
2891 | }; | 3597 | }; |
2892 | 3598 | ||
2893 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3599 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -2896,6 +3602,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
2896 | .name = "memsw.usage_in_bytes", | 3602 | .name = "memsw.usage_in_bytes", |
2897 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
2898 | .read_u64 = mem_cgroup_read, | 3604 | .read_u64 = mem_cgroup_read, |
3605 | .register_event = mem_cgroup_register_event, | ||
3606 | .unregister_event = mem_cgroup_unregister_event, | ||
2899 | }, | 3607 | }, |
2900 | { | 3608 | { |
2901 | .name = "memsw.max_usage_in_bytes", | 3609 | .name = "memsw.max_usage_in_bytes", |
@@ -2970,24 +3678,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2970 | kfree(mem->info.nodeinfo[node]); | 3678 | kfree(mem->info.nodeinfo[node]); |
2971 | } | 3679 | } |
2972 | 3680 | ||
2973 | static int mem_cgroup_size(void) | ||
2974 | { | ||
2975 | int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); | ||
2976 | return sizeof(struct mem_cgroup) + cpustat_size; | ||
2977 | } | ||
2978 | |||
2979 | static struct mem_cgroup *mem_cgroup_alloc(void) | 3681 | static struct mem_cgroup *mem_cgroup_alloc(void) |
2980 | { | 3682 | { |
2981 | struct mem_cgroup *mem; | 3683 | struct mem_cgroup *mem; |
2982 | int size = mem_cgroup_size(); | 3684 | int size = sizeof(struct mem_cgroup); |
2983 | 3685 | ||
3686 | /* Can be very big if MAX_NUMNODES is very big */ | ||
2984 | if (size < PAGE_SIZE) | 3687 | if (size < PAGE_SIZE) |
2985 | mem = kmalloc(size, GFP_KERNEL); | 3688 | mem = kmalloc(size, GFP_KERNEL); |
2986 | else | 3689 | else |
2987 | mem = vmalloc(size); | 3690 | mem = vmalloc(size); |
2988 | 3691 | ||
2989 | if (mem) | 3692 | if (!mem) |
2990 | memset(mem, 0, size); | 3693 | return NULL; |
3694 | |||
3695 | memset(mem, 0, size); | ||
3696 | mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); | ||
3697 | if (!mem->stat) { | ||
3698 | if (size < PAGE_SIZE) | ||
3699 | kfree(mem); | ||
3700 | else | ||
3701 | vfree(mem); | ||
3702 | mem = NULL; | ||
3703 | } | ||
2991 | return mem; | 3704 | return mem; |
2992 | } | 3705 | } |
2993 | 3706 | ||
@@ -3012,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
3012 | for_each_node_state(node, N_POSSIBLE) | 3725 | for_each_node_state(node, N_POSSIBLE) |
3013 | free_mem_cgroup_per_zone_info(mem, node); | 3726 | free_mem_cgroup_per_zone_info(mem, node); |
3014 | 3727 | ||
3015 | if (mem_cgroup_size() < PAGE_SIZE) | 3728 | free_percpu(mem->stat); |
3729 | if (sizeof(struct mem_cgroup) < PAGE_SIZE) | ||
3016 | kfree(mem); | 3730 | kfree(mem); |
3017 | else | 3731 | else |
3018 | vfree(mem); | 3732 | vfree(mem); |
@@ -3023,9 +3737,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) | |||
3023 | atomic_inc(&mem->refcnt); | 3737 | atomic_inc(&mem->refcnt); |
3024 | } | 3738 | } |
3025 | 3739 | ||
3026 | static void mem_cgroup_put(struct mem_cgroup *mem) | 3740 | static void __mem_cgroup_put(struct mem_cgroup *mem, int count) |
3027 | { | 3741 | { |
3028 | if (atomic_dec_and_test(&mem->refcnt)) { | 3742 | if (atomic_sub_and_test(count, &mem->refcnt)) { |
3029 | struct mem_cgroup *parent = parent_mem_cgroup(mem); | 3743 | struct mem_cgroup *parent = parent_mem_cgroup(mem); |
3030 | __mem_cgroup_free(mem); | 3744 | __mem_cgroup_free(mem); |
3031 | if (parent) | 3745 | if (parent) |
@@ -3033,6 +3747,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) | |||
3033 | } | 3747 | } |
3034 | } | 3748 | } |
3035 | 3749 | ||
3750 | static void mem_cgroup_put(struct mem_cgroup *mem) | ||
3751 | { | ||
3752 | __mem_cgroup_put(mem, 1); | ||
3753 | } | ||
3754 | |||
3036 | /* | 3755 | /* |
3037 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. | 3756 | * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. |
3038 | */ | 3757 | */ |
@@ -3097,12 +3816,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3097 | 3816 | ||
3098 | /* root ? */ | 3817 | /* root ? */ |
3099 | if (cont->parent == NULL) { | 3818 | if (cont->parent == NULL) { |
3819 | int cpu; | ||
3100 | enable_swap_cgroup(); | 3820 | enable_swap_cgroup(); |
3101 | parent = NULL; | 3821 | parent = NULL; |
3102 | root_mem_cgroup = mem; | 3822 | root_mem_cgroup = mem; |
3103 | if (mem_cgroup_soft_limit_tree_init()) | 3823 | if (mem_cgroup_soft_limit_tree_init()) |
3104 | goto free_out; | 3824 | goto free_out; |
3105 | 3825 | for_each_possible_cpu(cpu) { | |
3826 | struct memcg_stock_pcp *stock = | ||
3827 | &per_cpu(memcg_stock, cpu); | ||
3828 | INIT_WORK(&stock->work, drain_local_stock); | ||
3829 | } | ||
3830 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3106 | } else { | 3831 | } else { |
3107 | parent = mem_cgroup_from_cont(cont->parent); | 3832 | parent = mem_cgroup_from_cont(cont->parent); |
3108 | mem->use_hierarchy = parent->use_hierarchy; | 3833 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -3128,6 +3853,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3128 | if (parent) | 3853 | if (parent) |
3129 | mem->swappiness = get_swappiness(parent); | 3854 | mem->swappiness = get_swappiness(parent); |
3130 | atomic_set(&mem->refcnt, 1); | 3855 | atomic_set(&mem->refcnt, 1); |
3856 | mem->move_charge_at_immigrate = 0; | ||
3857 | mutex_init(&mem->thresholds_lock); | ||
3131 | return &mem->css; | 3858 | return &mem->css; |
3132 | free_out: | 3859 | free_out: |
3133 | __mem_cgroup_free(mem); | 3860 | __mem_cgroup_free(mem); |
@@ -3164,19 +3891,445 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
3164 | return ret; | 3891 | return ret; |
3165 | } | 3892 | } |
3166 | 3893 | ||
3894 | #ifdef CONFIG_MMU | ||
3895 | /* Handlers for move charge at task migration. */ | ||
3896 | #define PRECHARGE_COUNT_AT_ONCE 256 | ||
3897 | static int mem_cgroup_do_precharge(unsigned long count) | ||
3898 | { | ||
3899 | int ret = 0; | ||
3900 | int batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3901 | struct mem_cgroup *mem = mc.to; | ||
3902 | |||
3903 | if (mem_cgroup_is_root(mem)) { | ||
3904 | mc.precharge += count; | ||
3905 | /* we don't need css_get for root */ | ||
3906 | return ret; | ||
3907 | } | ||
3908 | /* try to charge at once */ | ||
3909 | if (count > 1) { | ||
3910 | struct res_counter *dummy; | ||
3911 | /* | ||
3912 | * "mem" cannot be under rmdir() because we've already checked | ||
3913 | * by cgroup_lock_live_cgroup() that it is not removed and we | ||
3914 | * are still under the same cgroup_mutex. So we can postpone | ||
3915 | * css_get(). | ||
3916 | */ | ||
3917 | if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) | ||
3918 | goto one_by_one; | ||
3919 | if (do_swap_account && res_counter_charge(&mem->memsw, | ||
3920 | PAGE_SIZE * count, &dummy)) { | ||
3921 | res_counter_uncharge(&mem->res, PAGE_SIZE * count); | ||
3922 | goto one_by_one; | ||
3923 | } | ||
3924 | mc.precharge += count; | ||
3925 | VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags)); | ||
3926 | WARN_ON_ONCE(count > INT_MAX); | ||
3927 | __css_get(&mem->css, (int)count); | ||
3928 | return ret; | ||
3929 | } | ||
3930 | one_by_one: | ||
3931 | /* fall back to one by one charge */ | ||
3932 | while (count--) { | ||
3933 | if (signal_pending(current)) { | ||
3934 | ret = -EINTR; | ||
3935 | break; | ||
3936 | } | ||
3937 | if (!batch_count--) { | ||
3938 | batch_count = PRECHARGE_COUNT_AT_ONCE; | ||
3939 | cond_resched(); | ||
3940 | } | ||
3941 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | ||
3942 | if (ret || !mem) | ||
3943 | /* mem_cgroup_clear_mc() will do uncharge later */ | ||
3944 | return -ENOMEM; | ||
3945 | mc.precharge++; | ||
3946 | } | ||
3947 | return ret; | ||
3948 | } | ||
3949 | |||
3950 | /** | ||
3951 | * is_target_pte_for_mc - check a pte whether it is valid for move charge | ||
3952 | * @vma: the vma the pte to be checked belongs | ||
3953 | * @addr: the address corresponding to the pte to be checked | ||
3954 | * @ptent: the pte to be checked | ||
3955 | * @target: the pointer the target page or swap ent will be stored(can be NULL) | ||
3956 | * | ||
3957 | * Returns | ||
3958 | * 0(MC_TARGET_NONE): if the pte is not a target for move charge. | ||
3959 | * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for | ||
3960 | * move charge. if @target is not NULL, the page is stored in target->page | ||
3961 | * with extra refcnt got(Callers should handle it). | ||
3962 | * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a | ||
3963 | * target for charge migration. if @target is not NULL, the entry is stored | ||
3964 | * in target->ent. | ||
3965 | * | ||
3966 | * Called with pte lock held. | ||
3967 | */ | ||
3968 | union mc_target { | ||
3969 | struct page *page; | ||
3970 | swp_entry_t ent; | ||
3971 | }; | ||
3972 | |||
3973 | enum mc_target_type { | ||
3974 | MC_TARGET_NONE, /* not used */ | ||
3975 | MC_TARGET_PAGE, | ||
3976 | MC_TARGET_SWAP, | ||
3977 | }; | ||
3978 | |||
3979 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | ||
3980 | unsigned long addr, pte_t ptent, union mc_target *target) | ||
3981 | { | ||
3982 | struct page *page = NULL; | ||
3983 | struct page_cgroup *pc; | ||
3984 | int ret = 0; | ||
3985 | swp_entry_t ent = { .val = 0 }; | ||
3986 | int usage_count = 0; | ||
3987 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3988 | &mc.to->move_charge_at_immigrate); | ||
3989 | |||
3990 | if (!pte_present(ptent)) { | ||
3991 | /* TODO: handle swap of shmes/tmpfs */ | ||
3992 | if (pte_none(ptent) || pte_file(ptent)) | ||
3993 | return 0; | ||
3994 | else if (is_swap_pte(ptent)) { | ||
3995 | ent = pte_to_swp_entry(ptent); | ||
3996 | if (!move_anon || non_swap_entry(ent)) | ||
3997 | return 0; | ||
3998 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
3999 | } | ||
4000 | } else { | ||
4001 | page = vm_normal_page(vma, addr, ptent); | ||
4002 | if (!page || !page_mapped(page)) | ||
4003 | return 0; | ||
4004 | /* | ||
4005 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4006 | * pages for now. | ||
4007 | */ | ||
4008 | if (!move_anon || !PageAnon(page)) | ||
4009 | return 0; | ||
4010 | if (!get_page_unless_zero(page)) | ||
4011 | return 0; | ||
4012 | usage_count = page_mapcount(page); | ||
4013 | } | ||
4014 | if (usage_count > 1) { | ||
4015 | /* | ||
4016 | * TODO: We don't move charges of shared(used by multiple | ||
4017 | * processes) pages for now. | ||
4018 | */ | ||
4019 | if (page) | ||
4020 | put_page(page); | ||
4021 | return 0; | ||
4022 | } | ||
4023 | if (page) { | ||
4024 | pc = lookup_page_cgroup(page); | ||
4025 | /* | ||
4026 | * Do only loose check w/o page_cgroup lock. | ||
4027 | * mem_cgroup_move_account() checks the pc is valid or not under | ||
4028 | * the lock. | ||
4029 | */ | ||
4030 | if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { | ||
4031 | ret = MC_TARGET_PAGE; | ||
4032 | if (target) | ||
4033 | target->page = page; | ||
4034 | } | ||
4035 | if (!ret || !target) | ||
4036 | put_page(page); | ||
4037 | } | ||
4038 | /* throught */ | ||
4039 | if (ent.val && do_swap_account && !ret && | ||
4040 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | ||
4041 | ret = MC_TARGET_SWAP; | ||
4042 | if (target) | ||
4043 | target->ent = ent; | ||
4044 | } | ||
4045 | return ret; | ||
4046 | } | ||
4047 | |||
4048 | static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, | ||
4049 | unsigned long addr, unsigned long end, | ||
4050 | struct mm_walk *walk) | ||
4051 | { | ||
4052 | struct vm_area_struct *vma = walk->private; | ||
4053 | pte_t *pte; | ||
4054 | spinlock_t *ptl; | ||
4055 | |||
4056 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4057 | for (; addr != end; pte++, addr += PAGE_SIZE) | ||
4058 | if (is_target_pte_for_mc(vma, addr, *pte, NULL)) | ||
4059 | mc.precharge++; /* increment precharge temporarily */ | ||
4060 | pte_unmap_unlock(pte - 1, ptl); | ||
4061 | cond_resched(); | ||
4062 | |||
4063 | return 0; | ||
4064 | } | ||
4065 | |||
4066 | static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | ||
4067 | { | ||
4068 | unsigned long precharge; | ||
4069 | struct vm_area_struct *vma; | ||
4070 | |||
4071 | down_read(&mm->mmap_sem); | ||
4072 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4073 | struct mm_walk mem_cgroup_count_precharge_walk = { | ||
4074 | .pmd_entry = mem_cgroup_count_precharge_pte_range, | ||
4075 | .mm = mm, | ||
4076 | .private = vma, | ||
4077 | }; | ||
4078 | if (is_vm_hugetlb_page(vma)) | ||
4079 | continue; | ||
4080 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4081 | if (vma->vm_flags & VM_SHARED) | ||
4082 | continue; | ||
4083 | walk_page_range(vma->vm_start, vma->vm_end, | ||
4084 | &mem_cgroup_count_precharge_walk); | ||
4085 | } | ||
4086 | up_read(&mm->mmap_sem); | ||
4087 | |||
4088 | precharge = mc.precharge; | ||
4089 | mc.precharge = 0; | ||
4090 | |||
4091 | return precharge; | ||
4092 | } | ||
4093 | |||
4094 | static int mem_cgroup_precharge_mc(struct mm_struct *mm) | ||
4095 | { | ||
4096 | return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); | ||
4097 | } | ||
4098 | |||
4099 | static void mem_cgroup_clear_mc(void) | ||
4100 | { | ||
4101 | /* we must uncharge all the leftover precharges from mc.to */ | ||
4102 | if (mc.precharge) { | ||
4103 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | ||
4104 | mc.precharge = 0; | ||
4105 | } | ||
4106 | /* | ||
4107 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | ||
4108 | * we must uncharge here. | ||
4109 | */ | ||
4110 | if (mc.moved_charge) { | ||
4111 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | ||
4112 | mc.moved_charge = 0; | ||
4113 | } | ||
4114 | /* we must fixup refcnts and charges */ | ||
4115 | if (mc.moved_swap) { | ||
4116 | WARN_ON_ONCE(mc.moved_swap > INT_MAX); | ||
4117 | /* uncharge swap account from the old cgroup */ | ||
4118 | if (!mem_cgroup_is_root(mc.from)) | ||
4119 | res_counter_uncharge(&mc.from->memsw, | ||
4120 | PAGE_SIZE * mc.moved_swap); | ||
4121 | __mem_cgroup_put(mc.from, mc.moved_swap); | ||
4122 | |||
4123 | if (!mem_cgroup_is_root(mc.to)) { | ||
4124 | /* | ||
4125 | * we charged both to->res and to->memsw, so we should | ||
4126 | * uncharge to->res. | ||
4127 | */ | ||
4128 | res_counter_uncharge(&mc.to->res, | ||
4129 | PAGE_SIZE * mc.moved_swap); | ||
4130 | VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags)); | ||
4131 | __css_put(&mc.to->css, mc.moved_swap); | ||
4132 | } | ||
4133 | /* we've already done mem_cgroup_get(mc.to) */ | ||
4134 | |||
4135 | mc.moved_swap = 0; | ||
4136 | } | ||
4137 | mc.from = NULL; | ||
4138 | mc.to = NULL; | ||
4139 | mc.moving_task = NULL; | ||
4140 | wake_up_all(&mc.waitq); | ||
4141 | } | ||
4142 | |||
4143 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4144 | struct cgroup *cgroup, | ||
4145 | struct task_struct *p, | ||
4146 | bool threadgroup) | ||
4147 | { | ||
4148 | int ret = 0; | ||
4149 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); | ||
4150 | |||
4151 | if (mem->move_charge_at_immigrate) { | ||
4152 | struct mm_struct *mm; | ||
4153 | struct mem_cgroup *from = mem_cgroup_from_task(p); | ||
4154 | |||
4155 | VM_BUG_ON(from == mem); | ||
4156 | |||
4157 | mm = get_task_mm(p); | ||
4158 | if (!mm) | ||
4159 | return 0; | ||
4160 | /* We move charges only when we move a owner of the mm */ | ||
4161 | if (mm->owner == p) { | ||
4162 | VM_BUG_ON(mc.from); | ||
4163 | VM_BUG_ON(mc.to); | ||
4164 | VM_BUG_ON(mc.precharge); | ||
4165 | VM_BUG_ON(mc.moved_charge); | ||
4166 | VM_BUG_ON(mc.moved_swap); | ||
4167 | VM_BUG_ON(mc.moving_task); | ||
4168 | mc.from = from; | ||
4169 | mc.to = mem; | ||
4170 | mc.precharge = 0; | ||
4171 | mc.moved_charge = 0; | ||
4172 | mc.moved_swap = 0; | ||
4173 | mc.moving_task = current; | ||
4174 | |||
4175 | ret = mem_cgroup_precharge_mc(mm); | ||
4176 | if (ret) | ||
4177 | mem_cgroup_clear_mc(); | ||
4178 | } | ||
4179 | mmput(mm); | ||
4180 | } | ||
4181 | return ret; | ||
4182 | } | ||
4183 | |||
4184 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4185 | struct cgroup *cgroup, | ||
4186 | struct task_struct *p, | ||
4187 | bool threadgroup) | ||
4188 | { | ||
4189 | mem_cgroup_clear_mc(); | ||
4190 | } | ||
4191 | |||
4192 | static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, | ||
4193 | unsigned long addr, unsigned long end, | ||
4194 | struct mm_walk *walk) | ||
4195 | { | ||
4196 | int ret = 0; | ||
4197 | struct vm_area_struct *vma = walk->private; | ||
4198 | pte_t *pte; | ||
4199 | spinlock_t *ptl; | ||
4200 | |||
4201 | retry: | ||
4202 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | ||
4203 | for (; addr != end; addr += PAGE_SIZE) { | ||
4204 | pte_t ptent = *(pte++); | ||
4205 | union mc_target target; | ||
4206 | int type; | ||
4207 | struct page *page; | ||
4208 | struct page_cgroup *pc; | ||
4209 | swp_entry_t ent; | ||
4210 | |||
4211 | if (!mc.precharge) | ||
4212 | break; | ||
4213 | |||
4214 | type = is_target_pte_for_mc(vma, addr, ptent, &target); | ||
4215 | switch (type) { | ||
4216 | case MC_TARGET_PAGE: | ||
4217 | page = target.page; | ||
4218 | if (isolate_lru_page(page)) | ||
4219 | goto put; | ||
4220 | pc = lookup_page_cgroup(page); | ||
4221 | if (!mem_cgroup_move_account(pc, | ||
4222 | mc.from, mc.to, false)) { | ||
4223 | mc.precharge--; | ||
4224 | /* we uncharge from mc.from later. */ | ||
4225 | mc.moved_charge++; | ||
4226 | } | ||
4227 | putback_lru_page(page); | ||
4228 | put: /* is_target_pte_for_mc() gets the page */ | ||
4229 | put_page(page); | ||
4230 | break; | ||
4231 | case MC_TARGET_SWAP: | ||
4232 | ent = target.ent; | ||
4233 | if (!mem_cgroup_move_swap_account(ent, | ||
4234 | mc.from, mc.to, false)) { | ||
4235 | mc.precharge--; | ||
4236 | /* we fixup refcnts and charges later. */ | ||
4237 | mc.moved_swap++; | ||
4238 | } | ||
4239 | break; | ||
4240 | default: | ||
4241 | break; | ||
4242 | } | ||
4243 | } | ||
4244 | pte_unmap_unlock(pte - 1, ptl); | ||
4245 | cond_resched(); | ||
4246 | |||
4247 | if (addr != end) { | ||
4248 | /* | ||
4249 | * We have consumed all precharges we got in can_attach(). | ||
4250 | * We try charge one by one, but don't do any additional | ||
4251 | * charges to mc.to if we have failed in charge once in attach() | ||
4252 | * phase. | ||
4253 | */ | ||
4254 | ret = mem_cgroup_do_precharge(1); | ||
4255 | if (!ret) | ||
4256 | goto retry; | ||
4257 | } | ||
4258 | |||
4259 | return ret; | ||
4260 | } | ||
4261 | |||
4262 | static void mem_cgroup_move_charge(struct mm_struct *mm) | ||
4263 | { | ||
4264 | struct vm_area_struct *vma; | ||
4265 | |||
4266 | lru_add_drain_all(); | ||
4267 | down_read(&mm->mmap_sem); | ||
4268 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
4269 | int ret; | ||
4270 | struct mm_walk mem_cgroup_move_charge_walk = { | ||
4271 | .pmd_entry = mem_cgroup_move_charge_pte_range, | ||
4272 | .mm = mm, | ||
4273 | .private = vma, | ||
4274 | }; | ||
4275 | if (is_vm_hugetlb_page(vma)) | ||
4276 | continue; | ||
4277 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4278 | if (vma->vm_flags & VM_SHARED) | ||
4279 | continue; | ||
4280 | ret = walk_page_range(vma->vm_start, vma->vm_end, | ||
4281 | &mem_cgroup_move_charge_walk); | ||
4282 | if (ret) | ||
4283 | /* | ||
4284 | * means we have consumed all precharges and failed in | ||
4285 | * doing additional charge. Just abandon here. | ||
4286 | */ | ||
4287 | break; | ||
4288 | } | ||
4289 | up_read(&mm->mmap_sem); | ||
4290 | } | ||
4291 | |||
3167 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 4292 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
3168 | struct cgroup *cont, | 4293 | struct cgroup *cont, |
3169 | struct cgroup *old_cont, | 4294 | struct cgroup *old_cont, |
3170 | struct task_struct *p, | 4295 | struct task_struct *p, |
3171 | bool threadgroup) | 4296 | bool threadgroup) |
3172 | { | 4297 | { |
3173 | mutex_lock(&memcg_tasklist); | 4298 | struct mm_struct *mm; |
3174 | /* | 4299 | |
3175 | * FIXME: It's better to move charges of this process from old | 4300 | if (!mc.to) |
3176 | * memcg to new memcg. But it's just on TODO-List now. | 4301 | /* no need to move charge */ |
3177 | */ | 4302 | return; |
3178 | mutex_unlock(&memcg_tasklist); | 4303 | |
4304 | mm = get_task_mm(p); | ||
4305 | if (mm) { | ||
4306 | mem_cgroup_move_charge(mm); | ||
4307 | mmput(mm); | ||
4308 | } | ||
4309 | mem_cgroup_clear_mc(); | ||
4310 | } | ||
4311 | #else /* !CONFIG_MMU */ | ||
4312 | static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | ||
4313 | struct cgroup *cgroup, | ||
4314 | struct task_struct *p, | ||
4315 | bool threadgroup) | ||
4316 | { | ||
4317 | return 0; | ||
3179 | } | 4318 | } |
4319 | static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, | ||
4320 | struct cgroup *cgroup, | ||
4321 | struct task_struct *p, | ||
4322 | bool threadgroup) | ||
4323 | { | ||
4324 | } | ||
4325 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | ||
4326 | struct cgroup *cont, | ||
4327 | struct cgroup *old_cont, | ||
4328 | struct task_struct *p, | ||
4329 | bool threadgroup) | ||
4330 | { | ||
4331 | } | ||
4332 | #endif | ||
3180 | 4333 | ||
3181 | struct cgroup_subsys mem_cgroup_subsys = { | 4334 | struct cgroup_subsys mem_cgroup_subsys = { |
3182 | .name = "memory", | 4335 | .name = "memory", |
@@ -3185,6 +4338,8 @@ struct cgroup_subsys mem_cgroup_subsys = { | |||
3185 | .pre_destroy = mem_cgroup_pre_destroy, | 4338 | .pre_destroy = mem_cgroup_pre_destroy, |
3186 | .destroy = mem_cgroup_destroy, | 4339 | .destroy = mem_cgroup_destroy, |
3187 | .populate = mem_cgroup_populate, | 4340 | .populate = mem_cgroup_populate, |
4341 | .can_attach = mem_cgroup_can_attach, | ||
4342 | .cancel_attach = mem_cgroup_cancel_attach, | ||
3188 | .attach = mem_cgroup_move_task, | 4343 | .attach = mem_cgroup_move_task, |
3189 | .early_init = 0, | 4344 | .early_init = 0, |
3190 | .use_id = 1, | 4345 | .use_id = 1, |