diff options
author | Balbir Singh <balbir@linux.vnet.ibm.com> | 2009-09-23 18:56:37 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-24 10:20:59 -0400 |
commit | f64c3f54940d6929a2b6dcffaab942bd62be2e66 (patch) | |
tree | 7b3587700b08639970580be6c87f36df80ca8c74 | |
parent | 296c81d89f4f14269f7346f81442910158c0a83a (diff) |
memory controller: soft limit organize cgroups
Organize cgroups over soft limit in a RB-Tree
Introduce an RB-Tree for storing memory cgroups that are over their soft
limit. The overall goal is to
1. Add a memory cgroup to the RB-Tree when the soft limit is exceeded.
We are careful about updates, updates take place only after a particular
time interval has passed
2. We remove the node from the RB-Tree when the usage goes below the soft
limit
The next set of patches will exploit the RB-Tree to get the group that is
over its soft limit by the largest amount and reclaim from it, when we
face memory contention.
[hugh.dickins@tiscali.co.uk: CONFIG_CGROUP_MEM_RES_CTLR=y CONFIG_PREEMPT=y fails to boot]
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/res_counter.h | 6 | ||||
-rw-r--r-- | kernel/res_counter.c | 18 | ||||
-rw-r--r-- | mm/memcontrol.c | 300 |
3 files changed, 277 insertions, 47 deletions
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index fcb9884df618..731af71cddc9 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
@@ -114,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); | |||
114 | int __must_check res_counter_charge_locked(struct res_counter *counter, | 114 | int __must_check res_counter_charge_locked(struct res_counter *counter, |
115 | unsigned long val); | 115 | unsigned long val); |
116 | int __must_check res_counter_charge(struct res_counter *counter, | 116 | int __must_check res_counter_charge(struct res_counter *counter, |
117 | unsigned long val, struct res_counter **limit_fail_at); | 117 | unsigned long val, struct res_counter **limit_fail_at, |
118 | struct res_counter **soft_limit_at); | ||
118 | 119 | ||
119 | /* | 120 | /* |
120 | * uncharge - tell that some portion of the resource is released | 121 | * uncharge - tell that some portion of the resource is released |
@@ -127,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter, | |||
127 | */ | 128 | */ |
128 | 129 | ||
129 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); | 130 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); |
130 | void res_counter_uncharge(struct res_counter *counter, unsigned long val); | 131 | void res_counter_uncharge(struct res_counter *counter, unsigned long val, |
132 | bool *was_soft_limit_excess); | ||
131 | 133 | ||
132 | static inline bool res_counter_limit_check_locked(struct res_counter *cnt) | 134 | static inline bool res_counter_limit_check_locked(struct res_counter *cnt) |
133 | { | 135 | { |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf37c40b..88faec23e833 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
@@ -37,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | |||
37 | } | 37 | } |
38 | 38 | ||
39 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 39 | int res_counter_charge(struct res_counter *counter, unsigned long val, |
40 | struct res_counter **limit_fail_at) | 40 | struct res_counter **limit_fail_at, |
41 | struct res_counter **soft_limit_fail_at) | ||
41 | { | 42 | { |
42 | int ret; | 43 | int ret; |
43 | unsigned long flags; | 44 | unsigned long flags; |
44 | struct res_counter *c, *u; | 45 | struct res_counter *c, *u; |
45 | 46 | ||
46 | *limit_fail_at = NULL; | 47 | *limit_fail_at = NULL; |
48 | if (soft_limit_fail_at) | ||
49 | *soft_limit_fail_at = NULL; | ||
47 | local_irq_save(flags); | 50 | local_irq_save(flags); |
48 | for (c = counter; c != NULL; c = c->parent) { | 51 | for (c = counter; c != NULL; c = c->parent) { |
49 | spin_lock(&c->lock); | 52 | spin_lock(&c->lock); |
50 | ret = res_counter_charge_locked(c, val); | 53 | ret = res_counter_charge_locked(c, val); |
54 | /* | ||
55 | * With soft limits, we return the highest ancestor | ||
56 | * that exceeds its soft limit | ||
57 | */ | ||
58 | if (soft_limit_fail_at && | ||
59 | !res_counter_soft_limit_check_locked(c)) | ||
60 | *soft_limit_fail_at = c; | ||
51 | spin_unlock(&c->lock); | 61 | spin_unlock(&c->lock); |
52 | if (ret < 0) { | 62 | if (ret < 0) { |
53 | *limit_fail_at = c; | 63 | *limit_fail_at = c; |
@@ -75,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | |||
75 | counter->usage -= val; | 85 | counter->usage -= val; |
76 | } | 86 | } |
77 | 87 | ||
78 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 88 | void res_counter_uncharge(struct res_counter *counter, unsigned long val, |
89 | bool *was_soft_limit_excess) | ||
79 | { | 90 | { |
80 | unsigned long flags; | 91 | unsigned long flags; |
81 | struct res_counter *c; | 92 | struct res_counter *c; |
@@ -83,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) | |||
83 | local_irq_save(flags); | 94 | local_irq_save(flags); |
84 | for (c = counter; c != NULL; c = c->parent) { | 95 | for (c = counter; c != NULL; c = c->parent) { |
85 | spin_lock(&c->lock); | 96 | spin_lock(&c->lock); |
97 | if (was_soft_limit_excess) | ||
98 | *was_soft_limit_excess = | ||
99 | !res_counter_soft_limit_check_locked(c); | ||
86 | res_counter_uncharge_locked(c, val); | 100 | res_counter_uncharge_locked(c, val); |
87 | spin_unlock(&c->lock); | 101 | spin_unlock(&c->lock); |
88 | } | 102 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4ad3e6be045d..0ed325943cd1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -29,6 +29,7 @@ | |||
29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
32 | #include <linux/rbtree.h> | ||
32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
@@ -54,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #endif | 55 | #endif |
55 | 56 | ||
56 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
57 | 59 | ||
58 | /* | 60 | /* |
59 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
@@ -67,6 +69,7 @@ enum mem_cgroup_stat_index { | |||
67 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
68 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
69 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
70 | 73 | ||
71 | MEM_CGROUP_STAT_NSTATS, | 74 | MEM_CGROUP_STAT_NSTATS, |
72 | }; | 75 | }; |
@@ -79,6 +82,20 @@ struct mem_cgroup_stat { | |||
79 | struct mem_cgroup_stat_cpu cpustat[0]; | 82 | struct mem_cgroup_stat_cpu cpustat[0]; |
80 | }; | 83 | }; |
81 | 84 | ||
85 | static inline void | ||
86 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
87 | enum mem_cgroup_stat_index idx) | ||
88 | { | ||
89 | stat->count[idx] = 0; | ||
90 | } | ||
91 | |||
92 | static inline s64 | ||
93 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
94 | enum mem_cgroup_stat_index idx) | ||
95 | { | ||
96 | return stat->count[idx]; | ||
97 | } | ||
98 | |||
82 | /* | 99 | /* |
83 | * For accounting under irq disable, no need for increment preempt count. | 100 | * For accounting under irq disable, no need for increment preempt count. |
84 | */ | 101 | */ |
@@ -118,6 +135,10 @@ struct mem_cgroup_per_zone { | |||
118 | unsigned long count[NR_LRU_LISTS]; | 135 | unsigned long count[NR_LRU_LISTS]; |
119 | 136 | ||
120 | struct zone_reclaim_stat reclaim_stat; | 137 | struct zone_reclaim_stat reclaim_stat; |
138 | struct rb_node tree_node; /* RB tree node */ | ||
139 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
140 | /* the soft limit is exceeded*/ | ||
141 | bool on_tree; | ||
121 | }; | 142 | }; |
122 | /* Macro for accessing counter */ | 143 | /* Macro for accessing counter */ |
123 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 144 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
@@ -131,6 +152,26 @@ struct mem_cgroup_lru_info { | |||
131 | }; | 152 | }; |
132 | 153 | ||
133 | /* | 154 | /* |
155 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
156 | * their hierarchy representation | ||
157 | */ | ||
158 | |||
159 | struct mem_cgroup_tree_per_zone { | ||
160 | struct rb_root rb_root; | ||
161 | spinlock_t lock; | ||
162 | }; | ||
163 | |||
164 | struct mem_cgroup_tree_per_node { | ||
165 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
166 | }; | ||
167 | |||
168 | struct mem_cgroup_tree { | ||
169 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
170 | }; | ||
171 | |||
172 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
173 | |||
174 | /* | ||
134 | * The memory controller data structure. The memory controller controls both | 175 | * The memory controller data structure. The memory controller controls both |
135 | * page cache and RSS per cgroup. We would eventually like to provide | 176 | * page cache and RSS per cgroup. We would eventually like to provide |
136 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 177 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
@@ -215,6 +256,150 @@ static void mem_cgroup_get(struct mem_cgroup *mem); | |||
215 | static void mem_cgroup_put(struct mem_cgroup *mem); | 256 | static void mem_cgroup_put(struct mem_cgroup *mem); |
216 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 257 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
217 | 258 | ||
259 | static struct mem_cgroup_per_zone * | ||
260 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
261 | { | ||
262 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
263 | } | ||
264 | |||
265 | static struct mem_cgroup_per_zone * | ||
266 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
267 | { | ||
268 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
269 | int nid = page_cgroup_nid(pc); | ||
270 | int zid = page_cgroup_zid(pc); | ||
271 | |||
272 | if (!mem) | ||
273 | return NULL; | ||
274 | |||
275 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
276 | } | ||
277 | |||
278 | static struct mem_cgroup_tree_per_zone * | ||
279 | soft_limit_tree_node_zone(int nid, int zid) | ||
280 | { | ||
281 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
282 | } | ||
283 | |||
284 | static struct mem_cgroup_tree_per_zone * | ||
285 | soft_limit_tree_from_page(struct page *page) | ||
286 | { | ||
287 | int nid = page_to_nid(page); | ||
288 | int zid = page_zonenum(page); | ||
289 | |||
290 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
291 | } | ||
292 | |||
293 | static void | ||
294 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
295 | struct mem_cgroup_per_zone *mz, | ||
296 | struct mem_cgroup_tree_per_zone *mctz) | ||
297 | { | ||
298 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
299 | struct rb_node *parent = NULL; | ||
300 | struct mem_cgroup_per_zone *mz_node; | ||
301 | |||
302 | if (mz->on_tree) | ||
303 | return; | ||
304 | |||
305 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
306 | spin_lock(&mctz->lock); | ||
307 | while (*p) { | ||
308 | parent = *p; | ||
309 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
310 | tree_node); | ||
311 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
312 | p = &(*p)->rb_left; | ||
313 | /* | ||
314 | * We can't avoid mem cgroups that are over their soft | ||
315 | * limit by the same amount | ||
316 | */ | ||
317 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
318 | p = &(*p)->rb_right; | ||
319 | } | ||
320 | rb_link_node(&mz->tree_node, parent, p); | ||
321 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
322 | mz->on_tree = true; | ||
323 | spin_unlock(&mctz->lock); | ||
324 | } | ||
325 | |||
326 | static void | ||
327 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
328 | struct mem_cgroup_per_zone *mz, | ||
329 | struct mem_cgroup_tree_per_zone *mctz) | ||
330 | { | ||
331 | spin_lock(&mctz->lock); | ||
332 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
333 | mz->on_tree = false; | ||
334 | spin_unlock(&mctz->lock); | ||
335 | } | ||
336 | |||
337 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
338 | { | ||
339 | bool ret = false; | ||
340 | int cpu; | ||
341 | s64 val; | ||
342 | struct mem_cgroup_stat_cpu *cpustat; | ||
343 | |||
344 | cpu = get_cpu(); | ||
345 | cpustat = &mem->stat.cpustat[cpu]; | ||
346 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
347 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
348 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
349 | ret = true; | ||
350 | } | ||
351 | put_cpu(); | ||
352 | return ret; | ||
353 | } | ||
354 | |||
355 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
356 | { | ||
357 | unsigned long long prev_usage_in_excess, new_usage_in_excess; | ||
358 | bool updated_tree = false; | ||
359 | struct mem_cgroup_per_zone *mz; | ||
360 | struct mem_cgroup_tree_per_zone *mctz; | ||
361 | |||
362 | mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); | ||
363 | mctz = soft_limit_tree_from_page(page); | ||
364 | |||
365 | /* | ||
366 | * We do updates in lazy mode, mem's are removed | ||
367 | * lazily from the per-zone, per-node rb tree | ||
368 | */ | ||
369 | prev_usage_in_excess = mz->usage_in_excess; | ||
370 | |||
371 | new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
372 | if (prev_usage_in_excess) { | ||
373 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
374 | updated_tree = true; | ||
375 | } | ||
376 | if (!new_usage_in_excess) | ||
377 | goto done; | ||
378 | mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
379 | |||
380 | done: | ||
381 | if (updated_tree) { | ||
382 | spin_lock(&mctz->lock); | ||
383 | mz->usage_in_excess = new_usage_in_excess; | ||
384 | spin_unlock(&mctz->lock); | ||
385 | } | ||
386 | } | ||
387 | |||
388 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
389 | { | ||
390 | int node, zone; | ||
391 | struct mem_cgroup_per_zone *mz; | ||
392 | struct mem_cgroup_tree_per_zone *mctz; | ||
393 | |||
394 | for_each_node_state(node, N_POSSIBLE) { | ||
395 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
396 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
397 | mctz = soft_limit_tree_node_zone(node, zone); | ||
398 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
399 | } | ||
400 | } | ||
401 | } | ||
402 | |||
218 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 403 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
219 | struct page_cgroup *pc, | 404 | struct page_cgroup *pc, |
220 | bool charge) | 405 | bool charge) |
@@ -236,28 +421,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
236 | else | 421 | else |
237 | __mem_cgroup_stat_add_safe(cpustat, | 422 | __mem_cgroup_stat_add_safe(cpustat, |
238 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 423 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
424 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
239 | put_cpu(); | 425 | put_cpu(); |
240 | } | 426 | } |
241 | 427 | ||
242 | static struct mem_cgroup_per_zone * | ||
243 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
244 | { | ||
245 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
246 | } | ||
247 | |||
248 | static struct mem_cgroup_per_zone * | ||
249 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
250 | { | ||
251 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
252 | int nid = page_cgroup_nid(pc); | ||
253 | int zid = page_cgroup_zid(pc); | ||
254 | |||
255 | if (!mem) | ||
256 | return NULL; | ||
257 | |||
258 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
259 | } | ||
260 | |||
261 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 428 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
262 | enum lru_list idx) | 429 | enum lru_list idx) |
263 | { | 430 | { |
@@ -972,11 +1139,11 @@ done: | |||
972 | */ | 1139 | */ |
973 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1140 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
974 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1141 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
975 | bool oom) | 1142 | bool oom, struct page *page) |
976 | { | 1143 | { |
977 | struct mem_cgroup *mem, *mem_over_limit; | 1144 | struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; |
978 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1145 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
979 | struct res_counter *fail_res; | 1146 | struct res_counter *fail_res, *soft_fail_res = NULL; |
980 | 1147 | ||
981 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1148 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
982 | /* Don't account this! */ | 1149 | /* Don't account this! */ |
@@ -1006,16 +1173,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1006 | int ret; | 1173 | int ret; |
1007 | bool noswap = false; | 1174 | bool noswap = false; |
1008 | 1175 | ||
1009 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1176 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, |
1177 | &soft_fail_res); | ||
1010 | if (likely(!ret)) { | 1178 | if (likely(!ret)) { |
1011 | if (!do_swap_account) | 1179 | if (!do_swap_account) |
1012 | break; | 1180 | break; |
1013 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1181 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, |
1014 | &fail_res); | 1182 | &fail_res, NULL); |
1015 | if (likely(!ret)) | 1183 | if (likely(!ret)) |
1016 | break; | 1184 | break; |
1017 | /* mem+swap counter fails */ | 1185 | /* mem+swap counter fails */ |
1018 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1186 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1019 | noswap = true; | 1187 | noswap = true; |
1020 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1188 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1021 | memsw); | 1189 | memsw); |
@@ -1053,13 +1221,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1053 | goto nomem; | 1221 | goto nomem; |
1054 | } | 1222 | } |
1055 | } | 1223 | } |
1224 | /* | ||
1225 | * Insert just the ancestor, we should trickle down to the correct | ||
1226 | * cgroup for reclaim, since the other nodes will be below their | ||
1227 | * soft limit | ||
1228 | */ | ||
1229 | if (soft_fail_res) { | ||
1230 | mem_over_soft_limit = | ||
1231 | mem_cgroup_from_res_counter(soft_fail_res, res); | ||
1232 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | ||
1233 | mem_cgroup_update_tree(mem_over_soft_limit, page); | ||
1234 | } | ||
1056 | return 0; | 1235 | return 0; |
1057 | nomem: | 1236 | nomem: |
1058 | css_put(&mem->css); | 1237 | css_put(&mem->css); |
1059 | return -ENOMEM; | 1238 | return -ENOMEM; |
1060 | } | 1239 | } |
1061 | 1240 | ||
1062 | |||
1063 | /* | 1241 | /* |
1064 | * A helper function to get mem_cgroup from ID. must be called under | 1242 | * A helper function to get mem_cgroup from ID. must be called under |
1065 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1243 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
@@ -1126,9 +1304,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1126 | lock_page_cgroup(pc); | 1304 | lock_page_cgroup(pc); |
1127 | if (unlikely(PageCgroupUsed(pc))) { | 1305 | if (unlikely(PageCgroupUsed(pc))) { |
1128 | unlock_page_cgroup(pc); | 1306 | unlock_page_cgroup(pc); |
1129 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1307 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1130 | if (do_swap_account) | 1308 | if (do_swap_account) |
1131 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1309 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
1132 | css_put(&mem->css); | 1310 | css_put(&mem->css); |
1133 | return; | 1311 | return; |
1134 | } | 1312 | } |
@@ -1205,7 +1383,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1205 | if (pc->mem_cgroup != from) | 1383 | if (pc->mem_cgroup != from) |
1206 | goto out; | 1384 | goto out; |
1207 | 1385 | ||
1208 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1386 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); |
1209 | mem_cgroup_charge_statistics(from, pc, false); | 1387 | mem_cgroup_charge_statistics(from, pc, false); |
1210 | 1388 | ||
1211 | page = pc->page; | 1389 | page = pc->page; |
@@ -1225,7 +1403,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1225 | } | 1403 | } |
1226 | 1404 | ||
1227 | if (do_swap_account) | 1405 | if (do_swap_account) |
1228 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1406 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
1229 | css_put(&from->css); | 1407 | css_put(&from->css); |
1230 | 1408 | ||
1231 | css_get(&to->css); | 1409 | css_get(&to->css); |
@@ -1265,7 +1443,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1265 | parent = mem_cgroup_from_cont(pcg); | 1443 | parent = mem_cgroup_from_cont(pcg); |
1266 | 1444 | ||
1267 | 1445 | ||
1268 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1446 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1269 | if (ret || !parent) | 1447 | if (ret || !parent) |
1270 | return ret; | 1448 | return ret; |
1271 | 1449 | ||
@@ -1295,9 +1473,9 @@ uncharge: | |||
1295 | /* drop extra refcnt by try_charge() */ | 1473 | /* drop extra refcnt by try_charge() */ |
1296 | css_put(&parent->css); | 1474 | css_put(&parent->css); |
1297 | /* uncharge if move fails */ | 1475 | /* uncharge if move fails */ |
1298 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1476 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
1299 | if (do_swap_account) | 1477 | if (do_swap_account) |
1300 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1478 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); |
1301 | return ret; | 1479 | return ret; |
1302 | } | 1480 | } |
1303 | 1481 | ||
@@ -1322,7 +1500,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
1322 | prefetchw(pc); | 1500 | prefetchw(pc); |
1323 | 1501 | ||
1324 | mem = memcg; | 1502 | mem = memcg; |
1325 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1503 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
1326 | if (ret || !mem) | 1504 | if (ret || !mem) |
1327 | return ret; | 1505 | return ret; |
1328 | 1506 | ||
@@ -1441,14 +1619,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
1441 | if (!mem) | 1619 | if (!mem) |
1442 | goto charge_cur_mm; | 1620 | goto charge_cur_mm; |
1443 | *ptr = mem; | 1621 | *ptr = mem; |
1444 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1622 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
1445 | /* drop extra refcnt from tryget */ | 1623 | /* drop extra refcnt from tryget */ |
1446 | css_put(&mem->css); | 1624 | css_put(&mem->css); |
1447 | return ret; | 1625 | return ret; |
1448 | charge_cur_mm: | 1626 | charge_cur_mm: |
1449 | if (unlikely(!mm)) | 1627 | if (unlikely(!mm)) |
1450 | mm = &init_mm; | 1628 | mm = &init_mm; |
1451 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1629 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
1452 | } | 1630 | } |
1453 | 1631 | ||
1454 | static void | 1632 | static void |
@@ -1486,7 +1664,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
1486 | * This recorded memcg can be obsolete one. So, avoid | 1664 | * This recorded memcg can be obsolete one. So, avoid |
1487 | * calling css_tryget | 1665 | * calling css_tryget |
1488 | */ | 1666 | */ |
1489 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1667 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); |
1490 | mem_cgroup_put(memcg); | 1668 | mem_cgroup_put(memcg); |
1491 | } | 1669 | } |
1492 | rcu_read_unlock(); | 1670 | rcu_read_unlock(); |
@@ -1511,9 +1689,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1511 | return; | 1689 | return; |
1512 | if (!mem) | 1690 | if (!mem) |
1513 | return; | 1691 | return; |
1514 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1692 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
1515 | if (do_swap_account) | 1693 | if (do_swap_account) |
1516 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1694 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
1517 | css_put(&mem->css); | 1695 | css_put(&mem->css); |
1518 | } | 1696 | } |
1519 | 1697 | ||
@@ -1527,6 +1705,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1527 | struct page_cgroup *pc; | 1705 | struct page_cgroup *pc; |
1528 | struct mem_cgroup *mem = NULL; | 1706 | struct mem_cgroup *mem = NULL; |
1529 | struct mem_cgroup_per_zone *mz; | 1707 | struct mem_cgroup_per_zone *mz; |
1708 | bool soft_limit_excess = false; | ||
1530 | 1709 | ||
1531 | if (mem_cgroup_disabled()) | 1710 | if (mem_cgroup_disabled()) |
1532 | return NULL; | 1711 | return NULL; |
@@ -1565,9 +1744,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1565 | break; | 1744 | break; |
1566 | } | 1745 | } |
1567 | 1746 | ||
1568 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1747 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
1569 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1748 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
1570 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1749 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
1571 | mem_cgroup_charge_statistics(mem, pc, false); | 1750 | mem_cgroup_charge_statistics(mem, pc, false); |
1572 | 1751 | ||
1573 | ClearPageCgroupUsed(pc); | 1752 | ClearPageCgroupUsed(pc); |
@@ -1581,6 +1760,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1581 | mz = page_cgroup_zoneinfo(pc); | 1760 | mz = page_cgroup_zoneinfo(pc); |
1582 | unlock_page_cgroup(pc); | 1761 | unlock_page_cgroup(pc); |
1583 | 1762 | ||
1763 | if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) | ||
1764 | mem_cgroup_update_tree(mem, page); | ||
1584 | /* at swapout, this memcg will be accessed to record to swap */ | 1765 | /* at swapout, this memcg will be accessed to record to swap */ |
1585 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1766 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1586 | css_put(&mem->css); | 1767 | css_put(&mem->css); |
@@ -1656,7 +1837,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
1656 | * We uncharge this because swap is freed. | 1837 | * We uncharge this because swap is freed. |
1657 | * This memcg can be obsolete one. We avoid calling css_tryget | 1838 | * This memcg can be obsolete one. We avoid calling css_tryget |
1658 | */ | 1839 | */ |
1659 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1840 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); |
1660 | mem_cgroup_put(memcg); | 1841 | mem_cgroup_put(memcg); |
1661 | } | 1842 | } |
1662 | rcu_read_unlock(); | 1843 | rcu_read_unlock(); |
@@ -1685,7 +1866,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
1685 | unlock_page_cgroup(pc); | 1866 | unlock_page_cgroup(pc); |
1686 | 1867 | ||
1687 | if (mem) { | 1868 | if (mem) { |
1688 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 1869 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
1870 | page); | ||
1689 | css_put(&mem->css); | 1871 | css_put(&mem->css); |
1690 | } | 1872 | } |
1691 | *ptr = mem; | 1873 | *ptr = mem; |
@@ -2194,6 +2376,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2194 | res_counter_reset_failcnt(&mem->memsw); | 2376 | res_counter_reset_failcnt(&mem->memsw); |
2195 | break; | 2377 | break; |
2196 | } | 2378 | } |
2379 | |||
2197 | return 0; | 2380 | return 0; |
2198 | } | 2381 | } |
2199 | 2382 | ||
@@ -2489,6 +2672,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
2489 | mz = &pn->zoneinfo[zone]; | 2672 | mz = &pn->zoneinfo[zone]; |
2490 | for_each_lru(l) | 2673 | for_each_lru(l) |
2491 | INIT_LIST_HEAD(&mz->lists[l]); | 2674 | INIT_LIST_HEAD(&mz->lists[l]); |
2675 | mz->usage_in_excess = 0; | ||
2492 | } | 2676 | } |
2493 | return 0; | 2677 | return 0; |
2494 | } | 2678 | } |
@@ -2534,6 +2718,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
2534 | { | 2718 | { |
2535 | int node; | 2719 | int node; |
2536 | 2720 | ||
2721 | mem_cgroup_remove_from_trees(mem); | ||
2537 | free_css_id(&mem_cgroup_subsys, &mem->css); | 2722 | free_css_id(&mem_cgroup_subsys, &mem->css); |
2538 | 2723 | ||
2539 | for_each_node_state(node, N_POSSIBLE) | 2724 | for_each_node_state(node, N_POSSIBLE) |
@@ -2582,6 +2767,31 @@ static void __init enable_swap_cgroup(void) | |||
2582 | } | 2767 | } |
2583 | #endif | 2768 | #endif |
2584 | 2769 | ||
2770 | static int mem_cgroup_soft_limit_tree_init(void) | ||
2771 | { | ||
2772 | struct mem_cgroup_tree_per_node *rtpn; | ||
2773 | struct mem_cgroup_tree_per_zone *rtpz; | ||
2774 | int tmp, node, zone; | ||
2775 | |||
2776 | for_each_node_state(node, N_POSSIBLE) { | ||
2777 | tmp = node; | ||
2778 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
2779 | tmp = -1; | ||
2780 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
2781 | if (!rtpn) | ||
2782 | return 1; | ||
2783 | |||
2784 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
2785 | |||
2786 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
2787 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
2788 | rtpz->rb_root = RB_ROOT; | ||
2789 | spin_lock_init(&rtpz->lock); | ||
2790 | } | ||
2791 | } | ||
2792 | return 0; | ||
2793 | } | ||
2794 | |||
2585 | static struct cgroup_subsys_state * __ref | 2795 | static struct cgroup_subsys_state * __ref |
2586 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 2796 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
2587 | { | 2797 | { |
@@ -2596,11 +2806,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
2596 | for_each_node_state(node, N_POSSIBLE) | 2806 | for_each_node_state(node, N_POSSIBLE) |
2597 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 2807 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
2598 | goto free_out; | 2808 | goto free_out; |
2809 | |||
2599 | /* root ? */ | 2810 | /* root ? */ |
2600 | if (cont->parent == NULL) { | 2811 | if (cont->parent == NULL) { |
2601 | enable_swap_cgroup(); | 2812 | enable_swap_cgroup(); |
2602 | parent = NULL; | 2813 | parent = NULL; |
2603 | root_mem_cgroup = mem; | 2814 | root_mem_cgroup = mem; |
2815 | if (mem_cgroup_soft_limit_tree_init()) | ||
2816 | goto free_out; | ||
2817 | |||
2604 | } else { | 2818 | } else { |
2605 | parent = mem_cgroup_from_cont(cont->parent); | 2819 | parent = mem_cgroup_from_cont(cont->parent); |
2606 | mem->use_hierarchy = parent->use_hierarchy; | 2820 | mem->use_hierarchy = parent->use_hierarchy; |