diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 689 |
1 files changed, 494 insertions, 195 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8569bc298ff..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -149,16 +149,35 @@ struct mem_cgroup_threshold { | |||
149 | u64 threshold; | 149 | u64 threshold; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* For threshold */ | ||
152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
154 | atomic_t current_threshold; | 155 | int current_threshold; |
155 | /* Size of entries[] */ | 156 | /* Size of entries[] */ |
156 | unsigned int size; | 157 | unsigned int size; |
157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
159 | }; | 160 | }; |
160 | 161 | ||
162 | struct mem_cgroup_thresholds { | ||
163 | /* Primary thresholds array */ | ||
164 | struct mem_cgroup_threshold_ary *primary; | ||
165 | /* | ||
166 | * Spare threshold array. | ||
167 | * This is needed to make mem_cgroup_unregister_event() "never fail". | ||
168 | * It must be able to store at least primary->size - 1 entries. | ||
169 | */ | ||
170 | struct mem_cgroup_threshold_ary *spare; | ||
171 | }; | ||
172 | |||
173 | /* for OOM */ | ||
174 | struct mem_cgroup_eventfd_list { | ||
175 | struct list_head list; | ||
176 | struct eventfd_ctx *eventfd; | ||
177 | }; | ||
178 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 179 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
180 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
162 | 181 | ||
163 | /* | 182 | /* |
164 | * The memory controller data structure. The memory controller controls both | 183 | * The memory controller data structure. The memory controller controls both |
@@ -207,6 +226,8 @@ struct mem_cgroup { | |||
207 | atomic_t refcnt; | 226 | atomic_t refcnt; |
208 | 227 | ||
209 | unsigned int swappiness; | 228 | unsigned int swappiness; |
229 | /* OOM-Killer disable */ | ||
230 | int oom_kill_disable; | ||
210 | 231 | ||
211 | /* set when res.limit == memsw.limit */ | 232 | /* set when res.limit == memsw.limit */ |
212 | bool memsw_is_minimum; | 233 | bool memsw_is_minimum; |
@@ -215,17 +236,19 @@ struct mem_cgroup { | |||
215 | struct mutex thresholds_lock; | 236 | struct mutex thresholds_lock; |
216 | 237 | ||
217 | /* thresholds for memory usage. RCU-protected */ | 238 | /* thresholds for memory usage. RCU-protected */ |
218 | struct mem_cgroup_threshold_ary *thresholds; | 239 | struct mem_cgroup_thresholds thresholds; |
219 | 240 | ||
220 | /* thresholds for mem+swap usage. RCU-protected */ | 241 | /* thresholds for mem+swap usage. RCU-protected */ |
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 242 | struct mem_cgroup_thresholds memsw_thresholds; |
243 | |||
244 | /* For oom notifier event fd */ | ||
245 | struct list_head oom_notify; | ||
222 | 246 | ||
223 | /* | 247 | /* |
224 | * Should we move charges of a task when a task is moved into this | 248 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | 249 | * mem_cgroup ? And what type of charges should we move ? |
226 | */ | 250 | */ |
227 | unsigned long move_charge_at_immigrate; | 251 | unsigned long move_charge_at_immigrate; |
228 | |||
229 | /* | 252 | /* |
230 | * percpu counter. | 253 | * percpu counter. |
231 | */ | 254 | */ |
@@ -239,6 +262,7 @@ struct mem_cgroup { | |||
239 | */ | 262 | */ |
240 | enum move_type { | 263 | enum move_type { |
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 264 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
265 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | ||
242 | NR_MOVE_TYPE, | 266 | NR_MOVE_TYPE, |
243 | }; | 267 | }; |
244 | 268 | ||
@@ -255,6 +279,18 @@ static struct move_charge_struct { | |||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
256 | }; | 280 | }; |
257 | 281 | ||
282 | static bool move_anon(void) | ||
283 | { | ||
284 | return test_bit(MOVE_CHARGE_TYPE_ANON, | ||
285 | &mc.to->move_charge_at_immigrate); | ||
286 | } | ||
287 | |||
288 | static bool move_file(void) | ||
289 | { | ||
290 | return test_bit(MOVE_CHARGE_TYPE_FILE, | ||
291 | &mc.to->move_charge_at_immigrate); | ||
292 | } | ||
293 | |||
258 | /* | 294 | /* |
259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 295 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
260 | * limit reclaim to prevent infinite loops, if they ever occur. | 296 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -282,9 +318,12 @@ enum charge_type { | |||
282 | /* for encoding cft->private value on file */ | 318 | /* for encoding cft->private value on file */ |
283 | #define _MEM (0) | 319 | #define _MEM (0) |
284 | #define _MEMSWAP (1) | 320 | #define _MEMSWAP (1) |
321 | #define _OOM_TYPE (2) | ||
285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 322 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 323 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 324 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
325 | /* Used for OOM nofiier */ | ||
326 | #define OOM_CONTROL (0) | ||
288 | 327 | ||
289 | /* | 328 | /* |
290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 329 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
@@ -1293,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | 1332 | static DEFINE_MUTEX(memcg_oom_mutex); |
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1333 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1295 | 1334 | ||
1335 | struct oom_wait_info { | ||
1336 | struct mem_cgroup *mem; | ||
1337 | wait_queue_t wait; | ||
1338 | }; | ||
1339 | |||
1340 | static int memcg_oom_wake_function(wait_queue_t *wait, | ||
1341 | unsigned mode, int sync, void *arg) | ||
1342 | { | ||
1343 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | ||
1344 | struct oom_wait_info *oom_wait_info; | ||
1345 | |||
1346 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | ||
1347 | |||
1348 | if (oom_wait_info->mem == wake_mem) | ||
1349 | goto wakeup; | ||
1350 | /* if no hierarchy, no match */ | ||
1351 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1352 | return 0; | ||
1353 | /* | ||
1354 | * Both of oom_wait_info->mem and wake_mem are stable under us. | ||
1355 | * Then we can use css_is_ancestor without taking care of RCU. | ||
1356 | */ | ||
1357 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | ||
1358 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | ||
1359 | return 0; | ||
1360 | |||
1361 | wakeup: | ||
1362 | return autoremove_wake_function(wait, mode, sync, arg); | ||
1363 | } | ||
1364 | |||
1365 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | ||
1366 | { | ||
1367 | /* for filtering, pass "mem" as argument. */ | ||
1368 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | ||
1369 | } | ||
1370 | |||
1371 | static void memcg_oom_recover(struct mem_cgroup *mem) | ||
1372 | { | ||
1373 | if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) | ||
1374 | memcg_wakeup_oom(mem); | ||
1375 | } | ||
1376 | |||
1296 | /* | 1377 | /* |
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1378 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1298 | */ | 1379 | */ |
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1380 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
1300 | { | 1381 | { |
1301 | DEFINE_WAIT(wait); | 1382 | struct oom_wait_info owait; |
1302 | bool locked; | 1383 | bool locked, need_to_kill; |
1303 | 1384 | ||
1385 | owait.mem = mem; | ||
1386 | owait.wait.flags = 0; | ||
1387 | owait.wait.func = memcg_oom_wake_function; | ||
1388 | owait.wait.private = current; | ||
1389 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
1390 | need_to_kill = true; | ||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | 1391 | /* At first, try to OOM lock hierarchy under mem.*/ |
1305 | mutex_lock(&memcg_oom_mutex); | 1392 | mutex_lock(&memcg_oom_mutex); |
1306 | locked = mem_cgroup_oom_lock(mem); | 1393 | locked = mem_cgroup_oom_lock(mem); |
@@ -1309,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1396 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1397 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1311 | */ | 1398 | */ |
1312 | if (!locked) | 1399 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | 1400 | if (!locked || mem->oom_kill_disable) |
1401 | need_to_kill = false; | ||
1402 | if (locked) | ||
1403 | mem_cgroup_oom_notify(mem); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | 1404 | mutex_unlock(&memcg_oom_mutex); |
1315 | 1405 | ||
1316 | if (locked) | 1406 | if (need_to_kill) { |
1407 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | 1408 | mem_cgroup_out_of_memory(mem, mask); |
1318 | else { | 1409 | } else { |
1319 | schedule(); | 1410 | schedule(); |
1320 | finish_wait(&memcg_oom_waitq, &wait); | 1411 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1321 | } | 1412 | } |
1322 | mutex_lock(&memcg_oom_mutex); | 1413 | mutex_lock(&memcg_oom_mutex); |
1323 | mem_cgroup_oom_unlock(mem); | 1414 | mem_cgroup_oom_unlock(mem); |
1324 | /* | 1415 | memcg_wakeup_oom(mem); |
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | 1416 | mutex_unlock(&memcg_oom_mutex); |
1339 | 1417 | ||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1418 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
@@ -2118,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2118 | /* If swapout, usage of swap doesn't decrease */ | 2196 | /* If swapout, usage of swap doesn't decrease */ |
2119 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2197 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2120 | uncharge_memsw = false; | 2198 | uncharge_memsw = false; |
2121 | /* | ||
2122 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2123 | * In those cases, all pages freed continously can be expected to be in | ||
2124 | * the same cgroup and we have chance to coalesce uncharges. | ||
2125 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2126 | * because we want to do uncharge as soon as possible. | ||
2127 | */ | ||
2128 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2129 | goto direct_uncharge; | ||
2130 | 2199 | ||
2131 | batch = ¤t->memcg_batch; | 2200 | batch = ¤t->memcg_batch; |
2132 | /* | 2201 | /* |
@@ -2137,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2137 | if (!batch->memcg) | 2206 | if (!batch->memcg) |
2138 | batch->memcg = mem; | 2207 | batch->memcg = mem; |
2139 | /* | 2208 | /* |
2209 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2210 | * In those cases, all pages freed continously can be expected to be in | ||
2211 | * the same cgroup and we have chance to coalesce uncharges. | ||
2212 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2213 | * because we want to do uncharge as soon as possible. | ||
2214 | */ | ||
2215 | |||
2216 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2217 | goto direct_uncharge; | ||
2218 | |||
2219 | /* | ||
2140 | * In typical case, batch->memcg == mem. This means we can | 2220 | * In typical case, batch->memcg == mem. This means we can |
2141 | * merge a series of uncharges to an uncharge of res_counter. | 2221 | * merge a series of uncharges to an uncharge of res_counter. |
2142 | * If not, we uncharge res_counter ony by one. | 2222 | * If not, we uncharge res_counter ony by one. |
@@ -2152,6 +2232,8 @@ direct_uncharge: | |||
2152 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2232 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
2153 | if (uncharge_memsw) | 2233 | if (uncharge_memsw) |
2154 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2234 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
2235 | if (unlikely(batch->memcg != mem)) | ||
2236 | memcg_oom_recover(mem); | ||
2155 | return; | 2237 | return; |
2156 | } | 2238 | } |
2157 | 2239 | ||
@@ -2188,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2188 | switch (ctype) { | 2270 | switch (ctype) { |
2189 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2271 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2190 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2272 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2191 | if (page_mapped(page)) | 2273 | /* See mem_cgroup_prepare_migration() */ |
2274 | if (page_mapped(page) || PageCgroupMigration(pc)) | ||
2192 | goto unlock_out; | 2275 | goto unlock_out; |
2193 | break; | 2276 | break; |
2194 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 2277 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2288,6 +2371,7 @@ void mem_cgroup_uncharge_end(void) | |||
2288 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2371 | res_counter_uncharge(&batch->memcg->res, batch->bytes); |
2289 | if (batch->memsw_bytes) | 2372 | if (batch->memsw_bytes) |
2290 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2373 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); |
2374 | memcg_oom_recover(batch->memcg); | ||
2291 | /* forget this pointer (for sanity check) */ | 2375 | /* forget this pointer (for sanity check) */ |
2292 | batch->memcg = NULL; | 2376 | batch->memcg = NULL; |
2293 | } | 2377 | } |
@@ -2410,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2410 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 2494 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
2411 | * page belongs to. | 2495 | * page belongs to. |
2412 | */ | 2496 | */ |
2413 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 2497 | int mem_cgroup_prepare_migration(struct page *page, |
2498 | struct page *newpage, struct mem_cgroup **ptr) | ||
2414 | { | 2499 | { |
2415 | struct page_cgroup *pc; | 2500 | struct page_cgroup *pc; |
2416 | struct mem_cgroup *mem = NULL; | 2501 | struct mem_cgroup *mem = NULL; |
2502 | enum charge_type ctype; | ||
2417 | int ret = 0; | 2503 | int ret = 0; |
2418 | 2504 | ||
2419 | if (mem_cgroup_disabled()) | 2505 | if (mem_cgroup_disabled()) |
@@ -2424,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2424 | if (PageCgroupUsed(pc)) { | 2510 | if (PageCgroupUsed(pc)) { |
2425 | mem = pc->mem_cgroup; | 2511 | mem = pc->mem_cgroup; |
2426 | css_get(&mem->css); | 2512 | css_get(&mem->css); |
2513 | /* | ||
2514 | * At migrating an anonymous page, its mapcount goes down | ||
2515 | * to 0 and uncharge() will be called. But, even if it's fully | ||
2516 | * unmapped, migration may fail and this page has to be | ||
2517 | * charged again. We set MIGRATION flag here and delay uncharge | ||
2518 | * until end_migration() is called | ||
2519 | * | ||
2520 | * Corner Case Thinking | ||
2521 | * A) | ||
2522 | * When the old page was mapped as Anon and it's unmap-and-freed | ||
2523 | * while migration was ongoing. | ||
2524 | * If unmap finds the old page, uncharge() of it will be delayed | ||
2525 | * until end_migration(). If unmap finds a new page, it's | ||
2526 | * uncharged when it make mapcount to be 1->0. If unmap code | ||
2527 | * finds swap_migration_entry, the new page will not be mapped | ||
2528 | * and end_migration() will find it(mapcount==0). | ||
2529 | * | ||
2530 | * B) | ||
2531 | * When the old page was mapped but migraion fails, the kernel | ||
2532 | * remaps it. A charge for it is kept by MIGRATION flag even | ||
2533 | * if mapcount goes down to 0. We can do remap successfully | ||
2534 | * without charging it again. | ||
2535 | * | ||
2536 | * C) | ||
2537 | * The "old" page is under lock_page() until the end of | ||
2538 | * migration, so, the old page itself will not be swapped-out. | ||
2539 | * If the new page is swapped out before end_migraton, our | ||
2540 | * hook to usual swap-out path will catch the event. | ||
2541 | */ | ||
2542 | if (PageAnon(page)) | ||
2543 | SetPageCgroupMigration(pc); | ||
2427 | } | 2544 | } |
2428 | unlock_page_cgroup(pc); | 2545 | unlock_page_cgroup(pc); |
2546 | /* | ||
2547 | * If the page is not charged at this point, | ||
2548 | * we return here. | ||
2549 | */ | ||
2550 | if (!mem) | ||
2551 | return 0; | ||
2429 | 2552 | ||
2430 | *ptr = mem; | 2553 | *ptr = mem; |
2431 | if (mem) { | 2554 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
2432 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2555 | css_put(&mem->css);/* drop extra refcnt */ |
2433 | css_put(&mem->css); | 2556 | if (ret || *ptr == NULL) { |
2557 | if (PageAnon(page)) { | ||
2558 | lock_page_cgroup(pc); | ||
2559 | ClearPageCgroupMigration(pc); | ||
2560 | unlock_page_cgroup(pc); | ||
2561 | /* | ||
2562 | * The old page may be fully unmapped while we kept it. | ||
2563 | */ | ||
2564 | mem_cgroup_uncharge_page(page); | ||
2565 | } | ||
2566 | return -ENOMEM; | ||
2434 | } | 2567 | } |
2568 | /* | ||
2569 | * We charge new page before it's used/mapped. So, even if unlock_page() | ||
2570 | * is called before end_migration, we can catch all events on this new | ||
2571 | * page. In the case new page is migrated but not remapped, new page's | ||
2572 | * mapcount will be finally 0 and we call uncharge in end_migration(). | ||
2573 | */ | ||
2574 | pc = lookup_page_cgroup(newpage); | ||
2575 | if (PageAnon(page)) | ||
2576 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2577 | else if (page_is_file_cache(page)) | ||
2578 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2579 | else | ||
2580 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2581 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
2435 | return ret; | 2582 | return ret; |
2436 | } | 2583 | } |
2437 | 2584 | ||
2438 | /* remove redundant charge if migration failed*/ | 2585 | /* remove redundant charge if migration failed*/ |
2439 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2586 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2440 | struct page *oldpage, struct page *newpage) | 2587 | struct page *oldpage, struct page *newpage) |
2441 | { | 2588 | { |
2442 | struct page *target, *unused; | 2589 | struct page *used, *unused; |
2443 | struct page_cgroup *pc; | 2590 | struct page_cgroup *pc; |
2444 | enum charge_type ctype; | ||
2445 | 2591 | ||
2446 | if (!mem) | 2592 | if (!mem) |
2447 | return; | 2593 | return; |
2594 | /* blocks rmdir() */ | ||
2448 | cgroup_exclude_rmdir(&mem->css); | 2595 | cgroup_exclude_rmdir(&mem->css); |
2449 | /* at migration success, oldpage->mapping is NULL. */ | 2596 | /* at migration success, oldpage->mapping is NULL. */ |
2450 | if (oldpage->mapping) { | 2597 | if (oldpage->mapping) { |
2451 | target = oldpage; | 2598 | used = oldpage; |
2452 | unused = NULL; | 2599 | unused = newpage; |
2453 | } else { | 2600 | } else { |
2454 | target = newpage; | 2601 | used = newpage; |
2455 | unused = oldpage; | 2602 | unused = oldpage; |
2456 | } | 2603 | } |
2457 | |||
2458 | if (PageAnon(target)) | ||
2459 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2460 | else if (page_is_file_cache(target)) | ||
2461 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2462 | else | ||
2463 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2464 | |||
2465 | /* unused page is not on radix-tree now. */ | ||
2466 | if (unused) | ||
2467 | __mem_cgroup_uncharge_common(unused, ctype); | ||
2468 | |||
2469 | pc = lookup_page_cgroup(target); | ||
2470 | /* | 2604 | /* |
2471 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. | 2605 | * We disallowed uncharge of pages under migration because mapcount |
2472 | * So, double-counting is effectively avoided. | 2606 | * of the page goes down to zero, temporarly. |
2607 | * Clear the flag and check the page should be charged. | ||
2473 | */ | 2608 | */ |
2474 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2609 | pc = lookup_page_cgroup(oldpage); |
2610 | lock_page_cgroup(pc); | ||
2611 | ClearPageCgroupMigration(pc); | ||
2612 | unlock_page_cgroup(pc); | ||
2613 | |||
2614 | if (unused != oldpage) | ||
2615 | pc = lookup_page_cgroup(unused); | ||
2616 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
2475 | 2617 | ||
2618 | pc = lookup_page_cgroup(used); | ||
2476 | /* | 2619 | /* |
2477 | * Both of oldpage and newpage are still under lock_page(). | 2620 | * If a page is a file cache, radix-tree replacement is very atomic |
2478 | * Then, we don't have to care about race in radix-tree. | 2621 | * and we can skip this check. When it was an Anon page, its mapcount |
2479 | * But we have to be careful that this page is unmapped or not. | 2622 | * goes down to 0. But because we added MIGRATION flage, it's not |
2480 | * | 2623 | * uncharged yet. There are several case but page->mapcount check |
2481 | * There is a case for !page_mapped(). At the start of | 2624 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
2482 | * migration, oldpage was mapped. But now, it's zapped. | 2625 | * check. (see prepare_charge() also) |
2483 | * But we know *target* page is not freed/reused under us. | ||
2484 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
2485 | */ | 2626 | */ |
2486 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2627 | if (PageAnon(used)) |
2487 | mem_cgroup_uncharge_page(target); | 2628 | mem_cgroup_uncharge_page(used); |
2488 | /* | 2629 | /* |
2489 | * At migration, we may charge account against cgroup which has no tasks | 2630 | * At migration, we may charge account against cgroup which has no |
2631 | * tasks. | ||
2490 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2632 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2491 | * In that case, we need to call pre_destroy() again. check it here. | 2633 | * In that case, we need to call pre_destroy() again. check it here. |
2492 | */ | 2634 | */ |
@@ -2524,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2524 | unsigned long long val) | 2666 | unsigned long long val) |
2525 | { | 2667 | { |
2526 | int retry_count; | 2668 | int retry_count; |
2527 | u64 memswlimit; | 2669 | u64 memswlimit, memlimit; |
2528 | int ret = 0; | 2670 | int ret = 0; |
2529 | int children = mem_cgroup_count_children(memcg); | 2671 | int children = mem_cgroup_count_children(memcg); |
2530 | u64 curusage, oldusage; | 2672 | u64 curusage, oldusage; |
2673 | int enlarge; | ||
2531 | 2674 | ||
2532 | /* | 2675 | /* |
2533 | * For keeping hierarchical_reclaim simple, how long we should retry | 2676 | * For keeping hierarchical_reclaim simple, how long we should retry |
@@ -2538,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2538 | 2681 | ||
2539 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2682 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2540 | 2683 | ||
2684 | enlarge = 0; | ||
2541 | while (retry_count) { | 2685 | while (retry_count) { |
2542 | if (signal_pending(current)) { | 2686 | if (signal_pending(current)) { |
2543 | ret = -EINTR; | 2687 | ret = -EINTR; |
@@ -2555,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2555 | mutex_unlock(&set_limit_mutex); | 2699 | mutex_unlock(&set_limit_mutex); |
2556 | break; | 2700 | break; |
2557 | } | 2701 | } |
2702 | |||
2703 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
2704 | if (memlimit < val) | ||
2705 | enlarge = 1; | ||
2706 | |||
2558 | ret = res_counter_set_limit(&memcg->res, val); | 2707 | ret = res_counter_set_limit(&memcg->res, val); |
2559 | if (!ret) { | 2708 | if (!ret) { |
2560 | if (memswlimit == val) | 2709 | if (memswlimit == val) |
@@ -2576,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2576 | else | 2725 | else |
2577 | oldusage = curusage; | 2726 | oldusage = curusage; |
2578 | } | 2727 | } |
2728 | if (!ret && enlarge) | ||
2729 | memcg_oom_recover(memcg); | ||
2579 | 2730 | ||
2580 | return ret; | 2731 | return ret; |
2581 | } | 2732 | } |
@@ -2584,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2584 | unsigned long long val) | 2735 | unsigned long long val) |
2585 | { | 2736 | { |
2586 | int retry_count; | 2737 | int retry_count; |
2587 | u64 memlimit, oldusage, curusage; | 2738 | u64 memlimit, memswlimit, oldusage, curusage; |
2588 | int children = mem_cgroup_count_children(memcg); | 2739 | int children = mem_cgroup_count_children(memcg); |
2589 | int ret = -EBUSY; | 2740 | int ret = -EBUSY; |
2741 | int enlarge = 0; | ||
2590 | 2742 | ||
2591 | /* see mem_cgroup_resize_res_limit */ | 2743 | /* see mem_cgroup_resize_res_limit */ |
2592 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 2744 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
@@ -2608,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2608 | mutex_unlock(&set_limit_mutex); | 2760 | mutex_unlock(&set_limit_mutex); |
2609 | break; | 2761 | break; |
2610 | } | 2762 | } |
2763 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
2764 | if (memswlimit < val) | ||
2765 | enlarge = 1; | ||
2611 | ret = res_counter_set_limit(&memcg->memsw, val); | 2766 | ret = res_counter_set_limit(&memcg->memsw, val); |
2612 | if (!ret) { | 2767 | if (!ret) { |
2613 | if (memlimit == val) | 2768 | if (memlimit == val) |
@@ -2630,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2630 | else | 2785 | else |
2631 | oldusage = curusage; | 2786 | oldusage = curusage; |
2632 | } | 2787 | } |
2788 | if (!ret && enlarge) | ||
2789 | memcg_oom_recover(memcg); | ||
2633 | return ret; | 2790 | return ret; |
2634 | } | 2791 | } |
2635 | 2792 | ||
@@ -2821,6 +2978,7 @@ move_account: | |||
2821 | if (ret) | 2978 | if (ret) |
2822 | break; | 2979 | break; |
2823 | } | 2980 | } |
2981 | memcg_oom_recover(mem); | ||
2824 | /* it seems parent cgroup doesn't have enough mem */ | 2982 | /* it seems parent cgroup doesn't have enough mem */ |
2825 | if (ret == -ENOMEM) | 2983 | if (ret == -ENOMEM) |
2826 | goto try_to_free; | 2984 | goto try_to_free; |
@@ -3311,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3311 | 3469 | ||
3312 | rcu_read_lock(); | 3470 | rcu_read_lock(); |
3313 | if (!swap) | 3471 | if (!swap) |
3314 | t = rcu_dereference(memcg->thresholds); | 3472 | t = rcu_dereference(memcg->thresholds.primary); |
3315 | else | 3473 | else |
3316 | t = rcu_dereference(memcg->memsw_thresholds); | 3474 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
3317 | 3475 | ||
3318 | if (!t) | 3476 | if (!t) |
3319 | goto unlock; | 3477 | goto unlock; |
@@ -3325,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3325 | * If it's not true, a threshold was crossed after last | 3483 | * If it's not true, a threshold was crossed after last |
3326 | * call of __mem_cgroup_threshold(). | 3484 | * call of __mem_cgroup_threshold(). |
3327 | */ | 3485 | */ |
3328 | i = atomic_read(&t->current_threshold); | 3486 | i = t->current_threshold; |
3329 | 3487 | ||
3330 | /* | 3488 | /* |
3331 | * Iterate backward over array of thresholds starting from | 3489 | * Iterate backward over array of thresholds starting from |
@@ -3349,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3349 | eventfd_signal(t->entries[i].eventfd, 1); | 3507 | eventfd_signal(t->entries[i].eventfd, 1); |
3350 | 3508 | ||
3351 | /* Update current_threshold */ | 3509 | /* Update current_threshold */ |
3352 | atomic_set(&t->current_threshold, i - 1); | 3510 | t->current_threshold = i - 1; |
3353 | unlock: | 3511 | unlock: |
3354 | rcu_read_unlock(); | 3512 | rcu_read_unlock(); |
3355 | } | 3513 | } |
@@ -3369,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b) | |||
3369 | return _a->threshold - _b->threshold; | 3527 | return _a->threshold - _b->threshold; |
3370 | } | 3528 | } |
3371 | 3529 | ||
3372 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3530 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
3373 | struct eventfd_ctx *eventfd, const char *args) | 3531 | { |
3532 | struct mem_cgroup_eventfd_list *ev; | ||
3533 | |||
3534 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
3535 | eventfd_signal(ev->eventfd, 1); | ||
3536 | return 0; | ||
3537 | } | ||
3538 | |||
3539 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
3540 | { | ||
3541 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
3542 | } | ||
3543 | |||
3544 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
3545 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3374 | { | 3546 | { |
3375 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3547 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3376 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3548 | struct mem_cgroup_thresholds *thresholds; |
3549 | struct mem_cgroup_threshold_ary *new; | ||
3377 | int type = MEMFILE_TYPE(cft->private); | 3550 | int type = MEMFILE_TYPE(cft->private); |
3378 | u64 threshold, usage; | 3551 | u64 threshold, usage; |
3379 | int size; | 3552 | int i, size, ret; |
3380 | int i, ret; | ||
3381 | 3553 | ||
3382 | ret = res_counter_memparse_write_strategy(args, &threshold); | 3554 | ret = res_counter_memparse_write_strategy(args, &threshold); |
3383 | if (ret) | 3555 | if (ret) |
3384 | return ret; | 3556 | return ret; |
3385 | 3557 | ||
3386 | mutex_lock(&memcg->thresholds_lock); | 3558 | mutex_lock(&memcg->thresholds_lock); |
3559 | |||
3387 | if (type == _MEM) | 3560 | if (type == _MEM) |
3388 | thresholds = memcg->thresholds; | 3561 | thresholds = &memcg->thresholds; |
3389 | else if (type == _MEMSWAP) | 3562 | else if (type == _MEMSWAP) |
3390 | thresholds = memcg->memsw_thresholds; | 3563 | thresholds = &memcg->memsw_thresholds; |
3391 | else | 3564 | else |
3392 | BUG(); | 3565 | BUG(); |
3393 | 3566 | ||
3394 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 3567 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
3395 | 3568 | ||
3396 | /* Check if a threshold crossed before adding a new one */ | 3569 | /* Check if a threshold crossed before adding a new one */ |
3397 | if (thresholds) | 3570 | if (thresholds->primary) |
3398 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3571 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3399 | 3572 | ||
3400 | if (thresholds) | 3573 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
3401 | size = thresholds->size + 1; | ||
3402 | else | ||
3403 | size = 1; | ||
3404 | 3574 | ||
3405 | /* Allocate memory for new array of thresholds */ | 3575 | /* Allocate memory for new array of thresholds */ |
3406 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | 3576 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
3407 | size * sizeof(struct mem_cgroup_threshold), | ||
3408 | GFP_KERNEL); | 3577 | GFP_KERNEL); |
3409 | if (!thresholds_new) { | 3578 | if (!new) { |
3410 | ret = -ENOMEM; | 3579 | ret = -ENOMEM; |
3411 | goto unlock; | 3580 | goto unlock; |
3412 | } | 3581 | } |
3413 | thresholds_new->size = size; | 3582 | new->size = size; |
3414 | 3583 | ||
3415 | /* Copy thresholds (if any) to new array */ | 3584 | /* Copy thresholds (if any) to new array */ |
3416 | if (thresholds) | 3585 | if (thresholds->primary) { |
3417 | memcpy(thresholds_new->entries, thresholds->entries, | 3586 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
3418 | thresholds->size * | ||
3419 | sizeof(struct mem_cgroup_threshold)); | 3587 | sizeof(struct mem_cgroup_threshold)); |
3588 | } | ||
3589 | |||
3420 | /* Add new threshold */ | 3590 | /* Add new threshold */ |
3421 | thresholds_new->entries[size - 1].eventfd = eventfd; | 3591 | new->entries[size - 1].eventfd = eventfd; |
3422 | thresholds_new->entries[size - 1].threshold = threshold; | 3592 | new->entries[size - 1].threshold = threshold; |
3423 | 3593 | ||
3424 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 3594 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
3425 | sort(thresholds_new->entries, size, | 3595 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
3426 | sizeof(struct mem_cgroup_threshold), | ||
3427 | compare_thresholds, NULL); | 3596 | compare_thresholds, NULL); |
3428 | 3597 | ||
3429 | /* Find current threshold */ | 3598 | /* Find current threshold */ |
3430 | atomic_set(&thresholds_new->current_threshold, -1); | 3599 | new->current_threshold = -1; |
3431 | for (i = 0; i < size; i++) { | 3600 | for (i = 0; i < size; i++) { |
3432 | if (thresholds_new->entries[i].threshold < usage) { | 3601 | if (new->entries[i].threshold < usage) { |
3433 | /* | 3602 | /* |
3434 | * thresholds_new->current_threshold will not be used | 3603 | * new->current_threshold will not be used until |
3435 | * until rcu_assign_pointer(), so it's safe to increment | 3604 | * rcu_assign_pointer(), so it's safe to increment |
3436 | * it here. | 3605 | * it here. |
3437 | */ | 3606 | */ |
3438 | atomic_inc(&thresholds_new->current_threshold); | 3607 | ++new->current_threshold; |
3439 | } | 3608 | } |
3440 | } | 3609 | } |
3441 | 3610 | ||
3442 | if (type == _MEM) | 3611 | /* Free old spare buffer and save old primary buffer as spare */ |
3443 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3612 | kfree(thresholds->spare); |
3444 | else | 3613 | thresholds->spare = thresholds->primary; |
3445 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | 3614 | |
3615 | rcu_assign_pointer(thresholds->primary, new); | ||
3446 | 3616 | ||
3447 | /* To be sure that nobody uses thresholds before freeing it */ | 3617 | /* To be sure that nobody uses thresholds */ |
3448 | synchronize_rcu(); | 3618 | synchronize_rcu(); |
3449 | 3619 | ||
3450 | kfree(thresholds); | ||
3451 | unlock: | 3620 | unlock: |
3452 | mutex_unlock(&memcg->thresholds_lock); | 3621 | mutex_unlock(&memcg->thresholds_lock); |
3453 | 3622 | ||
3454 | return ret; | 3623 | return ret; |
3455 | } | 3624 | } |
3456 | 3625 | ||
3457 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3626 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
3458 | struct eventfd_ctx *eventfd) | 3627 | struct cftype *cft, struct eventfd_ctx *eventfd) |
3459 | { | 3628 | { |
3460 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3461 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3630 | struct mem_cgroup_thresholds *thresholds; |
3631 | struct mem_cgroup_threshold_ary *new; | ||
3462 | int type = MEMFILE_TYPE(cft->private); | 3632 | int type = MEMFILE_TYPE(cft->private); |
3463 | u64 usage; | 3633 | u64 usage; |
3464 | int size = 0; | 3634 | int i, j, size; |
3465 | int i, j, ret; | ||
3466 | 3635 | ||
3467 | mutex_lock(&memcg->thresholds_lock); | 3636 | mutex_lock(&memcg->thresholds_lock); |
3468 | if (type == _MEM) | 3637 | if (type == _MEM) |
3469 | thresholds = memcg->thresholds; | 3638 | thresholds = &memcg->thresholds; |
3470 | else if (type == _MEMSWAP) | 3639 | else if (type == _MEMSWAP) |
3471 | thresholds = memcg->memsw_thresholds; | 3640 | thresholds = &memcg->memsw_thresholds; |
3472 | else | 3641 | else |
3473 | BUG(); | 3642 | BUG(); |
3474 | 3643 | ||
@@ -3484,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | |||
3484 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3653 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3485 | 3654 | ||
3486 | /* Calculate new number of threshold */ | 3655 | /* Calculate new number of threshold */ |
3487 | for (i = 0; i < thresholds->size; i++) { | 3656 | size = 0; |
3488 | if (thresholds->entries[i].eventfd != eventfd) | 3657 | for (i = 0; i < thresholds->primary->size; i++) { |
3658 | if (thresholds->primary->entries[i].eventfd != eventfd) | ||
3489 | size++; | 3659 | size++; |
3490 | } | 3660 | } |
3491 | 3661 | ||
3662 | new = thresholds->spare; | ||
3663 | |||
3492 | /* Set thresholds array to NULL if we don't have thresholds */ | 3664 | /* Set thresholds array to NULL if we don't have thresholds */ |
3493 | if (!size) { | 3665 | if (!size) { |
3494 | thresholds_new = NULL; | 3666 | kfree(new); |
3495 | goto assign; | 3667 | new = NULL; |
3668 | goto swap_buffers; | ||
3496 | } | 3669 | } |
3497 | 3670 | ||
3498 | /* Allocate memory for new array of thresholds */ | 3671 | new->size = size; |
3499 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3500 | size * sizeof(struct mem_cgroup_threshold), | ||
3501 | GFP_KERNEL); | ||
3502 | if (!thresholds_new) { | ||
3503 | ret = -ENOMEM; | ||
3504 | goto unlock; | ||
3505 | } | ||
3506 | thresholds_new->size = size; | ||
3507 | 3672 | ||
3508 | /* Copy thresholds and find current threshold */ | 3673 | /* Copy thresholds and find current threshold */ |
3509 | atomic_set(&thresholds_new->current_threshold, -1); | 3674 | new->current_threshold = -1; |
3510 | for (i = 0, j = 0; i < thresholds->size; i++) { | 3675 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
3511 | if (thresholds->entries[i].eventfd == eventfd) | 3676 | if (thresholds->primary->entries[i].eventfd == eventfd) |
3512 | continue; | 3677 | continue; |
3513 | 3678 | ||
3514 | thresholds_new->entries[j] = thresholds->entries[i]; | 3679 | new->entries[j] = thresholds->primary->entries[i]; |
3515 | if (thresholds_new->entries[j].threshold < usage) { | 3680 | if (new->entries[j].threshold < usage) { |
3516 | /* | 3681 | /* |
3517 | * thresholds_new->current_threshold will not be used | 3682 | * new->current_threshold will not be used |
3518 | * until rcu_assign_pointer(), so it's safe to increment | 3683 | * until rcu_assign_pointer(), so it's safe to increment |
3519 | * it here. | 3684 | * it here. |
3520 | */ | 3685 | */ |
3521 | atomic_inc(&thresholds_new->current_threshold); | 3686 | ++new->current_threshold; |
3522 | } | 3687 | } |
3523 | j++; | 3688 | j++; |
3524 | } | 3689 | } |
3525 | 3690 | ||
3526 | assign: | 3691 | swap_buffers: |
3527 | if (type == _MEM) | 3692 | /* Swap primary and spare array */ |
3528 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3693 | thresholds->spare = thresholds->primary; |
3529 | else | 3694 | rcu_assign_pointer(thresholds->primary, new); |
3530 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3531 | 3695 | ||
3532 | /* To be sure that nobody uses thresholds before freeing it */ | 3696 | /* To be sure that nobody uses thresholds */ |
3533 | synchronize_rcu(); | 3697 | synchronize_rcu(); |
3534 | 3698 | ||
3535 | kfree(thresholds); | ||
3536 | unlock: | ||
3537 | mutex_unlock(&memcg->thresholds_lock); | 3699 | mutex_unlock(&memcg->thresholds_lock); |
3700 | } | ||
3538 | 3701 | ||
3539 | return ret; | 3702 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, |
3703 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3704 | { | ||
3705 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3706 | struct mem_cgroup_eventfd_list *event; | ||
3707 | int type = MEMFILE_TYPE(cft->private); | ||
3708 | |||
3709 | BUG_ON(type != _OOM_TYPE); | ||
3710 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
3711 | if (!event) | ||
3712 | return -ENOMEM; | ||
3713 | |||
3714 | mutex_lock(&memcg_oom_mutex); | ||
3715 | |||
3716 | event->eventfd = eventfd; | ||
3717 | list_add(&event->list, &memcg->oom_notify); | ||
3718 | |||
3719 | /* already in OOM ? */ | ||
3720 | if (atomic_read(&memcg->oom_lock)) | ||
3721 | eventfd_signal(eventfd, 1); | ||
3722 | mutex_unlock(&memcg_oom_mutex); | ||
3723 | |||
3724 | return 0; | ||
3725 | } | ||
3726 | |||
3727 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
3728 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
3729 | { | ||
3730 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3731 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
3732 | int type = MEMFILE_TYPE(cft->private); | ||
3733 | |||
3734 | BUG_ON(type != _OOM_TYPE); | ||
3735 | |||
3736 | mutex_lock(&memcg_oom_mutex); | ||
3737 | |||
3738 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
3739 | if (ev->eventfd == eventfd) { | ||
3740 | list_del(&ev->list); | ||
3741 | kfree(ev); | ||
3742 | } | ||
3743 | } | ||
3744 | |||
3745 | mutex_unlock(&memcg_oom_mutex); | ||
3746 | } | ||
3747 | |||
3748 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | ||
3749 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
3750 | { | ||
3751 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3752 | |||
3753 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | ||
3754 | |||
3755 | if (atomic_read(&mem->oom_lock)) | ||
3756 | cb->fill(cb, "under_oom", 1); | ||
3757 | else | ||
3758 | cb->fill(cb, "under_oom", 0); | ||
3759 | return 0; | ||
3760 | } | ||
3761 | |||
3762 | /* | ||
3763 | */ | ||
3764 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | ||
3765 | struct cftype *cft, u64 val) | ||
3766 | { | ||
3767 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3768 | struct mem_cgroup *parent; | ||
3769 | |||
3770 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | ||
3771 | if (!cgrp->parent || !((val == 0) || (val == 1))) | ||
3772 | return -EINVAL; | ||
3773 | |||
3774 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
3775 | |||
3776 | cgroup_lock(); | ||
3777 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
3778 | if ((parent->use_hierarchy) || | ||
3779 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | ||
3780 | cgroup_unlock(); | ||
3781 | return -EINVAL; | ||
3782 | } | ||
3783 | mem->oom_kill_disable = val; | ||
3784 | cgroup_unlock(); | ||
3785 | return 0; | ||
3540 | } | 3786 | } |
3541 | 3787 | ||
3542 | static struct cftype mem_cgroup_files[] = { | 3788 | static struct cftype mem_cgroup_files[] = { |
@@ -3544,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = { | |||
3544 | .name = "usage_in_bytes", | 3790 | .name = "usage_in_bytes", |
3545 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3791 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3546 | .read_u64 = mem_cgroup_read, | 3792 | .read_u64 = mem_cgroup_read, |
3547 | .register_event = mem_cgroup_register_event, | 3793 | .register_event = mem_cgroup_usage_register_event, |
3548 | .unregister_event = mem_cgroup_unregister_event, | 3794 | .unregister_event = mem_cgroup_usage_unregister_event, |
3549 | }, | 3795 | }, |
3550 | { | 3796 | { |
3551 | .name = "max_usage_in_bytes", | 3797 | .name = "max_usage_in_bytes", |
@@ -3594,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = { | |||
3594 | .read_u64 = mem_cgroup_move_charge_read, | 3840 | .read_u64 = mem_cgroup_move_charge_read, |
3595 | .write_u64 = mem_cgroup_move_charge_write, | 3841 | .write_u64 = mem_cgroup_move_charge_write, |
3596 | }, | 3842 | }, |
3843 | { | ||
3844 | .name = "oom_control", | ||
3845 | .read_map = mem_cgroup_oom_control_read, | ||
3846 | .write_u64 = mem_cgroup_oom_control_write, | ||
3847 | .register_event = mem_cgroup_oom_register_event, | ||
3848 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
3849 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
3850 | }, | ||
3597 | }; | 3851 | }; |
3598 | 3852 | ||
3599 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3602,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3602 | .name = "memsw.usage_in_bytes", | 3856 | .name = "memsw.usage_in_bytes", |
3603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3857 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3604 | .read_u64 = mem_cgroup_read, | 3858 | .read_u64 = mem_cgroup_read, |
3605 | .register_event = mem_cgroup_register_event, | 3859 | .register_event = mem_cgroup_usage_register_event, |
3606 | .unregister_event = mem_cgroup_unregister_event, | 3860 | .unregister_event = mem_cgroup_usage_unregister_event, |
3607 | }, | 3861 | }, |
3608 | { | 3862 | { |
3609 | .name = "memsw.max_usage_in_bytes", | 3863 | .name = "memsw.max_usage_in_bytes", |
@@ -3831,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3831 | } else { | 4085 | } else { |
3832 | parent = mem_cgroup_from_cont(cont->parent); | 4086 | parent = mem_cgroup_from_cont(cont->parent); |
3833 | mem->use_hierarchy = parent->use_hierarchy; | 4087 | mem->use_hierarchy = parent->use_hierarchy; |
4088 | mem->oom_kill_disable = parent->oom_kill_disable; | ||
3834 | } | 4089 | } |
3835 | 4090 | ||
3836 | if (parent && parent->use_hierarchy) { | 4091 | if (parent && parent->use_hierarchy) { |
@@ -3849,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3849 | } | 4104 | } |
3850 | mem->last_scanned_child = 0; | 4105 | mem->last_scanned_child = 0; |
3851 | spin_lock_init(&mem->reclaim_param_lock); | 4106 | spin_lock_init(&mem->reclaim_param_lock); |
4107 | INIT_LIST_HEAD(&mem->oom_notify); | ||
3852 | 4108 | ||
3853 | if (parent) | 4109 | if (parent) |
3854 | mem->swappiness = get_swappiness(parent); | 4110 | mem->swappiness = get_swappiness(parent); |
@@ -3976,6 +4232,80 @@ enum mc_target_type { | |||
3976 | MC_TARGET_SWAP, | 4232 | MC_TARGET_SWAP, |
3977 | }; | 4233 | }; |
3978 | 4234 | ||
4235 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | ||
4236 | unsigned long addr, pte_t ptent) | ||
4237 | { | ||
4238 | struct page *page = vm_normal_page(vma, addr, ptent); | ||
4239 | |||
4240 | if (!page || !page_mapped(page)) | ||
4241 | return NULL; | ||
4242 | if (PageAnon(page)) { | ||
4243 | /* we don't move shared anon */ | ||
4244 | if (!move_anon() || page_mapcount(page) > 2) | ||
4245 | return NULL; | ||
4246 | } else if (!move_file()) | ||
4247 | /* we ignore mapcount for file pages */ | ||
4248 | return NULL; | ||
4249 | if (!get_page_unless_zero(page)) | ||
4250 | return NULL; | ||
4251 | |||
4252 | return page; | ||
4253 | } | ||
4254 | |||
4255 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
4256 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4257 | { | ||
4258 | int usage_count; | ||
4259 | struct page *page = NULL; | ||
4260 | swp_entry_t ent = pte_to_swp_entry(ptent); | ||
4261 | |||
4262 | if (!move_anon() || non_swap_entry(ent)) | ||
4263 | return NULL; | ||
4264 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4265 | if (usage_count > 1) { /* we don't move shared anon */ | ||
4266 | if (page) | ||
4267 | put_page(page); | ||
4268 | return NULL; | ||
4269 | } | ||
4270 | if (do_swap_account) | ||
4271 | entry->val = ent.val; | ||
4272 | |||
4273 | return page; | ||
4274 | } | ||
4275 | |||
4276 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | ||
4277 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4278 | { | ||
4279 | struct page *page = NULL; | ||
4280 | struct inode *inode; | ||
4281 | struct address_space *mapping; | ||
4282 | pgoff_t pgoff; | ||
4283 | |||
4284 | if (!vma->vm_file) /* anonymous vma */ | ||
4285 | return NULL; | ||
4286 | if (!move_file()) | ||
4287 | return NULL; | ||
4288 | |||
4289 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
4290 | mapping = vma->vm_file->f_mapping; | ||
4291 | if (pte_none(ptent)) | ||
4292 | pgoff = linear_page_index(vma, addr); | ||
4293 | else /* pte_file(ptent) is true */ | ||
4294 | pgoff = pte_to_pgoff(ptent); | ||
4295 | |||
4296 | /* page is moved even if it's not RSS of this task(page-faulted). */ | ||
4297 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | ||
4298 | page = find_get_page(mapping, pgoff); | ||
4299 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | ||
4300 | swp_entry_t ent; | ||
4301 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | ||
4302 | if (do_swap_account) | ||
4303 | entry->val = ent.val; | ||
4304 | } | ||
4305 | |||
4306 | return page; | ||
4307 | } | ||
4308 | |||
3979 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 4309 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
3980 | unsigned long addr, pte_t ptent, union mc_target *target) | 4310 | unsigned long addr, pte_t ptent, union mc_target *target) |
3981 | { | 4311 | { |
@@ -3983,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
3983 | struct page_cgroup *pc; | 4313 | struct page_cgroup *pc; |
3984 | int ret = 0; | 4314 | int ret = 0; |
3985 | swp_entry_t ent = { .val = 0 }; | 4315 | swp_entry_t ent = { .val = 0 }; |
3986 | int usage_count = 0; | ||
3987 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3988 | &mc.to->move_charge_at_immigrate); | ||
3989 | 4316 | ||
3990 | if (!pte_present(ptent)) { | 4317 | if (pte_present(ptent)) |
3991 | /* TODO: handle swap of shmes/tmpfs */ | 4318 | page = mc_handle_present_pte(vma, addr, ptent); |
3992 | if (pte_none(ptent) || pte_file(ptent)) | 4319 | else if (is_swap_pte(ptent)) |
3993 | return 0; | 4320 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
3994 | else if (is_swap_pte(ptent)) { | 4321 | else if (pte_none(ptent) || pte_file(ptent)) |
3995 | ent = pte_to_swp_entry(ptent); | 4322 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
3996 | if (!move_anon || non_swap_entry(ent)) | 4323 | |
3997 | return 0; | 4324 | if (!page && !ent.val) |
3998 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
3999 | } | ||
4000 | } else { | ||
4001 | page = vm_normal_page(vma, addr, ptent); | ||
4002 | if (!page || !page_mapped(page)) | ||
4003 | return 0; | ||
4004 | /* | ||
4005 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4006 | * pages for now. | ||
4007 | */ | ||
4008 | if (!move_anon || !PageAnon(page)) | ||
4009 | return 0; | ||
4010 | if (!get_page_unless_zero(page)) | ||
4011 | return 0; | ||
4012 | usage_count = page_mapcount(page); | ||
4013 | } | ||
4014 | if (usage_count > 1) { | ||
4015 | /* | ||
4016 | * TODO: We don't move charges of shared(used by multiple | ||
4017 | * processes) pages for now. | ||
4018 | */ | ||
4019 | if (page) | ||
4020 | put_page(page); | ||
4021 | return 0; | 4325 | return 0; |
4022 | } | ||
4023 | if (page) { | 4326 | if (page) { |
4024 | pc = lookup_page_cgroup(page); | 4327 | pc = lookup_page_cgroup(page); |
4025 | /* | 4328 | /* |
@@ -4035,8 +4338,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
4035 | if (!ret || !target) | 4338 | if (!ret || !target) |
4036 | put_page(page); | 4339 | put_page(page); |
4037 | } | 4340 | } |
4038 | /* throught */ | 4341 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
4039 | if (ent.val && do_swap_account && !ret && | 4342 | if (ent.val && !ret && |
4040 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 4343 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { |
4041 | ret = MC_TARGET_SWAP; | 4344 | ret = MC_TARGET_SWAP; |
4042 | if (target) | 4345 | if (target) |
@@ -4077,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4077 | }; | 4380 | }; |
4078 | if (is_vm_hugetlb_page(vma)) | 4381 | if (is_vm_hugetlb_page(vma)) |
4079 | continue; | 4382 | continue; |
4080 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4081 | if (vma->vm_flags & VM_SHARED) | ||
4082 | continue; | ||
4083 | walk_page_range(vma->vm_start, vma->vm_end, | 4383 | walk_page_range(vma->vm_start, vma->vm_end, |
4084 | &mem_cgroup_count_precharge_walk); | 4384 | &mem_cgroup_count_precharge_walk); |
4085 | } | 4385 | } |
@@ -4102,6 +4402,7 @@ static void mem_cgroup_clear_mc(void) | |||
4102 | if (mc.precharge) { | 4402 | if (mc.precharge) { |
4103 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4403 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4104 | mc.precharge = 0; | 4404 | mc.precharge = 0; |
4405 | memcg_oom_recover(mc.to); | ||
4105 | } | 4406 | } |
4106 | /* | 4407 | /* |
4107 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4408 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
@@ -4110,6 +4411,7 @@ static void mem_cgroup_clear_mc(void) | |||
4110 | if (mc.moved_charge) { | 4411 | if (mc.moved_charge) { |
4111 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4412 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4112 | mc.moved_charge = 0; | 4413 | mc.moved_charge = 0; |
4414 | memcg_oom_recover(mc.from); | ||
4113 | } | 4415 | } |
4114 | /* we must fixup refcnts and charges */ | 4416 | /* we must fixup refcnts and charges */ |
4115 | if (mc.moved_swap) { | 4417 | if (mc.moved_swap) { |
@@ -4274,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4274 | }; | 4576 | }; |
4275 | if (is_vm_hugetlb_page(vma)) | 4577 | if (is_vm_hugetlb_page(vma)) |
4276 | continue; | 4578 | continue; |
4277 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4278 | if (vma->vm_flags & VM_SHARED) | ||
4279 | continue; | ||
4280 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 4579 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
4281 | &mem_cgroup_move_charge_walk); | 4580 | &mem_cgroup_move_charge_walk); |
4282 | if (ret) | 4581 | if (ret) |