diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 689 |
1 files changed, 494 insertions, 195 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8569bc298ff..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -149,16 +149,35 @@ struct mem_cgroup_threshold { | |||
| 149 | u64 threshold; | 149 | u64 threshold; |
| 150 | }; | 150 | }; |
| 151 | 151 | ||
| 152 | /* For threshold */ | ||
| 152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
| 153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
| 154 | atomic_t current_threshold; | 155 | int current_threshold; |
| 155 | /* Size of entries[] */ | 156 | /* Size of entries[] */ |
| 156 | unsigned int size; | 157 | unsigned int size; |
| 157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
| 158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
| 159 | }; | 160 | }; |
| 160 | 161 | ||
| 162 | struct mem_cgroup_thresholds { | ||
| 163 | /* Primary thresholds array */ | ||
| 164 | struct mem_cgroup_threshold_ary *primary; | ||
| 165 | /* | ||
| 166 | * Spare threshold array. | ||
| 167 | * This is needed to make mem_cgroup_unregister_event() "never fail". | ||
| 168 | * It must be able to store at least primary->size - 1 entries. | ||
| 169 | */ | ||
| 170 | struct mem_cgroup_threshold_ary *spare; | ||
| 171 | }; | ||
| 172 | |||
| 173 | /* for OOM */ | ||
| 174 | struct mem_cgroup_eventfd_list { | ||
| 175 | struct list_head list; | ||
| 176 | struct eventfd_ctx *eventfd; | ||
| 177 | }; | ||
| 178 | |||
| 161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 179 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
| 180 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
| 162 | 181 | ||
| 163 | /* | 182 | /* |
| 164 | * The memory controller data structure. The memory controller controls both | 183 | * The memory controller data structure. The memory controller controls both |
| @@ -207,6 +226,8 @@ struct mem_cgroup { | |||
| 207 | atomic_t refcnt; | 226 | atomic_t refcnt; |
| 208 | 227 | ||
| 209 | unsigned int swappiness; | 228 | unsigned int swappiness; |
| 229 | /* OOM-Killer disable */ | ||
| 230 | int oom_kill_disable; | ||
| 210 | 231 | ||
| 211 | /* set when res.limit == memsw.limit */ | 232 | /* set when res.limit == memsw.limit */ |
| 212 | bool memsw_is_minimum; | 233 | bool memsw_is_minimum; |
| @@ -215,17 +236,19 @@ struct mem_cgroup { | |||
| 215 | struct mutex thresholds_lock; | 236 | struct mutex thresholds_lock; |
| 216 | 237 | ||
| 217 | /* thresholds for memory usage. RCU-protected */ | 238 | /* thresholds for memory usage. RCU-protected */ |
| 218 | struct mem_cgroup_threshold_ary *thresholds; | 239 | struct mem_cgroup_thresholds thresholds; |
| 219 | 240 | ||
| 220 | /* thresholds for mem+swap usage. RCU-protected */ | 241 | /* thresholds for mem+swap usage. RCU-protected */ |
| 221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 242 | struct mem_cgroup_thresholds memsw_thresholds; |
| 243 | |||
| 244 | /* For oom notifier event fd */ | ||
| 245 | struct list_head oom_notify; | ||
| 222 | 246 | ||
| 223 | /* | 247 | /* |
| 224 | * Should we move charges of a task when a task is moved into this | 248 | * Should we move charges of a task when a task is moved into this |
| 225 | * mem_cgroup ? And what type of charges should we move ? | 249 | * mem_cgroup ? And what type of charges should we move ? |
| 226 | */ | 250 | */ |
| 227 | unsigned long move_charge_at_immigrate; | 251 | unsigned long move_charge_at_immigrate; |
| 228 | |||
| 229 | /* | 252 | /* |
| 230 | * percpu counter. | 253 | * percpu counter. |
| 231 | */ | 254 | */ |
| @@ -239,6 +262,7 @@ struct mem_cgroup { | |||
| 239 | */ | 262 | */ |
| 240 | enum move_type { | 263 | enum move_type { |
| 241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 264 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
| 265 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | ||
| 242 | NR_MOVE_TYPE, | 266 | NR_MOVE_TYPE, |
| 243 | }; | 267 | }; |
| 244 | 268 | ||
| @@ -255,6 +279,18 @@ static struct move_charge_struct { | |||
| 255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
| 256 | }; | 280 | }; |
| 257 | 281 | ||
| 282 | static bool move_anon(void) | ||
| 283 | { | ||
| 284 | return test_bit(MOVE_CHARGE_TYPE_ANON, | ||
| 285 | &mc.to->move_charge_at_immigrate); | ||
| 286 | } | ||
| 287 | |||
| 288 | static bool move_file(void) | ||
| 289 | { | ||
| 290 | return test_bit(MOVE_CHARGE_TYPE_FILE, | ||
| 291 | &mc.to->move_charge_at_immigrate); | ||
| 292 | } | ||
| 293 | |||
| 258 | /* | 294 | /* |
| 259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 295 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
| 260 | * limit reclaim to prevent infinite loops, if they ever occur. | 296 | * limit reclaim to prevent infinite loops, if they ever occur. |
| @@ -282,9 +318,12 @@ enum charge_type { | |||
| 282 | /* for encoding cft->private value on file */ | 318 | /* for encoding cft->private value on file */ |
| 283 | #define _MEM (0) | 319 | #define _MEM (0) |
| 284 | #define _MEMSWAP (1) | 320 | #define _MEMSWAP (1) |
| 321 | #define _OOM_TYPE (2) | ||
| 285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 322 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
| 286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 323 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
| 287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 324 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
| 325 | /* Used for OOM nofiier */ | ||
| 326 | #define OOM_CONTROL (0) | ||
| 288 | 327 | ||
| 289 | /* | 328 | /* |
| 290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 329 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
| @@ -1293,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |||
| 1293 | static DEFINE_MUTEX(memcg_oom_mutex); | 1332 | static DEFINE_MUTEX(memcg_oom_mutex); |
| 1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1333 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
| 1295 | 1334 | ||
| 1335 | struct oom_wait_info { | ||
| 1336 | struct mem_cgroup *mem; | ||
| 1337 | wait_queue_t wait; | ||
| 1338 | }; | ||
| 1339 | |||
| 1340 | static int memcg_oom_wake_function(wait_queue_t *wait, | ||
| 1341 | unsigned mode, int sync, void *arg) | ||
| 1342 | { | ||
| 1343 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | ||
| 1344 | struct oom_wait_info *oom_wait_info; | ||
| 1345 | |||
| 1346 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | ||
| 1347 | |||
| 1348 | if (oom_wait_info->mem == wake_mem) | ||
| 1349 | goto wakeup; | ||
| 1350 | /* if no hierarchy, no match */ | ||
| 1351 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
| 1352 | return 0; | ||
| 1353 | /* | ||
| 1354 | * Both of oom_wait_info->mem and wake_mem are stable under us. | ||
| 1355 | * Then we can use css_is_ancestor without taking care of RCU. | ||
| 1356 | */ | ||
| 1357 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | ||
| 1358 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | ||
| 1359 | return 0; | ||
| 1360 | |||
| 1361 | wakeup: | ||
| 1362 | return autoremove_wake_function(wait, mode, sync, arg); | ||
| 1363 | } | ||
| 1364 | |||
| 1365 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | ||
| 1366 | { | ||
| 1367 | /* for filtering, pass "mem" as argument. */ | ||
| 1368 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | ||
| 1369 | } | ||
| 1370 | |||
| 1371 | static void memcg_oom_recover(struct mem_cgroup *mem) | ||
| 1372 | { | ||
| 1373 | if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) | ||
| 1374 | memcg_wakeup_oom(mem); | ||
| 1375 | } | ||
| 1376 | |||
| 1296 | /* | 1377 | /* |
| 1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1378 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
| 1298 | */ | 1379 | */ |
| 1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1380 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
| 1300 | { | 1381 | { |
| 1301 | DEFINE_WAIT(wait); | 1382 | struct oom_wait_info owait; |
| 1302 | bool locked; | 1383 | bool locked, need_to_kill; |
| 1303 | 1384 | ||
| 1385 | owait.mem = mem; | ||
| 1386 | owait.wait.flags = 0; | ||
| 1387 | owait.wait.func = memcg_oom_wake_function; | ||
| 1388 | owait.wait.private = current; | ||
| 1389 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
| 1390 | need_to_kill = true; | ||
| 1304 | /* At first, try to OOM lock hierarchy under mem.*/ | 1391 | /* At first, try to OOM lock hierarchy under mem.*/ |
| 1305 | mutex_lock(&memcg_oom_mutex); | 1392 | mutex_lock(&memcg_oom_mutex); |
| 1306 | locked = mem_cgroup_oom_lock(mem); | 1393 | locked = mem_cgroup_oom_lock(mem); |
| @@ -1309,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
| 1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1396 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
| 1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1397 | * under OOM is always welcomed, use TASK_KILLABLE here. |
| 1311 | */ | 1398 | */ |
| 1312 | if (!locked) | 1399 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
| 1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | 1400 | if (!locked || mem->oom_kill_disable) |
| 1401 | need_to_kill = false; | ||
| 1402 | if (locked) | ||
| 1403 | mem_cgroup_oom_notify(mem); | ||
| 1314 | mutex_unlock(&memcg_oom_mutex); | 1404 | mutex_unlock(&memcg_oom_mutex); |
| 1315 | 1405 | ||
| 1316 | if (locked) | 1406 | if (need_to_kill) { |
| 1407 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
| 1317 | mem_cgroup_out_of_memory(mem, mask); | 1408 | mem_cgroup_out_of_memory(mem, mask); |
| 1318 | else { | 1409 | } else { |
| 1319 | schedule(); | 1410 | schedule(); |
| 1320 | finish_wait(&memcg_oom_waitq, &wait); | 1411 | finish_wait(&memcg_oom_waitq, &owait.wait); |
| 1321 | } | 1412 | } |
| 1322 | mutex_lock(&memcg_oom_mutex); | 1413 | mutex_lock(&memcg_oom_mutex); |
| 1323 | mem_cgroup_oom_unlock(mem); | 1414 | mem_cgroup_oom_unlock(mem); |
| 1324 | /* | 1415 | memcg_wakeup_oom(mem); |
| 1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
| 1326 | * Assume following hierarchy. | ||
| 1327 | * A/ | ||
| 1328 | * 01 | ||
| 1329 | * 02 | ||
| 1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
| 1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
| 1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
| 1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
| 1334 | * It will not be a big problem. | ||
| 1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
| 1336 | */ | ||
| 1337 | wake_up_all(&memcg_oom_waitq); | ||
| 1338 | mutex_unlock(&memcg_oom_mutex); | 1416 | mutex_unlock(&memcg_oom_mutex); |
| 1339 | 1417 | ||
| 1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1418 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
| @@ -2118,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
| 2118 | /* If swapout, usage of swap doesn't decrease */ | 2196 | /* If swapout, usage of swap doesn't decrease */ |
| 2119 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2197 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 2120 | uncharge_memsw = false; | 2198 | uncharge_memsw = false; |
| 2121 | /* | ||
| 2122 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
| 2123 | * In those cases, all pages freed continously can be expected to be in | ||
| 2124 | * the same cgroup and we have chance to coalesce uncharges. | ||
| 2125 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
| 2126 | * because we want to do uncharge as soon as possible. | ||
| 2127 | */ | ||
| 2128 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
| 2129 | goto direct_uncharge; | ||
| 2130 | 2199 | ||
| 2131 | batch = ¤t->memcg_batch; | 2200 | batch = ¤t->memcg_batch; |
| 2132 | /* | 2201 | /* |
| @@ -2137,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
| 2137 | if (!batch->memcg) | 2206 | if (!batch->memcg) |
| 2138 | batch->memcg = mem; | 2207 | batch->memcg = mem; |
| 2139 | /* | 2208 | /* |
| 2209 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
| 2210 | * In those cases, all pages freed continously can be expected to be in | ||
| 2211 | * the same cgroup and we have chance to coalesce uncharges. | ||
| 2212 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
| 2213 | * because we want to do uncharge as soon as possible. | ||
| 2214 | */ | ||
| 2215 | |||
| 2216 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | ||
| 2217 | goto direct_uncharge; | ||
| 2218 | |||
| 2219 | /* | ||
| 2140 | * In typical case, batch->memcg == mem. This means we can | 2220 | * In typical case, batch->memcg == mem. This means we can |
| 2141 | * merge a series of uncharges to an uncharge of res_counter. | 2221 | * merge a series of uncharges to an uncharge of res_counter. |
| 2142 | * If not, we uncharge res_counter ony by one. | 2222 | * If not, we uncharge res_counter ony by one. |
| @@ -2152,6 +2232,8 @@ direct_uncharge: | |||
| 2152 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2232 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
| 2153 | if (uncharge_memsw) | 2233 | if (uncharge_memsw) |
| 2154 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2234 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
| 2235 | if (unlikely(batch->memcg != mem)) | ||
| 2236 | memcg_oom_recover(mem); | ||
| 2155 | return; | 2237 | return; |
| 2156 | } | 2238 | } |
| 2157 | 2239 | ||
| @@ -2188,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 2188 | switch (ctype) { | 2270 | switch (ctype) { |
| 2189 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2271 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
| 2190 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2272 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
| 2191 | if (page_mapped(page)) | 2273 | /* See mem_cgroup_prepare_migration() */ |
| 2274 | if (page_mapped(page) || PageCgroupMigration(pc)) | ||
| 2192 | goto unlock_out; | 2275 | goto unlock_out; |
| 2193 | break; | 2276 | break; |
| 2194 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 2277 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
| @@ -2288,6 +2371,7 @@ void mem_cgroup_uncharge_end(void) | |||
| 2288 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2371 | res_counter_uncharge(&batch->memcg->res, batch->bytes); |
| 2289 | if (batch->memsw_bytes) | 2372 | if (batch->memsw_bytes) |
| 2290 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2373 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); |
| 2374 | memcg_oom_recover(batch->memcg); | ||
| 2291 | /* forget this pointer (for sanity check) */ | 2375 | /* forget this pointer (for sanity check) */ |
| 2292 | batch->memcg = NULL; | 2376 | batch->memcg = NULL; |
| 2293 | } | 2377 | } |
| @@ -2410,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
| 2410 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 2494 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
| 2411 | * page belongs to. | 2495 | * page belongs to. |
| 2412 | */ | 2496 | */ |
| 2413 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 2497 | int mem_cgroup_prepare_migration(struct page *page, |
| 2498 | struct page *newpage, struct mem_cgroup **ptr) | ||
| 2414 | { | 2499 | { |
| 2415 | struct page_cgroup *pc; | 2500 | struct page_cgroup *pc; |
| 2416 | struct mem_cgroup *mem = NULL; | 2501 | struct mem_cgroup *mem = NULL; |
| 2502 | enum charge_type ctype; | ||
| 2417 | int ret = 0; | 2503 | int ret = 0; |
| 2418 | 2504 | ||
| 2419 | if (mem_cgroup_disabled()) | 2505 | if (mem_cgroup_disabled()) |
| @@ -2424,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
| 2424 | if (PageCgroupUsed(pc)) { | 2510 | if (PageCgroupUsed(pc)) { |
| 2425 | mem = pc->mem_cgroup; | 2511 | mem = pc->mem_cgroup; |
| 2426 | css_get(&mem->css); | 2512 | css_get(&mem->css); |
| 2513 | /* | ||
| 2514 | * At migrating an anonymous page, its mapcount goes down | ||
| 2515 | * to 0 and uncharge() will be called. But, even if it's fully | ||
| 2516 | * unmapped, migration may fail and this page has to be | ||
| 2517 | * charged again. We set MIGRATION flag here and delay uncharge | ||
| 2518 | * until end_migration() is called | ||
| 2519 | * | ||
| 2520 | * Corner Case Thinking | ||
| 2521 | * A) | ||
| 2522 | * When the old page was mapped as Anon and it's unmap-and-freed | ||
| 2523 | * while migration was ongoing. | ||
| 2524 | * If unmap finds the old page, uncharge() of it will be delayed | ||
| 2525 | * until end_migration(). If unmap finds a new page, it's | ||
| 2526 | * uncharged when it make mapcount to be 1->0. If unmap code | ||
| 2527 | * finds swap_migration_entry, the new page will not be mapped | ||
| 2528 | * and end_migration() will find it(mapcount==0). | ||
| 2529 | * | ||
| 2530 | * B) | ||
| 2531 | * When the old page was mapped but migraion fails, the kernel | ||
| 2532 | * remaps it. A charge for it is kept by MIGRATION flag even | ||
| 2533 | * if mapcount goes down to 0. We can do remap successfully | ||
| 2534 | * without charging it again. | ||
| 2535 | * | ||
| 2536 | * C) | ||
| 2537 | * The "old" page is under lock_page() until the end of | ||
| 2538 | * migration, so, the old page itself will not be swapped-out. | ||
| 2539 | * If the new page is swapped out before end_migraton, our | ||
| 2540 | * hook to usual swap-out path will catch the event. | ||
| 2541 | */ | ||
| 2542 | if (PageAnon(page)) | ||
| 2543 | SetPageCgroupMigration(pc); | ||
| 2427 | } | 2544 | } |
| 2428 | unlock_page_cgroup(pc); | 2545 | unlock_page_cgroup(pc); |
| 2546 | /* | ||
| 2547 | * If the page is not charged at this point, | ||
| 2548 | * we return here. | ||
| 2549 | */ | ||
| 2550 | if (!mem) | ||
| 2551 | return 0; | ||
| 2429 | 2552 | ||
| 2430 | *ptr = mem; | 2553 | *ptr = mem; |
| 2431 | if (mem) { | 2554 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
| 2432 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2555 | css_put(&mem->css);/* drop extra refcnt */ |
| 2433 | css_put(&mem->css); | 2556 | if (ret || *ptr == NULL) { |
| 2557 | if (PageAnon(page)) { | ||
| 2558 | lock_page_cgroup(pc); | ||
| 2559 | ClearPageCgroupMigration(pc); | ||
| 2560 | unlock_page_cgroup(pc); | ||
| 2561 | /* | ||
| 2562 | * The old page may be fully unmapped while we kept it. | ||
| 2563 | */ | ||
| 2564 | mem_cgroup_uncharge_page(page); | ||
| 2565 | } | ||
| 2566 | return -ENOMEM; | ||
| 2434 | } | 2567 | } |
| 2568 | /* | ||
| 2569 | * We charge new page before it's used/mapped. So, even if unlock_page() | ||
| 2570 | * is called before end_migration, we can catch all events on this new | ||
| 2571 | * page. In the case new page is migrated but not remapped, new page's | ||
| 2572 | * mapcount will be finally 0 and we call uncharge in end_migration(). | ||
| 2573 | */ | ||
| 2574 | pc = lookup_page_cgroup(newpage); | ||
| 2575 | if (PageAnon(page)) | ||
| 2576 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
| 2577 | else if (page_is_file_cache(page)) | ||
| 2578 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
| 2579 | else | ||
| 2580 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
| 2581 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
| 2435 | return ret; | 2582 | return ret; |
| 2436 | } | 2583 | } |
| 2437 | 2584 | ||
| 2438 | /* remove redundant charge if migration failed*/ | 2585 | /* remove redundant charge if migration failed*/ |
| 2439 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2586 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
| 2440 | struct page *oldpage, struct page *newpage) | 2587 | struct page *oldpage, struct page *newpage) |
| 2441 | { | 2588 | { |
| 2442 | struct page *target, *unused; | 2589 | struct page *used, *unused; |
| 2443 | struct page_cgroup *pc; | 2590 | struct page_cgroup *pc; |
| 2444 | enum charge_type ctype; | ||
| 2445 | 2591 | ||
| 2446 | if (!mem) | 2592 | if (!mem) |
| 2447 | return; | 2593 | return; |
| 2594 | /* blocks rmdir() */ | ||
| 2448 | cgroup_exclude_rmdir(&mem->css); | 2595 | cgroup_exclude_rmdir(&mem->css); |
| 2449 | /* at migration success, oldpage->mapping is NULL. */ | 2596 | /* at migration success, oldpage->mapping is NULL. */ |
| 2450 | if (oldpage->mapping) { | 2597 | if (oldpage->mapping) { |
| 2451 | target = oldpage; | 2598 | used = oldpage; |
| 2452 | unused = NULL; | 2599 | unused = newpage; |
| 2453 | } else { | 2600 | } else { |
| 2454 | target = newpage; | 2601 | used = newpage; |
| 2455 | unused = oldpage; | 2602 | unused = oldpage; |
| 2456 | } | 2603 | } |
| 2457 | |||
| 2458 | if (PageAnon(target)) | ||
| 2459 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
| 2460 | else if (page_is_file_cache(target)) | ||
| 2461 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
| 2462 | else | ||
| 2463 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
| 2464 | |||
| 2465 | /* unused page is not on radix-tree now. */ | ||
| 2466 | if (unused) | ||
| 2467 | __mem_cgroup_uncharge_common(unused, ctype); | ||
| 2468 | |||
| 2469 | pc = lookup_page_cgroup(target); | ||
| 2470 | /* | 2604 | /* |
| 2471 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. | 2605 | * We disallowed uncharge of pages under migration because mapcount |
| 2472 | * So, double-counting is effectively avoided. | 2606 | * of the page goes down to zero, temporarly. |
| 2607 | * Clear the flag and check the page should be charged. | ||
| 2473 | */ | 2608 | */ |
| 2474 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2609 | pc = lookup_page_cgroup(oldpage); |
| 2610 | lock_page_cgroup(pc); | ||
| 2611 | ClearPageCgroupMigration(pc); | ||
| 2612 | unlock_page_cgroup(pc); | ||
| 2613 | |||
| 2614 | if (unused != oldpage) | ||
| 2615 | pc = lookup_page_cgroup(unused); | ||
| 2616 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
| 2475 | 2617 | ||
| 2618 | pc = lookup_page_cgroup(used); | ||
| 2476 | /* | 2619 | /* |
| 2477 | * Both of oldpage and newpage are still under lock_page(). | 2620 | * If a page is a file cache, radix-tree replacement is very atomic |
| 2478 | * Then, we don't have to care about race in radix-tree. | 2621 | * and we can skip this check. When it was an Anon page, its mapcount |
| 2479 | * But we have to be careful that this page is unmapped or not. | 2622 | * goes down to 0. But because we added MIGRATION flage, it's not |
| 2480 | * | 2623 | * uncharged yet. There are several case but page->mapcount check |
| 2481 | * There is a case for !page_mapped(). At the start of | 2624 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
| 2482 | * migration, oldpage was mapped. But now, it's zapped. | 2625 | * check. (see prepare_charge() also) |
| 2483 | * But we know *target* page is not freed/reused under us. | ||
| 2484 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
| 2485 | */ | 2626 | */ |
| 2486 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2627 | if (PageAnon(used)) |
| 2487 | mem_cgroup_uncharge_page(target); | 2628 | mem_cgroup_uncharge_page(used); |
| 2488 | /* | 2629 | /* |
| 2489 | * At migration, we may charge account against cgroup which has no tasks | 2630 | * At migration, we may charge account against cgroup which has no |
| 2631 | * tasks. | ||
| 2490 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2632 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
| 2491 | * In that case, we need to call pre_destroy() again. check it here. | 2633 | * In that case, we need to call pre_destroy() again. check it here. |
| 2492 | */ | 2634 | */ |
| @@ -2524,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2524 | unsigned long long val) | 2666 | unsigned long long val) |
| 2525 | { | 2667 | { |
| 2526 | int retry_count; | 2668 | int retry_count; |
| 2527 | u64 memswlimit; | 2669 | u64 memswlimit, memlimit; |
| 2528 | int ret = 0; | 2670 | int ret = 0; |
| 2529 | int children = mem_cgroup_count_children(memcg); | 2671 | int children = mem_cgroup_count_children(memcg); |
| 2530 | u64 curusage, oldusage; | 2672 | u64 curusage, oldusage; |
| 2673 | int enlarge; | ||
| 2531 | 2674 | ||
| 2532 | /* | 2675 | /* |
| 2533 | * For keeping hierarchical_reclaim simple, how long we should retry | 2676 | * For keeping hierarchical_reclaim simple, how long we should retry |
| @@ -2538,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2538 | 2681 | ||
| 2539 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2682 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
| 2540 | 2683 | ||
| 2684 | enlarge = 0; | ||
| 2541 | while (retry_count) { | 2685 | while (retry_count) { |
| 2542 | if (signal_pending(current)) { | 2686 | if (signal_pending(current)) { |
| 2543 | ret = -EINTR; | 2687 | ret = -EINTR; |
| @@ -2555,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2555 | mutex_unlock(&set_limit_mutex); | 2699 | mutex_unlock(&set_limit_mutex); |
| 2556 | break; | 2700 | break; |
| 2557 | } | 2701 | } |
| 2702 | |||
| 2703 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
| 2704 | if (memlimit < val) | ||
| 2705 | enlarge = 1; | ||
| 2706 | |||
| 2558 | ret = res_counter_set_limit(&memcg->res, val); | 2707 | ret = res_counter_set_limit(&memcg->res, val); |
| 2559 | if (!ret) { | 2708 | if (!ret) { |
| 2560 | if (memswlimit == val) | 2709 | if (memswlimit == val) |
| @@ -2576,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 2576 | else | 2725 | else |
| 2577 | oldusage = curusage; | 2726 | oldusage = curusage; |
| 2578 | } | 2727 | } |
| 2728 | if (!ret && enlarge) | ||
| 2729 | memcg_oom_recover(memcg); | ||
| 2579 | 2730 | ||
| 2580 | return ret; | 2731 | return ret; |
| 2581 | } | 2732 | } |
| @@ -2584,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 2584 | unsigned long long val) | 2735 | unsigned long long val) |
| 2585 | { | 2736 | { |
| 2586 | int retry_count; | 2737 | int retry_count; |
| 2587 | u64 memlimit, oldusage, curusage; | 2738 | u64 memlimit, memswlimit, oldusage, curusage; |
| 2588 | int children = mem_cgroup_count_children(memcg); | 2739 | int children = mem_cgroup_count_children(memcg); |
| 2589 | int ret = -EBUSY; | 2740 | int ret = -EBUSY; |
| 2741 | int enlarge = 0; | ||
| 2590 | 2742 | ||
| 2591 | /* see mem_cgroup_resize_res_limit */ | 2743 | /* see mem_cgroup_resize_res_limit */ |
| 2592 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 2744 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
| @@ -2608,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 2608 | mutex_unlock(&set_limit_mutex); | 2760 | mutex_unlock(&set_limit_mutex); |
| 2609 | break; | 2761 | break; |
| 2610 | } | 2762 | } |
| 2763 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
| 2764 | if (memswlimit < val) | ||
| 2765 | enlarge = 1; | ||
| 2611 | ret = res_counter_set_limit(&memcg->memsw, val); | 2766 | ret = res_counter_set_limit(&memcg->memsw, val); |
| 2612 | if (!ret) { | 2767 | if (!ret) { |
| 2613 | if (memlimit == val) | 2768 | if (memlimit == val) |
| @@ -2630,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 2630 | else | 2785 | else |
| 2631 | oldusage = curusage; | 2786 | oldusage = curusage; |
| 2632 | } | 2787 | } |
| 2788 | if (!ret && enlarge) | ||
| 2789 | memcg_oom_recover(memcg); | ||
| 2633 | return ret; | 2790 | return ret; |
| 2634 | } | 2791 | } |
| 2635 | 2792 | ||
| @@ -2821,6 +2978,7 @@ move_account: | |||
| 2821 | if (ret) | 2978 | if (ret) |
| 2822 | break; | 2979 | break; |
| 2823 | } | 2980 | } |
| 2981 | memcg_oom_recover(mem); | ||
| 2824 | /* it seems parent cgroup doesn't have enough mem */ | 2982 | /* it seems parent cgroup doesn't have enough mem */ |
| 2825 | if (ret == -ENOMEM) | 2983 | if (ret == -ENOMEM) |
| 2826 | goto try_to_free; | 2984 | goto try_to_free; |
| @@ -3311,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
| 3311 | 3469 | ||
| 3312 | rcu_read_lock(); | 3470 | rcu_read_lock(); |
| 3313 | if (!swap) | 3471 | if (!swap) |
| 3314 | t = rcu_dereference(memcg->thresholds); | 3472 | t = rcu_dereference(memcg->thresholds.primary); |
| 3315 | else | 3473 | else |
| 3316 | t = rcu_dereference(memcg->memsw_thresholds); | 3474 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
| 3317 | 3475 | ||
| 3318 | if (!t) | 3476 | if (!t) |
| 3319 | goto unlock; | 3477 | goto unlock; |
| @@ -3325,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
| 3325 | * If it's not true, a threshold was crossed after last | 3483 | * If it's not true, a threshold was crossed after last |
| 3326 | * call of __mem_cgroup_threshold(). | 3484 | * call of __mem_cgroup_threshold(). |
| 3327 | */ | 3485 | */ |
| 3328 | i = atomic_read(&t->current_threshold); | 3486 | i = t->current_threshold; |
| 3329 | 3487 | ||
| 3330 | /* | 3488 | /* |
| 3331 | * Iterate backward over array of thresholds starting from | 3489 | * Iterate backward over array of thresholds starting from |
| @@ -3349,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
| 3349 | eventfd_signal(t->entries[i].eventfd, 1); | 3507 | eventfd_signal(t->entries[i].eventfd, 1); |
| 3350 | 3508 | ||
| 3351 | /* Update current_threshold */ | 3509 | /* Update current_threshold */ |
| 3352 | atomic_set(&t->current_threshold, i - 1); | 3510 | t->current_threshold = i - 1; |
| 3353 | unlock: | 3511 | unlock: |
| 3354 | rcu_read_unlock(); | 3512 | rcu_read_unlock(); |
| 3355 | } | 3513 | } |
| @@ -3369,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b) | |||
| 3369 | return _a->threshold - _b->threshold; | 3527 | return _a->threshold - _b->threshold; |
| 3370 | } | 3528 | } |
| 3371 | 3529 | ||
| 3372 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3530 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
| 3373 | struct eventfd_ctx *eventfd, const char *args) | 3531 | { |
| 3532 | struct mem_cgroup_eventfd_list *ev; | ||
| 3533 | |||
| 3534 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
| 3535 | eventfd_signal(ev->eventfd, 1); | ||
| 3536 | return 0; | ||
| 3537 | } | ||
| 3538 | |||
| 3539 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
| 3540 | { | ||
| 3541 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
| 3542 | } | ||
| 3543 | |||
| 3544 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
| 3545 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
| 3374 | { | 3546 | { |
| 3375 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3547 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
| 3376 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3548 | struct mem_cgroup_thresholds *thresholds; |
| 3549 | struct mem_cgroup_threshold_ary *new; | ||
| 3377 | int type = MEMFILE_TYPE(cft->private); | 3550 | int type = MEMFILE_TYPE(cft->private); |
| 3378 | u64 threshold, usage; | 3551 | u64 threshold, usage; |
| 3379 | int size; | 3552 | int i, size, ret; |
| 3380 | int i, ret; | ||
| 3381 | 3553 | ||
| 3382 | ret = res_counter_memparse_write_strategy(args, &threshold); | 3554 | ret = res_counter_memparse_write_strategy(args, &threshold); |
| 3383 | if (ret) | 3555 | if (ret) |
| 3384 | return ret; | 3556 | return ret; |
| 3385 | 3557 | ||
| 3386 | mutex_lock(&memcg->thresholds_lock); | 3558 | mutex_lock(&memcg->thresholds_lock); |
| 3559 | |||
| 3387 | if (type == _MEM) | 3560 | if (type == _MEM) |
| 3388 | thresholds = memcg->thresholds; | 3561 | thresholds = &memcg->thresholds; |
| 3389 | else if (type == _MEMSWAP) | 3562 | else if (type == _MEMSWAP) |
| 3390 | thresholds = memcg->memsw_thresholds; | 3563 | thresholds = &memcg->memsw_thresholds; |
| 3391 | else | 3564 | else |
| 3392 | BUG(); | 3565 | BUG(); |
| 3393 | 3566 | ||
| 3394 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 3567 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
| 3395 | 3568 | ||
| 3396 | /* Check if a threshold crossed before adding a new one */ | 3569 | /* Check if a threshold crossed before adding a new one */ |
| 3397 | if (thresholds) | 3570 | if (thresholds->primary) |
| 3398 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3571 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| 3399 | 3572 | ||
| 3400 | if (thresholds) | 3573 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
| 3401 | size = thresholds->size + 1; | ||
| 3402 | else | ||
| 3403 | size = 1; | ||
| 3404 | 3574 | ||
| 3405 | /* Allocate memory for new array of thresholds */ | 3575 | /* Allocate memory for new array of thresholds */ |
| 3406 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | 3576 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
| 3407 | size * sizeof(struct mem_cgroup_threshold), | ||
| 3408 | GFP_KERNEL); | 3577 | GFP_KERNEL); |
| 3409 | if (!thresholds_new) { | 3578 | if (!new) { |
| 3410 | ret = -ENOMEM; | 3579 | ret = -ENOMEM; |
| 3411 | goto unlock; | 3580 | goto unlock; |
| 3412 | } | 3581 | } |
| 3413 | thresholds_new->size = size; | 3582 | new->size = size; |
| 3414 | 3583 | ||
| 3415 | /* Copy thresholds (if any) to new array */ | 3584 | /* Copy thresholds (if any) to new array */ |
| 3416 | if (thresholds) | 3585 | if (thresholds->primary) { |
| 3417 | memcpy(thresholds_new->entries, thresholds->entries, | 3586 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
| 3418 | thresholds->size * | ||
| 3419 | sizeof(struct mem_cgroup_threshold)); | 3587 | sizeof(struct mem_cgroup_threshold)); |
| 3588 | } | ||
| 3589 | |||
| 3420 | /* Add new threshold */ | 3590 | /* Add new threshold */ |
| 3421 | thresholds_new->entries[size - 1].eventfd = eventfd; | 3591 | new->entries[size - 1].eventfd = eventfd; |
| 3422 | thresholds_new->entries[size - 1].threshold = threshold; | 3592 | new->entries[size - 1].threshold = threshold; |
| 3423 | 3593 | ||
| 3424 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 3594 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
| 3425 | sort(thresholds_new->entries, size, | 3595 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
| 3426 | sizeof(struct mem_cgroup_threshold), | ||
| 3427 | compare_thresholds, NULL); | 3596 | compare_thresholds, NULL); |
| 3428 | 3597 | ||
| 3429 | /* Find current threshold */ | 3598 | /* Find current threshold */ |
| 3430 | atomic_set(&thresholds_new->current_threshold, -1); | 3599 | new->current_threshold = -1; |
| 3431 | for (i = 0; i < size; i++) { | 3600 | for (i = 0; i < size; i++) { |
| 3432 | if (thresholds_new->entries[i].threshold < usage) { | 3601 | if (new->entries[i].threshold < usage) { |
| 3433 | /* | 3602 | /* |
| 3434 | * thresholds_new->current_threshold will not be used | 3603 | * new->current_threshold will not be used until |
| 3435 | * until rcu_assign_pointer(), so it's safe to increment | 3604 | * rcu_assign_pointer(), so it's safe to increment |
| 3436 | * it here. | 3605 | * it here. |
| 3437 | */ | 3606 | */ |
| 3438 | atomic_inc(&thresholds_new->current_threshold); | 3607 | ++new->current_threshold; |
| 3439 | } | 3608 | } |
| 3440 | } | 3609 | } |
| 3441 | 3610 | ||
| 3442 | if (type == _MEM) | 3611 | /* Free old spare buffer and save old primary buffer as spare */ |
| 3443 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3612 | kfree(thresholds->spare); |
| 3444 | else | 3613 | thresholds->spare = thresholds->primary; |
| 3445 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | 3614 | |
| 3615 | rcu_assign_pointer(thresholds->primary, new); | ||
| 3446 | 3616 | ||
| 3447 | /* To be sure that nobody uses thresholds before freeing it */ | 3617 | /* To be sure that nobody uses thresholds */ |
| 3448 | synchronize_rcu(); | 3618 | synchronize_rcu(); |
| 3449 | 3619 | ||
| 3450 | kfree(thresholds); | ||
| 3451 | unlock: | 3620 | unlock: |
| 3452 | mutex_unlock(&memcg->thresholds_lock); | 3621 | mutex_unlock(&memcg->thresholds_lock); |
| 3453 | 3622 | ||
| 3454 | return ret; | 3623 | return ret; |
| 3455 | } | 3624 | } |
| 3456 | 3625 | ||
| 3457 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3626 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
| 3458 | struct eventfd_ctx *eventfd) | 3627 | struct cftype *cft, struct eventfd_ctx *eventfd) |
| 3459 | { | 3628 | { |
| 3460 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
| 3461 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3630 | struct mem_cgroup_thresholds *thresholds; |
| 3631 | struct mem_cgroup_threshold_ary *new; | ||
| 3462 | int type = MEMFILE_TYPE(cft->private); | 3632 | int type = MEMFILE_TYPE(cft->private); |
| 3463 | u64 usage; | 3633 | u64 usage; |
| 3464 | int size = 0; | 3634 | int i, j, size; |
| 3465 | int i, j, ret; | ||
| 3466 | 3635 | ||
| 3467 | mutex_lock(&memcg->thresholds_lock); | 3636 | mutex_lock(&memcg->thresholds_lock); |
| 3468 | if (type == _MEM) | 3637 | if (type == _MEM) |
| 3469 | thresholds = memcg->thresholds; | 3638 | thresholds = &memcg->thresholds; |
| 3470 | else if (type == _MEMSWAP) | 3639 | else if (type == _MEMSWAP) |
| 3471 | thresholds = memcg->memsw_thresholds; | 3640 | thresholds = &memcg->memsw_thresholds; |
| 3472 | else | 3641 | else |
| 3473 | BUG(); | 3642 | BUG(); |
| 3474 | 3643 | ||
| @@ -3484,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | |||
| 3484 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3653 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
| 3485 | 3654 | ||
| 3486 | /* Calculate new number of threshold */ | 3655 | /* Calculate new number of threshold */ |
| 3487 | for (i = 0; i < thresholds->size; i++) { | 3656 | size = 0; |
| 3488 | if (thresholds->entries[i].eventfd != eventfd) | 3657 | for (i = 0; i < thresholds->primary->size; i++) { |
| 3658 | if (thresholds->primary->entries[i].eventfd != eventfd) | ||
| 3489 | size++; | 3659 | size++; |
| 3490 | } | 3660 | } |
| 3491 | 3661 | ||
| 3662 | new = thresholds->spare; | ||
| 3663 | |||
| 3492 | /* Set thresholds array to NULL if we don't have thresholds */ | 3664 | /* Set thresholds array to NULL if we don't have thresholds */ |
| 3493 | if (!size) { | 3665 | if (!size) { |
| 3494 | thresholds_new = NULL; | 3666 | kfree(new); |
| 3495 | goto assign; | 3667 | new = NULL; |
| 3668 | goto swap_buffers; | ||
| 3496 | } | 3669 | } |
| 3497 | 3670 | ||
| 3498 | /* Allocate memory for new array of thresholds */ | 3671 | new->size = size; |
| 3499 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
| 3500 | size * sizeof(struct mem_cgroup_threshold), | ||
| 3501 | GFP_KERNEL); | ||
| 3502 | if (!thresholds_new) { | ||
| 3503 | ret = -ENOMEM; | ||
| 3504 | goto unlock; | ||
| 3505 | } | ||
| 3506 | thresholds_new->size = size; | ||
| 3507 | 3672 | ||
| 3508 | /* Copy thresholds and find current threshold */ | 3673 | /* Copy thresholds and find current threshold */ |
| 3509 | atomic_set(&thresholds_new->current_threshold, -1); | 3674 | new->current_threshold = -1; |
| 3510 | for (i = 0, j = 0; i < thresholds->size; i++) { | 3675 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
| 3511 | if (thresholds->entries[i].eventfd == eventfd) | 3676 | if (thresholds->primary->entries[i].eventfd == eventfd) |
| 3512 | continue; | 3677 | continue; |
| 3513 | 3678 | ||
| 3514 | thresholds_new->entries[j] = thresholds->entries[i]; | 3679 | new->entries[j] = thresholds->primary->entries[i]; |
| 3515 | if (thresholds_new->entries[j].threshold < usage) { | 3680 | if (new->entries[j].threshold < usage) { |
| 3516 | /* | 3681 | /* |
| 3517 | * thresholds_new->current_threshold will not be used | 3682 | * new->current_threshold will not be used |
| 3518 | * until rcu_assign_pointer(), so it's safe to increment | 3683 | * until rcu_assign_pointer(), so it's safe to increment |
| 3519 | * it here. | 3684 | * it here. |
| 3520 | */ | 3685 | */ |
| 3521 | atomic_inc(&thresholds_new->current_threshold); | 3686 | ++new->current_threshold; |
| 3522 | } | 3687 | } |
| 3523 | j++; | 3688 | j++; |
| 3524 | } | 3689 | } |
| 3525 | 3690 | ||
| 3526 | assign: | 3691 | swap_buffers: |
| 3527 | if (type == _MEM) | 3692 | /* Swap primary and spare array */ |
| 3528 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3693 | thresholds->spare = thresholds->primary; |
| 3529 | else | 3694 | rcu_assign_pointer(thresholds->primary, new); |
| 3530 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
| 3531 | 3695 | ||
| 3532 | /* To be sure that nobody uses thresholds before freeing it */ | 3696 | /* To be sure that nobody uses thresholds */ |
| 3533 | synchronize_rcu(); | 3697 | synchronize_rcu(); |
| 3534 | 3698 | ||
| 3535 | kfree(thresholds); | ||
| 3536 | unlock: | ||
| 3537 | mutex_unlock(&memcg->thresholds_lock); | 3699 | mutex_unlock(&memcg->thresholds_lock); |
| 3700 | } | ||
| 3538 | 3701 | ||
| 3539 | return ret; | 3702 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, |
| 3703 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
| 3704 | { | ||
| 3705 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
| 3706 | struct mem_cgroup_eventfd_list *event; | ||
| 3707 | int type = MEMFILE_TYPE(cft->private); | ||
| 3708 | |||
| 3709 | BUG_ON(type != _OOM_TYPE); | ||
| 3710 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
| 3711 | if (!event) | ||
| 3712 | return -ENOMEM; | ||
| 3713 | |||
| 3714 | mutex_lock(&memcg_oom_mutex); | ||
| 3715 | |||
| 3716 | event->eventfd = eventfd; | ||
| 3717 | list_add(&event->list, &memcg->oom_notify); | ||
| 3718 | |||
| 3719 | /* already in OOM ? */ | ||
| 3720 | if (atomic_read(&memcg->oom_lock)) | ||
| 3721 | eventfd_signal(eventfd, 1); | ||
| 3722 | mutex_unlock(&memcg_oom_mutex); | ||
| 3723 | |||
| 3724 | return 0; | ||
| 3725 | } | ||
| 3726 | |||
| 3727 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
| 3728 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
| 3729 | { | ||
| 3730 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
| 3731 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
| 3732 | int type = MEMFILE_TYPE(cft->private); | ||
| 3733 | |||
| 3734 | BUG_ON(type != _OOM_TYPE); | ||
| 3735 | |||
| 3736 | mutex_lock(&memcg_oom_mutex); | ||
| 3737 | |||
| 3738 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
| 3739 | if (ev->eventfd == eventfd) { | ||
| 3740 | list_del(&ev->list); | ||
| 3741 | kfree(ev); | ||
| 3742 | } | ||
| 3743 | } | ||
| 3744 | |||
| 3745 | mutex_unlock(&memcg_oom_mutex); | ||
| 3746 | } | ||
| 3747 | |||
| 3748 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | ||
| 3749 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
| 3750 | { | ||
| 3751 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
| 3752 | |||
| 3753 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | ||
| 3754 | |||
| 3755 | if (atomic_read(&mem->oom_lock)) | ||
| 3756 | cb->fill(cb, "under_oom", 1); | ||
| 3757 | else | ||
| 3758 | cb->fill(cb, "under_oom", 0); | ||
| 3759 | return 0; | ||
| 3760 | } | ||
| 3761 | |||
| 3762 | /* | ||
| 3763 | */ | ||
| 3764 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | ||
| 3765 | struct cftype *cft, u64 val) | ||
| 3766 | { | ||
| 3767 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
| 3768 | struct mem_cgroup *parent; | ||
| 3769 | |||
| 3770 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | ||
| 3771 | if (!cgrp->parent || !((val == 0) || (val == 1))) | ||
| 3772 | return -EINVAL; | ||
| 3773 | |||
| 3774 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
| 3775 | |||
| 3776 | cgroup_lock(); | ||
| 3777 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
| 3778 | if ((parent->use_hierarchy) || | ||
| 3779 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | ||
| 3780 | cgroup_unlock(); | ||
| 3781 | return -EINVAL; | ||
| 3782 | } | ||
| 3783 | mem->oom_kill_disable = val; | ||
| 3784 | cgroup_unlock(); | ||
| 3785 | return 0; | ||
| 3540 | } | 3786 | } |
| 3541 | 3787 | ||
| 3542 | static struct cftype mem_cgroup_files[] = { | 3788 | static struct cftype mem_cgroup_files[] = { |
| @@ -3544,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = { | |||
| 3544 | .name = "usage_in_bytes", | 3790 | .name = "usage_in_bytes", |
| 3545 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3791 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
| 3546 | .read_u64 = mem_cgroup_read, | 3792 | .read_u64 = mem_cgroup_read, |
| 3547 | .register_event = mem_cgroup_register_event, | 3793 | .register_event = mem_cgroup_usage_register_event, |
| 3548 | .unregister_event = mem_cgroup_unregister_event, | 3794 | .unregister_event = mem_cgroup_usage_unregister_event, |
| 3549 | }, | 3795 | }, |
| 3550 | { | 3796 | { |
| 3551 | .name = "max_usage_in_bytes", | 3797 | .name = "max_usage_in_bytes", |
| @@ -3594,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = { | |||
| 3594 | .read_u64 = mem_cgroup_move_charge_read, | 3840 | .read_u64 = mem_cgroup_move_charge_read, |
| 3595 | .write_u64 = mem_cgroup_move_charge_write, | 3841 | .write_u64 = mem_cgroup_move_charge_write, |
| 3596 | }, | 3842 | }, |
| 3843 | { | ||
| 3844 | .name = "oom_control", | ||
| 3845 | .read_map = mem_cgroup_oom_control_read, | ||
| 3846 | .write_u64 = mem_cgroup_oom_control_write, | ||
| 3847 | .register_event = mem_cgroup_oom_register_event, | ||
| 3848 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
| 3849 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
| 3850 | }, | ||
| 3597 | }; | 3851 | }; |
| 3598 | 3852 | ||
| 3599 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| @@ -3602,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
| 3602 | .name = "memsw.usage_in_bytes", | 3856 | .name = "memsw.usage_in_bytes", |
| 3603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3857 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
| 3604 | .read_u64 = mem_cgroup_read, | 3858 | .read_u64 = mem_cgroup_read, |
| 3605 | .register_event = mem_cgroup_register_event, | 3859 | .register_event = mem_cgroup_usage_register_event, |
| 3606 | .unregister_event = mem_cgroup_unregister_event, | 3860 | .unregister_event = mem_cgroup_usage_unregister_event, |
| 3607 | }, | 3861 | }, |
| 3608 | { | 3862 | { |
| 3609 | .name = "memsw.max_usage_in_bytes", | 3863 | .name = "memsw.max_usage_in_bytes", |
| @@ -3831,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3831 | } else { | 4085 | } else { |
| 3832 | parent = mem_cgroup_from_cont(cont->parent); | 4086 | parent = mem_cgroup_from_cont(cont->parent); |
| 3833 | mem->use_hierarchy = parent->use_hierarchy; | 4087 | mem->use_hierarchy = parent->use_hierarchy; |
| 4088 | mem->oom_kill_disable = parent->oom_kill_disable; | ||
| 3834 | } | 4089 | } |
| 3835 | 4090 | ||
| 3836 | if (parent && parent->use_hierarchy) { | 4091 | if (parent && parent->use_hierarchy) { |
| @@ -3849,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 3849 | } | 4104 | } |
| 3850 | mem->last_scanned_child = 0; | 4105 | mem->last_scanned_child = 0; |
| 3851 | spin_lock_init(&mem->reclaim_param_lock); | 4106 | spin_lock_init(&mem->reclaim_param_lock); |
| 4107 | INIT_LIST_HEAD(&mem->oom_notify); | ||
| 3852 | 4108 | ||
| 3853 | if (parent) | 4109 | if (parent) |
| 3854 | mem->swappiness = get_swappiness(parent); | 4110 | mem->swappiness = get_swappiness(parent); |
| @@ -3976,6 +4232,80 @@ enum mc_target_type { | |||
| 3976 | MC_TARGET_SWAP, | 4232 | MC_TARGET_SWAP, |
| 3977 | }; | 4233 | }; |
| 3978 | 4234 | ||
| 4235 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | ||
| 4236 | unsigned long addr, pte_t ptent) | ||
| 4237 | { | ||
| 4238 | struct page *page = vm_normal_page(vma, addr, ptent); | ||
| 4239 | |||
| 4240 | if (!page || !page_mapped(page)) | ||
| 4241 | return NULL; | ||
| 4242 | if (PageAnon(page)) { | ||
| 4243 | /* we don't move shared anon */ | ||
| 4244 | if (!move_anon() || page_mapcount(page) > 2) | ||
| 4245 | return NULL; | ||
| 4246 | } else if (!move_file()) | ||
| 4247 | /* we ignore mapcount for file pages */ | ||
| 4248 | return NULL; | ||
| 4249 | if (!get_page_unless_zero(page)) | ||
| 4250 | return NULL; | ||
| 4251 | |||
| 4252 | return page; | ||
| 4253 | } | ||
| 4254 | |||
| 4255 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
| 4256 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
| 4257 | { | ||
| 4258 | int usage_count; | ||
| 4259 | struct page *page = NULL; | ||
| 4260 | swp_entry_t ent = pte_to_swp_entry(ptent); | ||
| 4261 | |||
| 4262 | if (!move_anon() || non_swap_entry(ent)) | ||
| 4263 | return NULL; | ||
| 4264 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
| 4265 | if (usage_count > 1) { /* we don't move shared anon */ | ||
| 4266 | if (page) | ||
| 4267 | put_page(page); | ||
| 4268 | return NULL; | ||
| 4269 | } | ||
| 4270 | if (do_swap_account) | ||
| 4271 | entry->val = ent.val; | ||
| 4272 | |||
| 4273 | return page; | ||
| 4274 | } | ||
| 4275 | |||
| 4276 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | ||
| 4277 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
| 4278 | { | ||
| 4279 | struct page *page = NULL; | ||
| 4280 | struct inode *inode; | ||
| 4281 | struct address_space *mapping; | ||
| 4282 | pgoff_t pgoff; | ||
| 4283 | |||
| 4284 | if (!vma->vm_file) /* anonymous vma */ | ||
| 4285 | return NULL; | ||
| 4286 | if (!move_file()) | ||
| 4287 | return NULL; | ||
| 4288 | |||
| 4289 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
| 4290 | mapping = vma->vm_file->f_mapping; | ||
| 4291 | if (pte_none(ptent)) | ||
| 4292 | pgoff = linear_page_index(vma, addr); | ||
| 4293 | else /* pte_file(ptent) is true */ | ||
| 4294 | pgoff = pte_to_pgoff(ptent); | ||
| 4295 | |||
| 4296 | /* page is moved even if it's not RSS of this task(page-faulted). */ | ||
| 4297 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | ||
| 4298 | page = find_get_page(mapping, pgoff); | ||
| 4299 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | ||
| 4300 | swp_entry_t ent; | ||
| 4301 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | ||
| 4302 | if (do_swap_account) | ||
| 4303 | entry->val = ent.val; | ||
| 4304 | } | ||
| 4305 | |||
| 4306 | return page; | ||
| 4307 | } | ||
| 4308 | |||
| 3979 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 4309 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
| 3980 | unsigned long addr, pte_t ptent, union mc_target *target) | 4310 | unsigned long addr, pte_t ptent, union mc_target *target) |
| 3981 | { | 4311 | { |
| @@ -3983,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
| 3983 | struct page_cgroup *pc; | 4313 | struct page_cgroup *pc; |
| 3984 | int ret = 0; | 4314 | int ret = 0; |
| 3985 | swp_entry_t ent = { .val = 0 }; | 4315 | swp_entry_t ent = { .val = 0 }; |
| 3986 | int usage_count = 0; | ||
| 3987 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
| 3988 | &mc.to->move_charge_at_immigrate); | ||
| 3989 | 4316 | ||
| 3990 | if (!pte_present(ptent)) { | 4317 | if (pte_present(ptent)) |
| 3991 | /* TODO: handle swap of shmes/tmpfs */ | 4318 | page = mc_handle_present_pte(vma, addr, ptent); |
| 3992 | if (pte_none(ptent) || pte_file(ptent)) | 4319 | else if (is_swap_pte(ptent)) |
| 3993 | return 0; | 4320 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
| 3994 | else if (is_swap_pte(ptent)) { | 4321 | else if (pte_none(ptent) || pte_file(ptent)) |
| 3995 | ent = pte_to_swp_entry(ptent); | 4322 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
| 3996 | if (!move_anon || non_swap_entry(ent)) | 4323 | |
| 3997 | return 0; | 4324 | if (!page && !ent.val) |
| 3998 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
| 3999 | } | ||
| 4000 | } else { | ||
| 4001 | page = vm_normal_page(vma, addr, ptent); | ||
| 4002 | if (!page || !page_mapped(page)) | ||
| 4003 | return 0; | ||
| 4004 | /* | ||
| 4005 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
| 4006 | * pages for now. | ||
| 4007 | */ | ||
| 4008 | if (!move_anon || !PageAnon(page)) | ||
| 4009 | return 0; | ||
| 4010 | if (!get_page_unless_zero(page)) | ||
| 4011 | return 0; | ||
| 4012 | usage_count = page_mapcount(page); | ||
| 4013 | } | ||
| 4014 | if (usage_count > 1) { | ||
| 4015 | /* | ||
| 4016 | * TODO: We don't move charges of shared(used by multiple | ||
| 4017 | * processes) pages for now. | ||
| 4018 | */ | ||
| 4019 | if (page) | ||
| 4020 | put_page(page); | ||
| 4021 | return 0; | 4325 | return 0; |
| 4022 | } | ||
| 4023 | if (page) { | 4326 | if (page) { |
| 4024 | pc = lookup_page_cgroup(page); | 4327 | pc = lookup_page_cgroup(page); |
| 4025 | /* | 4328 | /* |
| @@ -4035,8 +4338,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
| 4035 | if (!ret || !target) | 4338 | if (!ret || !target) |
| 4036 | put_page(page); | 4339 | put_page(page); |
| 4037 | } | 4340 | } |
| 4038 | /* throught */ | 4341 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
| 4039 | if (ent.val && do_swap_account && !ret && | 4342 | if (ent.val && !ret && |
| 4040 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 4343 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { |
| 4041 | ret = MC_TARGET_SWAP; | 4344 | ret = MC_TARGET_SWAP; |
| 4042 | if (target) | 4345 | if (target) |
| @@ -4077,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
| 4077 | }; | 4380 | }; |
| 4078 | if (is_vm_hugetlb_page(vma)) | 4381 | if (is_vm_hugetlb_page(vma)) |
| 4079 | continue; | 4382 | continue; |
| 4080 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
| 4081 | if (vma->vm_flags & VM_SHARED) | ||
| 4082 | continue; | ||
| 4083 | walk_page_range(vma->vm_start, vma->vm_end, | 4383 | walk_page_range(vma->vm_start, vma->vm_end, |
| 4084 | &mem_cgroup_count_precharge_walk); | 4384 | &mem_cgroup_count_precharge_walk); |
| 4085 | } | 4385 | } |
| @@ -4102,6 +4402,7 @@ static void mem_cgroup_clear_mc(void) | |||
| 4102 | if (mc.precharge) { | 4402 | if (mc.precharge) { |
| 4103 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4403 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
| 4104 | mc.precharge = 0; | 4404 | mc.precharge = 0; |
| 4405 | memcg_oom_recover(mc.to); | ||
| 4105 | } | 4406 | } |
| 4106 | /* | 4407 | /* |
| 4107 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4408 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
| @@ -4110,6 +4411,7 @@ static void mem_cgroup_clear_mc(void) | |||
| 4110 | if (mc.moved_charge) { | 4411 | if (mc.moved_charge) { |
| 4111 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4412 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
| 4112 | mc.moved_charge = 0; | 4413 | mc.moved_charge = 0; |
| 4414 | memcg_oom_recover(mc.from); | ||
| 4113 | } | 4415 | } |
| 4114 | /* we must fixup refcnts and charges */ | 4416 | /* we must fixup refcnts and charges */ |
| 4115 | if (mc.moved_swap) { | 4417 | if (mc.moved_swap) { |
| @@ -4274,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
| 4274 | }; | 4576 | }; |
| 4275 | if (is_vm_hugetlb_page(vma)) | 4577 | if (is_vm_hugetlb_page(vma)) |
| 4276 | continue; | 4578 | continue; |
| 4277 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
| 4278 | if (vma->vm_flags & VM_SHARED) | ||
| 4279 | continue; | ||
| 4280 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 4579 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
| 4281 | &mem_cgroup_move_charge_walk); | 4580 | &mem_cgroup_move_charge_walk); |
| 4282 | if (ret) | 4581 | if (ret) |
