diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 712 |
1 files changed, 499 insertions, 213 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f711c213d2e..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -149,16 +149,35 @@ struct mem_cgroup_threshold { | |||
149 | u64 threshold; | 149 | u64 threshold; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* For threshold */ | ||
152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
154 | atomic_t current_threshold; | 155 | int current_threshold; |
155 | /* Size of entries[] */ | 156 | /* Size of entries[] */ |
156 | unsigned int size; | 157 | unsigned int size; |
157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
159 | }; | 160 | }; |
160 | 161 | ||
162 | struct mem_cgroup_thresholds { | ||
163 | /* Primary thresholds array */ | ||
164 | struct mem_cgroup_threshold_ary *primary; | ||
165 | /* | ||
166 | * Spare threshold array. | ||
167 | * This is needed to make mem_cgroup_unregister_event() "never fail". | ||
168 | * It must be able to store at least primary->size - 1 entries. | ||
169 | */ | ||
170 | struct mem_cgroup_threshold_ary *spare; | ||
171 | }; | ||
172 | |||
173 | /* for OOM */ | ||
174 | struct mem_cgroup_eventfd_list { | ||
175 | struct list_head list; | ||
176 | struct eventfd_ctx *eventfd; | ||
177 | }; | ||
178 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 179 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
180 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
162 | 181 | ||
163 | /* | 182 | /* |
164 | * The memory controller data structure. The memory controller controls both | 183 | * The memory controller data structure. The memory controller controls both |
@@ -207,6 +226,8 @@ struct mem_cgroup { | |||
207 | atomic_t refcnt; | 226 | atomic_t refcnt; |
208 | 227 | ||
209 | unsigned int swappiness; | 228 | unsigned int swappiness; |
229 | /* OOM-Killer disable */ | ||
230 | int oom_kill_disable; | ||
210 | 231 | ||
211 | /* set when res.limit == memsw.limit */ | 232 | /* set when res.limit == memsw.limit */ |
212 | bool memsw_is_minimum; | 233 | bool memsw_is_minimum; |
@@ -215,17 +236,19 @@ struct mem_cgroup { | |||
215 | struct mutex thresholds_lock; | 236 | struct mutex thresholds_lock; |
216 | 237 | ||
217 | /* thresholds for memory usage. RCU-protected */ | 238 | /* thresholds for memory usage. RCU-protected */ |
218 | struct mem_cgroup_threshold_ary *thresholds; | 239 | struct mem_cgroup_thresholds thresholds; |
219 | 240 | ||
220 | /* thresholds for mem+swap usage. RCU-protected */ | 241 | /* thresholds for mem+swap usage. RCU-protected */ |
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 242 | struct mem_cgroup_thresholds memsw_thresholds; |
243 | |||
244 | /* For oom notifier event fd */ | ||
245 | struct list_head oom_notify; | ||
222 | 246 | ||
223 | /* | 247 | /* |
224 | * Should we move charges of a task when a task is moved into this | 248 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | 249 | * mem_cgroup ? And what type of charges should we move ? |
226 | */ | 250 | */ |
227 | unsigned long move_charge_at_immigrate; | 251 | unsigned long move_charge_at_immigrate; |
228 | |||
229 | /* | 252 | /* |
230 | * percpu counter. | 253 | * percpu counter. |
231 | */ | 254 | */ |
@@ -239,6 +262,7 @@ struct mem_cgroup { | |||
239 | */ | 262 | */ |
240 | enum move_type { | 263 | enum move_type { |
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 264 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
265 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | ||
242 | NR_MOVE_TYPE, | 266 | NR_MOVE_TYPE, |
243 | }; | 267 | }; |
244 | 268 | ||
@@ -255,6 +279,18 @@ static struct move_charge_struct { | |||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
256 | }; | 280 | }; |
257 | 281 | ||
282 | static bool move_anon(void) | ||
283 | { | ||
284 | return test_bit(MOVE_CHARGE_TYPE_ANON, | ||
285 | &mc.to->move_charge_at_immigrate); | ||
286 | } | ||
287 | |||
288 | static bool move_file(void) | ||
289 | { | ||
290 | return test_bit(MOVE_CHARGE_TYPE_FILE, | ||
291 | &mc.to->move_charge_at_immigrate); | ||
292 | } | ||
293 | |||
258 | /* | 294 | /* |
259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 295 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
260 | * limit reclaim to prevent infinite loops, if they ever occur. | 296 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -282,9 +318,12 @@ enum charge_type { | |||
282 | /* for encoding cft->private value on file */ | 318 | /* for encoding cft->private value on file */ |
283 | #define _MEM (0) | 319 | #define _MEM (0) |
284 | #define _MEMSWAP (1) | 320 | #define _MEMSWAP (1) |
321 | #define _OOM_TYPE (2) | ||
285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 322 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 323 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 324 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
325 | /* Used for OOM nofiier */ | ||
326 | #define OOM_CONTROL (0) | ||
288 | 327 | ||
289 | /* | 328 | /* |
290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 329 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
@@ -811,12 +850,10 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
811 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | 850 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* |
812 | * hierarchy(even if use_hierarchy is disabled in "mem"). | 851 | * hierarchy(even if use_hierarchy is disabled in "mem"). |
813 | */ | 852 | */ |
814 | rcu_read_lock(); | ||
815 | if (mem->use_hierarchy) | 853 | if (mem->use_hierarchy) |
816 | ret = css_is_ancestor(&curr->css, &mem->css); | 854 | ret = css_is_ancestor(&curr->css, &mem->css); |
817 | else | 855 | else |
818 | ret = (curr == mem); | 856 | ret = (curr == mem); |
819 | rcu_read_unlock(); | ||
820 | css_put(&curr->css); | 857 | css_put(&curr->css); |
821 | return ret; | 858 | return ret; |
822 | } | 859 | } |
@@ -1295,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |||
1295 | static DEFINE_MUTEX(memcg_oom_mutex); | 1332 | static DEFINE_MUTEX(memcg_oom_mutex); |
1296 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1333 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1297 | 1334 | ||
1335 | struct oom_wait_info { | ||
1336 | struct mem_cgroup *mem; | ||
1337 | wait_queue_t wait; | ||
1338 | }; | ||
1339 | |||
1340 | static int memcg_oom_wake_function(wait_queue_t *wait, | ||
1341 | unsigned mode, int sync, void *arg) | ||
1342 | { | ||
1343 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | ||
1344 | struct oom_wait_info *oom_wait_info; | ||
1345 | |||
1346 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | ||
1347 | |||
1348 | if (oom_wait_info->mem == wake_mem) | ||
1349 | goto wakeup; | ||
1350 | /* if no hierarchy, no match */ | ||
1351 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1352 | return 0; | ||
1353 | /* | ||
1354 | * Both of oom_wait_info->mem and wake_mem are stable under us. | ||
1355 | * Then we can use css_is_ancestor without taking care of RCU. | ||
1356 | */ | ||
1357 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | ||
1358 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | ||
1359 | return 0; | ||
1360 | |||
1361 | wakeup: | ||
1362 | return autoremove_wake_function(wait, mode, sync, arg); | ||
1363 | } | ||
1364 | |||
1365 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | ||
1366 | { | ||
1367 | /* for filtering, pass "mem" as argument. */ | ||
1368 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | ||
1369 | } | ||
1370 | |||
1371 | static void memcg_oom_recover(struct mem_cgroup *mem) | ||
1372 | { | ||
1373 | if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) | ||
1374 | memcg_wakeup_oom(mem); | ||
1375 | } | ||
1376 | |||
1298 | /* | 1377 | /* |
1299 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1378 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1300 | */ | 1379 | */ |
1301 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1380 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
1302 | { | 1381 | { |
1303 | DEFINE_WAIT(wait); | 1382 | struct oom_wait_info owait; |
1304 | bool locked; | 1383 | bool locked, need_to_kill; |
1305 | 1384 | ||
1385 | owait.mem = mem; | ||
1386 | owait.wait.flags = 0; | ||
1387 | owait.wait.func = memcg_oom_wake_function; | ||
1388 | owait.wait.private = current; | ||
1389 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
1390 | need_to_kill = true; | ||
1306 | /* At first, try to OOM lock hierarchy under mem.*/ | 1391 | /* At first, try to OOM lock hierarchy under mem.*/ |
1307 | mutex_lock(&memcg_oom_mutex); | 1392 | mutex_lock(&memcg_oom_mutex); |
1308 | locked = mem_cgroup_oom_lock(mem); | 1393 | locked = mem_cgroup_oom_lock(mem); |
@@ -1311,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1311 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1396 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1312 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1397 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1313 | */ | 1398 | */ |
1314 | if (!locked) | 1399 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1315 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | 1400 | if (!locked || mem->oom_kill_disable) |
1401 | need_to_kill = false; | ||
1402 | if (locked) | ||
1403 | mem_cgroup_oom_notify(mem); | ||
1316 | mutex_unlock(&memcg_oom_mutex); | 1404 | mutex_unlock(&memcg_oom_mutex); |
1317 | 1405 | ||
1318 | if (locked) | 1406 | if (need_to_kill) { |
1407 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
1319 | mem_cgroup_out_of_memory(mem, mask); | 1408 | mem_cgroup_out_of_memory(mem, mask); |
1320 | else { | 1409 | } else { |
1321 | schedule(); | 1410 | schedule(); |
1322 | finish_wait(&memcg_oom_waitq, &wait); | 1411 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1323 | } | 1412 | } |
1324 | mutex_lock(&memcg_oom_mutex); | 1413 | mutex_lock(&memcg_oom_mutex); |
1325 | mem_cgroup_oom_unlock(mem); | 1414 | mem_cgroup_oom_unlock(mem); |
1326 | /* | 1415 | memcg_wakeup_oom(mem); |
1327 | * Here, we use global waitq .....more fine grained waitq ? | ||
1328 | * Assume following hierarchy. | ||
1329 | * A/ | ||
1330 | * 01 | ||
1331 | * 02 | ||
1332 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1333 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1334 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1335 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1336 | * It will not be a big problem. | ||
1337 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1338 | */ | ||
1339 | wake_up_all(&memcg_oom_waitq); | ||
1340 | mutex_unlock(&memcg_oom_mutex); | 1416 | mutex_unlock(&memcg_oom_mutex); |
1341 | 1417 | ||
1342 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1418 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
@@ -1440,7 +1516,7 @@ static void drain_local_stock(struct work_struct *dummy) | |||
1440 | 1516 | ||
1441 | /* | 1517 | /* |
1442 | * Cache charges(val) which is from res_counter, to local per_cpu area. | 1518 | * Cache charges(val) which is from res_counter, to local per_cpu area. |
1443 | * This will be consumed by consumt_stock() function, later. | 1519 | * This will be consumed by consume_stock() function, later. |
1444 | */ | 1520 | */ |
1445 | static void refill_stock(struct mem_cgroup *mem, int val) | 1521 | static void refill_stock(struct mem_cgroup *mem, int val) |
1446 | { | 1522 | { |
@@ -1603,7 +1679,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1603 | * There is a small race that "from" or "to" can be | 1679 | * There is a small race that "from" or "to" can be |
1604 | * freed by rmdir, so we use css_tryget(). | 1680 | * freed by rmdir, so we use css_tryget(). |
1605 | */ | 1681 | */ |
1606 | rcu_read_lock(); | ||
1607 | from = mc.from; | 1682 | from = mc.from; |
1608 | to = mc.to; | 1683 | to = mc.to; |
1609 | if (from && css_tryget(&from->css)) { | 1684 | if (from && css_tryget(&from->css)) { |
@@ -1624,7 +1699,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1624 | do_continue = (to == mem_over_limit); | 1699 | do_continue = (to == mem_over_limit); |
1625 | css_put(&to->css); | 1700 | css_put(&to->css); |
1626 | } | 1701 | } |
1627 | rcu_read_unlock(); | ||
1628 | if (do_continue) { | 1702 | if (do_continue) { |
1629 | DEFINE_WAIT(wait); | 1703 | DEFINE_WAIT(wait); |
1630 | prepare_to_wait(&mc.waitq, &wait, | 1704 | prepare_to_wait(&mc.waitq, &wait, |
@@ -2122,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2122 | /* If swapout, usage of swap doesn't decrease */ | 2196 | /* If swapout, usage of swap doesn't decrease */ |
2123 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2197 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2124 | uncharge_memsw = false; | 2198 | uncharge_memsw = false; |
2125 | /* | ||
2126 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2127 | * In those cases, all pages freed continously can be expected to be in | ||
2128 | * the same cgroup and we have chance to coalesce uncharges. | ||
2129 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2130 | * because we want to do uncharge as soon as possible. | ||
2131 | */ | ||
2132 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2133 | goto direct_uncharge; | ||
2134 | 2199 | ||
2135 | batch = ¤t->memcg_batch; | 2200 | batch = ¤t->memcg_batch; |
2136 | /* | 2201 | /* |
@@ -2141,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2141 | if (!batch->memcg) | 2206 | if (!batch->memcg) |
2142 | batch->memcg = mem; | 2207 | batch->memcg = mem; |
2143 | /* | 2208 | /* |
2209 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2210 | * In those cases, all pages freed continously can be expected to be in | ||
2211 | * the same cgroup and we have chance to coalesce uncharges. | ||
2212 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2213 | * because we want to do uncharge as soon as possible. | ||
2214 | */ | ||
2215 | |||
2216 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2217 | goto direct_uncharge; | ||
2218 | |||
2219 | /* | ||
2144 | * In typical case, batch->memcg == mem. This means we can | 2220 | * In typical case, batch->memcg == mem. This means we can |
2145 | * merge a series of uncharges to an uncharge of res_counter. | 2221 | * merge a series of uncharges to an uncharge of res_counter. |
2146 | * If not, we uncharge res_counter ony by one. | 2222 | * If not, we uncharge res_counter ony by one. |
@@ -2156,6 +2232,8 @@ direct_uncharge: | |||
2156 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2232 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
2157 | if (uncharge_memsw) | 2233 | if (uncharge_memsw) |
2158 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2234 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
2235 | if (unlikely(batch->memcg != mem)) | ||
2236 | memcg_oom_recover(mem); | ||
2159 | return; | 2237 | return; |
2160 | } | 2238 | } |
2161 | 2239 | ||
@@ -2192,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2192 | switch (ctype) { | 2270 | switch (ctype) { |
2193 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2271 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2194 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2272 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2195 | if (page_mapped(page)) | 2273 | /* See mem_cgroup_prepare_migration() */ |
2274 | if (page_mapped(page) || PageCgroupMigration(pc)) | ||
2196 | goto unlock_out; | 2275 | goto unlock_out; |
2197 | break; | 2276 | break; |
2198 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 2277 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2292,6 +2371,7 @@ void mem_cgroup_uncharge_end(void) | |||
2292 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2371 | res_counter_uncharge(&batch->memcg->res, batch->bytes); |
2293 | if (batch->memsw_bytes) | 2372 | if (batch->memsw_bytes) |
2294 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2373 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); |
2374 | memcg_oom_recover(batch->memcg); | ||
2295 | /* forget this pointer (for sanity check) */ | 2375 | /* forget this pointer (for sanity check) */ |
2296 | batch->memcg = NULL; | 2376 | batch->memcg = NULL; |
2297 | } | 2377 | } |
@@ -2314,9 +2394,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) | |||
2314 | 2394 | ||
2315 | /* record memcg information */ | 2395 | /* record memcg information */ |
2316 | if (do_swap_account && swapout && memcg) { | 2396 | if (do_swap_account && swapout && memcg) { |
2317 | rcu_read_lock(); | ||
2318 | swap_cgroup_record(ent, css_id(&memcg->css)); | 2397 | swap_cgroup_record(ent, css_id(&memcg->css)); |
2319 | rcu_read_unlock(); | ||
2320 | mem_cgroup_get(memcg); | 2398 | mem_cgroup_get(memcg); |
2321 | } | 2399 | } |
2322 | if (swapout && memcg) | 2400 | if (swapout && memcg) |
@@ -2373,10 +2451,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2373 | { | 2451 | { |
2374 | unsigned short old_id, new_id; | 2452 | unsigned short old_id, new_id; |
2375 | 2453 | ||
2376 | rcu_read_lock(); | ||
2377 | old_id = css_id(&from->css); | 2454 | old_id = css_id(&from->css); |
2378 | new_id = css_id(&to->css); | 2455 | new_id = css_id(&to->css); |
2379 | rcu_read_unlock(); | ||
2380 | 2456 | ||
2381 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { | 2457 | if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { |
2382 | mem_cgroup_swap_statistics(from, false); | 2458 | mem_cgroup_swap_statistics(from, false); |
@@ -2418,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2418 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 2494 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
2419 | * page belongs to. | 2495 | * page belongs to. |
2420 | */ | 2496 | */ |
2421 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 2497 | int mem_cgroup_prepare_migration(struct page *page, |
2498 | struct page *newpage, struct mem_cgroup **ptr) | ||
2422 | { | 2499 | { |
2423 | struct page_cgroup *pc; | 2500 | struct page_cgroup *pc; |
2424 | struct mem_cgroup *mem = NULL; | 2501 | struct mem_cgroup *mem = NULL; |
2502 | enum charge_type ctype; | ||
2425 | int ret = 0; | 2503 | int ret = 0; |
2426 | 2504 | ||
2427 | if (mem_cgroup_disabled()) | 2505 | if (mem_cgroup_disabled()) |
@@ -2432,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2432 | if (PageCgroupUsed(pc)) { | 2510 | if (PageCgroupUsed(pc)) { |
2433 | mem = pc->mem_cgroup; | 2511 | mem = pc->mem_cgroup; |
2434 | css_get(&mem->css); | 2512 | css_get(&mem->css); |
2513 | /* | ||
2514 | * At migrating an anonymous page, its mapcount goes down | ||
2515 | * to 0 and uncharge() will be called. But, even if it's fully | ||
2516 | * unmapped, migration may fail and this page has to be | ||
2517 | * charged again. We set MIGRATION flag here and delay uncharge | ||
2518 | * until end_migration() is called | ||
2519 | * | ||
2520 | * Corner Case Thinking | ||
2521 | * A) | ||
2522 | * When the old page was mapped as Anon and it's unmap-and-freed | ||
2523 | * while migration was ongoing. | ||
2524 | * If unmap finds the old page, uncharge() of it will be delayed | ||
2525 | * until end_migration(). If unmap finds a new page, it's | ||
2526 | * uncharged when it make mapcount to be 1->0. If unmap code | ||
2527 | * finds swap_migration_entry, the new page will not be mapped | ||
2528 | * and end_migration() will find it(mapcount==0). | ||
2529 | * | ||
2530 | * B) | ||
2531 | * When the old page was mapped but migraion fails, the kernel | ||
2532 | * remaps it. A charge for it is kept by MIGRATION flag even | ||
2533 | * if mapcount goes down to 0. We can do remap successfully | ||
2534 | * without charging it again. | ||
2535 | * | ||
2536 | * C) | ||
2537 | * The "old" page is under lock_page() until the end of | ||
2538 | * migration, so, the old page itself will not be swapped-out. | ||
2539 | * If the new page is swapped out before end_migraton, our | ||
2540 | * hook to usual swap-out path will catch the event. | ||
2541 | */ | ||
2542 | if (PageAnon(page)) | ||
2543 | SetPageCgroupMigration(pc); | ||
2435 | } | 2544 | } |
2436 | unlock_page_cgroup(pc); | 2545 | unlock_page_cgroup(pc); |
2546 | /* | ||
2547 | * If the page is not charged at this point, | ||
2548 | * we return here. | ||
2549 | */ | ||
2550 | if (!mem) | ||
2551 | return 0; | ||
2437 | 2552 | ||
2438 | *ptr = mem; | 2553 | *ptr = mem; |
2439 | if (mem) { | 2554 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
2440 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2555 | css_put(&mem->css);/* drop extra refcnt */ |
2441 | css_put(&mem->css); | 2556 | if (ret || *ptr == NULL) { |
2557 | if (PageAnon(page)) { | ||
2558 | lock_page_cgroup(pc); | ||
2559 | ClearPageCgroupMigration(pc); | ||
2560 | unlock_page_cgroup(pc); | ||
2561 | /* | ||
2562 | * The old page may be fully unmapped while we kept it. | ||
2563 | */ | ||
2564 | mem_cgroup_uncharge_page(page); | ||
2565 | } | ||
2566 | return -ENOMEM; | ||
2442 | } | 2567 | } |
2568 | /* | ||
2569 | * We charge new page before it's used/mapped. So, even if unlock_page() | ||
2570 | * is called before end_migration, we can catch all events on this new | ||
2571 | * page. In the case new page is migrated but not remapped, new page's | ||
2572 | * mapcount will be finally 0 and we call uncharge in end_migration(). | ||
2573 | */ | ||
2574 | pc = lookup_page_cgroup(newpage); | ||
2575 | if (PageAnon(page)) | ||
2576 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2577 | else if (page_is_file_cache(page)) | ||
2578 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2579 | else | ||
2580 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2581 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
2443 | return ret; | 2582 | return ret; |
2444 | } | 2583 | } |
2445 | 2584 | ||
2446 | /* remove redundant charge if migration failed*/ | 2585 | /* remove redundant charge if migration failed*/ |
2447 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2586 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2448 | struct page *oldpage, struct page *newpage) | 2587 | struct page *oldpage, struct page *newpage) |
2449 | { | 2588 | { |
2450 | struct page *target, *unused; | 2589 | struct page *used, *unused; |
2451 | struct page_cgroup *pc; | 2590 | struct page_cgroup *pc; |
2452 | enum charge_type ctype; | ||
2453 | 2591 | ||
2454 | if (!mem) | 2592 | if (!mem) |
2455 | return; | 2593 | return; |
2594 | /* blocks rmdir() */ | ||
2456 | cgroup_exclude_rmdir(&mem->css); | 2595 | cgroup_exclude_rmdir(&mem->css); |
2457 | /* at migration success, oldpage->mapping is NULL. */ | 2596 | /* at migration success, oldpage->mapping is NULL. */ |
2458 | if (oldpage->mapping) { | 2597 | if (oldpage->mapping) { |
2459 | target = oldpage; | 2598 | used = oldpage; |
2460 | unused = NULL; | 2599 | unused = newpage; |
2461 | } else { | 2600 | } else { |
2462 | target = newpage; | 2601 | used = newpage; |
2463 | unused = oldpage; | 2602 | unused = oldpage; |
2464 | } | 2603 | } |
2465 | |||
2466 | if (PageAnon(target)) | ||
2467 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2468 | else if (page_is_file_cache(target)) | ||
2469 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2470 | else | ||
2471 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2472 | |||
2473 | /* unused page is not on radix-tree now. */ | ||
2474 | if (unused) | ||
2475 | __mem_cgroup_uncharge_common(unused, ctype); | ||
2476 | |||
2477 | pc = lookup_page_cgroup(target); | ||
2478 | /* | 2604 | /* |
2479 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. | 2605 | * We disallowed uncharge of pages under migration because mapcount |
2480 | * So, double-counting is effectively avoided. | 2606 | * of the page goes down to zero, temporarly. |
2607 | * Clear the flag and check the page should be charged. | ||
2481 | */ | 2608 | */ |
2482 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2609 | pc = lookup_page_cgroup(oldpage); |
2610 | lock_page_cgroup(pc); | ||
2611 | ClearPageCgroupMigration(pc); | ||
2612 | unlock_page_cgroup(pc); | ||
2483 | 2613 | ||
2614 | if (unused != oldpage) | ||
2615 | pc = lookup_page_cgroup(unused); | ||
2616 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
2617 | |||
2618 | pc = lookup_page_cgroup(used); | ||
2484 | /* | 2619 | /* |
2485 | * Both of oldpage and newpage are still under lock_page(). | 2620 | * If a page is a file cache, radix-tree replacement is very atomic |
2486 | * Then, we don't have to care about race in radix-tree. | 2621 | * and we can skip this check. When it was an Anon page, its mapcount |
2487 | * But we have to be careful that this page is unmapped or not. | 2622 | * goes down to 0. But because we added MIGRATION flage, it's not |
2488 | * | 2623 | * uncharged yet. There are several case but page->mapcount check |
2489 | * There is a case for !page_mapped(). At the start of | 2624 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
2490 | * migration, oldpage was mapped. But now, it's zapped. | 2625 | * check. (see prepare_charge() also) |
2491 | * But we know *target* page is not freed/reused under us. | ||
2492 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
2493 | */ | 2626 | */ |
2494 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2627 | if (PageAnon(used)) |
2495 | mem_cgroup_uncharge_page(target); | 2628 | mem_cgroup_uncharge_page(used); |
2496 | /* | 2629 | /* |
2497 | * At migration, we may charge account against cgroup which has no tasks | 2630 | * At migration, we may charge account against cgroup which has no |
2631 | * tasks. | ||
2498 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2632 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2499 | * In that case, we need to call pre_destroy() again. check it here. | 2633 | * In that case, we need to call pre_destroy() again. check it here. |
2500 | */ | 2634 | */ |
@@ -2532,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2532 | unsigned long long val) | 2666 | unsigned long long val) |
2533 | { | 2667 | { |
2534 | int retry_count; | 2668 | int retry_count; |
2535 | u64 memswlimit; | 2669 | u64 memswlimit, memlimit; |
2536 | int ret = 0; | 2670 | int ret = 0; |
2537 | int children = mem_cgroup_count_children(memcg); | 2671 | int children = mem_cgroup_count_children(memcg); |
2538 | u64 curusage, oldusage; | 2672 | u64 curusage, oldusage; |
2673 | int enlarge; | ||
2539 | 2674 | ||
2540 | /* | 2675 | /* |
2541 | * For keeping hierarchical_reclaim simple, how long we should retry | 2676 | * For keeping hierarchical_reclaim simple, how long we should retry |
@@ -2546,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2546 | 2681 | ||
2547 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2682 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2548 | 2683 | ||
2684 | enlarge = 0; | ||
2549 | while (retry_count) { | 2685 | while (retry_count) { |
2550 | if (signal_pending(current)) { | 2686 | if (signal_pending(current)) { |
2551 | ret = -EINTR; | 2687 | ret = -EINTR; |
@@ -2563,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2563 | mutex_unlock(&set_limit_mutex); | 2699 | mutex_unlock(&set_limit_mutex); |
2564 | break; | 2700 | break; |
2565 | } | 2701 | } |
2702 | |||
2703 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
2704 | if (memlimit < val) | ||
2705 | enlarge = 1; | ||
2706 | |||
2566 | ret = res_counter_set_limit(&memcg->res, val); | 2707 | ret = res_counter_set_limit(&memcg->res, val); |
2567 | if (!ret) { | 2708 | if (!ret) { |
2568 | if (memswlimit == val) | 2709 | if (memswlimit == val) |
@@ -2584,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2584 | else | 2725 | else |
2585 | oldusage = curusage; | 2726 | oldusage = curusage; |
2586 | } | 2727 | } |
2728 | if (!ret && enlarge) | ||
2729 | memcg_oom_recover(memcg); | ||
2587 | 2730 | ||
2588 | return ret; | 2731 | return ret; |
2589 | } | 2732 | } |
@@ -2592,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2592 | unsigned long long val) | 2735 | unsigned long long val) |
2593 | { | 2736 | { |
2594 | int retry_count; | 2737 | int retry_count; |
2595 | u64 memlimit, oldusage, curusage; | 2738 | u64 memlimit, memswlimit, oldusage, curusage; |
2596 | int children = mem_cgroup_count_children(memcg); | 2739 | int children = mem_cgroup_count_children(memcg); |
2597 | int ret = -EBUSY; | 2740 | int ret = -EBUSY; |
2741 | int enlarge = 0; | ||
2598 | 2742 | ||
2599 | /* see mem_cgroup_resize_res_limit */ | 2743 | /* see mem_cgroup_resize_res_limit */ |
2600 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 2744 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
@@ -2616,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2616 | mutex_unlock(&set_limit_mutex); | 2760 | mutex_unlock(&set_limit_mutex); |
2617 | break; | 2761 | break; |
2618 | } | 2762 | } |
2763 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
2764 | if (memswlimit < val) | ||
2765 | enlarge = 1; | ||
2619 | ret = res_counter_set_limit(&memcg->memsw, val); | 2766 | ret = res_counter_set_limit(&memcg->memsw, val); |
2620 | if (!ret) { | 2767 | if (!ret) { |
2621 | if (memlimit == val) | 2768 | if (memlimit == val) |
@@ -2638,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2638 | else | 2785 | else |
2639 | oldusage = curusage; | 2786 | oldusage = curusage; |
2640 | } | 2787 | } |
2788 | if (!ret && enlarge) | ||
2789 | memcg_oom_recover(memcg); | ||
2641 | return ret; | 2790 | return ret; |
2642 | } | 2791 | } |
2643 | 2792 | ||
@@ -2829,6 +2978,7 @@ move_account: | |||
2829 | if (ret) | 2978 | if (ret) |
2830 | break; | 2979 | break; |
2831 | } | 2980 | } |
2981 | memcg_oom_recover(mem); | ||
2832 | /* it seems parent cgroup doesn't have enough mem */ | 2982 | /* it seems parent cgroup doesn't have enough mem */ |
2833 | if (ret == -ENOMEM) | 2983 | if (ret == -ENOMEM) |
2834 | goto try_to_free; | 2984 | goto try_to_free; |
@@ -3319,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3319 | 3469 | ||
3320 | rcu_read_lock(); | 3470 | rcu_read_lock(); |
3321 | if (!swap) | 3471 | if (!swap) |
3322 | t = rcu_dereference(memcg->thresholds); | 3472 | t = rcu_dereference(memcg->thresholds.primary); |
3323 | else | 3473 | else |
3324 | t = rcu_dereference(memcg->memsw_thresholds); | 3474 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
3325 | 3475 | ||
3326 | if (!t) | 3476 | if (!t) |
3327 | goto unlock; | 3477 | goto unlock; |
@@ -3333,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3333 | * If it's not true, a threshold was crossed after last | 3483 | * If it's not true, a threshold was crossed after last |
3334 | * call of __mem_cgroup_threshold(). | 3484 | * call of __mem_cgroup_threshold(). |
3335 | */ | 3485 | */ |
3336 | i = atomic_read(&t->current_threshold); | 3486 | i = t->current_threshold; |
3337 | 3487 | ||
3338 | /* | 3488 | /* |
3339 | * Iterate backward over array of thresholds starting from | 3489 | * Iterate backward over array of thresholds starting from |
@@ -3357,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3357 | eventfd_signal(t->entries[i].eventfd, 1); | 3507 | eventfd_signal(t->entries[i].eventfd, 1); |
3358 | 3508 | ||
3359 | /* Update current_threshold */ | 3509 | /* Update current_threshold */ |
3360 | atomic_set(&t->current_threshold, i - 1); | 3510 | t->current_threshold = i - 1; |
3361 | unlock: | 3511 | unlock: |
3362 | rcu_read_unlock(); | 3512 | rcu_read_unlock(); |
3363 | } | 3513 | } |
@@ -3377,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b) | |||
3377 | return _a->threshold - _b->threshold; | 3527 | return _a->threshold - _b->threshold; |
3378 | } | 3528 | } |
3379 | 3529 | ||
3380 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3530 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
3381 | struct eventfd_ctx *eventfd, const char *args) | 3531 | { |
3532 | struct mem_cgroup_eventfd_list *ev; | ||
3533 | |||
3534 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
3535 | eventfd_signal(ev->eventfd, 1); | ||
3536 | return 0; | ||
3537 | } | ||
3538 | |||
3539 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
3540 | { | ||
3541 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
3542 | } | ||
3543 | |||
3544 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
3545 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3382 | { | 3546 | { |
3383 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3547 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3384 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3548 | struct mem_cgroup_thresholds *thresholds; |
3549 | struct mem_cgroup_threshold_ary *new; | ||
3385 | int type = MEMFILE_TYPE(cft->private); | 3550 | int type = MEMFILE_TYPE(cft->private); |
3386 | u64 threshold, usage; | 3551 | u64 threshold, usage; |
3387 | int size; | 3552 | int i, size, ret; |
3388 | int i, ret; | ||
3389 | 3553 | ||
3390 | ret = res_counter_memparse_write_strategy(args, &threshold); | 3554 | ret = res_counter_memparse_write_strategy(args, &threshold); |
3391 | if (ret) | 3555 | if (ret) |
3392 | return ret; | 3556 | return ret; |
3393 | 3557 | ||
3394 | mutex_lock(&memcg->thresholds_lock); | 3558 | mutex_lock(&memcg->thresholds_lock); |
3559 | |||
3395 | if (type == _MEM) | 3560 | if (type == _MEM) |
3396 | thresholds = memcg->thresholds; | 3561 | thresholds = &memcg->thresholds; |
3397 | else if (type == _MEMSWAP) | 3562 | else if (type == _MEMSWAP) |
3398 | thresholds = memcg->memsw_thresholds; | 3563 | thresholds = &memcg->memsw_thresholds; |
3399 | else | 3564 | else |
3400 | BUG(); | 3565 | BUG(); |
3401 | 3566 | ||
3402 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 3567 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
3403 | 3568 | ||
3404 | /* Check if a threshold crossed before adding a new one */ | 3569 | /* Check if a threshold crossed before adding a new one */ |
3405 | if (thresholds) | 3570 | if (thresholds->primary) |
3406 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3571 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3407 | 3572 | ||
3408 | if (thresholds) | 3573 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
3409 | size = thresholds->size + 1; | ||
3410 | else | ||
3411 | size = 1; | ||
3412 | 3574 | ||
3413 | /* Allocate memory for new array of thresholds */ | 3575 | /* Allocate memory for new array of thresholds */ |
3414 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | 3576 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
3415 | size * sizeof(struct mem_cgroup_threshold), | ||
3416 | GFP_KERNEL); | 3577 | GFP_KERNEL); |
3417 | if (!thresholds_new) { | 3578 | if (!new) { |
3418 | ret = -ENOMEM; | 3579 | ret = -ENOMEM; |
3419 | goto unlock; | 3580 | goto unlock; |
3420 | } | 3581 | } |
3421 | thresholds_new->size = size; | 3582 | new->size = size; |
3422 | 3583 | ||
3423 | /* Copy thresholds (if any) to new array */ | 3584 | /* Copy thresholds (if any) to new array */ |
3424 | if (thresholds) | 3585 | if (thresholds->primary) { |
3425 | memcpy(thresholds_new->entries, thresholds->entries, | 3586 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
3426 | thresholds->size * | ||
3427 | sizeof(struct mem_cgroup_threshold)); | 3587 | sizeof(struct mem_cgroup_threshold)); |
3588 | } | ||
3589 | |||
3428 | /* Add new threshold */ | 3590 | /* Add new threshold */ |
3429 | thresholds_new->entries[size - 1].eventfd = eventfd; | 3591 | new->entries[size - 1].eventfd = eventfd; |
3430 | thresholds_new->entries[size - 1].threshold = threshold; | 3592 | new->entries[size - 1].threshold = threshold; |
3431 | 3593 | ||
3432 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 3594 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
3433 | sort(thresholds_new->entries, size, | 3595 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
3434 | sizeof(struct mem_cgroup_threshold), | ||
3435 | compare_thresholds, NULL); | 3596 | compare_thresholds, NULL); |
3436 | 3597 | ||
3437 | /* Find current threshold */ | 3598 | /* Find current threshold */ |
3438 | atomic_set(&thresholds_new->current_threshold, -1); | 3599 | new->current_threshold = -1; |
3439 | for (i = 0; i < size; i++) { | 3600 | for (i = 0; i < size; i++) { |
3440 | if (thresholds_new->entries[i].threshold < usage) { | 3601 | if (new->entries[i].threshold < usage) { |
3441 | /* | 3602 | /* |
3442 | * thresholds_new->current_threshold will not be used | 3603 | * new->current_threshold will not be used until |
3443 | * until rcu_assign_pointer(), so it's safe to increment | 3604 | * rcu_assign_pointer(), so it's safe to increment |
3444 | * it here. | 3605 | * it here. |
3445 | */ | 3606 | */ |
3446 | atomic_inc(&thresholds_new->current_threshold); | 3607 | ++new->current_threshold; |
3447 | } | 3608 | } |
3448 | } | 3609 | } |
3449 | 3610 | ||
3450 | if (type == _MEM) | 3611 | /* Free old spare buffer and save old primary buffer as spare */ |
3451 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3612 | kfree(thresholds->spare); |
3452 | else | 3613 | thresholds->spare = thresholds->primary; |
3453 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | 3614 | |
3615 | rcu_assign_pointer(thresholds->primary, new); | ||
3454 | 3616 | ||
3455 | /* To be sure that nobody uses thresholds before freeing it */ | 3617 | /* To be sure that nobody uses thresholds */ |
3456 | synchronize_rcu(); | 3618 | synchronize_rcu(); |
3457 | 3619 | ||
3458 | kfree(thresholds); | ||
3459 | unlock: | 3620 | unlock: |
3460 | mutex_unlock(&memcg->thresholds_lock); | 3621 | mutex_unlock(&memcg->thresholds_lock); |
3461 | 3622 | ||
3462 | return ret; | 3623 | return ret; |
3463 | } | 3624 | } |
3464 | 3625 | ||
3465 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3626 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
3466 | struct eventfd_ctx *eventfd) | 3627 | struct cftype *cft, struct eventfd_ctx *eventfd) |
3467 | { | 3628 | { |
3468 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3469 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3630 | struct mem_cgroup_thresholds *thresholds; |
3631 | struct mem_cgroup_threshold_ary *new; | ||
3470 | int type = MEMFILE_TYPE(cft->private); | 3632 | int type = MEMFILE_TYPE(cft->private); |
3471 | u64 usage; | 3633 | u64 usage; |
3472 | int size = 0; | 3634 | int i, j, size; |
3473 | int i, j, ret; | ||
3474 | 3635 | ||
3475 | mutex_lock(&memcg->thresholds_lock); | 3636 | mutex_lock(&memcg->thresholds_lock); |
3476 | if (type == _MEM) | 3637 | if (type == _MEM) |
3477 | thresholds = memcg->thresholds; | 3638 | thresholds = &memcg->thresholds; |
3478 | else if (type == _MEMSWAP) | 3639 | else if (type == _MEMSWAP) |
3479 | thresholds = memcg->memsw_thresholds; | 3640 | thresholds = &memcg->memsw_thresholds; |
3480 | else | 3641 | else |
3481 | BUG(); | 3642 | BUG(); |
3482 | 3643 | ||
@@ -3492,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | |||
3492 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3653 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3493 | 3654 | ||
3494 | /* Calculate new number of threshold */ | 3655 | /* Calculate new number of threshold */ |
3495 | for (i = 0; i < thresholds->size; i++) { | 3656 | size = 0; |
3496 | if (thresholds->entries[i].eventfd != eventfd) | 3657 | for (i = 0; i < thresholds->primary->size; i++) { |
3658 | if (thresholds->primary->entries[i].eventfd != eventfd) | ||
3497 | size++; | 3659 | size++; |
3498 | } | 3660 | } |
3499 | 3661 | ||
3662 | new = thresholds->spare; | ||
3663 | |||
3500 | /* Set thresholds array to NULL if we don't have thresholds */ | 3664 | /* Set thresholds array to NULL if we don't have thresholds */ |
3501 | if (!size) { | 3665 | if (!size) { |
3502 | thresholds_new = NULL; | 3666 | kfree(new); |
3503 | goto assign; | 3667 | new = NULL; |
3668 | goto swap_buffers; | ||
3504 | } | 3669 | } |
3505 | 3670 | ||
3506 | /* Allocate memory for new array of thresholds */ | 3671 | new->size = size; |
3507 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3508 | size * sizeof(struct mem_cgroup_threshold), | ||
3509 | GFP_KERNEL); | ||
3510 | if (!thresholds_new) { | ||
3511 | ret = -ENOMEM; | ||
3512 | goto unlock; | ||
3513 | } | ||
3514 | thresholds_new->size = size; | ||
3515 | 3672 | ||
3516 | /* Copy thresholds and find current threshold */ | 3673 | /* Copy thresholds and find current threshold */ |
3517 | atomic_set(&thresholds_new->current_threshold, -1); | 3674 | new->current_threshold = -1; |
3518 | for (i = 0, j = 0; i < thresholds->size; i++) { | 3675 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
3519 | if (thresholds->entries[i].eventfd == eventfd) | 3676 | if (thresholds->primary->entries[i].eventfd == eventfd) |
3520 | continue; | 3677 | continue; |
3521 | 3678 | ||
3522 | thresholds_new->entries[j] = thresholds->entries[i]; | 3679 | new->entries[j] = thresholds->primary->entries[i]; |
3523 | if (thresholds_new->entries[j].threshold < usage) { | 3680 | if (new->entries[j].threshold < usage) { |
3524 | /* | 3681 | /* |
3525 | * thresholds_new->current_threshold will not be used | 3682 | * new->current_threshold will not be used |
3526 | * until rcu_assign_pointer(), so it's safe to increment | 3683 | * until rcu_assign_pointer(), so it's safe to increment |
3527 | * it here. | 3684 | * it here. |
3528 | */ | 3685 | */ |
3529 | atomic_inc(&thresholds_new->current_threshold); | 3686 | ++new->current_threshold; |
3530 | } | 3687 | } |
3531 | j++; | 3688 | j++; |
3532 | } | 3689 | } |
3533 | 3690 | ||
3534 | assign: | 3691 | swap_buffers: |
3535 | if (type == _MEM) | 3692 | /* Swap primary and spare array */ |
3536 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3693 | thresholds->spare = thresholds->primary; |
3537 | else | 3694 | rcu_assign_pointer(thresholds->primary, new); |
3538 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3539 | 3695 | ||
3540 | /* To be sure that nobody uses thresholds before freeing it */ | 3696 | /* To be sure that nobody uses thresholds */ |
3541 | synchronize_rcu(); | 3697 | synchronize_rcu(); |
3542 | 3698 | ||
3543 | kfree(thresholds); | ||
3544 | unlock: | ||
3545 | mutex_unlock(&memcg->thresholds_lock); | 3699 | mutex_unlock(&memcg->thresholds_lock); |
3700 | } | ||
3546 | 3701 | ||
3547 | return ret; | 3702 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, |
3703 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3704 | { | ||
3705 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3706 | struct mem_cgroup_eventfd_list *event; | ||
3707 | int type = MEMFILE_TYPE(cft->private); | ||
3708 | |||
3709 | BUG_ON(type != _OOM_TYPE); | ||
3710 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
3711 | if (!event) | ||
3712 | return -ENOMEM; | ||
3713 | |||
3714 | mutex_lock(&memcg_oom_mutex); | ||
3715 | |||
3716 | event->eventfd = eventfd; | ||
3717 | list_add(&event->list, &memcg->oom_notify); | ||
3718 | |||
3719 | /* already in OOM ? */ | ||
3720 | if (atomic_read(&memcg->oom_lock)) | ||
3721 | eventfd_signal(eventfd, 1); | ||
3722 | mutex_unlock(&memcg_oom_mutex); | ||
3723 | |||
3724 | return 0; | ||
3725 | } | ||
3726 | |||
3727 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
3728 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
3729 | { | ||
3730 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3731 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
3732 | int type = MEMFILE_TYPE(cft->private); | ||
3733 | |||
3734 | BUG_ON(type != _OOM_TYPE); | ||
3735 | |||
3736 | mutex_lock(&memcg_oom_mutex); | ||
3737 | |||
3738 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
3739 | if (ev->eventfd == eventfd) { | ||
3740 | list_del(&ev->list); | ||
3741 | kfree(ev); | ||
3742 | } | ||
3743 | } | ||
3744 | |||
3745 | mutex_unlock(&memcg_oom_mutex); | ||
3746 | } | ||
3747 | |||
3748 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | ||
3749 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
3750 | { | ||
3751 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3752 | |||
3753 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | ||
3754 | |||
3755 | if (atomic_read(&mem->oom_lock)) | ||
3756 | cb->fill(cb, "under_oom", 1); | ||
3757 | else | ||
3758 | cb->fill(cb, "under_oom", 0); | ||
3759 | return 0; | ||
3760 | } | ||
3761 | |||
3762 | /* | ||
3763 | */ | ||
3764 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | ||
3765 | struct cftype *cft, u64 val) | ||
3766 | { | ||
3767 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3768 | struct mem_cgroup *parent; | ||
3769 | |||
3770 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | ||
3771 | if (!cgrp->parent || !((val == 0) || (val == 1))) | ||
3772 | return -EINVAL; | ||
3773 | |||
3774 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
3775 | |||
3776 | cgroup_lock(); | ||
3777 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
3778 | if ((parent->use_hierarchy) || | ||
3779 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | ||
3780 | cgroup_unlock(); | ||
3781 | return -EINVAL; | ||
3782 | } | ||
3783 | mem->oom_kill_disable = val; | ||
3784 | cgroup_unlock(); | ||
3785 | return 0; | ||
3548 | } | 3786 | } |
3549 | 3787 | ||
3550 | static struct cftype mem_cgroup_files[] = { | 3788 | static struct cftype mem_cgroup_files[] = { |
@@ -3552,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = { | |||
3552 | .name = "usage_in_bytes", | 3790 | .name = "usage_in_bytes", |
3553 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3791 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3554 | .read_u64 = mem_cgroup_read, | 3792 | .read_u64 = mem_cgroup_read, |
3555 | .register_event = mem_cgroup_register_event, | 3793 | .register_event = mem_cgroup_usage_register_event, |
3556 | .unregister_event = mem_cgroup_unregister_event, | 3794 | .unregister_event = mem_cgroup_usage_unregister_event, |
3557 | }, | 3795 | }, |
3558 | { | 3796 | { |
3559 | .name = "max_usage_in_bytes", | 3797 | .name = "max_usage_in_bytes", |
@@ -3602,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = { | |||
3602 | .read_u64 = mem_cgroup_move_charge_read, | 3840 | .read_u64 = mem_cgroup_move_charge_read, |
3603 | .write_u64 = mem_cgroup_move_charge_write, | 3841 | .write_u64 = mem_cgroup_move_charge_write, |
3604 | }, | 3842 | }, |
3843 | { | ||
3844 | .name = "oom_control", | ||
3845 | .read_map = mem_cgroup_oom_control_read, | ||
3846 | .write_u64 = mem_cgroup_oom_control_write, | ||
3847 | .register_event = mem_cgroup_oom_register_event, | ||
3848 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
3849 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
3850 | }, | ||
3605 | }; | 3851 | }; |
3606 | 3852 | ||
3607 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3610,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3610 | .name = "memsw.usage_in_bytes", | 3856 | .name = "memsw.usage_in_bytes", |
3611 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3857 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3612 | .read_u64 = mem_cgroup_read, | 3858 | .read_u64 = mem_cgroup_read, |
3613 | .register_event = mem_cgroup_register_event, | 3859 | .register_event = mem_cgroup_usage_register_event, |
3614 | .unregister_event = mem_cgroup_unregister_event, | 3860 | .unregister_event = mem_cgroup_usage_unregister_event, |
3615 | }, | 3861 | }, |
3616 | { | 3862 | { |
3617 | .name = "memsw.max_usage_in_bytes", | 3863 | .name = "memsw.max_usage_in_bytes", |
@@ -3839,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3839 | } else { | 4085 | } else { |
3840 | parent = mem_cgroup_from_cont(cont->parent); | 4086 | parent = mem_cgroup_from_cont(cont->parent); |
3841 | mem->use_hierarchy = parent->use_hierarchy; | 4087 | mem->use_hierarchy = parent->use_hierarchy; |
4088 | mem->oom_kill_disable = parent->oom_kill_disable; | ||
3842 | } | 4089 | } |
3843 | 4090 | ||
3844 | if (parent && parent->use_hierarchy) { | 4091 | if (parent && parent->use_hierarchy) { |
@@ -3857,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3857 | } | 4104 | } |
3858 | mem->last_scanned_child = 0; | 4105 | mem->last_scanned_child = 0; |
3859 | spin_lock_init(&mem->reclaim_param_lock); | 4106 | spin_lock_init(&mem->reclaim_param_lock); |
4107 | INIT_LIST_HEAD(&mem->oom_notify); | ||
3860 | 4108 | ||
3861 | if (parent) | 4109 | if (parent) |
3862 | mem->swappiness = get_swappiness(parent); | 4110 | mem->swappiness = get_swappiness(parent); |
@@ -3984,6 +4232,80 @@ enum mc_target_type { | |||
3984 | MC_TARGET_SWAP, | 4232 | MC_TARGET_SWAP, |
3985 | }; | 4233 | }; |
3986 | 4234 | ||
4235 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | ||
4236 | unsigned long addr, pte_t ptent) | ||
4237 | { | ||
4238 | struct page *page = vm_normal_page(vma, addr, ptent); | ||
4239 | |||
4240 | if (!page || !page_mapped(page)) | ||
4241 | return NULL; | ||
4242 | if (PageAnon(page)) { | ||
4243 | /* we don't move shared anon */ | ||
4244 | if (!move_anon() || page_mapcount(page) > 2) | ||
4245 | return NULL; | ||
4246 | } else if (!move_file()) | ||
4247 | /* we ignore mapcount for file pages */ | ||
4248 | return NULL; | ||
4249 | if (!get_page_unless_zero(page)) | ||
4250 | return NULL; | ||
4251 | |||
4252 | return page; | ||
4253 | } | ||
4254 | |||
4255 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
4256 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4257 | { | ||
4258 | int usage_count; | ||
4259 | struct page *page = NULL; | ||
4260 | swp_entry_t ent = pte_to_swp_entry(ptent); | ||
4261 | |||
4262 | if (!move_anon() || non_swap_entry(ent)) | ||
4263 | return NULL; | ||
4264 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4265 | if (usage_count > 1) { /* we don't move shared anon */ | ||
4266 | if (page) | ||
4267 | put_page(page); | ||
4268 | return NULL; | ||
4269 | } | ||
4270 | if (do_swap_account) | ||
4271 | entry->val = ent.val; | ||
4272 | |||
4273 | return page; | ||
4274 | } | ||
4275 | |||
4276 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | ||
4277 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4278 | { | ||
4279 | struct page *page = NULL; | ||
4280 | struct inode *inode; | ||
4281 | struct address_space *mapping; | ||
4282 | pgoff_t pgoff; | ||
4283 | |||
4284 | if (!vma->vm_file) /* anonymous vma */ | ||
4285 | return NULL; | ||
4286 | if (!move_file()) | ||
4287 | return NULL; | ||
4288 | |||
4289 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
4290 | mapping = vma->vm_file->f_mapping; | ||
4291 | if (pte_none(ptent)) | ||
4292 | pgoff = linear_page_index(vma, addr); | ||
4293 | else /* pte_file(ptent) is true */ | ||
4294 | pgoff = pte_to_pgoff(ptent); | ||
4295 | |||
4296 | /* page is moved even if it's not RSS of this task(page-faulted). */ | ||
4297 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | ||
4298 | page = find_get_page(mapping, pgoff); | ||
4299 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | ||
4300 | swp_entry_t ent; | ||
4301 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | ||
4302 | if (do_swap_account) | ||
4303 | entry->val = ent.val; | ||
4304 | } | ||
4305 | |||
4306 | return page; | ||
4307 | } | ||
4308 | |||
3987 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 4309 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
3988 | unsigned long addr, pte_t ptent, union mc_target *target) | 4310 | unsigned long addr, pte_t ptent, union mc_target *target) |
3989 | { | 4311 | { |
@@ -3991,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
3991 | struct page_cgroup *pc; | 4313 | struct page_cgroup *pc; |
3992 | int ret = 0; | 4314 | int ret = 0; |
3993 | swp_entry_t ent = { .val = 0 }; | 4315 | swp_entry_t ent = { .val = 0 }; |
3994 | int usage_count = 0; | ||
3995 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3996 | &mc.to->move_charge_at_immigrate); | ||
3997 | 4316 | ||
3998 | if (!pte_present(ptent)) { | 4317 | if (pte_present(ptent)) |
3999 | /* TODO: handle swap of shmes/tmpfs */ | 4318 | page = mc_handle_present_pte(vma, addr, ptent); |
4000 | if (pte_none(ptent) || pte_file(ptent)) | 4319 | else if (is_swap_pte(ptent)) |
4001 | return 0; | 4320 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
4002 | else if (is_swap_pte(ptent)) { | 4321 | else if (pte_none(ptent) || pte_file(ptent)) |
4003 | ent = pte_to_swp_entry(ptent); | 4322 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
4004 | if (!move_anon || non_swap_entry(ent)) | 4323 | |
4005 | return 0; | 4324 | if (!page && !ent.val) |
4006 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4007 | } | ||
4008 | } else { | ||
4009 | page = vm_normal_page(vma, addr, ptent); | ||
4010 | if (!page || !page_mapped(page)) | ||
4011 | return 0; | ||
4012 | /* | ||
4013 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4014 | * pages for now. | ||
4015 | */ | ||
4016 | if (!move_anon || !PageAnon(page)) | ||
4017 | return 0; | ||
4018 | if (!get_page_unless_zero(page)) | ||
4019 | return 0; | ||
4020 | usage_count = page_mapcount(page); | ||
4021 | } | ||
4022 | if (usage_count > 1) { | ||
4023 | /* | ||
4024 | * TODO: We don't move charges of shared(used by multiple | ||
4025 | * processes) pages for now. | ||
4026 | */ | ||
4027 | if (page) | ||
4028 | put_page(page); | ||
4029 | return 0; | 4325 | return 0; |
4030 | } | ||
4031 | if (page) { | 4326 | if (page) { |
4032 | pc = lookup_page_cgroup(page); | 4327 | pc = lookup_page_cgroup(page); |
4033 | /* | 4328 | /* |
@@ -4043,17 +4338,12 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
4043 | if (!ret || !target) | 4338 | if (!ret || !target) |
4044 | put_page(page); | 4339 | put_page(page); |
4045 | } | 4340 | } |
4046 | /* throught */ | 4341 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
4047 | if (ent.val && do_swap_account && !ret) { | 4342 | if (ent.val && !ret && |
4048 | unsigned short id; | 4343 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { |
4049 | rcu_read_lock(); | 4344 | ret = MC_TARGET_SWAP; |
4050 | id = css_id(&mc.from->css); | 4345 | if (target) |
4051 | rcu_read_unlock(); | 4346 | target->ent = ent; |
4052 | if (id == lookup_swap_cgroup(ent)) { | ||
4053 | ret = MC_TARGET_SWAP; | ||
4054 | if (target) | ||
4055 | target->ent = ent; | ||
4056 | } | ||
4057 | } | 4347 | } |
4058 | return ret; | 4348 | return ret; |
4059 | } | 4349 | } |
@@ -4090,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4090 | }; | 4380 | }; |
4091 | if (is_vm_hugetlb_page(vma)) | 4381 | if (is_vm_hugetlb_page(vma)) |
4092 | continue; | 4382 | continue; |
4093 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4094 | if (vma->vm_flags & VM_SHARED) | ||
4095 | continue; | ||
4096 | walk_page_range(vma->vm_start, vma->vm_end, | 4383 | walk_page_range(vma->vm_start, vma->vm_end, |
4097 | &mem_cgroup_count_precharge_walk); | 4384 | &mem_cgroup_count_precharge_walk); |
4098 | } | 4385 | } |
@@ -4115,6 +4402,7 @@ static void mem_cgroup_clear_mc(void) | |||
4115 | if (mc.precharge) { | 4402 | if (mc.precharge) { |
4116 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4403 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4117 | mc.precharge = 0; | 4404 | mc.precharge = 0; |
4405 | memcg_oom_recover(mc.to); | ||
4118 | } | 4406 | } |
4119 | /* | 4407 | /* |
4120 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4408 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
@@ -4123,6 +4411,7 @@ static void mem_cgroup_clear_mc(void) | |||
4123 | if (mc.moved_charge) { | 4411 | if (mc.moved_charge) { |
4124 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4412 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4125 | mc.moved_charge = 0; | 4413 | mc.moved_charge = 0; |
4414 | memcg_oom_recover(mc.from); | ||
4126 | } | 4415 | } |
4127 | /* we must fixup refcnts and charges */ | 4416 | /* we must fixup refcnts and charges */ |
4128 | if (mc.moved_swap) { | 4417 | if (mc.moved_swap) { |
@@ -4287,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4287 | }; | 4576 | }; |
4288 | if (is_vm_hugetlb_page(vma)) | 4577 | if (is_vm_hugetlb_page(vma)) |
4289 | continue; | 4578 | continue; |
4290 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4291 | if (vma->vm_flags & VM_SHARED) | ||
4292 | continue; | ||
4293 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 4579 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
4294 | &mem_cgroup_move_charge_walk); | 4580 | &mem_cgroup_move_charge_walk); |
4295 | if (ret) | 4581 | if (ret) |