diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2010-03-10 18:22:39 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-03-12 18:52:38 -0500 |
commit | 867578cbccb0893cc14fc29c670f7185809c90d6 (patch) | |
tree | e4d0cefac265fc64399223bc82ed714a88ebe20c | |
parent | 0263c12c12ccc90edc9d856fa839f8936183e6d1 (diff) |
memcg: fix oom kill behavior
In current page-fault code,
handle_mm_fault()
-> ...
-> mem_cgroup_charge()
-> map page or handle error.
-> check return code.
If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is
called. But if it's caused by memcg, OOM should have been already
invoked.
Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That
patch records last_oom_jiffies for memcg's sub-hierarchy and prevents
page_fault_out_of_memory from being invoked in near future.
But Nishimura-san reported that check by jiffies is not enough when the
system is terribly heavy.
This patch changes memcg's oom logic as.
* If memcg causes OOM-kill, continue to retry.
* remove jiffies check which is used now.
* add memcg-oom-lock which works like perzone oom lock.
* If current is killed(as a process), bypass charge.
Something more sophisticated can be added but this pactch does
fundamental things.
TODO:
- add oom notifier
- add permemcg disable-oom-kill flag and freezer at oom.
- more chances for wake up oom waiter (when changing memory limit etc..)
Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Tested-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/memcontrol.h | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 134 | ||||
-rw-r--r-- | mm/oom_kill.c | 8 |
3 files changed, 107 insertions, 41 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f9b119f4ace..44301c6affa8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void) | |||
124 | return false; | 124 | return false; |
125 | } | 125 | } |
126 | 126 | ||
127 | extern bool mem_cgroup_oom_called(struct task_struct *task); | ||
128 | void mem_cgroup_update_file_mapped(struct page *page, int val); | 127 | void mem_cgroup_update_file_mapped(struct page *page, int val); |
129 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 128 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
130 | gfp_t gfp_mask, int nid, | 129 | gfp_t gfp_mask, int nid, |
@@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void) | |||
258 | return true; | 257 | return true; |
259 | } | 258 | } |
260 | 259 | ||
261 | static inline bool mem_cgroup_oom_called(struct task_struct *task) | ||
262 | { | ||
263 | return false; | ||
264 | } | ||
265 | |||
266 | static inline int | 260 | static inline int |
267 | mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) | 261 | mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) |
268 | { | 262 | { |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7b910fc14fb..7973b5221fb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -203,7 +203,7 @@ struct mem_cgroup { | |||
203 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
204 | */ | 204 | */ |
205 | bool use_hierarchy; | 205 | bool use_hierarchy; |
206 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
207 | atomic_t refcnt; | 207 | atomic_t refcnt; |
208 | 208 | ||
209 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1246 | return total; | 1246 | return total; |
1247 | } | 1247 | } |
1248 | 1248 | ||
1249 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1250 | { | 1250 | { |
1251 | bool ret = false; | 1251 | int *val = (int *)data; |
1252 | struct mem_cgroup *mem; | 1252 | int x; |
1253 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1254 | 1269 | ||
1255 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1256 | mm = task->mm; | 1271 | |
1257 | if (!mm) | 1272 | if (lock_count == 1) |
1258 | mm = &init_mm; | 1273 | return true; |
1259 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1260 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1261 | ret = true; | ||
1262 | rcu_read_unlock(); | ||
1263 | return ret; | ||
1264 | } | 1275 | } |
1265 | 1276 | ||
1266 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1267 | { | 1278 | { |
1268 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1269 | return 0; | 1285 | return 0; |
1270 | } | 1286 | } |
1271 | 1287 | ||
1272 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1273 | { | 1289 | { |
1274 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); |
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1300 | { | ||
1301 | DEFINE_WAIT(wait); | ||
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1275 | } | 1345 | } |
1276 | 1346 | ||
1277 | /* | 1347 | /* |
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1443 | struct res_counter *fail_res; | 1513 | struct res_counter *fail_res; |
1444 | int csize = CHARGE_SIZE; | 1514 | int csize = CHARGE_SIZE; |
1445 | 1515 | ||
1446 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1516 | /* |
1447 | /* Don't account this! */ | 1517 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1448 | *memcg = NULL; | 1518 | * in system level. So, allow to go ahead dying process in addition to |
1449 | return 0; | 1519 | * MEMDIE process. |
1450 | } | 1520 | */ |
1521 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1522 | || fatal_signal_pending(current))) | ||
1523 | goto bypass; | ||
1451 | 1524 | ||
1452 | /* | 1525 | /* |
1453 | * We always charge the cgroup the mm_struct belongs to. | 1526 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1560 | } | 1633 | } |
1561 | 1634 | ||
1562 | if (!nr_retries--) { | 1635 | if (!nr_retries--) { |
1563 | if (oom) { | 1636 | if (!oom) |
1564 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1637 | goto nomem; |
1565 | record_last_oom(mem_over_limit); | 1638 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1639 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1640 | continue; | ||
1566 | } | 1641 | } |
1567 | goto nomem; | 1642 | /* When we reach here, current task is dying .*/ |
1643 | css_put(&mem->css); | ||
1644 | goto bypass; | ||
1568 | } | 1645 | } |
1569 | } | 1646 | } |
1570 | if (csize > PAGE_SIZE) | 1647 | if (csize > PAGE_SIZE) |
@@ -1574,6 +1651,9 @@ done: | |||
1574 | nomem: | 1651 | nomem: |
1575 | css_put(&mem->css); | 1652 | css_put(&mem->css); |
1576 | return -ENOMEM; | 1653 | return -ENOMEM; |
1654 | bypass: | ||
1655 | *memcg = NULL; | ||
1656 | return 0; | ||
1577 | } | 1657 | } |
1578 | 1658 | ||
1579 | /* | 1659 | /* |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 71d10bf52dc8..9b223af6a147 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -603,13 +603,6 @@ void pagefault_out_of_memory(void) | |||
603 | /* Got some memory back in the last second. */ | 603 | /* Got some memory back in the last second. */ |
604 | return; | 604 | return; |
605 | 605 | ||
606 | /* | ||
607 | * If this is from memcg, oom-killer is already invoked. | ||
608 | * and not worth to go system-wide-oom. | ||
609 | */ | ||
610 | if (mem_cgroup_oom_called(current)) | ||
611 | goto rest_and_return; | ||
612 | |||
613 | if (sysctl_panic_on_oom) | 606 | if (sysctl_panic_on_oom) |
614 | panic("out of memory from page fault. panic_on_oom is selected.\n"); | 607 | panic("out of memory from page fault. panic_on_oom is selected.\n"); |
615 | 608 | ||
@@ -621,7 +614,6 @@ void pagefault_out_of_memory(void) | |||
621 | * Give "p" a good chance of killing itself before we | 614 | * Give "p" a good chance of killing itself before we |
622 | * retry to allocate memory. | 615 | * retry to allocate memory. |
623 | */ | 616 | */ |
624 | rest_and_return: | ||
625 | if (!test_thread_flag(TIF_MEMDIE)) | 617 | if (!test_thread_flag(TIF_MEMDIE)) |
626 | schedule_timeout_uninterruptible(1); | 618 | schedule_timeout_uninterruptible(1); |
627 | } | 619 | } |