diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 134 |
1 files changed, 107 insertions, 27 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f7b910fc14fb..7973b5221fb8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -203,7 +203,7 @@ struct mem_cgroup { | |||
203 | * Should the accounting and control be hierarchical, per subtree? | 203 | * Should the accounting and control be hierarchical, per subtree? |
204 | */ | 204 | */ |
205 | bool use_hierarchy; | 205 | bool use_hierarchy; |
206 | unsigned long last_oom_jiffies; | 206 | atomic_t oom_lock; |
207 | atomic_t refcnt; | 207 | atomic_t refcnt; |
208 | 208 | ||
209 | unsigned int swappiness; | 209 | unsigned int swappiness; |
@@ -1246,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1246 | return total; | 1246 | return total; |
1247 | } | 1247 | } |
1248 | 1248 | ||
1249 | bool mem_cgroup_oom_called(struct task_struct *task) | 1249 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) |
1250 | { | 1250 | { |
1251 | bool ret = false; | 1251 | int *val = (int *)data; |
1252 | struct mem_cgroup *mem; | 1252 | int x; |
1253 | struct mm_struct *mm; | 1253 | /* |
1254 | * Logically, we can stop scanning immediately when we find | ||
1255 | * a memcg is already locked. But condidering unlock ops and | ||
1256 | * creation/removal of memcg, scan-all is simple operation. | ||
1257 | */ | ||
1258 | x = atomic_inc_return(&mem->oom_lock); | ||
1259 | *val = max(x, *val); | ||
1260 | return 0; | ||
1261 | } | ||
1262 | /* | ||
1263 | * Check OOM-Killer is already running under our hierarchy. | ||
1264 | * If someone is running, return false. | ||
1265 | */ | ||
1266 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | ||
1267 | { | ||
1268 | int lock_count = 0; | ||
1254 | 1269 | ||
1255 | rcu_read_lock(); | 1270 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); |
1256 | mm = task->mm; | 1271 | |
1257 | if (!mm) | 1272 | if (lock_count == 1) |
1258 | mm = &init_mm; | 1273 | return true; |
1259 | mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); | 1274 | return false; |
1260 | if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) | ||
1261 | ret = true; | ||
1262 | rcu_read_unlock(); | ||
1263 | return ret; | ||
1264 | } | 1275 | } |
1265 | 1276 | ||
1266 | static int record_last_oom_cb(struct mem_cgroup *mem, void *data) | 1277 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) |
1267 | { | 1278 | { |
1268 | mem->last_oom_jiffies = jiffies; | 1279 | /* |
1280 | * When a new child is created while the hierarchy is under oom, | ||
1281 | * mem_cgroup_oom_lock() may not be called. We have to use | ||
1282 | * atomic_add_unless() here. | ||
1283 | */ | ||
1284 | atomic_add_unless(&mem->oom_lock, -1, 0); | ||
1269 | return 0; | 1285 | return 0; |
1270 | } | 1286 | } |
1271 | 1287 | ||
1272 | static void record_last_oom(struct mem_cgroup *mem) | 1288 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1273 | { | 1289 | { |
1274 | mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); | 1290 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); |
1291 | } | ||
1292 | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | ||
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | ||
1295 | |||
1296 | /* | ||
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | ||
1298 | */ | ||
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | ||
1300 | { | ||
1301 | DEFINE_WAIT(wait); | ||
1302 | bool locked; | ||
1303 | |||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | ||
1305 | mutex_lock(&memcg_oom_mutex); | ||
1306 | locked = mem_cgroup_oom_lock(mem); | ||
1307 | /* | ||
1308 | * Even if signal_pending(), we can't quit charge() loop without | ||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
1311 | */ | ||
1312 | if (!locked) | ||
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | ||
1315 | |||
1316 | if (locked) | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | ||
1318 | else { | ||
1319 | schedule(); | ||
1320 | finish_wait(&memcg_oom_waitq, &wait); | ||
1321 | } | ||
1322 | mutex_lock(&memcg_oom_mutex); | ||
1323 | mem_cgroup_oom_unlock(mem); | ||
1324 | /* | ||
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | ||
1339 | |||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
1341 | return false; | ||
1342 | /* Give chance to dying process */ | ||
1343 | schedule_timeout(1); | ||
1344 | return true; | ||
1275 | } | 1345 | } |
1276 | 1346 | ||
1277 | /* | 1347 | /* |
@@ -1443,11 +1513,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1443 | struct res_counter *fail_res; | 1513 | struct res_counter *fail_res; |
1444 | int csize = CHARGE_SIZE; | 1514 | int csize = CHARGE_SIZE; |
1445 | 1515 | ||
1446 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1516 | /* |
1447 | /* Don't account this! */ | 1517 | * Unlike gloval-vm's OOM-kill, we're not in memory shortage |
1448 | *memcg = NULL; | 1518 | * in system level. So, allow to go ahead dying process in addition to |
1449 | return 0; | 1519 | * MEMDIE process. |
1450 | } | 1520 | */ |
1521 | if (unlikely(test_thread_flag(TIF_MEMDIE) | ||
1522 | || fatal_signal_pending(current))) | ||
1523 | goto bypass; | ||
1451 | 1524 | ||
1452 | /* | 1525 | /* |
1453 | * We always charge the cgroup the mm_struct belongs to. | 1526 | * We always charge the cgroup the mm_struct belongs to. |
@@ -1560,11 +1633,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1560 | } | 1633 | } |
1561 | 1634 | ||
1562 | if (!nr_retries--) { | 1635 | if (!nr_retries--) { |
1563 | if (oom) { | 1636 | if (!oom) |
1564 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1637 | goto nomem; |
1565 | record_last_oom(mem_over_limit); | 1638 | if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) { |
1639 | nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
1640 | continue; | ||
1566 | } | 1641 | } |
1567 | goto nomem; | 1642 | /* When we reach here, current task is dying .*/ |
1643 | css_put(&mem->css); | ||
1644 | goto bypass; | ||
1568 | } | 1645 | } |
1569 | } | 1646 | } |
1570 | if (csize > PAGE_SIZE) | 1647 | if (csize > PAGE_SIZE) |
@@ -1574,6 +1651,9 @@ done: | |||
1574 | nomem: | 1651 | nomem: |
1575 | css_put(&mem->css); | 1652 | css_put(&mem->css); |
1576 | return -ENOMEM; | 1653 | return -ENOMEM; |
1654 | bypass: | ||
1655 | *memcg = NULL; | ||
1656 | return 0; | ||
1577 | } | 1657 | } |
1578 | 1658 | ||
1579 | /* | 1659 | /* |