diff options
-rw-r--r-- | include/linux/memcontrol.h | 21 | ||||
-rw-r--r-- | include/linux/sched.h | 4 | ||||
-rw-r--r-- | mm/memcontrol.c | 154 | ||||
-rw-r--r-- | mm/memory.c | 3 | ||||
-rw-r--r-- | mm/oom_kill.c | 7 |
5 files changed, 140 insertions, 49 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34ac6497d01a..89d576cfcc4c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -157,6 +157,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage, | |||
157 | * | 157 | * |
158 | * Toggle whether a failed memcg charge should invoke the OOM killer | 158 | * Toggle whether a failed memcg charge should invoke the OOM killer |
159 | * or just return -ENOMEM. Returns the previous toggle state. | 159 | * or just return -ENOMEM. Returns the previous toggle state. |
160 | * | ||
161 | * NOTE: Any path that enables the OOM killer before charging must | ||
162 | * call mem_cgroup_oom_synchronize() afterward to finalize the | ||
163 | * OOM handling and clean up. | ||
160 | */ | 164 | */ |
161 | static inline bool mem_cgroup_toggle_oom(bool new) | 165 | static inline bool mem_cgroup_toggle_oom(bool new) |
162 | { | 166 | { |
@@ -182,6 +186,13 @@ static inline void mem_cgroup_disable_oom(void) | |||
182 | WARN_ON(old == false); | 186 | WARN_ON(old == false); |
183 | } | 187 | } |
184 | 188 | ||
189 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
190 | { | ||
191 | return p->memcg_oom.in_memcg_oom; | ||
192 | } | ||
193 | |||
194 | bool mem_cgroup_oom_synchronize(void); | ||
195 | |||
185 | #ifdef CONFIG_MEMCG_SWAP | 196 | #ifdef CONFIG_MEMCG_SWAP |
186 | extern int do_swap_account; | 197 | extern int do_swap_account; |
187 | #endif | 198 | #endif |
@@ -427,6 +438,16 @@ static inline void mem_cgroup_disable_oom(void) | |||
427 | { | 438 | { |
428 | } | 439 | } |
429 | 440 | ||
441 | static inline bool task_in_memcg_oom(struct task_struct *p) | ||
442 | { | ||
443 | return false; | ||
444 | } | ||
445 | |||
446 | static inline bool mem_cgroup_oom_synchronize(void) | ||
447 | { | ||
448 | return false; | ||
449 | } | ||
450 | |||
430 | static inline void mem_cgroup_inc_page_stat(struct page *page, | 451 | static inline void mem_cgroup_inc_page_stat(struct page *page, |
431 | enum mem_cgroup_page_stat_item idx) | 452 | enum mem_cgroup_page_stat_item idx) |
432 | { | 453 | { |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ce1fa53031f..6682da36b293 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1395,6 +1395,10 @@ struct task_struct { | |||
1395 | unsigned int memcg_kmem_skip_account; | 1395 | unsigned int memcg_kmem_skip_account; |
1396 | struct memcg_oom_info { | 1396 | struct memcg_oom_info { |
1397 | unsigned int may_oom:1; | 1397 | unsigned int may_oom:1; |
1398 | unsigned int in_memcg_oom:1; | ||
1399 | unsigned int oom_locked:1; | ||
1400 | int wakeups; | ||
1401 | struct mem_cgroup *wait_on_memcg; | ||
1398 | } memcg_oom; | 1402 | } memcg_oom; |
1399 | #endif | 1403 | #endif |
1400 | #ifdef CONFIG_UPROBES | 1404 | #ifdef CONFIG_UPROBES |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 04250cbf46c6..4b5cfb509270 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -255,6 +255,7 @@ struct mem_cgroup { | |||
255 | 255 | ||
256 | bool oom_lock; | 256 | bool oom_lock; |
257 | atomic_t under_oom; | 257 | atomic_t under_oom; |
258 | atomic_t oom_wakeups; | ||
258 | 259 | ||
259 | int swappiness; | 260 | int swappiness; |
260 | /* OOM-Killer disable */ | 261 | /* OOM-Killer disable */ |
@@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait, | |||
2020 | 2021 | ||
2021 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) | 2022 | static void memcg_wakeup_oom(struct mem_cgroup *memcg) |
2022 | { | 2023 | { |
2024 | atomic_inc(&memcg->oom_wakeups); | ||
2023 | /* for filtering, pass "memcg" as argument. */ | 2025 | /* for filtering, pass "memcg" as argument. */ |
2024 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); | 2026 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); |
2025 | } | 2027 | } |
@@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
2031 | } | 2033 | } |
2032 | 2034 | ||
2033 | /* | 2035 | /* |
2034 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 2036 | * try to call OOM killer |
2035 | */ | 2037 | */ |
2036 | static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | 2038 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
2037 | int order) | ||
2038 | { | 2039 | { |
2039 | struct oom_wait_info owait; | ||
2040 | bool locked; | 2040 | bool locked; |
2041 | int wakeups; | ||
2041 | 2042 | ||
2042 | owait.memcg = memcg; | 2043 | if (!current->memcg_oom.may_oom) |
2043 | owait.wait.flags = 0; | 2044 | return; |
2044 | owait.wait.func = memcg_oom_wake_function; | 2045 | |
2045 | owait.wait.private = current; | 2046 | current->memcg_oom.in_memcg_oom = 1; |
2046 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2047 | 2047 | ||
2048 | /* | 2048 | /* |
2049 | * As with any blocking lock, a contender needs to start | 2049 | * As with any blocking lock, a contender needs to start |
@@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | |||
2051 | * otherwise it can miss the wakeup from the unlock and sleep | 2051 | * otherwise it can miss the wakeup from the unlock and sleep |
2052 | * indefinitely. This is just open-coded because our locking | 2052 | * indefinitely. This is just open-coded because our locking |
2053 | * is so particular to memcg hierarchies. | 2053 | * is so particular to memcg hierarchies. |
2054 | * | ||
2055 | * Even if signal_pending(), we can't quit charge() loop without | ||
2056 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | ||
2057 | * under OOM is always welcomed, use TASK_KILLABLE here. | ||
2058 | */ | 2054 | */ |
2059 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2055 | wakeups = atomic_read(&memcg->oom_wakeups); |
2060 | mem_cgroup_mark_under_oom(memcg); | 2056 | mem_cgroup_mark_under_oom(memcg); |
2061 | 2057 | ||
2062 | locked = mem_cgroup_oom_trylock(memcg); | 2058 | locked = mem_cgroup_oom_trylock(memcg); |
@@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | |||
2066 | 2062 | ||
2067 | if (locked && !memcg->oom_kill_disable) { | 2063 | if (locked && !memcg->oom_kill_disable) { |
2068 | mem_cgroup_unmark_under_oom(memcg); | 2064 | mem_cgroup_unmark_under_oom(memcg); |
2069 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2070 | mem_cgroup_out_of_memory(memcg, mask, order); | 2065 | mem_cgroup_out_of_memory(memcg, mask, order); |
2066 | mem_cgroup_oom_unlock(memcg); | ||
2067 | /* | ||
2068 | * There is no guarantee that an OOM-lock contender | ||
2069 | * sees the wakeups triggered by the OOM kill | ||
2070 | * uncharges. Wake any sleepers explicitely. | ||
2071 | */ | ||
2072 | memcg_oom_recover(memcg); | ||
2071 | } else { | 2073 | } else { |
2072 | schedule(); | 2074 | /* |
2073 | mem_cgroup_unmark_under_oom(memcg); | 2075 | * A system call can just return -ENOMEM, but if this |
2074 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2076 | * is a page fault and somebody else is handling the |
2077 | * OOM already, we need to sleep on the OOM waitqueue | ||
2078 | * for this memcg until the situation is resolved. | ||
2079 | * Which can take some time because it might be | ||
2080 | * handled by a userspace task. | ||
2081 | * | ||
2082 | * However, this is the charge context, which means | ||
2083 | * that we may sit on a large call stack and hold | ||
2084 | * various filesystem locks, the mmap_sem etc. and we | ||
2085 | * don't want the OOM handler to deadlock on them | ||
2086 | * while we sit here and wait. Store the current OOM | ||
2087 | * context in the task_struct, then return -ENOMEM. | ||
2088 | * At the end of the page fault handler, with the | ||
2089 | * stack unwound, pagefault_out_of_memory() will check | ||
2090 | * back with us by calling | ||
2091 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
2092 | * task to sleep. | ||
2093 | */ | ||
2094 | current->memcg_oom.oom_locked = locked; | ||
2095 | current->memcg_oom.wakeups = wakeups; | ||
2096 | css_get(&memcg->css); | ||
2097 | current->memcg_oom.wait_on_memcg = memcg; | ||
2075 | } | 2098 | } |
2099 | } | ||
2100 | |||
2101 | /** | ||
2102 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | ||
2103 | * | ||
2104 | * This has to be called at the end of a page fault if the the memcg | ||
2105 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | ||
2106 | * | ||
2107 | * Memcg supports userspace OOM handling, so failed allocations must | ||
2108 | * sleep on a waitqueue until the userspace task resolves the | ||
2109 | * situation. Sleeping directly in the charge context with all kinds | ||
2110 | * of locks held is not a good idea, instead we remember an OOM state | ||
2111 | * in the task and mem_cgroup_oom_synchronize() has to be called at | ||
2112 | * the end of the page fault to put the task to sleep and clean up the | ||
2113 | * OOM state. | ||
2114 | * | ||
2115 | * Returns %true if an ongoing memcg OOM situation was detected and | ||
2116 | * finalized, %false otherwise. | ||
2117 | */ | ||
2118 | bool mem_cgroup_oom_synchronize(void) | ||
2119 | { | ||
2120 | struct oom_wait_info owait; | ||
2121 | struct mem_cgroup *memcg; | ||
2122 | |||
2123 | /* OOM is global, do not handle */ | ||
2124 | if (!current->memcg_oom.in_memcg_oom) | ||
2125 | return false; | ||
2126 | |||
2127 | /* | ||
2128 | * We invoked the OOM killer but there is a chance that a kill | ||
2129 | * did not free up any charges. Everybody else might already | ||
2130 | * be sleeping, so restart the fault and keep the rampage | ||
2131 | * going until some charges are released. | ||
2132 | */ | ||
2133 | memcg = current->memcg_oom.wait_on_memcg; | ||
2134 | if (!memcg) | ||
2135 | goto out; | ||
2136 | |||
2137 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | ||
2138 | goto out_memcg; | ||
2139 | |||
2140 | owait.memcg = memcg; | ||
2141 | owait.wait.flags = 0; | ||
2142 | owait.wait.func = memcg_oom_wake_function; | ||
2143 | owait.wait.private = current; | ||
2144 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
2076 | 2145 | ||
2077 | if (locked) { | 2146 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
2147 | /* Only sleep if we didn't miss any wakeups since OOM */ | ||
2148 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | ||
2149 | schedule(); | ||
2150 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
2151 | out_memcg: | ||
2152 | mem_cgroup_unmark_under_oom(memcg); | ||
2153 | if (current->memcg_oom.oom_locked) { | ||
2078 | mem_cgroup_oom_unlock(memcg); | 2154 | mem_cgroup_oom_unlock(memcg); |
2079 | /* | 2155 | /* |
2080 | * There is no guarantee that an OOM-lock contender | 2156 | * There is no guarantee that an OOM-lock contender |
@@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, | |||
2083 | */ | 2159 | */ |
2084 | memcg_oom_recover(memcg); | 2160 | memcg_oom_recover(memcg); |
2085 | } | 2161 | } |
2086 | 2162 | css_put(&memcg->css); | |
2087 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2163 | current->memcg_oom.wait_on_memcg = NULL; |
2088 | return false; | 2164 | out: |
2089 | /* Give chance to dying process */ | 2165 | current->memcg_oom.in_memcg_oom = 0; |
2090 | schedule_timeout_uninterruptible(1); | ||
2091 | return true; | 2166 | return true; |
2092 | } | 2167 | } |
2093 | 2168 | ||
@@ -2400,12 +2475,11 @@ enum { | |||
2400 | CHARGE_RETRY, /* need to retry but retry is not bad */ | 2475 | CHARGE_RETRY, /* need to retry but retry is not bad */ |
2401 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ | 2476 | CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ |
2402 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ | 2477 | CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ |
2403 | CHARGE_OOM_DIE, /* the current is killed because of OOM */ | ||
2404 | }; | 2478 | }; |
2405 | 2479 | ||
2406 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | 2480 | static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, |
2407 | unsigned int nr_pages, unsigned int min_pages, | 2481 | unsigned int nr_pages, unsigned int min_pages, |
2408 | bool oom_check) | 2482 | bool invoke_oom) |
2409 | { | 2483 | { |
2410 | unsigned long csize = nr_pages * PAGE_SIZE; | 2484 | unsigned long csize = nr_pages * PAGE_SIZE; |
2411 | struct mem_cgroup *mem_over_limit; | 2485 | struct mem_cgroup *mem_over_limit; |
@@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
2462 | if (mem_cgroup_wait_acct_move(mem_over_limit)) | 2536 | if (mem_cgroup_wait_acct_move(mem_over_limit)) |
2463 | return CHARGE_RETRY; | 2537 | return CHARGE_RETRY; |
2464 | 2538 | ||
2465 | /* If we don't need to call oom-killer at el, return immediately */ | 2539 | if (invoke_oom) |
2466 | if (!oom_check || !current->memcg_oom.may_oom) | 2540 | mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); |
2467 | return CHARGE_NOMEM; | ||
2468 | /* check OOM */ | ||
2469 | if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize))) | ||
2470 | return CHARGE_OOM_DIE; | ||
2471 | 2541 | ||
2472 | return CHARGE_RETRY; | 2542 | return CHARGE_NOMEM; |
2473 | } | 2543 | } |
2474 | 2544 | ||
2475 | /* | 2545 | /* |
@@ -2572,7 +2642,7 @@ again: | |||
2572 | } | 2642 | } |
2573 | 2643 | ||
2574 | do { | 2644 | do { |
2575 | bool oom_check; | 2645 | bool invoke_oom = oom && !nr_oom_retries; |
2576 | 2646 | ||
2577 | /* If killed, bypass charge */ | 2647 | /* If killed, bypass charge */ |
2578 | if (fatal_signal_pending(current)) { | 2648 | if (fatal_signal_pending(current)) { |
@@ -2580,14 +2650,8 @@ again: | |||
2580 | goto bypass; | 2650 | goto bypass; |
2581 | } | 2651 | } |
2582 | 2652 | ||
2583 | oom_check = false; | 2653 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, |
2584 | if (oom && !nr_oom_retries) { | 2654 | nr_pages, invoke_oom); |
2585 | oom_check = true; | ||
2586 | nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; | ||
2587 | } | ||
2588 | |||
2589 | ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages, | ||
2590 | oom_check); | ||
2591 | switch (ret) { | 2655 | switch (ret) { |
2592 | case CHARGE_OK: | 2656 | case CHARGE_OK: |
2593 | break; | 2657 | break; |
@@ -2600,16 +2664,12 @@ again: | |||
2600 | css_put(&memcg->css); | 2664 | css_put(&memcg->css); |
2601 | goto nomem; | 2665 | goto nomem; |
2602 | case CHARGE_NOMEM: /* OOM routine works */ | 2666 | case CHARGE_NOMEM: /* OOM routine works */ |
2603 | if (!oom) { | 2667 | if (!oom || invoke_oom) { |
2604 | css_put(&memcg->css); | 2668 | css_put(&memcg->css); |
2605 | goto nomem; | 2669 | goto nomem; |
2606 | } | 2670 | } |
2607 | /* If oom, we never return -ENOMEM */ | ||
2608 | nr_oom_retries--; | 2671 | nr_oom_retries--; |
2609 | break; | 2672 | break; |
2610 | case CHARGE_OOM_DIE: /* Killed by OOM Killer */ | ||
2611 | css_put(&memcg->css); | ||
2612 | goto bypass; | ||
2613 | } | 2673 | } |
2614 | } while (ret != CHARGE_OK); | 2674 | } while (ret != CHARGE_OK); |
2615 | 2675 | ||
diff --git a/mm/memory.c b/mm/memory.c index a8f9deab8719..5ec6f199e685 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
3867 | if (flags & FAULT_FLAG_USER) | 3867 | if (flags & FAULT_FLAG_USER) |
3868 | mem_cgroup_disable_oom(); | 3868 | mem_cgroup_disable_oom(); |
3869 | 3869 | ||
3870 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | ||
3871 | mem_cgroup_oom_synchronize(); | ||
3872 | |||
3870 | return ret; | 3873 | return ret; |
3871 | } | 3874 | } |
3872 | 3875 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 98e75f2ac7bc..314e9d274381 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -678,9 +678,12 @@ out: | |||
678 | */ | 678 | */ |
679 | void pagefault_out_of_memory(void) | 679 | void pagefault_out_of_memory(void) |
680 | { | 680 | { |
681 | struct zonelist *zonelist = node_zonelist(first_online_node, | 681 | struct zonelist *zonelist; |
682 | GFP_KERNEL); | ||
683 | 682 | ||
683 | if (mem_cgroup_oom_synchronize()) | ||
684 | return; | ||
685 | |||
686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | ||
684 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { | 687 | if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { |
685 | out_of_memory(NULL, 0, 0, NULL, false); | 688 | out_of_memory(NULL, 0, 0, NULL, false); |
686 | clear_zonelist_oom(zonelist, GFP_KERNEL); | 689 | clear_zonelist_oom(zonelist, GFP_KERNEL); |