aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h21
-rw-r--r--include/linux/sched.h4
-rw-r--r--mm/memcontrol.c154
-rw-r--r--mm/memory.c3
-rw-r--r--mm/oom_kill.c7
5 files changed, 140 insertions, 49 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 34ac6497d01a..89d576cfcc4c 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -157,6 +157,10 @@ extern void mem_cgroup_replace_page_cache(struct page *oldpage,
157 * 157 *
158 * Toggle whether a failed memcg charge should invoke the OOM killer 158 * Toggle whether a failed memcg charge should invoke the OOM killer
159 * or just return -ENOMEM. Returns the previous toggle state. 159 * or just return -ENOMEM. Returns the previous toggle state.
160 *
161 * NOTE: Any path that enables the OOM killer before charging must
162 * call mem_cgroup_oom_synchronize() afterward to finalize the
163 * OOM handling and clean up.
160 */ 164 */
161static inline bool mem_cgroup_toggle_oom(bool new) 165static inline bool mem_cgroup_toggle_oom(bool new)
162{ 166{
@@ -182,6 +186,13 @@ static inline void mem_cgroup_disable_oom(void)
182 WARN_ON(old == false); 186 WARN_ON(old == false);
183} 187}
184 188
189static inline bool task_in_memcg_oom(struct task_struct *p)
190{
191 return p->memcg_oom.in_memcg_oom;
192}
193
194bool mem_cgroup_oom_synchronize(void);
195
185#ifdef CONFIG_MEMCG_SWAP 196#ifdef CONFIG_MEMCG_SWAP
186extern int do_swap_account; 197extern int do_swap_account;
187#endif 198#endif
@@ -427,6 +438,16 @@ static inline void mem_cgroup_disable_oom(void)
427{ 438{
428} 439}
429 440
441static inline bool task_in_memcg_oom(struct task_struct *p)
442{
443 return false;
444}
445
446static inline bool mem_cgroup_oom_synchronize(void)
447{
448 return false;
449}
450
430static inline void mem_cgroup_inc_page_stat(struct page *page, 451static inline void mem_cgroup_inc_page_stat(struct page *page,
431 enum mem_cgroup_page_stat_item idx) 452 enum mem_cgroup_page_stat_item idx)
432{ 453{
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9ce1fa53031f..6682da36b293 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1395,6 +1395,10 @@ struct task_struct {
1395 unsigned int memcg_kmem_skip_account; 1395 unsigned int memcg_kmem_skip_account;
1396 struct memcg_oom_info { 1396 struct memcg_oom_info {
1397 unsigned int may_oom:1; 1397 unsigned int may_oom:1;
1398 unsigned int in_memcg_oom:1;
1399 unsigned int oom_locked:1;
1400 int wakeups;
1401 struct mem_cgroup *wait_on_memcg;
1398 } memcg_oom; 1402 } memcg_oom;
1399#endif 1403#endif
1400#ifdef CONFIG_UPROBES 1404#ifdef CONFIG_UPROBES
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 04250cbf46c6..4b5cfb509270 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -255,6 +255,7 @@ struct mem_cgroup {
255 255
256 bool oom_lock; 256 bool oom_lock;
257 atomic_t under_oom; 257 atomic_t under_oom;
258 atomic_t oom_wakeups;
258 259
259 int swappiness; 260 int swappiness;
260 /* OOM-Killer disable */ 261 /* OOM-Killer disable */
@@ -2020,6 +2021,7 @@ static int memcg_oom_wake_function(wait_queue_t *wait,
2020 2021
2021static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2022static void memcg_wakeup_oom(struct mem_cgroup *memcg)
2022{ 2023{
2024 atomic_inc(&memcg->oom_wakeups);
2023 /* for filtering, pass "memcg" as argument. */ 2025 /* for filtering, pass "memcg" as argument. */
2024 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2026 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
2025} 2027}
@@ -2031,19 +2033,17 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
2031} 2033}
2032 2034
2033/* 2035/*
2034 * try to call OOM killer. returns false if we should exit memory-reclaim loop. 2036 * try to call OOM killer
2035 */ 2037 */
2036static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask, 2038static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2037 int order)
2038{ 2039{
2039 struct oom_wait_info owait;
2040 bool locked; 2040 bool locked;
2041 int wakeups;
2041 2042
2042 owait.memcg = memcg; 2043 if (!current->memcg_oom.may_oom)
2043 owait.wait.flags = 0; 2044 return;
2044 owait.wait.func = memcg_oom_wake_function; 2045
2045 owait.wait.private = current; 2046 current->memcg_oom.in_memcg_oom = 1;
2046 INIT_LIST_HEAD(&owait.wait.task_list);
2047 2047
2048 /* 2048 /*
2049 * As with any blocking lock, a contender needs to start 2049 * As with any blocking lock, a contender needs to start
@@ -2051,12 +2051,8 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2051 * otherwise it can miss the wakeup from the unlock and sleep 2051 * otherwise it can miss the wakeup from the unlock and sleep
2052 * indefinitely. This is just open-coded because our locking 2052 * indefinitely. This is just open-coded because our locking
2053 * is so particular to memcg hierarchies. 2053 * is so particular to memcg hierarchies.
2054 *
2055 * Even if signal_pending(), we can't quit charge() loop without
2056 * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
2057 * under OOM is always welcomed, use TASK_KILLABLE here.
2058 */ 2054 */
2059 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2055 wakeups = atomic_read(&memcg->oom_wakeups);
2060 mem_cgroup_mark_under_oom(memcg); 2056 mem_cgroup_mark_under_oom(memcg);
2061 2057
2062 locked = mem_cgroup_oom_trylock(memcg); 2058 locked = mem_cgroup_oom_trylock(memcg);
@@ -2066,15 +2062,95 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2066 2062
2067 if (locked && !memcg->oom_kill_disable) { 2063 if (locked && !memcg->oom_kill_disable) {
2068 mem_cgroup_unmark_under_oom(memcg); 2064 mem_cgroup_unmark_under_oom(memcg);
2069 finish_wait(&memcg_oom_waitq, &owait.wait);
2070 mem_cgroup_out_of_memory(memcg, mask, order); 2065 mem_cgroup_out_of_memory(memcg, mask, order);
2066 mem_cgroup_oom_unlock(memcg);
2067 /*
2068 * There is no guarantee that an OOM-lock contender
2069 * sees the wakeups triggered by the OOM kill
2070 * uncharges. Wake any sleepers explicitely.
2071 */
2072 memcg_oom_recover(memcg);
2071 } else { 2073 } else {
2072 schedule(); 2074 /*
2073 mem_cgroup_unmark_under_oom(memcg); 2075 * A system call can just return -ENOMEM, but if this
2074 finish_wait(&memcg_oom_waitq, &owait.wait); 2076 * is a page fault and somebody else is handling the
2077 * OOM already, we need to sleep on the OOM waitqueue
2078 * for this memcg until the situation is resolved.
2079 * Which can take some time because it might be
2080 * handled by a userspace task.
2081 *
2082 * However, this is the charge context, which means
2083 * that we may sit on a large call stack and hold
2084 * various filesystem locks, the mmap_sem etc. and we
2085 * don't want the OOM handler to deadlock on them
2086 * while we sit here and wait. Store the current OOM
2087 * context in the task_struct, then return -ENOMEM.
2088 * At the end of the page fault handler, with the
2089 * stack unwound, pagefault_out_of_memory() will check
2090 * back with us by calling
2091 * mem_cgroup_oom_synchronize(), possibly putting the
2092 * task to sleep.
2093 */
2094 current->memcg_oom.oom_locked = locked;
2095 current->memcg_oom.wakeups = wakeups;
2096 css_get(&memcg->css);
2097 current->memcg_oom.wait_on_memcg = memcg;
2075 } 2098 }
2099}
2100
2101/**
2102 * mem_cgroup_oom_synchronize - complete memcg OOM handling
2103 *
2104 * This has to be called at the end of a page fault if the the memcg
2105 * OOM handler was enabled and the fault is returning %VM_FAULT_OOM.
2106 *
2107 * Memcg supports userspace OOM handling, so failed allocations must
2108 * sleep on a waitqueue until the userspace task resolves the
2109 * situation. Sleeping directly in the charge context with all kinds
2110 * of locks held is not a good idea, instead we remember an OOM state
2111 * in the task and mem_cgroup_oom_synchronize() has to be called at
2112 * the end of the page fault to put the task to sleep and clean up the
2113 * OOM state.
2114 *
2115 * Returns %true if an ongoing memcg OOM situation was detected and
2116 * finalized, %false otherwise.
2117 */
2118bool mem_cgroup_oom_synchronize(void)
2119{
2120 struct oom_wait_info owait;
2121 struct mem_cgroup *memcg;
2122
2123 /* OOM is global, do not handle */
2124 if (!current->memcg_oom.in_memcg_oom)
2125 return false;
2126
2127 /*
2128 * We invoked the OOM killer but there is a chance that a kill
2129 * did not free up any charges. Everybody else might already
2130 * be sleeping, so restart the fault and keep the rampage
2131 * going until some charges are released.
2132 */
2133 memcg = current->memcg_oom.wait_on_memcg;
2134 if (!memcg)
2135 goto out;
2136
2137 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
2138 goto out_memcg;
2139
2140 owait.memcg = memcg;
2141 owait.wait.flags = 0;
2142 owait.wait.func = memcg_oom_wake_function;
2143 owait.wait.private = current;
2144 INIT_LIST_HEAD(&owait.wait.task_list);
2076 2145
2077 if (locked) { 2146 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
2147 /* Only sleep if we didn't miss any wakeups since OOM */
2148 if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups)
2149 schedule();
2150 finish_wait(&memcg_oom_waitq, &owait.wait);
2151out_memcg:
2152 mem_cgroup_unmark_under_oom(memcg);
2153 if (current->memcg_oom.oom_locked) {
2078 mem_cgroup_oom_unlock(memcg); 2154 mem_cgroup_oom_unlock(memcg);
2079 /* 2155 /*
2080 * There is no guarantee that an OOM-lock contender 2156 * There is no guarantee that an OOM-lock contender
@@ -2083,11 +2159,10 @@ static bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask,
2083 */ 2159 */
2084 memcg_oom_recover(memcg); 2160 memcg_oom_recover(memcg);
2085 } 2161 }
2086 2162 css_put(&memcg->css);
2087 if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) 2163 current->memcg_oom.wait_on_memcg = NULL;
2088 return false; 2164out:
2089 /* Give chance to dying process */ 2165 current->memcg_oom.in_memcg_oom = 0;
2090 schedule_timeout_uninterruptible(1);
2091 return true; 2166 return true;
2092} 2167}
2093 2168
@@ -2400,12 +2475,11 @@ enum {
2400 CHARGE_RETRY, /* need to retry but retry is not bad */ 2475 CHARGE_RETRY, /* need to retry but retry is not bad */
2401 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2476 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
2402 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2477 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
2403 CHARGE_OOM_DIE, /* the current is killed because of OOM */
2404}; 2478};
2405 2479
2406static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2480static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2407 unsigned int nr_pages, unsigned int min_pages, 2481 unsigned int nr_pages, unsigned int min_pages,
2408 bool oom_check) 2482 bool invoke_oom)
2409{ 2483{
2410 unsigned long csize = nr_pages * PAGE_SIZE; 2484 unsigned long csize = nr_pages * PAGE_SIZE;
2411 struct mem_cgroup *mem_over_limit; 2485 struct mem_cgroup *mem_over_limit;
@@ -2462,14 +2536,10 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2462 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2536 if (mem_cgroup_wait_acct_move(mem_over_limit))
2463 return CHARGE_RETRY; 2537 return CHARGE_RETRY;
2464 2538
2465 /* If we don't need to call oom-killer at el, return immediately */ 2539 if (invoke_oom)
2466 if (!oom_check || !current->memcg_oom.may_oom) 2540 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize));
2467 return CHARGE_NOMEM;
2468 /* check OOM */
2469 if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask, get_order(csize)))
2470 return CHARGE_OOM_DIE;
2471 2541
2472 return CHARGE_RETRY; 2542 return CHARGE_NOMEM;
2473} 2543}
2474 2544
2475/* 2545/*
@@ -2572,7 +2642,7 @@ again:
2572 } 2642 }
2573 2643
2574 do { 2644 do {
2575 bool oom_check; 2645 bool invoke_oom = oom && !nr_oom_retries;
2576 2646
2577 /* If killed, bypass charge */ 2647 /* If killed, bypass charge */
2578 if (fatal_signal_pending(current)) { 2648 if (fatal_signal_pending(current)) {
@@ -2580,14 +2650,8 @@ again:
2580 goto bypass; 2650 goto bypass;
2581 } 2651 }
2582 2652
2583 oom_check = false; 2653 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
2584 if (oom && !nr_oom_retries) { 2654 nr_pages, invoke_oom);
2585 oom_check = true;
2586 nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
2587 }
2588
2589 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, nr_pages,
2590 oom_check);
2591 switch (ret) { 2655 switch (ret) {
2592 case CHARGE_OK: 2656 case CHARGE_OK:
2593 break; 2657 break;
@@ -2600,16 +2664,12 @@ again:
2600 css_put(&memcg->css); 2664 css_put(&memcg->css);
2601 goto nomem; 2665 goto nomem;
2602 case CHARGE_NOMEM: /* OOM routine works */ 2666 case CHARGE_NOMEM: /* OOM routine works */
2603 if (!oom) { 2667 if (!oom || invoke_oom) {
2604 css_put(&memcg->css); 2668 css_put(&memcg->css);
2605 goto nomem; 2669 goto nomem;
2606 } 2670 }
2607 /* If oom, we never return -ENOMEM */
2608 nr_oom_retries--; 2671 nr_oom_retries--;
2609 break; 2672 break;
2610 case CHARGE_OOM_DIE: /* Killed by OOM Killer */
2611 css_put(&memcg->css);
2612 goto bypass;
2613 } 2673 }
2614 } while (ret != CHARGE_OK); 2674 } while (ret != CHARGE_OK);
2615 2675
diff --git a/mm/memory.c b/mm/memory.c
index a8f9deab8719..5ec6f199e685 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3867,6 +3867,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3867 if (flags & FAULT_FLAG_USER) 3867 if (flags & FAULT_FLAG_USER)
3868 mem_cgroup_disable_oom(); 3868 mem_cgroup_disable_oom();
3869 3869
3870 if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)))
3871 mem_cgroup_oom_synchronize();
3872
3870 return ret; 3873 return ret;
3871} 3874}
3872 3875
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 98e75f2ac7bc..314e9d274381 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -678,9 +678,12 @@ out:
678 */ 678 */
679void pagefault_out_of_memory(void) 679void pagefault_out_of_memory(void)
680{ 680{
681 struct zonelist *zonelist = node_zonelist(first_online_node, 681 struct zonelist *zonelist;
682 GFP_KERNEL);
683 682
683 if (mem_cgroup_oom_synchronize())
684 return;
685
686 zonelist = node_zonelist(first_online_node, GFP_KERNEL);
684 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) { 687 if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
685 out_of_memory(NULL, 0, 0, NULL, false); 688 out_of_memory(NULL, 0, 0, NULL, false);
686 clear_zonelist_oom(zonelist, GFP_KERNEL); 689 clear_zonelist_oom(zonelist, GFP_KERNEL);