summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMichal Hocko <mhocko@suse.com>2017-09-06 19:24:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-06 20:27:30 -0400
commitcd04ae1e2dc8e3651b8c427ec1b9500c6eed7b90 (patch)
tree3285aa754d29c10b6ea6062eed62c65f091b0ff5 /mm
parentd30561c56f4114f7d6595a40498ba364ffa6e28e (diff)
mm, oom: do not rely on TIF_MEMDIE for memory reserves access
For ages we have been relying on TIF_MEMDIE thread flag to mark OOM victims and then, among other things, to give these threads full access to memory reserves. There are few shortcomings of this implementation, though. First of all and the most serious one is that the full access to memory reserves is quite dangerous because we leave no safety room for the system to operate and potentially do last emergency steps to move on. Secondly this flag is per task_struct while the OOM killer operates on mm_struct granularity so all processes sharing the given mm are killed. Giving the full access to all these task_structs could lead to a quick memory reserves depletion. We have tried to reduce this risk by giving TIF_MEMDIE only to the main thread and the currently allocating task but that doesn't really solve this problem while it surely opens up a room for corner cases - e.g. GFP_NO{FS,IO} requests might loop inside the allocator without access to memory reserves because a particular thread was not the group leader. Now that we have the oom reaper and that all oom victims are reapable after 1b51e65eab64 ("oom, oom_reaper: allow to reap mm shared by the kthreads") we can be more conservative and grant only partial access to memory reserves because there are reasonable chances of the parallel memory freeing. We still want some access to reserves because we do not want other consumers to eat up the victim's freed memory. oom victims will still contend with __GFP_HIGH users but those shouldn't be so aggressive to starve oom victims completely. Introduce ALLOC_OOM flag and give all tsk_is_oom_victim tasks access to the half of the reserves. This makes the access to reserves independent on which task has passed through mark_oom_victim. Also drop any usage of TIF_MEMDIE from the page allocator proper and replace it by tsk_is_oom_victim as well which will make page_alloc.c completely TIF_MEMDIE free finally. CONFIG_MMU=n doesn't have oom reaper so let's stick to the original ALLOC_NO_WATERMARKS approach. There is a demand to make the oom killer memcg aware which will imply many tasks killed at once. This change will allow such a usecase without worrying about complete memory reserves depletion. Link: http://lkml.kernel.org/r/20170810075019.28998-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: David Rientjes <rientjes@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Roman Gushchin <guro@fb.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/internal.h11
-rw-r--r--mm/oom_kill.c9
-rw-r--r--mm/page_alloc.c76
3 files changed, 73 insertions, 23 deletions
diff --git a/mm/internal.h b/mm/internal.h
index 781c0d54d75a..1df011f62480 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -480,6 +480,17 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
480/* Mask to get the watermark bits */ 480/* Mask to get the watermark bits */
481#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1) 481#define ALLOC_WMARK_MASK (ALLOC_NO_WATERMARKS-1)
482 482
483/*
484 * Only MMU archs have async oom victim reclaim - aka oom_reaper so we
485 * cannot assume a reduced access to memory reserves is sufficient for
486 * !MMU
487 */
488#ifdef CONFIG_MMU
489#define ALLOC_OOM 0x08
490#else
491#define ALLOC_OOM ALLOC_NO_WATERMARKS
492#endif
493
483#define ALLOC_HARDER 0x10 /* try to alloc harder */ 494#define ALLOC_HARDER 0x10 /* try to alloc harder */
484#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ 495#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
485#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ 496#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9e8b4f030c1c..c9f3569a76c7 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -824,7 +824,8 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
824 824
825 /* 825 /*
826 * If the task is already exiting, don't alarm the sysadmin or kill 826 * If the task is already exiting, don't alarm the sysadmin or kill
827 * its children or threads, just set TIF_MEMDIE so it can die quickly 827 * its children or threads, just give it access to memory reserves
828 * so it can die quickly
828 */ 829 */
829 task_lock(p); 830 task_lock(p);
830 if (task_will_free_mem(p)) { 831 if (task_will_free_mem(p)) {
@@ -889,9 +890,9 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
889 count_memcg_event_mm(mm, OOM_KILL); 890 count_memcg_event_mm(mm, OOM_KILL);
890 891
891 /* 892 /*
892 * We should send SIGKILL before setting TIF_MEMDIE in order to prevent 893 * We should send SIGKILL before granting access to memory reserves
893 * the OOM victim from depleting the memory reserves from the user 894 * in order to prevent the OOM victim from depleting the memory
894 * space under its control. 895 * reserves from the user space under its control.
895 */ 896 */
896 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); 897 do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
897 mark_oom_victim(victim); 898 mark_oom_victim(victim);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4562c058ec4..a9add06fe768 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2951{ 2951{
2952 long min = mark; 2952 long min = mark;
2953 int o; 2953 int o;
2954 const bool alloc_harder = (alloc_flags & ALLOC_HARDER); 2954 const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
2955 2955
2956 /* free_pages may go negative - that's OK */ 2956 /* free_pages may go negative - that's OK */
2957 free_pages -= (1 << order) - 1; 2957 free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2964 * the high-atomic reserves. This will over-estimate the size of the 2964 * the high-atomic reserves. This will over-estimate the size of the
2965 * atomic reserve but it avoids a search. 2965 * atomic reserve but it avoids a search.
2966 */ 2966 */
2967 if (likely(!alloc_harder)) 2967 if (likely(!alloc_harder)) {
2968 free_pages -= z->nr_reserved_highatomic; 2968 free_pages -= z->nr_reserved_highatomic;
2969 else 2969 } else {
2970 min -= min / 4; 2970 /*
2971 * OOM victims can try even harder than normal ALLOC_HARDER
2972 * users on the grounds that it's definitely going to be in
2973 * the exit path shortly and free memory. Any allocation it
2974 * makes during the free path will be small and short-lived.
2975 */
2976 if (alloc_flags & ALLOC_OOM)
2977 min -= min / 2;
2978 else
2979 min -= min / 4;
2980 }
2981
2971 2982
2972#ifdef CONFIG_CMA 2983#ifdef CONFIG_CMA
2973 /* If allocation can't use CMA areas don't use free CMA pages */ 2984 /* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3205 * of allowed nodes. 3216 * of allowed nodes.
3206 */ 3217 */
3207 if (!(gfp_mask & __GFP_NOMEMALLOC)) 3218 if (!(gfp_mask & __GFP_NOMEMALLOC))
3208 if (test_thread_flag(TIF_MEMDIE) || 3219 if (tsk_is_oom_victim(current) ||
3209 (current->flags & (PF_MEMALLOC | PF_EXITING))) 3220 (current->flags & (PF_MEMALLOC | PF_EXITING)))
3210 filter &= ~SHOW_MEM_FILTER_NODES; 3221 filter &= ~SHOW_MEM_FILTER_NODES;
3211 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) 3222 if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
3668 return alloc_flags; 3679 return alloc_flags;
3669} 3680}
3670 3681
3671bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) 3682static bool oom_reserves_allowed(struct task_struct *tsk)
3672{ 3683{
3673 if (unlikely(gfp_mask & __GFP_NOMEMALLOC)) 3684 if (!tsk_is_oom_victim(tsk))
3685 return false;
3686
3687 /*
3688 * !MMU doesn't have oom reaper so give access to memory reserves
3689 * only to the thread with TIF_MEMDIE set
3690 */
3691 if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3674 return false; 3692 return false;
3675 3693
3694 return true;
3695}
3696
3697/*
3698 * Distinguish requests which really need access to full memory
3699 * reserves from oom victims which can live with a portion of it
3700 */
3701static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3702{
3703 if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3704 return 0;
3676 if (gfp_mask & __GFP_MEMALLOC) 3705 if (gfp_mask & __GFP_MEMALLOC)
3677 return true; 3706 return ALLOC_NO_WATERMARKS;
3678 if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) 3707 if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3679 return true; 3708 return ALLOC_NO_WATERMARKS;
3680 if (!in_interrupt() && 3709 if (!in_interrupt()) {
3681 ((current->flags & PF_MEMALLOC) || 3710 if (current->flags & PF_MEMALLOC)
3682 unlikely(test_thread_flag(TIF_MEMDIE)))) 3711 return ALLOC_NO_WATERMARKS;
3683 return true; 3712 else if (oom_reserves_allowed(current))
3713 return ALLOC_OOM;
3714 }
3684 3715
3685 return false; 3716 return 0;
3717}
3718
3719bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3720{
3721 return !!__gfp_pfmemalloc_flags(gfp_mask);
3686} 3722}
3687 3723
3688/* 3724/*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3835 unsigned long alloc_start = jiffies; 3871 unsigned long alloc_start = jiffies;
3836 unsigned int stall_timeout = 10 * HZ; 3872 unsigned int stall_timeout = 10 * HZ;
3837 unsigned int cpuset_mems_cookie; 3873 unsigned int cpuset_mems_cookie;
3874 int reserve_flags;
3838 3875
3839 /* 3876 /*
3840 * In the slowpath, we sanity check order to avoid ever trying to 3877 * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
3940 if (gfp_mask & __GFP_KSWAPD_RECLAIM) 3977 if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3941 wake_all_kswapds(order, ac); 3978 wake_all_kswapds(order, ac);
3942 3979
3943 if (gfp_pfmemalloc_allowed(gfp_mask)) 3980 reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
3944 alloc_flags = ALLOC_NO_WATERMARKS; 3981 if (reserve_flags)
3982 alloc_flags = reserve_flags;
3945 3983
3946 /* 3984 /*
3947 * Reset the zonelist iterators if memory policies can be ignored. 3985 * Reset the zonelist iterators if memory policies can be ignored.
3948 * These allocations are high priority and system rather than user 3986 * These allocations are high priority and system rather than user
3949 * orientated. 3987 * orientated.
3950 */ 3988 */
3951 if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) { 3989 if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
3952 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); 3990 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3953 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, 3991 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3954 ac->high_zoneidx, ac->nodemask); 3992 ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
4025 goto got_pg; 4063 goto got_pg;
4026 4064
4027 /* Avoid allocations with no watermarks from looping endlessly */ 4065 /* Avoid allocations with no watermarks from looping endlessly */
4028 if (test_thread_flag(TIF_MEMDIE) && 4066 if (tsk_is_oom_victim(current) &&
4029 (alloc_flags == ALLOC_NO_WATERMARKS || 4067 (alloc_flags == ALLOC_OOM ||
4030 (gfp_mask & __GFP_NOMEMALLOC))) 4068 (gfp_mask & __GFP_NOMEMALLOC)))
4031 goto nopage; 4069 goto nopage;
4032 4070