From 309ed882508cc471320ff79265e7340774d6746c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:18:54 -0700 Subject: oom: extract panic helper function There are various points in the oom killer where the kernel must determine whether to panic or not. It's better to extract this to a helper function to remove all the confusion as to its semantics. Also fix a call to dump_header() where tasklist_lock is not read- locked, as required. There's no functional change with this patch. Acked-by: KOSAKI Motohiro Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 537662315627..3ae6d94d0540 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -22,6 +22,7 @@ enum oom_constraint { CONSTRAINT_NONE, CONSTRAINT_CPUSET, CONSTRAINT_MEMORY_POLICY, + CONSTRAINT_MEMCG, }; extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); -- cgit v1.2.2 From 8e4228e1edb922afa83366803a1e5b3fa8e987c2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:18:56 -0700 Subject: oom: move sysctl declarations to oom.h The three oom killer sysctl variables (sysctl_oom_dump_tasks, sysctl_oom_kill_allocating_task, and sysctl_panic_on_oom) are better declared in include/linux/oom.h rather than kernel/sysctl.c. Signed-off-by: David Rientjes Acked-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 3ae6d94d0540..1a8e407a1a53 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -44,5 +44,10 @@ static inline void oom_killer_enable(void) { oom_killer_disabled = false; } + +/* sysctls */ +extern int sysctl_oom_dump_tasks; +extern int sysctl_oom_kill_allocating_task; +extern int sysctl_panic_on_oom; #endif /* __KERNEL__*/ #endif /* _INCLUDE_LINUX_OOM_H */ -- cgit v1.2.2 From ff321feac22313cf53ffceb69224b09ac19ff22b Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 9 Aug 2010 17:18:57 -0700 Subject: mm: rename try_set_zone_oom() to try_set_zonelist_oom() We have been used naming try_set_zone_oom and clear_zonelist_oom. The role of functions is to lock of zonelist for preventing parallel OOM. So clear_zonelist_oom makes sense but try_set_zone_oome is rather awkward and unmatched with clear_zonelist_oom. Let's change it with try_set_zonelist_oom. Signed-off-by: Minchan Kim Acked-by: David Rientjes Reviewed-by: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 1a8e407a1a53..9d7c34a741e7 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -25,7 +25,7 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; -extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, -- cgit v1.2.2 From 74bcbf40546bb7500f2a7ba4ff3cc056a6bd004a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 9 Aug 2010 17:19:43 -0700 Subject: oom: move badness() declaration into oom.h Cc: Minchan Kim Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 9d7c34a741e7..40e5e3a6bc20 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -14,6 +14,8 @@ struct zonelist; struct notifier_block; +struct mem_cgroup; +struct task_struct; /* * Types of limitations to the nodes from which allocations may occur @@ -45,6 +47,10 @@ static inline void oom_killer_enable(void) oom_killer_disabled = false; } +/* The badness from the OOM killer */ +extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, + const nodemask_t *nodemask, unsigned long uptime); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; -- cgit v1.2.2 From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:19:46 -0700 Subject: oom: badness heuristic rewrite This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 40e5e3a6bc20..73b8d7b6dd19 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -1,14 +1,24 @@ #ifndef __INCLUDE_LINUX_OOM_H #define __INCLUDE_LINUX_OOM_H -/* /proc//oom_adj set to -17 protects from the oom-killer */ +/* + * /proc//oom_adj set to -17 protects from the oom-killer + */ #define OOM_DISABLE (-17) /* inclusive */ #define OOM_ADJUST_MIN (-16) #define OOM_ADJUST_MAX 15 +/* + * /proc//oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for + * pid. + */ +#define OOM_SCORE_ADJ_MIN (-1000) +#define OOM_SCORE_ADJ_MAX 1000 + #ifdef __KERNEL__ +#include #include #include @@ -27,6 +37,8 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; +extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, + const nodemask_t *nodemask, unsigned long totalpages); extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); -- cgit v1.2.2 From 51b1bd2ace1595b72956224deda349efa880b693 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:19:47 -0700 Subject: oom: deprecate oom_adj tunable /proc/pid/oom_adj is now deprecated so that that it may eventually be removed. The target date for removal is August 2012. A warning will be printed to the kernel log if a task attempts to use this interface. Future warning will be suppressed until the kernel is rebooted to prevent spamming the kernel log. Signed-off-by: David Rientjes Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index 73b8d7b6dd19..f209b683e118 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -2,6 +2,9 @@ #define __INCLUDE_LINUX_OOM_H /* + * /proc//oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. + * * /proc//oom_adj set to -17 protects from the oom-killer */ #define OOM_DISABLE (-17) -- cgit v1.2.2 From 158e0a2d1b3cffed8b46cbc56393a1394672ef79 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 10 Aug 2010 18:03:00 -0700 Subject: memcg: use find_lock_task_mm() in memory cgroups oom When the OOM killer scans task, it check a task is under memcg or not when it's called via memcg's context. But, as Oleg pointed out, a thread group leader may have NULL ->mm and task_in_mem_cgroup() may do wrong decision. We have to use find_lock_task_mm() in memcg as generic OOM-Killer does. Signed-off-by: KAMEZAWA Hiroyuki Cc: Oleg Nesterov Cc: Daisuke Nishimura Cc: Balbir Singh Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux/oom.h') diff --git a/include/linux/oom.h b/include/linux/oom.h index f209b683e118..5e3aa8311c5e 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -66,6 +66,8 @@ static inline void oom_killer_enable(void) extern unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, const nodemask_t *nodemask, unsigned long uptime); +extern struct task_struct *find_lock_task_mm(struct task_struct *p); + /* sysctls */ extern int sysctl_oom_dump_tasks; extern int sysctl_oom_kill_allocating_task; -- cgit v1.2.2