aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-08-09 20:19:46 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:45:02 -0400
commita63d83f427fbce97a6cea0db2e64b0eb8435cd10 (patch)
tree8ac229cdf6e2289d97e82e35774057106fe7f4a2
parent74bcbf40546bb7500f2a7ba4ff3cc056a6bd004a (diff)
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/proc.txt94
-rw-r--r--fs/proc/base.c94
-rw-r--r--include/linux/memcontrol.h8
-rw-r--r--include/linux/oom.h14
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/fork.c1
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/oom_kill.c259
8 files changed, 300 insertions, 191 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 8fe8895894d8..cf1295c2bb66 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -33,7 +33,8 @@ Table of Contents
33 2 Modifying System Parameters 33 2 Modifying System Parameters
34 34
35 3 Per-Process Parameters 35 3 Per-Process Parameters
36 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score 36 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer
37 score
37 3.2 /proc/<pid>/oom_score - Display current oom-killer score 38 3.2 /proc/<pid>/oom_score - Display current oom-killer score
38 3.3 /proc/<pid>/io - Display the IO accounting fields 39 3.3 /proc/<pid>/io - Display the IO accounting fields
39 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings 40 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
@@ -1234,42 +1235,61 @@ of the kernel.
1234CHAPTER 3: PER-PROCESS PARAMETERS 1235CHAPTER 3: PER-PROCESS PARAMETERS
1235------------------------------------------------------------------------------ 1236------------------------------------------------------------------------------
1236 1237
12373.1 /proc/<pid>/oom_adj - Adjust the oom-killer score 12383.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score
1238------------------------------------------------------ 1239--------------------------------------------------------------------------------
1239 1240
1240This file can be used to adjust the score used to select which processes 1241These file can be used to adjust the badness heuristic used to select which
1241should be killed in an out-of-memory situation. Giving it a high score will 1242process gets killed in out of memory conditions.
1242increase the likelihood of this process being killed by the oom-killer. Valid 1243
1243values are in the range -16 to +15, plus the special value -17, which disables 1244The badness heuristic assigns a value to each candidate task ranging from 0
1244oom-killing altogether for this process. 1245(never kill) to 1000 (always kill) to determine which process is targeted. The
1245 1246units are roughly a proportion along that range of allowed memory the process
1246The process to be killed in an out-of-memory situation is selected among all others 1247may allocate from based on an estimation of its current memory and swap use.
1247based on its badness score. This value equals the original memory size of the process 1248For example, if a task is using all allowed memory, its badness score will be
1248and is then updated according to its CPU time (utime + stime) and the 12491000. If it is using half of its allowed memory, its score will be 500.
1249run time (uptime - start time). The longer it runs the smaller is the score. 1250
1250Badness score is divided by the square root of the CPU time and then by 1251There is an additional factor included in the badness score: root
1251the double square root of the run time. 1252processes are given 3% extra memory over other tasks.
1252 1253
1253Swapped out tasks are killed first. Half of each child's memory size is added to 1254The amount of "allowed" memory depends on the context in which the oom killer
1254the parent's score if they do not share the same memory. Thus forking servers 1255was called. If it is due to the memory assigned to the allocating task's cpuset
1255are the prime candidates to be killed. Having only one 'hungry' child will make 1256being exhausted, the allowed memory represents the set of mems assigned to that
1256parent less preferable than the child. 1257cpuset. If it is due to a mempolicy's node(s) being exhausted, the allowed
1257 1258memory represents the set of mempolicy nodes. If it is due to a memory
1258/proc/<pid>/oom_score shows process' current badness score. 1259limit (or swap limit) being reached, the allowed memory is that configured
1259 1260limit. Finally, if it is due to the entire system being out of memory, the
1260The following heuristics are then applied: 1261allowed memory represents all allocatable resources.
1261 * if the task was reniced, its score doubles 1262
1262 * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE 1263The value of /proc/<pid>/oom_score_adj is added to the badness score before it
1263 or CAP_SYS_RAWIO) have their score divided by 4 1264is used to determine which task to kill. Acceptable values range from -1000
1264 * if oom condition happened in one cpuset and checked process does not belong 1265(OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX). This allows userspace to
1265 to it, its score is divided by 8 1266polarize the preference for oom killing either by always preferring a certain
1266 * the resulting score is multiplied by two to the power of oom_adj, i.e. 1267task or completely disabling it. The lowest possible value, -1000, is
1267 points <<= oom_adj when it is positive and 1268equivalent to disabling oom killing entirely for that task since it will always
1268 points >>= -(oom_adj) otherwise 1269report a badness score of 0.
1269 1270
1270The task with the highest badness score is then selected and its children 1271Consequently, it is very simple for userspace to define the amount of memory to
1271are killed, process itself will be killed in an OOM situation when it does 1272consider for each task. Setting a /proc/<pid>/oom_score_adj value of +500, for
1272not have children or some of them disabled oom like described above. 1273example, is roughly equivalent to allowing the remainder of tasks sharing the
1274same system, cpuset, mempolicy, or memory controller resources to use at least
127550% more memory. A value of -500, on the other hand, would be roughly
1276equivalent to discounting 50% of the task's allowed memory from being considered
1277as scoring against the task.
1278
1279For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also
1280be used to tune the badness score. Its acceptable values range from -16
1281(OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17
1282(OOM_DISABLE) to disable oom killing entirely for that task. Its value is
1283scaled linearly with /proc/<pid>/oom_score_adj.
1284
1285Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the
1286other with its scaled value.
1287
1288Caveat: when a parent task is selected, the oom killer will sacrifice any first
1289generation children with seperate address spaces instead, if possible. This
1290avoids servers and important system daemons from being killed and loses the
1291minimal amount of work.
1292
1273 1293
12743.2 /proc/<pid>/oom_score - Display current oom-killer score 12943.2 /proc/<pid>/oom_score - Display current oom-killer score
1275------------------------------------------------------------- 1295-------------------------------------------------------------
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5949d0ac30f2..f923b728388a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -63,6 +63,7 @@
63#include <linux/namei.h> 63#include <linux/namei.h>
64#include <linux/mnt_namespace.h> 64#include <linux/mnt_namespace.h>
65#include <linux/mm.h> 65#include <linux/mm.h>
66#include <linux/swap.h>
66#include <linux/rcupdate.h> 67#include <linux/rcupdate.h>
67#include <linux/kallsyms.h> 68#include <linux/kallsyms.h>
68#include <linux/stacktrace.h> 69#include <linux/stacktrace.h>
@@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = {
430static int proc_oom_score(struct task_struct *task, char *buffer) 431static int proc_oom_score(struct task_struct *task, char *buffer)
431{ 432{
432 unsigned long points = 0; 433 unsigned long points = 0;
433 struct timespec uptime;
434 434
435 do_posix_clock_monotonic_gettime(&uptime);
436 read_lock(&tasklist_lock); 435 read_lock(&tasklist_lock);
437 if (pid_alive(task)) 436 if (pid_alive(task))
438 points = badness(task, NULL, NULL, uptime.tv_sec); 437 points = oom_badness(task, NULL, NULL,
438 totalram_pages + total_swap_pages);
439 read_unlock(&tasklist_lock); 439 read_unlock(&tasklist_lock);
440 return sprintf(buffer, "%lu\n", points); 440 return sprintf(buffer, "%lu\n", points);
441} 441}
@@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1038 } 1038 }
1039 1039
1040 task->signal->oom_adj = oom_adjust; 1040 task->signal->oom_adj = oom_adjust;
1041 1041 /*
1042 * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
1043 * value is always attainable.
1044 */
1045 if (task->signal->oom_adj == OOM_ADJUST_MAX)
1046 task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX;
1047 else
1048 task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) /
1049 -OOM_DISABLE;
1042 unlock_task_sighand(task, &flags); 1050 unlock_task_sighand(task, &flags);
1043 put_task_struct(task); 1051 put_task_struct(task);
1044 1052
@@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = {
1051 .llseek = generic_file_llseek, 1059 .llseek = generic_file_llseek,
1052}; 1060};
1053 1061
1062static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
1063 size_t count, loff_t *ppos)
1064{
1065 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1066 char buffer[PROC_NUMBUF];
1067 int oom_score_adj = OOM_SCORE_ADJ_MIN;
1068 unsigned long flags;
1069 size_t len;
1070
1071 if (!task)
1072 return -ESRCH;
1073 if (lock_task_sighand(task, &flags)) {
1074 oom_score_adj = task->signal->oom_score_adj;
1075 unlock_task_sighand(task, &flags);
1076 }
1077 put_task_struct(task);
1078 len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
1079 return simple_read_from_buffer(buf, count, ppos, buffer, len);
1080}
1081
1082static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
1083 size_t count, loff_t *ppos)
1084{
1085 struct task_struct *task;
1086 char buffer[PROC_NUMBUF];
1087 unsigned long flags;
1088 long oom_score_adj;
1089 int err;
1090
1091 memset(buffer, 0, sizeof(buffer));
1092 if (count > sizeof(buffer) - 1)
1093 count = sizeof(buffer) - 1;
1094 if (copy_from_user(buffer, buf, count))
1095 return -EFAULT;
1096
1097 err = strict_strtol(strstrip(buffer), 0, &oom_score_adj);
1098 if (err)
1099 return -EINVAL;
1100 if (oom_score_adj < OOM_SCORE_ADJ_MIN ||
1101 oom_score_adj > OOM_SCORE_ADJ_MAX)
1102 return -EINVAL;
1103
1104 task = get_proc_task(file->f_path.dentry->d_inode);
1105 if (!task)
1106 return -ESRCH;
1107 if (!lock_task_sighand(task, &flags)) {
1108 put_task_struct(task);
1109 return -ESRCH;
1110 }
1111 if (oom_score_adj < task->signal->oom_score_adj &&
1112 !capable(CAP_SYS_RESOURCE)) {
1113 unlock_task_sighand(task, &flags);
1114 put_task_struct(task);
1115 return -EACCES;
1116 }
1117
1118 task->signal->oom_score_adj = oom_score_adj;
1119 /*
1120 * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is
1121 * always attainable.
1122 */
1123 if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
1124 task->signal->oom_adj = OOM_DISABLE;
1125 else
1126 task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) /
1127 OOM_SCORE_ADJ_MAX;
1128 unlock_task_sighand(task, &flags);
1129 put_task_struct(task);
1130 return count;
1131}
1132
1133static const struct file_operations proc_oom_score_adj_operations = {
1134 .read = oom_score_adj_read,
1135 .write = oom_score_adj_write,
1136};
1137
1054#ifdef CONFIG_AUDITSYSCALL 1138#ifdef CONFIG_AUDITSYSCALL
1055#define TMPBUFLEN 21 1139#define TMPBUFLEN 21
1056static ssize_t proc_loginuid_read(struct file * file, char __user * buf, 1140static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
@@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2623#endif 2707#endif
2624 INF("oom_score", S_IRUGO, proc_oom_score), 2708 INF("oom_score", S_IRUGO, proc_oom_score),
2625 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 2709 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
2710 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2626#ifdef CONFIG_AUDITSYSCALL 2711#ifdef CONFIG_AUDITSYSCALL
2627 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 2712 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2628 REG("sessionid", S_IRUGO, proc_sessionid_operations), 2713 REG("sessionid", S_IRUGO, proc_sessionid_operations),
@@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = {
2957#endif 3042#endif
2958 INF("oom_score", S_IRUGO, proc_oom_score), 3043 INF("oom_score", S_IRUGO, proc_oom_score),
2959 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), 3044 REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations),
3045 REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
2960#ifdef CONFIG_AUDITSYSCALL 3046#ifdef CONFIG_AUDITSYSCALL
2961 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), 3047 REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
2962 REG("sessionid", S_IRUSR, proc_sessionid_operations), 3048 REG("sessionid", S_IRUSR, proc_sessionid_operations),
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9f1afd361583..73564cac38c7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val);
125unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 125unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
126 gfp_t gfp_mask, int nid, 126 gfp_t gfp_mask, int nid,
127 int zid); 127 int zid);
128u64 mem_cgroup_get_limit(struct mem_cgroup *mem);
129
128#else /* CONFIG_CGROUP_MEM_RES_CTLR */ 130#else /* CONFIG_CGROUP_MEM_RES_CTLR */
129struct mem_cgroup; 131struct mem_cgroup;
130 132
@@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
304 return 0; 306 return 0;
305} 307}
306 308
309static inline
310u64 mem_cgroup_get_limit(struct mem_cgroup *mem)
311{
312 return 0;
313}
314
307#endif /* CONFIG_CGROUP_MEM_CONT */ 315#endif /* CONFIG_CGROUP_MEM_CONT */
308 316
309#endif /* _LINUX_MEMCONTROL_H */ 317#endif /* _LINUX_MEMCONTROL_H */
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 40e5e3a6bc20..73b8d7b6dd19 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -1,14 +1,24 @@
1#ifndef __INCLUDE_LINUX_OOM_H 1#ifndef __INCLUDE_LINUX_OOM_H
2#define __INCLUDE_LINUX_OOM_H 2#define __INCLUDE_LINUX_OOM_H
3 3
4/* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */ 4/*
5 * /proc/<pid>/oom_adj set to -17 protects from the oom-killer
6 */
5#define OOM_DISABLE (-17) 7#define OOM_DISABLE (-17)
6/* inclusive */ 8/* inclusive */
7#define OOM_ADJUST_MIN (-16) 9#define OOM_ADJUST_MIN (-16)
8#define OOM_ADJUST_MAX 15 10#define OOM_ADJUST_MAX 15
9 11
12/*
13 * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for
14 * pid.
15 */
16#define OOM_SCORE_ADJ_MIN (-1000)
17#define OOM_SCORE_ADJ_MAX 1000
18
10#ifdef __KERNEL__ 19#ifdef __KERNEL__
11 20
21#include <linux/sched.h>
12#include <linux/types.h> 22#include <linux/types.h>
13#include <linux/nodemask.h> 23#include <linux/nodemask.h>
14 24
@@ -27,6 +37,8 @@ enum oom_constraint {
27 CONSTRAINT_MEMCG, 37 CONSTRAINT_MEMCG,
28}; 38};
29 39
40extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
41 const nodemask_t *nodemask, unsigned long totalpages);
30extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 42extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
31extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 43extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
32 44
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9591907c4f79..ce160d68f5e7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -621,7 +621,8 @@ struct signal_struct {
621 struct tty_audit_buf *tty_audit_buf; 621 struct tty_audit_buf *tty_audit_buf;
622#endif 622#endif
623 623
624 int oom_adj; /* OOM kill score adjustment (bit shift) */ 624 int oom_adj; /* OOM kill score adjustment (bit shift) */
625 int oom_score_adj; /* OOM kill score adjustment */
625}; 626};
626 627
627/* Context switch must be unlocked if interrupts are to be enabled */ 628/* Context switch must be unlocked if interrupts are to be enabled */
diff --git a/kernel/fork.c b/kernel/fork.c
index a82a65cef741..98b450876f93 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
899 tty_audit_fork(sig); 899 tty_audit_fork(sig);
900 900
901 sig->oom_adj = current->signal->oom_adj; 901 sig->oom_adj = current->signal->oom_adj;
902 sig->oom_score_adj = current->signal->oom_score_adj;
902 903
903 return 0; 904 return 0;
904} 905}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31abd1c2c0c5..de54ea0094a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,6 +1127,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
1127} 1127}
1128 1128
1129/* 1129/*
1130 * Return the memory (and swap, if configured) limit for a memcg.
1131 */
1132u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1133{
1134 u64 limit;
1135 u64 memsw;
1136
1137 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1138 total_swap_pages;
1139 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1140 /*
1141 * If memsw is finite and limits the amount of swap space available
1142 * to this memcg, return that limit.
1143 */
1144 return min(limit, memsw);
1145}
1146
1147/*
1130 * Visit the first child (need not be the first child as per the ordering 1148 * Visit the first child (need not be the first child as per the ordering
1131 * of the cgroup list, since we track last_scanned_child) of @mem and use 1149 * of the cgroup list, since we track last_scanned_child) of @mem and use
1132 * that to reclaim free pages from. 1150 * that to reclaim free pages from.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a4ca8a0234b..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
4 * Copyright (C) 1998,2000 Rik van Riel 4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and 5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file... 6 * for goading me into coding this file...
7 * Copyright (C) 2010 Google, Inc.
8 * Rewritten by David Rientjes
7 * 9 *
8 * The routines in this file are used to kill a process when 10 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from __alloc_pages() 11 * we're seriously out of memory. This gets called from __alloc_pages()
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom;
34int sysctl_oom_kill_allocating_task; 36int sysctl_oom_kill_allocating_task;
35int sysctl_oom_dump_tasks = 1; 37int sysctl_oom_dump_tasks = 1;
36static DEFINE_SPINLOCK(zone_scan_lock); 38static DEFINE_SPINLOCK(zone_scan_lock);
37/* #define DEBUG */
38 39
39#ifdef CONFIG_NUMA 40#ifdef CONFIG_NUMA
40/** 41/**
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
140} 141}
141 142
142/** 143/**
143 * badness - calculate a numeric value for how bad this task has been 144 * oom_badness - heuristic function to determine which candidate task to kill
144 * @p: task struct of which task we should calculate 145 * @p: task struct of which task we should calculate
145 * @uptime: current uptime in seconds 146 * @totalpages: total present RAM allowed for page allocation
146 * 147 *
147 * The formula used is relatively simple and documented inline in the 148 * The heuristic for determining which task to kill is made to be as simple and
148 * function. The main rationale is that we want to select a good task 149 * predictable as possible. The goal is to return the highest value for the
149 * to kill when we run out of memory. 150 * task consuming the most memory to avoid subsequent oom failures.
150 *
151 * Good in this context means that:
152 * 1) we lose the minimum amount of work done
153 * 2) we recover a large amount of memory
154 * 3) we don't kill anything innocent of eating tons of memory
155 * 4) we want to kill the minimum amount of processes (one)
156 * 5) we try to kill the process the user expects us to kill, this
157 * algorithm has been meticulously tuned to meet the principle
158 * of least surprise ... (be careful when you change it)
159 */ 151 */
160unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
161 const nodemask_t *nodemask, unsigned long uptime) 153 const nodemask_t *nodemask, unsigned long totalpages)
162{ 154{
163 unsigned long points, cpu_time, run_time; 155 int points;
164 struct task_struct *child;
165 struct task_struct *c, *t;
166 int oom_adj = p->signal->oom_adj;
167 struct task_cputime task_time;
168 unsigned long utime;
169 unsigned long stime;
170 156
171 if (oom_unkillable_task(p, mem, nodemask)) 157 if (oom_unkillable_task(p, mem, nodemask))
172 return 0; 158 return 0;
173 if (oom_adj == OOM_DISABLE)
174 return 0;
175 159
176 p = find_lock_task_mm(p); 160 p = find_lock_task_mm(p);
177 if (!p) 161 if (!p)
178 return 0; 162 return 0;
179 163
180 /* 164 /*
181 * The memory size of the process is the basis for the badness. 165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
182 */ 166 * need to be executed for something that cannot be killed.
183 points = p->mm->total_vm;
184 task_unlock(p);
185
186 /*
187 * swapoff can easily use up all memory, so kill those first.
188 */
189 if (p->flags & PF_OOM_ORIGIN)
190 return ULONG_MAX;
191
192 /*
193 * Processes which fork a lot of child processes are likely
194 * a good choice. We add half the vmsize of the children if they
195 * have an own mm. This prevents forking servers to flood the
196 * machine with an endless amount of children. In case a single
197 * child is eating the vast majority of memory, adding only half
198 * to the parents will make the child our kill candidate of choice.
199 */ 167 */
200 t = p; 168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
201 do { 169 task_unlock(p);
202 list_for_each_entry(c, &t->children, sibling) { 170 return 0;
203 child = find_lock_task_mm(c); 171 }
204 if (child) {
205 if (child->mm != p->mm)
206 points += child->mm->total_vm/2 + 1;
207 task_unlock(child);
208 }
209 }
210 } while_each_thread(p, t);
211 172
212 /* 173 /*
213 * CPU time is in tens of seconds and run time is in thousands 174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
214 * of seconds. There is no particular reason for this other than 175 * priority for oom killing.
215 * that it turned out to work very well in practice.
216 */ 176 */
217 thread_group_cputime(p, &task_time); 177 if (p->flags & PF_OOM_ORIGIN) {
218 utime = cputime_to_jiffies(task_time.utime); 178 task_unlock(p);
219 stime = cputime_to_jiffies(task_time.stime); 179 return 1000;
220 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 180 }
221
222
223 if (uptime >= p->start_time.tv_sec)
224 run_time = (uptime - p->start_time.tv_sec) >> 10;
225 else
226 run_time = 0;
227
228 if (cpu_time)
229 points /= int_sqrt(cpu_time);
230 if (run_time)
231 points /= int_sqrt(int_sqrt(run_time));
232 181
233 /* 182 /*
234 * Niced processes are most likely less important, so double 183 * The memory controller may have a limit of 0 bytes, so avoid a divide
235 * their badness points. 184 * by zero, if necessary.
236 */ 185 */
237 if (task_nice(p) > 0) 186 if (!totalpages)
238 points *= 2; 187 totalpages = 1;
239 188
240 /* 189 /*
241 * Superuser processes are usually more important, so we make it 190 * The baseline for the badness score is the proportion of RAM that each
242 * less likely that we kill those. 191 * task's rss and swap space use.
243 */ 192 */
244 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
245 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 194 totalpages;
246 points /= 4; 195 task_unlock(p);
247 196
248 /* 197 /*
249 * We don't want to kill a process with direct hardware access. 198 * Root processes get 3% bonus, just like the __vm_enough_memory()
250 * Not only could that mess up the hardware, but usually users 199 * implementation used by LSMs.
251 * tend to only have this flag set on applications they think
252 * of as important.
253 */ 200 */
254 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 201 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
255 points /= 4; 202 points -= 30;
256 203
257 /* 204 /*
258 * Adjust the score by oom_adj. 205 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
206 * either completely disable oom killing or always prefer a certain
207 * task.
259 */ 208 */
260 if (oom_adj) { 209 points += p->signal->oom_score_adj;
261 if (oom_adj > 0) {
262 if (!points)
263 points = 1;
264 points <<= oom_adj;
265 } else
266 points >>= -(oom_adj);
267 }
268 210
269#ifdef DEBUG 211 if (points < 0)
270 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 212 return 0;
271 p->pid, p->comm, points); 213 return (points < 1000) ? points : 1000;
272#endif
273 return points;
274} 214}
275 215
276/* 216/*
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
278 */ 218 */
279#ifdef CONFIG_NUMA 219#ifdef CONFIG_NUMA
280static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 220static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
281 gfp_t gfp_mask, nodemask_t *nodemask) 221 gfp_t gfp_mask, nodemask_t *nodemask,
222 unsigned long *totalpages)
282{ 223{
283 struct zone *zone; 224 struct zone *zone;
284 struct zoneref *z; 225 struct zoneref *z;
285 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 226 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
227 bool cpuset_limited = false;
228 int nid;
286 229
230 /* Default to all available memory */
231 *totalpages = totalram_pages + total_swap_pages;
232
233 if (!zonelist)
234 return CONSTRAINT_NONE;
287 /* 235 /*
288 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 236 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
289 * to kill current.We have to random task kill in this case. 237 * to kill current.We have to random task kill in this case.
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
293 return CONSTRAINT_NONE; 241 return CONSTRAINT_NONE;
294 242
295 /* 243 /*
296 * The nodemask here is a nodemask passed to alloc_pages(). Now, 244 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
297 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 245 * the page allocator means a mempolicy is in effect. Cpuset policy
298 * feature. mempolicy is an only user of nodemask here. 246 * is enforced in get_page_from_freelist().
299 * check mempolicy's nodemask contains all N_HIGH_MEMORY
300 */ 247 */
301 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 248 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
249 *totalpages = total_swap_pages;
250 for_each_node_mask(nid, *nodemask)
251 *totalpages += node_spanned_pages(nid);
302 return CONSTRAINT_MEMORY_POLICY; 252 return CONSTRAINT_MEMORY_POLICY;
253 }
303 254
304 /* Check this allocation failure is caused by cpuset's wall function */ 255 /* Check this allocation failure is caused by cpuset's wall function */
305 for_each_zone_zonelist_nodemask(zone, z, zonelist, 256 for_each_zone_zonelist_nodemask(zone, z, zonelist,
306 high_zoneidx, nodemask) 257 high_zoneidx, nodemask)
307 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 258 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
308 return CONSTRAINT_CPUSET; 259 cpuset_limited = true;
309 260
261 if (cpuset_limited) {
262 *totalpages = total_swap_pages;
263 for_each_node_mask(nid, cpuset_current_mems_allowed)
264 *totalpages += node_spanned_pages(nid);
265 return CONSTRAINT_CPUSET;
266 }
310 return CONSTRAINT_NONE; 267 return CONSTRAINT_NONE;
311} 268}
312#else 269#else
313static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 270static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
314 gfp_t gfp_mask, nodemask_t *nodemask) 271 gfp_t gfp_mask, nodemask_t *nodemask,
272 unsigned long *totalpages)
315{ 273{
274 *totalpages = totalram_pages + total_swap_pages;
316 return CONSTRAINT_NONE; 275 return CONSTRAINT_NONE;
317} 276}
318#endif 277#endif
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
323 * 282 *
324 * (not docbooked, we don't want this one cluttering up the manual) 283 * (not docbooked, we don't want this one cluttering up the manual)
325 */ 284 */
326static struct task_struct *select_bad_process(unsigned long *ppoints, 285static struct task_struct *select_bad_process(unsigned int *ppoints,
327 struct mem_cgroup *mem, const nodemask_t *nodemask) 286 unsigned long totalpages, struct mem_cgroup *mem,
287 const nodemask_t *nodemask)
328{ 288{
329 struct task_struct *p; 289 struct task_struct *p;
330 struct task_struct *chosen = NULL; 290 struct task_struct *chosen = NULL;
331 struct timespec uptime;
332 *ppoints = 0; 291 *ppoints = 0;
333 292
334 do_posix_clock_monotonic_gettime(&uptime);
335 for_each_process(p) { 293 for_each_process(p) {
336 unsigned long points; 294 unsigned int points;
337 295
338 if (oom_unkillable_task(p, mem, nodemask)) 296 if (oom_unkillable_task(p, mem, nodemask))
339 continue; 297 continue;
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
365 return ERR_PTR(-1UL); 323 return ERR_PTR(-1UL);
366 324
367 chosen = p; 325 chosen = p;
368 *ppoints = ULONG_MAX; 326 *ppoints = 1000;
369 } 327 }
370 328
371 points = badness(p, mem, nodemask, uptime.tv_sec); 329 points = oom_badness(p, mem, nodemask, totalpages);
372 if (points > *ppoints || !chosen) { 330 if (points > *ppoints) {
373 chosen = p; 331 chosen = p;
374 *ppoints = points; 332 *ppoints = points;
375 } 333 }
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
384 * 342 *
385 * Dumps the current memory state of all system tasks, excluding kernel threads. 343 * Dumps the current memory state of all system tasks, excluding kernel threads.
386 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
387 * score, and name. 345 * value, oom_score_adj value, and name.
388 * 346 *
389 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
390 * shown. 348 * shown.
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
396 struct task_struct *p; 354 struct task_struct *p;
397 struct task_struct *task; 355 struct task_struct *task;
398 356
399 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
400 "name\n");
401 for_each_process(p) { 358 for_each_process(p) {
402 if (p->flags & PF_KTHREAD) 359 if (p->flags & PF_KTHREAD)
403 continue; 360 continue;
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem)
414 continue; 371 continue;
415 } 372 }
416 373
417 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u %3d %s\n", 374 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
418 task->pid, __task_cred(task)->uid, task->tgid, 375 task->pid, __task_cred(task)->uid, task->tgid,
419 task->mm->total_vm, get_mm_rss(task->mm), 376 task->mm->total_vm, get_mm_rss(task->mm),
420 task_cpu(task), task->signal->oom_adj, task->comm); 377 task_cpu(task), task->signal->oom_adj,
378 task->signal->oom_score_adj, task->comm);
421 task_unlock(task); 379 task_unlock(task);
422 } 380 }
423} 381}
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
427{ 385{
428 task_lock(current); 386 task_lock(current);
429 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
430 "oom_adj=%d\n", 388 "oom_adj=%d, oom_score_adj=%d\n",
431 current->comm, gfp_mask, order, current->signal->oom_adj); 389 current->comm, gfp_mask, order, current->signal->oom_adj,
390 current->signal->oom_score_adj);
432 cpuset_print_task_mems_allowed(current); 391 cpuset_print_task_mems_allowed(current);
433 task_unlock(current); 392 task_unlock(current);
434 dump_stack(); 393 dump_stack();
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
468#undef K 427#undef K
469 428
470static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 429static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
471 unsigned long points, struct mem_cgroup *mem, 430 unsigned int points, unsigned long totalpages,
472 nodemask_t *nodemask, const char *message) 431 struct mem_cgroup *mem, nodemask_t *nodemask,
432 const char *message)
473{ 433{
474 struct task_struct *victim = p; 434 struct task_struct *victim = p;
475 struct task_struct *child; 435 struct task_struct *child;
476 struct task_struct *t = p; 436 struct task_struct *t = p;
477 unsigned long victim_points = 0; 437 unsigned int victim_points = 0;
478 struct timespec uptime;
479 438
480 if (printk_ratelimit()) 439 if (printk_ratelimit())
481 dump_header(p, gfp_mask, order, mem); 440 dump_header(p, gfp_mask, order, mem);
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
491 } 450 }
492 451
493 task_lock(p); 452 task_lock(p);
494 pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n", 453 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
495 message, task_pid_nr(p), p->comm, points); 454 message, task_pid_nr(p), p->comm, points);
496 task_unlock(p); 455 task_unlock(p);
497 456
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 * parent. This attempts to lose the minimal amount of work done while 460 * parent. This attempts to lose the minimal amount of work done while
502 * still freeing memory. 461 * still freeing memory.
503 */ 462 */
504 do_posix_clock_monotonic_gettime(&uptime);
505 do { 463 do {
506 list_for_each_entry(child, &t->children, sibling) { 464 list_for_each_entry(child, &t->children, sibling) {
507 unsigned long child_points; 465 unsigned int child_points;
508 466
509 /* badness() returns 0 if the thread is unkillable */ 467 /*
510 child_points = badness(child, mem, nodemask, 468 * oom_badness() returns 0 if the thread is unkillable
511 uptime.tv_sec); 469 */
470 child_points = oom_badness(child, mem, nodemask,
471 totalpages);
512 if (child_points > victim_points) { 472 if (child_points > victim_points) {
513 victim = child; 473 victim = child;
514 victim_points = child_points; 474 victim_points = child_points;
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
546#ifdef CONFIG_CGROUP_MEM_RES_CTLR 506#ifdef CONFIG_CGROUP_MEM_RES_CTLR
547void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 507void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
548{ 508{
549 unsigned long points = 0; 509 unsigned long limit;
510 unsigned int points = 0;
550 struct task_struct *p; 511 struct task_struct *p;
551 512
552 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); 513 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
514 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
553 read_lock(&tasklist_lock); 515 read_lock(&tasklist_lock);
554retry: 516retry:
555 p = select_bad_process(&points, mem, NULL); 517 p = select_bad_process(&points, limit, mem, NULL);
556 if (!p || PTR_ERR(p) == -1UL) 518 if (!p || PTR_ERR(p) == -1UL)
557 goto out; 519 goto out;
558 520
559 if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL, 521 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
560 "Memory cgroup out of memory")) 522 "Memory cgroup out of memory"))
561 goto retry; 523 goto retry;
562out: 524out:
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
681 int order, nodemask_t *nodemask) 643 int order, nodemask_t *nodemask)
682{ 644{
683 struct task_struct *p; 645 struct task_struct *p;
646 unsigned long totalpages;
684 unsigned long freed = 0; 647 unsigned long freed = 0;
685 unsigned long points; 648 unsigned int points;
686 enum oom_constraint constraint = CONSTRAINT_NONE; 649 enum oom_constraint constraint = CONSTRAINT_NONE;
687 650
688 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 651 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
705 * Check if there were limitations on the allocation (only relevant for 668 * Check if there were limitations on the allocation (only relevant for
706 * NUMA) that may require different handling. 669 * NUMA) that may require different handling.
707 */ 670 */
708 if (zonelist) 671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
709 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 672 &totalpages);
710 check_panic_on_oom(constraint, gfp_mask, order); 673 check_panic_on_oom(constraint, gfp_mask, order);
711 674
712 read_lock(&tasklist_lock); 675 read_lock(&tasklist_lock);
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
718 * non-zero, current could not be killed so we must fallback to 681 * non-zero, current could not be killed so we must fallback to
719 * the tasklist scan. 682 * the tasklist scan.
720 */ 683 */
721 if (!oom_kill_process(current, gfp_mask, order, 0, NULL, 684 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
722 nodemask, 685 NULL, nodemask,
723 "Out of memory (oom_kill_allocating_task)")) 686 "Out of memory (oom_kill_allocating_task)"))
724 return; 687 return;
725 } 688 }
726 689
727retry: 690retry:
728 p = select_bad_process(&points, NULL, 691 p = select_bad_process(&points, totalpages, NULL,
729 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : 692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
730 NULL); 693 NULL);
731 if (PTR_ERR(p) == -1UL) 694 if (PTR_ERR(p) == -1UL)
@@ -738,8 +701,8 @@ retry:
738 panic("Out of memory and no killable processes...\n"); 701 panic("Out of memory and no killable processes...\n");
739 } 702 }
740 703
741 if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask, 704 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
742 "Out of memory")) 705 nodemask, "Out of memory"))
743 goto retry; 706 goto retry;
744 read_unlock(&tasklist_lock); 707 read_unlock(&tasklist_lock);
745 708