diff options
-rw-r--r-- | Documentation/filesystems/proc.txt | 94 | ||||
-rw-r--r-- | fs/proc/base.c | 94 | ||||
-rw-r--r-- | include/linux/memcontrol.h | 8 | ||||
-rw-r--r-- | include/linux/oom.h | 14 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | mm/memcontrol.c | 18 | ||||
-rw-r--r-- | mm/oom_kill.c | 259 |
8 files changed, 300 insertions, 191 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 8fe8895894d8..cf1295c2bb66 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -33,7 +33,8 @@ Table of Contents | |||
33 | 2 Modifying System Parameters | 33 | 2 Modifying System Parameters |
34 | 34 | ||
35 | 3 Per-Process Parameters | 35 | 3 Per-Process Parameters |
36 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 36 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj - Adjust the oom-killer |
37 | score | ||
37 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score | 38 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
38 | 3.3 /proc/<pid>/io - Display the IO accounting fields | 39 | 3.3 /proc/<pid>/io - Display the IO accounting fields |
39 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings | 40 | 3.4 /proc/<pid>/coredump_filter - Core dump filtering settings |
@@ -1234,42 +1235,61 @@ of the kernel. | |||
1234 | CHAPTER 3: PER-PROCESS PARAMETERS | 1235 | CHAPTER 3: PER-PROCESS PARAMETERS |
1235 | ------------------------------------------------------------------------------ | 1236 | ------------------------------------------------------------------------------ |
1236 | 1237 | ||
1237 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1238 | 3.1 /proc/<pid>/oom_adj & /proc/<pid>/oom_score_adj- Adjust the oom-killer score |
1238 | ------------------------------------------------------ | 1239 | -------------------------------------------------------------------------------- |
1239 | 1240 | ||
1240 | This file can be used to adjust the score used to select which processes | 1241 | These file can be used to adjust the badness heuristic used to select which |
1241 | should be killed in an out-of-memory situation. Giving it a high score will | 1242 | process gets killed in out of memory conditions. |
1242 | increase the likelihood of this process being killed by the oom-killer. Valid | 1243 | |
1243 | values are in the range -16 to +15, plus the special value -17, which disables | 1244 | The badness heuristic assigns a value to each candidate task ranging from 0 |
1244 | oom-killing altogether for this process. | 1245 | (never kill) to 1000 (always kill) to determine which process is targeted. The |
1245 | 1246 | units are roughly a proportion along that range of allowed memory the process | |
1246 | The process to be killed in an out-of-memory situation is selected among all others | 1247 | may allocate from based on an estimation of its current memory and swap use. |
1247 | based on its badness score. This value equals the original memory size of the process | 1248 | For example, if a task is using all allowed memory, its badness score will be |
1248 | and is then updated according to its CPU time (utime + stime) and the | 1249 | 1000. If it is using half of its allowed memory, its score will be 500. |
1249 | run time (uptime - start time). The longer it runs the smaller is the score. | 1250 | |
1250 | Badness score is divided by the square root of the CPU time and then by | 1251 | There is an additional factor included in the badness score: root |
1251 | the double square root of the run time. | 1252 | processes are given 3% extra memory over other tasks. |
1252 | 1253 | ||
1253 | Swapped out tasks are killed first. Half of each child's memory size is added to | 1254 | The amount of "allowed" memory depends on the context in which the oom killer |
1254 | the parent's score if they do not share the same memory. Thus forking servers | 1255 | was called. If it is due to the memory assigned to the allocating task's cpuset |
1255 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1256 | being exhausted, the allowed memory represents the set of mems assigned to that |
1256 | parent less preferable than the child. | 1257 | cpuset. If it is due to a mempolicy's node(s) being exhausted, the allowed |
1257 | 1258 | memory represents the set of mempolicy nodes. If it is due to a memory | |
1258 | /proc/<pid>/oom_score shows process' current badness score. | 1259 | limit (or swap limit) being reached, the allowed memory is that configured |
1259 | 1260 | limit. Finally, if it is due to the entire system being out of memory, the | |
1260 | The following heuristics are then applied: | 1261 | allowed memory represents all allocatable resources. |
1261 | * if the task was reniced, its score doubles | 1262 | |
1262 | * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE | 1263 | The value of /proc/<pid>/oom_score_adj is added to the badness score before it |
1263 | or CAP_SYS_RAWIO) have their score divided by 4 | 1264 | is used to determine which task to kill. Acceptable values range from -1000 |
1264 | * if oom condition happened in one cpuset and checked process does not belong | 1265 | (OOM_SCORE_ADJ_MIN) to +1000 (OOM_SCORE_ADJ_MAX). This allows userspace to |
1265 | to it, its score is divided by 8 | 1266 | polarize the preference for oom killing either by always preferring a certain |
1266 | * the resulting score is multiplied by two to the power of oom_adj, i.e. | 1267 | task or completely disabling it. The lowest possible value, -1000, is |
1267 | points <<= oom_adj when it is positive and | 1268 | equivalent to disabling oom killing entirely for that task since it will always |
1268 | points >>= -(oom_adj) otherwise | 1269 | report a badness score of 0. |
1269 | 1270 | ||
1270 | The task with the highest badness score is then selected and its children | 1271 | Consequently, it is very simple for userspace to define the amount of memory to |
1271 | are killed, process itself will be killed in an OOM situation when it does | 1272 | consider for each task. Setting a /proc/<pid>/oom_score_adj value of +500, for |
1272 | not have children or some of them disabled oom like described above. | 1273 | example, is roughly equivalent to allowing the remainder of tasks sharing the |
1274 | same system, cpuset, mempolicy, or memory controller resources to use at least | ||
1275 | 50% more memory. A value of -500, on the other hand, would be roughly | ||
1276 | equivalent to discounting 50% of the task's allowed memory from being considered | ||
1277 | as scoring against the task. | ||
1278 | |||
1279 | For backwards compatibility with previous kernels, /proc/<pid>/oom_adj may also | ||
1280 | be used to tune the badness score. Its acceptable values range from -16 | ||
1281 | (OOM_ADJUST_MIN) to +15 (OOM_ADJUST_MAX) and a special value of -17 | ||
1282 | (OOM_DISABLE) to disable oom killing entirely for that task. Its value is | ||
1283 | scaled linearly with /proc/<pid>/oom_score_adj. | ||
1284 | |||
1285 | Writing to /proc/<pid>/oom_score_adj or /proc/<pid>/oom_adj will change the | ||
1286 | other with its scaled value. | ||
1287 | |||
1288 | Caveat: when a parent task is selected, the oom killer will sacrifice any first | ||
1289 | generation children with seperate address spaces instead, if possible. This | ||
1290 | avoids servers and important system daemons from being killed and loses the | ||
1291 | minimal amount of work. | ||
1292 | |||
1273 | 1293 | ||
1274 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score | 1294 | 3.2 /proc/<pid>/oom_score - Display current oom-killer score |
1275 | ------------------------------------------------------------- | 1295 | ------------------------------------------------------------- |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 5949d0ac30f2..f923b728388a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -63,6 +63,7 @@ | |||
63 | #include <linux/namei.h> | 63 | #include <linux/namei.h> |
64 | #include <linux/mnt_namespace.h> | 64 | #include <linux/mnt_namespace.h> |
65 | #include <linux/mm.h> | 65 | #include <linux/mm.h> |
66 | #include <linux/swap.h> | ||
66 | #include <linux/rcupdate.h> | 67 | #include <linux/rcupdate.h> |
67 | #include <linux/kallsyms.h> | 68 | #include <linux/kallsyms.h> |
68 | #include <linux/stacktrace.h> | 69 | #include <linux/stacktrace.h> |
@@ -430,12 +431,11 @@ static const struct file_operations proc_lstats_operations = { | |||
430 | static int proc_oom_score(struct task_struct *task, char *buffer) | 431 | static int proc_oom_score(struct task_struct *task, char *buffer) |
431 | { | 432 | { |
432 | unsigned long points = 0; | 433 | unsigned long points = 0; |
433 | struct timespec uptime; | ||
434 | 434 | ||
435 | do_posix_clock_monotonic_gettime(&uptime); | ||
436 | read_lock(&tasklist_lock); | 435 | read_lock(&tasklist_lock); |
437 | if (pid_alive(task)) | 436 | if (pid_alive(task)) |
438 | points = badness(task, NULL, NULL, uptime.tv_sec); | 437 | points = oom_badness(task, NULL, NULL, |
438 | totalram_pages + total_swap_pages); | ||
439 | read_unlock(&tasklist_lock); | 439 | read_unlock(&tasklist_lock); |
440 | return sprintf(buffer, "%lu\n", points); | 440 | return sprintf(buffer, "%lu\n", points); |
441 | } | 441 | } |
@@ -1038,7 +1038,15 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
1038 | } | 1038 | } |
1039 | 1039 | ||
1040 | task->signal->oom_adj = oom_adjust; | 1040 | task->signal->oom_adj = oom_adjust; |
1041 | 1041 | /* | |
1042 | * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum | ||
1043 | * value is always attainable. | ||
1044 | */ | ||
1045 | if (task->signal->oom_adj == OOM_ADJUST_MAX) | ||
1046 | task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; | ||
1047 | else | ||
1048 | task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / | ||
1049 | -OOM_DISABLE; | ||
1042 | unlock_task_sighand(task, &flags); | 1050 | unlock_task_sighand(task, &flags); |
1043 | put_task_struct(task); | 1051 | put_task_struct(task); |
1044 | 1052 | ||
@@ -1051,6 +1059,82 @@ static const struct file_operations proc_oom_adjust_operations = { | |||
1051 | .llseek = generic_file_llseek, | 1059 | .llseek = generic_file_llseek, |
1052 | }; | 1060 | }; |
1053 | 1061 | ||
1062 | static ssize_t oom_score_adj_read(struct file *file, char __user *buf, | ||
1063 | size_t count, loff_t *ppos) | ||
1064 | { | ||
1065 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | ||
1066 | char buffer[PROC_NUMBUF]; | ||
1067 | int oom_score_adj = OOM_SCORE_ADJ_MIN; | ||
1068 | unsigned long flags; | ||
1069 | size_t len; | ||
1070 | |||
1071 | if (!task) | ||
1072 | return -ESRCH; | ||
1073 | if (lock_task_sighand(task, &flags)) { | ||
1074 | oom_score_adj = task->signal->oom_score_adj; | ||
1075 | unlock_task_sighand(task, &flags); | ||
1076 | } | ||
1077 | put_task_struct(task); | ||
1078 | len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); | ||
1079 | return simple_read_from_buffer(buf, count, ppos, buffer, len); | ||
1080 | } | ||
1081 | |||
1082 | static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, | ||
1083 | size_t count, loff_t *ppos) | ||
1084 | { | ||
1085 | struct task_struct *task; | ||
1086 | char buffer[PROC_NUMBUF]; | ||
1087 | unsigned long flags; | ||
1088 | long oom_score_adj; | ||
1089 | int err; | ||
1090 | |||
1091 | memset(buffer, 0, sizeof(buffer)); | ||
1092 | if (count > sizeof(buffer) - 1) | ||
1093 | count = sizeof(buffer) - 1; | ||
1094 | if (copy_from_user(buffer, buf, count)) | ||
1095 | return -EFAULT; | ||
1096 | |||
1097 | err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); | ||
1098 | if (err) | ||
1099 | return -EINVAL; | ||
1100 | if (oom_score_adj < OOM_SCORE_ADJ_MIN || | ||
1101 | oom_score_adj > OOM_SCORE_ADJ_MAX) | ||
1102 | return -EINVAL; | ||
1103 | |||
1104 | task = get_proc_task(file->f_path.dentry->d_inode); | ||
1105 | if (!task) | ||
1106 | return -ESRCH; | ||
1107 | if (!lock_task_sighand(task, &flags)) { | ||
1108 | put_task_struct(task); | ||
1109 | return -ESRCH; | ||
1110 | } | ||
1111 | if (oom_score_adj < task->signal->oom_score_adj && | ||
1112 | !capable(CAP_SYS_RESOURCE)) { | ||
1113 | unlock_task_sighand(task, &flags); | ||
1114 | put_task_struct(task); | ||
1115 | return -EACCES; | ||
1116 | } | ||
1117 | |||
1118 | task->signal->oom_score_adj = oom_score_adj; | ||
1119 | /* | ||
1120 | * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is | ||
1121 | * always attainable. | ||
1122 | */ | ||
1123 | if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) | ||
1124 | task->signal->oom_adj = OOM_DISABLE; | ||
1125 | else | ||
1126 | task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / | ||
1127 | OOM_SCORE_ADJ_MAX; | ||
1128 | unlock_task_sighand(task, &flags); | ||
1129 | put_task_struct(task); | ||
1130 | return count; | ||
1131 | } | ||
1132 | |||
1133 | static const struct file_operations proc_oom_score_adj_operations = { | ||
1134 | .read = oom_score_adj_read, | ||
1135 | .write = oom_score_adj_write, | ||
1136 | }; | ||
1137 | |||
1054 | #ifdef CONFIG_AUDITSYSCALL | 1138 | #ifdef CONFIG_AUDITSYSCALL |
1055 | #define TMPBUFLEN 21 | 1139 | #define TMPBUFLEN 21 |
1056 | static ssize_t proc_loginuid_read(struct file * file, char __user * buf, | 1140 | static ssize_t proc_loginuid_read(struct file * file, char __user * buf, |
@@ -2623,6 +2707,7 @@ static const struct pid_entry tgid_base_stuff[] = { | |||
2623 | #endif | 2707 | #endif |
2624 | INF("oom_score", S_IRUGO, proc_oom_score), | 2708 | INF("oom_score", S_IRUGO, proc_oom_score), |
2625 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), | 2709 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), |
2710 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), | ||
2626 | #ifdef CONFIG_AUDITSYSCALL | 2711 | #ifdef CONFIG_AUDITSYSCALL |
2627 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), | 2712 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
2628 | REG("sessionid", S_IRUGO, proc_sessionid_operations), | 2713 | REG("sessionid", S_IRUGO, proc_sessionid_operations), |
@@ -2957,6 +3042,7 @@ static const struct pid_entry tid_base_stuff[] = { | |||
2957 | #endif | 3042 | #endif |
2958 | INF("oom_score", S_IRUGO, proc_oom_score), | 3043 | INF("oom_score", S_IRUGO, proc_oom_score), |
2959 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), | 3044 | REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), |
3045 | REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), | ||
2960 | #ifdef CONFIG_AUDITSYSCALL | 3046 | #ifdef CONFIG_AUDITSYSCALL |
2961 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), | 3047 | REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), |
2962 | REG("sessionid", S_IRUSR, proc_sessionid_operations), | 3048 | REG("sessionid", S_IRUSR, proc_sessionid_operations), |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9f1afd361583..73564cac38c7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val); | |||
125 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | 125 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, |
126 | gfp_t gfp_mask, int nid, | 126 | gfp_t gfp_mask, int nid, |
127 | int zid); | 127 | int zid); |
128 | u64 mem_cgroup_get_limit(struct mem_cgroup *mem); | ||
129 | |||
128 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ | 130 | #else /* CONFIG_CGROUP_MEM_RES_CTLR */ |
129 | struct mem_cgroup; | 131 | struct mem_cgroup; |
130 | 132 | ||
@@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | |||
304 | return 0; | 306 | return 0; |
305 | } | 307 | } |
306 | 308 | ||
309 | static inline | ||
310 | u64 mem_cgroup_get_limit(struct mem_cgroup *mem) | ||
311 | { | ||
312 | return 0; | ||
313 | } | ||
314 | |||
307 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 315 | #endif /* CONFIG_CGROUP_MEM_CONT */ |
308 | 316 | ||
309 | #endif /* _LINUX_MEMCONTROL_H */ | 317 | #endif /* _LINUX_MEMCONTROL_H */ |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 40e5e3a6bc20..73b8d7b6dd19 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -1,14 +1,24 @@ | |||
1 | #ifndef __INCLUDE_LINUX_OOM_H | 1 | #ifndef __INCLUDE_LINUX_OOM_H |
2 | #define __INCLUDE_LINUX_OOM_H | 2 | #define __INCLUDE_LINUX_OOM_H |
3 | 3 | ||
4 | /* /proc/<pid>/oom_adj set to -17 protects from the oom-killer */ | 4 | /* |
5 | * /proc/<pid>/oom_adj set to -17 protects from the oom-killer | ||
6 | */ | ||
5 | #define OOM_DISABLE (-17) | 7 | #define OOM_DISABLE (-17) |
6 | /* inclusive */ | 8 | /* inclusive */ |
7 | #define OOM_ADJUST_MIN (-16) | 9 | #define OOM_ADJUST_MIN (-16) |
8 | #define OOM_ADJUST_MAX 15 | 10 | #define OOM_ADJUST_MAX 15 |
9 | 11 | ||
12 | /* | ||
13 | * /proc/<pid>/oom_score_adj set to OOM_SCORE_ADJ_MIN disables oom killing for | ||
14 | * pid. | ||
15 | */ | ||
16 | #define OOM_SCORE_ADJ_MIN (-1000) | ||
17 | #define OOM_SCORE_ADJ_MAX 1000 | ||
18 | |||
10 | #ifdef __KERNEL__ | 19 | #ifdef __KERNEL__ |
11 | 20 | ||
21 | #include <linux/sched.h> | ||
12 | #include <linux/types.h> | 22 | #include <linux/types.h> |
13 | #include <linux/nodemask.h> | 23 | #include <linux/nodemask.h> |
14 | 24 | ||
@@ -27,6 +37,8 @@ enum oom_constraint { | |||
27 | CONSTRAINT_MEMCG, | 37 | CONSTRAINT_MEMCG, |
28 | }; | 38 | }; |
29 | 39 | ||
40 | extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, | ||
41 | const nodemask_t *nodemask, unsigned long totalpages); | ||
30 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 42 | extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
31 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 43 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
32 | 44 | ||
diff --git a/include/linux/sched.h b/include/linux/sched.h index 9591907c4f79..ce160d68f5e7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -621,7 +621,8 @@ struct signal_struct { | |||
621 | struct tty_audit_buf *tty_audit_buf; | 621 | struct tty_audit_buf *tty_audit_buf; |
622 | #endif | 622 | #endif |
623 | 623 | ||
624 | int oom_adj; /* OOM kill score adjustment (bit shift) */ | 624 | int oom_adj; /* OOM kill score adjustment (bit shift) */ |
625 | int oom_score_adj; /* OOM kill score adjustment */ | ||
625 | }; | 626 | }; |
626 | 627 | ||
627 | /* Context switch must be unlocked if interrupts are to be enabled */ | 628 | /* Context switch must be unlocked if interrupts are to be enabled */ |
diff --git a/kernel/fork.c b/kernel/fork.c index a82a65cef741..98b450876f93 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -899,6 +899,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
899 | tty_audit_fork(sig); | 899 | tty_audit_fork(sig); |
900 | 900 | ||
901 | sig->oom_adj = current->signal->oom_adj; | 901 | sig->oom_adj = current->signal->oom_adj; |
902 | sig->oom_score_adj = current->signal->oom_score_adj; | ||
902 | 903 | ||
903 | return 0; | 904 | return 0; |
904 | } | 905 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 31abd1c2c0c5..de54ea0094a1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -1127,6 +1127,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem) | |||
1127 | } | 1127 | } |
1128 | 1128 | ||
1129 | /* | 1129 | /* |
1130 | * Return the memory (and swap, if configured) limit for a memcg. | ||
1131 | */ | ||
1132 | u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) | ||
1133 | { | ||
1134 | u64 limit; | ||
1135 | u64 memsw; | ||
1136 | |||
1137 | limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + | ||
1138 | total_swap_pages; | ||
1139 | memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
1140 | /* | ||
1141 | * If memsw is finite and limits the amount of swap space available | ||
1142 | * to this memcg, return that limit. | ||
1143 | */ | ||
1144 | return min(limit, memsw); | ||
1145 | } | ||
1146 | |||
1147 | /* | ||
1130 | * Visit the first child (need not be the first child as per the ordering | 1148 | * Visit the first child (need not be the first child as per the ordering |
1131 | * of the cgroup list, since we track last_scanned_child) of @mem and use | 1149 | * of the cgroup list, since we track last_scanned_child) of @mem and use |
1132 | * that to reclaim free pages from. | 1150 | * that to reclaim free pages from. |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 0a4ca8a0234b..d3def05a33d9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -4,6 +4,8 @@ | |||
4 | * Copyright (C) 1998,2000 Rik van Riel | 4 | * Copyright (C) 1998,2000 Rik van Riel |
5 | * Thanks go out to Claus Fischer for some serious inspiration and | 5 | * Thanks go out to Claus Fischer for some serious inspiration and |
6 | * for goading me into coding this file... | 6 | * for goading me into coding this file... |
7 | * Copyright (C) 2010 Google, Inc. | ||
8 | * Rewritten by David Rientjes | ||
7 | * | 9 | * |
8 | * The routines in this file are used to kill a process when | 10 | * The routines in this file are used to kill a process when |
9 | * we're seriously out of memory. This gets called from __alloc_pages() | 11 | * we're seriously out of memory. This gets called from __alloc_pages() |
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom; | |||
34 | int sysctl_oom_kill_allocating_task; | 36 | int sysctl_oom_kill_allocating_task; |
35 | int sysctl_oom_dump_tasks = 1; | 37 | int sysctl_oom_dump_tasks = 1; |
36 | static DEFINE_SPINLOCK(zone_scan_lock); | 38 | static DEFINE_SPINLOCK(zone_scan_lock); |
37 | /* #define DEBUG */ | ||
38 | 39 | ||
39 | #ifdef CONFIG_NUMA | 40 | #ifdef CONFIG_NUMA |
40 | /** | 41 | /** |
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem, | |||
140 | } | 141 | } |
141 | 142 | ||
142 | /** | 143 | /** |
143 | * badness - calculate a numeric value for how bad this task has been | 144 | * oom_badness - heuristic function to determine which candidate task to kill |
144 | * @p: task struct of which task we should calculate | 145 | * @p: task struct of which task we should calculate |
145 | * @uptime: current uptime in seconds | 146 | * @totalpages: total present RAM allowed for page allocation |
146 | * | 147 | * |
147 | * The formula used is relatively simple and documented inline in the | 148 | * The heuristic for determining which task to kill is made to be as simple and |
148 | * function. The main rationale is that we want to select a good task | 149 | * predictable as possible. The goal is to return the highest value for the |
149 | * to kill when we run out of memory. | 150 | * task consuming the most memory to avoid subsequent oom failures. |
150 | * | ||
151 | * Good in this context means that: | ||
152 | * 1) we lose the minimum amount of work done | ||
153 | * 2) we recover a large amount of memory | ||
154 | * 3) we don't kill anything innocent of eating tons of memory | ||
155 | * 4) we want to kill the minimum amount of processes (one) | ||
156 | * 5) we try to kill the process the user expects us to kill, this | ||
157 | * algorithm has been meticulously tuned to meet the principle | ||
158 | * of least surprise ... (be careful when you change it) | ||
159 | */ | 151 | */ |
160 | unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, | 152 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, |
161 | const nodemask_t *nodemask, unsigned long uptime) | 153 | const nodemask_t *nodemask, unsigned long totalpages) |
162 | { | 154 | { |
163 | unsigned long points, cpu_time, run_time; | 155 | int points; |
164 | struct task_struct *child; | ||
165 | struct task_struct *c, *t; | ||
166 | int oom_adj = p->signal->oom_adj; | ||
167 | struct task_cputime task_time; | ||
168 | unsigned long utime; | ||
169 | unsigned long stime; | ||
170 | 156 | ||
171 | if (oom_unkillable_task(p, mem, nodemask)) | 157 | if (oom_unkillable_task(p, mem, nodemask)) |
172 | return 0; | 158 | return 0; |
173 | if (oom_adj == OOM_DISABLE) | ||
174 | return 0; | ||
175 | 159 | ||
176 | p = find_lock_task_mm(p); | 160 | p = find_lock_task_mm(p); |
177 | if (!p) | 161 | if (!p) |
178 | return 0; | 162 | return 0; |
179 | 163 | ||
180 | /* | 164 | /* |
181 | * The memory size of the process is the basis for the badness. | 165 | * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't |
182 | */ | 166 | * need to be executed for something that cannot be killed. |
183 | points = p->mm->total_vm; | ||
184 | task_unlock(p); | ||
185 | |||
186 | /* | ||
187 | * swapoff can easily use up all memory, so kill those first. | ||
188 | */ | ||
189 | if (p->flags & PF_OOM_ORIGIN) | ||
190 | return ULONG_MAX; | ||
191 | |||
192 | /* | ||
193 | * Processes which fork a lot of child processes are likely | ||
194 | * a good choice. We add half the vmsize of the children if they | ||
195 | * have an own mm. This prevents forking servers to flood the | ||
196 | * machine with an endless amount of children. In case a single | ||
197 | * child is eating the vast majority of memory, adding only half | ||
198 | * to the parents will make the child our kill candidate of choice. | ||
199 | */ | 167 | */ |
200 | t = p; | 168 | if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { |
201 | do { | 169 | task_unlock(p); |
202 | list_for_each_entry(c, &t->children, sibling) { | 170 | return 0; |
203 | child = find_lock_task_mm(c); | 171 | } |
204 | if (child) { | ||
205 | if (child->mm != p->mm) | ||
206 | points += child->mm->total_vm/2 + 1; | ||
207 | task_unlock(child); | ||
208 | } | ||
209 | } | ||
210 | } while_each_thread(p, t); | ||
211 | 172 | ||
212 | /* | 173 | /* |
213 | * CPU time is in tens of seconds and run time is in thousands | 174 | * When the PF_OOM_ORIGIN bit is set, it indicates the task should have |
214 | * of seconds. There is no particular reason for this other than | 175 | * priority for oom killing. |
215 | * that it turned out to work very well in practice. | ||
216 | */ | 176 | */ |
217 | thread_group_cputime(p, &task_time); | 177 | if (p->flags & PF_OOM_ORIGIN) { |
218 | utime = cputime_to_jiffies(task_time.utime); | 178 | task_unlock(p); |
219 | stime = cputime_to_jiffies(task_time.stime); | 179 | return 1000; |
220 | cpu_time = (utime + stime) >> (SHIFT_HZ + 3); | 180 | } |
221 | |||
222 | |||
223 | if (uptime >= p->start_time.tv_sec) | ||
224 | run_time = (uptime - p->start_time.tv_sec) >> 10; | ||
225 | else | ||
226 | run_time = 0; | ||
227 | |||
228 | if (cpu_time) | ||
229 | points /= int_sqrt(cpu_time); | ||
230 | if (run_time) | ||
231 | points /= int_sqrt(int_sqrt(run_time)); | ||
232 | 181 | ||
233 | /* | 182 | /* |
234 | * Niced processes are most likely less important, so double | 183 | * The memory controller may have a limit of 0 bytes, so avoid a divide |
235 | * their badness points. | 184 | * by zero, if necessary. |
236 | */ | 185 | */ |
237 | if (task_nice(p) > 0) | 186 | if (!totalpages) |
238 | points *= 2; | 187 | totalpages = 1; |
239 | 188 | ||
240 | /* | 189 | /* |
241 | * Superuser processes are usually more important, so we make it | 190 | * The baseline for the badness score is the proportion of RAM that each |
242 | * less likely that we kill those. | 191 | * task's rss and swap space use. |
243 | */ | 192 | */ |
244 | if (has_capability_noaudit(p, CAP_SYS_ADMIN) || | 193 | points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / |
245 | has_capability_noaudit(p, CAP_SYS_RESOURCE)) | 194 | totalpages; |
246 | points /= 4; | 195 | task_unlock(p); |
247 | 196 | ||
248 | /* | 197 | /* |
249 | * We don't want to kill a process with direct hardware access. | 198 | * Root processes get 3% bonus, just like the __vm_enough_memory() |
250 | * Not only could that mess up the hardware, but usually users | 199 | * implementation used by LSMs. |
251 | * tend to only have this flag set on applications they think | ||
252 | * of as important. | ||
253 | */ | 200 | */ |
254 | if (has_capability_noaudit(p, CAP_SYS_RAWIO)) | 201 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
255 | points /= 4; | 202 | points -= 30; |
256 | 203 | ||
257 | /* | 204 | /* |
258 | * Adjust the score by oom_adj. | 205 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may |
206 | * either completely disable oom killing or always prefer a certain | ||
207 | * task. | ||
259 | */ | 208 | */ |
260 | if (oom_adj) { | 209 | points += p->signal->oom_score_adj; |
261 | if (oom_adj > 0) { | ||
262 | if (!points) | ||
263 | points = 1; | ||
264 | points <<= oom_adj; | ||
265 | } else | ||
266 | points >>= -(oom_adj); | ||
267 | } | ||
268 | 210 | ||
269 | #ifdef DEBUG | 211 | if (points < 0) |
270 | printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", | 212 | return 0; |
271 | p->pid, p->comm, points); | 213 | return (points < 1000) ? points : 1000; |
272 | #endif | ||
273 | return points; | ||
274 | } | 214 | } |
275 | 215 | ||
276 | /* | 216 | /* |
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, | |||
278 | */ | 218 | */ |
279 | #ifdef CONFIG_NUMA | 219 | #ifdef CONFIG_NUMA |
280 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | 220 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, |
281 | gfp_t gfp_mask, nodemask_t *nodemask) | 221 | gfp_t gfp_mask, nodemask_t *nodemask, |
222 | unsigned long *totalpages) | ||
282 | { | 223 | { |
283 | struct zone *zone; | 224 | struct zone *zone; |
284 | struct zoneref *z; | 225 | struct zoneref *z; |
285 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 226 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
227 | bool cpuset_limited = false; | ||
228 | int nid; | ||
286 | 229 | ||
230 | /* Default to all available memory */ | ||
231 | *totalpages = totalram_pages + total_swap_pages; | ||
232 | |||
233 | if (!zonelist) | ||
234 | return CONSTRAINT_NONE; | ||
287 | /* | 235 | /* |
288 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid | 236 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
289 | * to kill current.We have to random task kill in this case. | 237 | * to kill current.We have to random task kill in this case. |
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
293 | return CONSTRAINT_NONE; | 241 | return CONSTRAINT_NONE; |
294 | 242 | ||
295 | /* | 243 | /* |
296 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | 244 | * This is not a __GFP_THISNODE allocation, so a truncated nodemask in |
297 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | 245 | * the page allocator means a mempolicy is in effect. Cpuset policy |
298 | * feature. mempolicy is an only user of nodemask here. | 246 | * is enforced in get_page_from_freelist(). |
299 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
300 | */ | 247 | */ |
301 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | 248 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { |
249 | *totalpages = total_swap_pages; | ||
250 | for_each_node_mask(nid, *nodemask) | ||
251 | *totalpages += node_spanned_pages(nid); | ||
302 | return CONSTRAINT_MEMORY_POLICY; | 252 | return CONSTRAINT_MEMORY_POLICY; |
253 | } | ||
303 | 254 | ||
304 | /* Check this allocation failure is caused by cpuset's wall function */ | 255 | /* Check this allocation failure is caused by cpuset's wall function */ |
305 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 256 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
306 | high_zoneidx, nodemask) | 257 | high_zoneidx, nodemask) |
307 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | 258 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) |
308 | return CONSTRAINT_CPUSET; | 259 | cpuset_limited = true; |
309 | 260 | ||
261 | if (cpuset_limited) { | ||
262 | *totalpages = total_swap_pages; | ||
263 | for_each_node_mask(nid, cpuset_current_mems_allowed) | ||
264 | *totalpages += node_spanned_pages(nid); | ||
265 | return CONSTRAINT_CPUSET; | ||
266 | } | ||
310 | return CONSTRAINT_NONE; | 267 | return CONSTRAINT_NONE; |
311 | } | 268 | } |
312 | #else | 269 | #else |
313 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | 270 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, |
314 | gfp_t gfp_mask, nodemask_t *nodemask) | 271 | gfp_t gfp_mask, nodemask_t *nodemask, |
272 | unsigned long *totalpages) | ||
315 | { | 273 | { |
274 | *totalpages = totalram_pages + total_swap_pages; | ||
316 | return CONSTRAINT_NONE; | 275 | return CONSTRAINT_NONE; |
317 | } | 276 | } |
318 | #endif | 277 | #endif |
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
323 | * | 282 | * |
324 | * (not docbooked, we don't want this one cluttering up the manual) | 283 | * (not docbooked, we don't want this one cluttering up the manual) |
325 | */ | 284 | */ |
326 | static struct task_struct *select_bad_process(unsigned long *ppoints, | 285 | static struct task_struct *select_bad_process(unsigned int *ppoints, |
327 | struct mem_cgroup *mem, const nodemask_t *nodemask) | 286 | unsigned long totalpages, struct mem_cgroup *mem, |
287 | const nodemask_t *nodemask) | ||
328 | { | 288 | { |
329 | struct task_struct *p; | 289 | struct task_struct *p; |
330 | struct task_struct *chosen = NULL; | 290 | struct task_struct *chosen = NULL; |
331 | struct timespec uptime; | ||
332 | *ppoints = 0; | 291 | *ppoints = 0; |
333 | 292 | ||
334 | do_posix_clock_monotonic_gettime(&uptime); | ||
335 | for_each_process(p) { | 293 | for_each_process(p) { |
336 | unsigned long points; | 294 | unsigned int points; |
337 | 295 | ||
338 | if (oom_unkillable_task(p, mem, nodemask)) | 296 | if (oom_unkillable_task(p, mem, nodemask)) |
339 | continue; | 297 | continue; |
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
365 | return ERR_PTR(-1UL); | 323 | return ERR_PTR(-1UL); |
366 | 324 | ||
367 | chosen = p; | 325 | chosen = p; |
368 | *ppoints = ULONG_MAX; | 326 | *ppoints = 1000; |
369 | } | 327 | } |
370 | 328 | ||
371 | points = badness(p, mem, nodemask, uptime.tv_sec); | 329 | points = oom_badness(p, mem, nodemask, totalpages); |
372 | if (points > *ppoints || !chosen) { | 330 | if (points > *ppoints) { |
373 | chosen = p; | 331 | chosen = p; |
374 | *ppoints = points; | 332 | *ppoints = points; |
375 | } | 333 | } |
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
384 | * | 342 | * |
385 | * Dumps the current memory state of all system tasks, excluding kernel threads. | 343 | * Dumps the current memory state of all system tasks, excluding kernel threads. |
386 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj | 344 | * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj |
387 | * score, and name. | 345 | * value, oom_score_adj value, and name. |
388 | * | 346 | * |
389 | * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are | 347 | * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are |
390 | * shown. | 348 | * shown. |
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
396 | struct task_struct *p; | 354 | struct task_struct *p; |
397 | struct task_struct *task; | 355 | struct task_struct *task; |
398 | 356 | ||
399 | printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " | 357 | pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); |
400 | "name\n"); | ||
401 | for_each_process(p) { | 358 | for_each_process(p) { |
402 | if (p->flags & PF_KTHREAD) | 359 | if (p->flags & PF_KTHREAD) |
403 | continue; | 360 | continue; |
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
414 | continue; | 371 | continue; |
415 | } | 372 | } |
416 | 373 | ||
417 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u %3d %s\n", | 374 | pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", |
418 | task->pid, __task_cred(task)->uid, task->tgid, | 375 | task->pid, __task_cred(task)->uid, task->tgid, |
419 | task->mm->total_vm, get_mm_rss(task->mm), | 376 | task->mm->total_vm, get_mm_rss(task->mm), |
420 | task_cpu(task), task->signal->oom_adj, task->comm); | 377 | task_cpu(task), task->signal->oom_adj, |
378 | task->signal->oom_score_adj, task->comm); | ||
421 | task_unlock(task); | 379 | task_unlock(task); |
422 | } | 380 | } |
423 | } | 381 | } |
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, | |||
427 | { | 385 | { |
428 | task_lock(current); | 386 | task_lock(current); |
429 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 387 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
430 | "oom_adj=%d\n", | 388 | "oom_adj=%d, oom_score_adj=%d\n", |
431 | current->comm, gfp_mask, order, current->signal->oom_adj); | 389 | current->comm, gfp_mask, order, current->signal->oom_adj, |
390 | current->signal->oom_score_adj); | ||
432 | cpuset_print_task_mems_allowed(current); | 391 | cpuset_print_task_mems_allowed(current); |
433 | task_unlock(current); | 392 | task_unlock(current); |
434 | dump_stack(); | 393 | dump_stack(); |
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) | |||
468 | #undef K | 427 | #undef K |
469 | 428 | ||
470 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | 429 | static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, |
471 | unsigned long points, struct mem_cgroup *mem, | 430 | unsigned int points, unsigned long totalpages, |
472 | nodemask_t *nodemask, const char *message) | 431 | struct mem_cgroup *mem, nodemask_t *nodemask, |
432 | const char *message) | ||
473 | { | 433 | { |
474 | struct task_struct *victim = p; | 434 | struct task_struct *victim = p; |
475 | struct task_struct *child; | 435 | struct task_struct *child; |
476 | struct task_struct *t = p; | 436 | struct task_struct *t = p; |
477 | unsigned long victim_points = 0; | 437 | unsigned int victim_points = 0; |
478 | struct timespec uptime; | ||
479 | 438 | ||
480 | if (printk_ratelimit()) | 439 | if (printk_ratelimit()) |
481 | dump_header(p, gfp_mask, order, mem); | 440 | dump_header(p, gfp_mask, order, mem); |
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
491 | } | 450 | } |
492 | 451 | ||
493 | task_lock(p); | 452 | task_lock(p); |
494 | pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n", | 453 | pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", |
495 | message, task_pid_nr(p), p->comm, points); | 454 | message, task_pid_nr(p), p->comm, points); |
496 | task_unlock(p); | 455 | task_unlock(p); |
497 | 456 | ||
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
501 | * parent. This attempts to lose the minimal amount of work done while | 460 | * parent. This attempts to lose the minimal amount of work done while |
502 | * still freeing memory. | 461 | * still freeing memory. |
503 | */ | 462 | */ |
504 | do_posix_clock_monotonic_gettime(&uptime); | ||
505 | do { | 463 | do { |
506 | list_for_each_entry(child, &t->children, sibling) { | 464 | list_for_each_entry(child, &t->children, sibling) { |
507 | unsigned long child_points; | 465 | unsigned int child_points; |
508 | 466 | ||
509 | /* badness() returns 0 if the thread is unkillable */ | 467 | /* |
510 | child_points = badness(child, mem, nodemask, | 468 | * oom_badness() returns 0 if the thread is unkillable |
511 | uptime.tv_sec); | 469 | */ |
470 | child_points = oom_badness(child, mem, nodemask, | ||
471 | totalpages); | ||
512 | if (child_points > victim_points) { | 472 | if (child_points > victim_points) { |
513 | victim = child; | 473 | victim = child; |
514 | victim_points = child_points; | 474 | victim_points = child_points; |
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, | |||
546 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | 506 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR |
547 | void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | 507 | void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) |
548 | { | 508 | { |
549 | unsigned long points = 0; | 509 | unsigned long limit; |
510 | unsigned int points = 0; | ||
550 | struct task_struct *p; | 511 | struct task_struct *p; |
551 | 512 | ||
552 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); | 513 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); |
514 | limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; | ||
553 | read_lock(&tasklist_lock); | 515 | read_lock(&tasklist_lock); |
554 | retry: | 516 | retry: |
555 | p = select_bad_process(&points, mem, NULL); | 517 | p = select_bad_process(&points, limit, mem, NULL); |
556 | if (!p || PTR_ERR(p) == -1UL) | 518 | if (!p || PTR_ERR(p) == -1UL) |
557 | goto out; | 519 | goto out; |
558 | 520 | ||
559 | if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL, | 521 | if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, |
560 | "Memory cgroup out of memory")) | 522 | "Memory cgroup out of memory")) |
561 | goto retry; | 523 | goto retry; |
562 | out: | 524 | out: |
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
681 | int order, nodemask_t *nodemask) | 643 | int order, nodemask_t *nodemask) |
682 | { | 644 | { |
683 | struct task_struct *p; | 645 | struct task_struct *p; |
646 | unsigned long totalpages; | ||
684 | unsigned long freed = 0; | 647 | unsigned long freed = 0; |
685 | unsigned long points; | 648 | unsigned int points; |
686 | enum oom_constraint constraint = CONSTRAINT_NONE; | 649 | enum oom_constraint constraint = CONSTRAINT_NONE; |
687 | 650 | ||
688 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); | 651 | blocking_notifier_call_chain(&oom_notify_list, 0, &freed); |
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
705 | * Check if there were limitations on the allocation (only relevant for | 668 | * Check if there were limitations on the allocation (only relevant for |
706 | * NUMA) that may require different handling. | 669 | * NUMA) that may require different handling. |
707 | */ | 670 | */ |
708 | if (zonelist) | 671 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask, |
709 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); | 672 | &totalpages); |
710 | check_panic_on_oom(constraint, gfp_mask, order); | 673 | check_panic_on_oom(constraint, gfp_mask, order); |
711 | 674 | ||
712 | read_lock(&tasklist_lock); | 675 | read_lock(&tasklist_lock); |
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, | |||
718 | * non-zero, current could not be killed so we must fallback to | 681 | * non-zero, current could not be killed so we must fallback to |
719 | * the tasklist scan. | 682 | * the tasklist scan. |
720 | */ | 683 | */ |
721 | if (!oom_kill_process(current, gfp_mask, order, 0, NULL, | 684 | if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, |
722 | nodemask, | 685 | NULL, nodemask, |
723 | "Out of memory (oom_kill_allocating_task)")) | 686 | "Out of memory (oom_kill_allocating_task)")) |
724 | return; | 687 | return; |
725 | } | 688 | } |
726 | 689 | ||
727 | retry: | 690 | retry: |
728 | p = select_bad_process(&points, NULL, | 691 | p = select_bad_process(&points, totalpages, NULL, |
729 | constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : | 692 | constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : |
730 | NULL); | 693 | NULL); |
731 | if (PTR_ERR(p) == -1UL) | 694 | if (PTR_ERR(p) == -1UL) |
@@ -738,8 +701,8 @@ retry: | |||
738 | panic("Out of memory and no killable processes...\n"); | 701 | panic("Out of memory and no killable processes...\n"); |
739 | } | 702 | } |
740 | 703 | ||
741 | if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask, | 704 | if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, |
742 | "Out of memory")) | 705 | nodemask, "Out of memory")) |
743 | goto retry; | 706 | goto retry; |
744 | read_unlock(&tasklist_lock); | 707 | read_unlock(&tasklist_lock); |
745 | 708 | ||