diff options
author | David Rientjes <rientjes@google.com> | 2009-06-16 18:32:56 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-16 22:47:43 -0400 |
commit | 2ff05b2b4eac2e63d345fc731ea151a060247f53 (patch) | |
tree | 1840bc2d3b381eca5d39869499339b0fcc6eabbf | |
parent | c9e444103b5e7a5a3519f9913f59767f92e33baf (diff) |
oom: move oom_adj value from task_struct to mm_struct
The per-task oom_adj value is a characteristic of its mm more than the
task itself since it's not possible to oom kill any thread that shares the
mm. If a task were to be killed while attached to an mm that could not be
freed because another thread were set to OOM_DISABLE, it would have
needlessly been terminated since there is no potential for future memory
freeing.
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct mm_struct. This requires task_lock() on a
task to check its oom_adj value to protect against exec, but it's already
necessary to take the lock when dereferencing the mm to find the total VM
size for the badness heuristic.
This fixes a livelock if the oom killer chooses a task and another thread
sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs
because oom_kill_task() repeatedly returns 1 and refuses to kill the
chosen task while select_bad_process() will repeatedly choose the same
task during the next retry.
Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in
oom_kill_task() to check for threads sharing the same memory will be
removed in the next patch in this series where it will no longer be
necessary.
Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since
these threads are immune from oom killing already. They simply report an
oom_adj value of OOM_DISABLE.
Cc: Nick Piggin <npiggin@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/filesystems/proc.txt | 15 | ||||
-rw-r--r-- | fs/proc/base.c | 19 | ||||
-rw-r--r-- | include/linux/mm_types.h | 2 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | mm/oom_kill.c | 34 |
5 files changed, 50 insertions, 21 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index cd8717a36271..ebff3c10a07f 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS | |||
1003 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1003 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
1004 | ------------------------------------------------------ | 1004 | ------------------------------------------------------ |
1005 | 1005 | ||
1006 | This file can be used to adjust the score used to select which processes | 1006 | This file can be used to adjust the score used to select which processes should |
1007 | should be killed in an out-of-memory situation. Giving it a high score will | 1007 | be killed in an out-of-memory situation. The oom_adj value is a characteristic |
1008 | increase the likelihood of this process being killed by the oom-killer. Valid | 1008 | of the task's mm, so all threads that share an mm with pid will have the same |
1009 | values are in the range -16 to +15, plus the special value -17, which disables | 1009 | oom_adj value. A high value will increase the likelihood of this process being |
1010 | oom-killing altogether for this process. | 1010 | killed by the oom-killer. Valid values are in the range -16 to +15 as |
1011 | explained below and a special value of -17, which disables oom-killing | ||
1012 | altogether for threads sharing pid's mm. | ||
1011 | 1013 | ||
1012 | The process to be killed in an out-of-memory situation is selected among all others | 1014 | The process to be killed in an out-of-memory situation is selected among all others |
1013 | based on its badness score. This value equals the original memory size of the process | 1015 | based on its badness score. This value equals the original memory size of the process |
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers | |||
1021 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1023 | are the prime candidates to be killed. Having only one 'hungry' child will make |
1022 | parent less preferable than the child. | 1024 | parent less preferable than the child. |
1023 | 1025 | ||
1026 | /proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from | ||
1027 | oom-killing already. | ||
1028 | |||
1024 | /proc/<pid>/oom_score shows process' current badness score. | 1029 | /proc/<pid>/oom_score shows process' current badness score. |
1025 | 1030 | ||
1026 | The following heuristics are then applied: | 1031 | The following heuristics are then applied: |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 1539e630c47d..3ce5ae9e3d2d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, | |||
1006 | 1006 | ||
1007 | if (!task) | 1007 | if (!task) |
1008 | return -ESRCH; | 1008 | return -ESRCH; |
1009 | oom_adjust = task->oomkilladj; | 1009 | task_lock(task); |
1010 | if (task->mm) | ||
1011 | oom_adjust = task->mm->oom_adj; | ||
1012 | else | ||
1013 | oom_adjust = OOM_DISABLE; | ||
1014 | task_unlock(task); | ||
1010 | put_task_struct(task); | 1015 | put_task_struct(task); |
1011 | 1016 | ||
1012 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); | 1017 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); |
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
1035 | task = get_proc_task(file->f_path.dentry->d_inode); | 1040 | task = get_proc_task(file->f_path.dentry->d_inode); |
1036 | if (!task) | 1041 | if (!task) |
1037 | return -ESRCH; | 1042 | return -ESRCH; |
1038 | if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { | 1043 | task_lock(task); |
1044 | if (!task->mm) { | ||
1045 | task_unlock(task); | ||
1046 | put_task_struct(task); | ||
1047 | return -EINVAL; | ||
1048 | } | ||
1049 | if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { | ||
1050 | task_unlock(task); | ||
1039 | put_task_struct(task); | 1051 | put_task_struct(task); |
1040 | return -EACCES; | 1052 | return -EACCES; |
1041 | } | 1053 | } |
1042 | task->oomkilladj = oom_adjust; | 1054 | task->mm->oom_adj = oom_adjust; |
1055 | task_unlock(task); | ||
1043 | put_task_struct(task); | 1056 | put_task_struct(task); |
1044 | if (end - buffer == 0) | 1057 | if (end - buffer == 0) |
1045 | return -EIO; | 1058 | return -EIO; |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf21..f4408106fcbc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -232,6 +232,8 @@ struct mm_struct { | |||
232 | 232 | ||
233 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 233 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
234 | 234 | ||
235 | s8 oom_adj; /* OOM kill score adjustment (bit shift) */ | ||
236 | |||
235 | cpumask_t cpu_vm_mask; | 237 | cpumask_t cpu_vm_mask; |
236 | 238 | ||
237 | /* Architecture-specific MM context */ | 239 | /* Architecture-specific MM context */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 1048bf50540a..1bc6fae0c135 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1178,7 +1178,6 @@ struct task_struct { | |||
1178 | * a short time | 1178 | * a short time |
1179 | */ | 1179 | */ |
1180 | unsigned char fpu_counter; | 1180 | unsigned char fpu_counter; |
1181 | s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ | ||
1182 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 1181 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
1183 | unsigned int btrace_seq; | 1182 | unsigned int btrace_seq; |
1184 | #endif | 1183 | #endif |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922b..b60913520ef3 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj; | ||
61 | 62 | ||
62 | task_lock(p); | 63 | task_lock(p); |
63 | mm = p->mm; | 64 | mm = p->mm; |
@@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
65 | task_unlock(p); | 66 | task_unlock(p); |
66 | return 0; | 67 | return 0; |
67 | } | 68 | } |
69 | oom_adj = mm->oom_adj; | ||
68 | 70 | ||
69 | /* | 71 | /* |
70 | * The memory size of the process is the basis for the badness. | 72 | * The memory size of the process is the basis for the badness. |
@@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
148 | points /= 8; | 150 | points /= 8; |
149 | 151 | ||
150 | /* | 152 | /* |
151 | * Adjust the score by oomkilladj. | 153 | * Adjust the score by oom_adj. |
152 | */ | 154 | */ |
153 | if (p->oomkilladj) { | 155 | if (oom_adj) { |
154 | if (p->oomkilladj > 0) { | 156 | if (oom_adj > 0) { |
155 | if (!points) | 157 | if (!points) |
156 | points = 1; | 158 | points = 1; |
157 | points <<= p->oomkilladj; | 159 | points <<= oom_adj; |
158 | } else | 160 | } else |
159 | points >>= -(p->oomkilladj); | 161 | points >>= -(oom_adj); |
160 | } | 162 | } |
161 | 163 | ||
162 | #ifdef DEBUG | 164 | #ifdef DEBUG |
@@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
251 | *ppoints = ULONG_MAX; | 253 | *ppoints = ULONG_MAX; |
252 | } | 254 | } |
253 | 255 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | 256 | task_lock(p); |
257 | if (p->mm && p->mm->oom_adj == OOM_DISABLE) { | ||
258 | task_unlock(p); | ||
255 | continue; | 259 | continue; |
260 | } | ||
261 | task_unlock(p); | ||
256 | 262 | ||
257 | points = badness(p, uptime.tv_sec); | 263 | points = badness(p, uptime.tv_sec); |
258 | if (points > *ppoints || !chosen) { | 264 | if (points > *ppoints || !chosen) { |
@@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
304 | } | 310 | } |
305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 311 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 312 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | 313 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); |
308 | p->comm); | ||
309 | task_unlock(p); | 314 | task_unlock(p); |
310 | } while_each_thread(g, p); | 315 | } while_each_thread(g, p); |
311 | } | 316 | } |
@@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p) | |||
367 | * Don't kill the process if any threads are set to OOM_DISABLE | 372 | * Don't kill the process if any threads are set to OOM_DISABLE |
368 | */ | 373 | */ |
369 | do_each_thread(g, q) { | 374 | do_each_thread(g, q) { |
370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | 375 | task_lock(q); |
376 | if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) { | ||
377 | task_unlock(q); | ||
371 | return 1; | 378 | return 1; |
379 | } | ||
380 | task_unlock(q); | ||
372 | } while_each_thread(g, q); | 381 | } while_each_thread(g, q); |
373 | 382 | ||
374 | __oom_kill_task(p, 1); | 383 | __oom_kill_task(p, 1); |
@@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
393 | struct task_struct *c; | 402 | struct task_struct *c; |
394 | 403 | ||
395 | if (printk_ratelimit()) { | 404 | if (printk_ratelimit()) { |
396 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | ||
398 | current->comm, gfp_mask, order, current->oomkilladj); | ||
399 | task_lock(current); | 405 | task_lock(current); |
406 | printk(KERN_WARNING "%s invoked oom-killer: " | ||
407 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | ||
408 | current->comm, gfp_mask, order, | ||
409 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | ||
400 | cpuset_print_task_mems_allowed(current); | 410 | cpuset_print_task_mems_allowed(current); |
401 | task_unlock(current); | 411 | task_unlock(current); |
402 | dump_stack(); | 412 | dump_stack(); |