aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2009-06-16 18:32:56 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-06-16 22:47:43 -0400
commit2ff05b2b4eac2e63d345fc731ea151a060247f53 (patch)
tree1840bc2d3b381eca5d39869499339b0fcc6eabbf
parentc9e444103b5e7a5a3519f9913f59767f92e33baf (diff)
oom: move oom_adj value from task_struct to mm_struct
The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin <npiggin@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/filesystems/proc.txt15
-rw-r--r--fs/proc/base.c19
-rw-r--r--include/linux/mm_types.h2
-rw-r--r--include/linux/sched.h1
-rw-r--r--mm/oom_kill.c34
5 files changed, 50 insertions, 21 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index cd8717a36271..ebff3c10a07f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS
10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score 10033.1 /proc/<pid>/oom_adj - Adjust the oom-killer score
1004------------------------------------------------------ 1004------------------------------------------------------
1005 1005
1006This file can be used to adjust the score used to select which processes 1006This file can be used to adjust the score used to select which processes should
1007should be killed in an out-of-memory situation. Giving it a high score will 1007be killed in an out-of-memory situation. The oom_adj value is a characteristic
1008increase the likelihood of this process being killed by the oom-killer. Valid 1008of the task's mm, so all threads that share an mm with pid will have the same
1009values are in the range -16 to +15, plus the special value -17, which disables 1009oom_adj value. A high value will increase the likelihood of this process being
1010oom-killing altogether for this process. 1010killed by the oom-killer. Valid values are in the range -16 to +15 as
1011explained below and a special value of -17, which disables oom-killing
1012altogether for threads sharing pid's mm.
1011 1013
1012The process to be killed in an out-of-memory situation is selected among all others 1014The process to be killed in an out-of-memory situation is selected among all others
1013based on its badness score. This value equals the original memory size of the process 1015based on its badness score. This value equals the original memory size of the process
@@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers
1021are the prime candidates to be killed. Having only one 'hungry' child will make 1023are the prime candidates to be killed. Having only one 'hungry' child will make
1022parent less preferable than the child. 1024parent less preferable than the child.
1023 1025
1026/proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from
1027oom-killing already.
1028
1024/proc/<pid>/oom_score shows process' current badness score. 1029/proc/<pid>/oom_score shows process' current badness score.
1025 1030
1026The following heuristics are then applied: 1031The following heuristics are then applied:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1539e630c47d..3ce5ae9e3d2d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
1006 1006
1007 if (!task) 1007 if (!task)
1008 return -ESRCH; 1008 return -ESRCH;
1009 oom_adjust = task->oomkilladj; 1009 task_lock(task);
1010 if (task->mm)
1011 oom_adjust = task->mm->oom_adj;
1012 else
1013 oom_adjust = OOM_DISABLE;
1014 task_unlock(task);
1010 put_task_struct(task); 1015 put_task_struct(task);
1011 1016
1012 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1017 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1035 task = get_proc_task(file->f_path.dentry->d_inode); 1040 task = get_proc_task(file->f_path.dentry->d_inode);
1036 if (!task) 1041 if (!task)
1037 return -ESRCH; 1042 return -ESRCH;
1038 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { 1043 task_lock(task);
1044 if (!task->mm) {
1045 task_unlock(task);
1046 put_task_struct(task);
1047 return -EINVAL;
1048 }
1049 if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1050 task_unlock(task);
1039 put_task_struct(task); 1051 put_task_struct(task);
1040 return -EACCES; 1052 return -EACCES;
1041 } 1053 }
1042 task->oomkilladj = oom_adjust; 1054 task->mm->oom_adj = oom_adjust;
1055 task_unlock(task);
1043 put_task_struct(task); 1056 put_task_struct(task);
1044 if (end - buffer == 0) 1057 if (end - buffer == 0)
1045 return -EIO; 1058 return -EIO;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0e80e26ecf21..f4408106fcbc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -232,6 +232,8 @@ struct mm_struct {
232 232
233 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ 233 unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
234 234
235 s8 oom_adj; /* OOM kill score adjustment (bit shift) */
236
235 cpumask_t cpu_vm_mask; 237 cpumask_t cpu_vm_mask;
236 238
237 /* Architecture-specific MM context */ 239 /* Architecture-specific MM context */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1048bf50540a..1bc6fae0c135 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1178,7 +1178,6 @@ struct task_struct {
1178 * a short time 1178 * a short time
1179 */ 1179 */
1180 unsigned char fpu_counter; 1180 unsigned char fpu_counter;
1181 s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
1182#ifdef CONFIG_BLK_DEV_IO_TRACE 1181#ifdef CONFIG_BLK_DEV_IO_TRACE
1183 unsigned int btrace_seq; 1182 unsigned int btrace_seq;
1184#endif 1183#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index a7b2460e922b..b60913520ef3 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
61 62
62 task_lock(p); 63 task_lock(p);
63 mm = p->mm; 64 mm = p->mm;
@@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
65 task_unlock(p); 66 task_unlock(p);
66 return 0; 67 return 0;
67 } 68 }
69 oom_adj = mm->oom_adj;
68 70
69 /* 71 /*
70 * The memory size of the process is the basis for the badness. 72 * The memory size of the process is the basis for the badness.
@@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 150 points /= 8;
149 151
150 /* 152 /*
151 * Adjust the score by oomkilladj. 153 * Adjust the score by oom_adj.
152 */ 154 */
153 if (p->oomkilladj) { 155 if (oom_adj) {
154 if (p->oomkilladj > 0) { 156 if (oom_adj > 0) {
155 if (!points) 157 if (!points)
156 points = 1; 158 points = 1;
157 points <<= p->oomkilladj; 159 points <<= oom_adj;
158 } else 160 } else
159 points >>= -(p->oomkilladj); 161 points >>= -(oom_adj);
160 } 162 }
161 163
162#ifdef DEBUG 164#ifdef DEBUG
@@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 253 *ppoints = ULONG_MAX;
252 } 254 }
253 255
254 if (p->oomkilladj == OOM_DISABLE) 256 task_lock(p);
257 if (p->mm && p->mm->oom_adj == OOM_DISABLE) {
258 task_unlock(p);
255 continue; 259 continue;
260 }
261 task_unlock(p);
256 262
257 points = badness(p, uptime.tv_sec); 263 points = badness(p, uptime.tv_sec);
258 if (points > *ppoints || !chosen) { 264 if (points > *ppoints || !chosen) {
@@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 310 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 311 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 312 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 313 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm);
308 p->comm);
309 task_unlock(p); 314 task_unlock(p);
310 } while_each_thread(g, p); 315 } while_each_thread(g, p);
311} 316}
@@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p)
367 * Don't kill the process if any threads are set to OOM_DISABLE 372 * Don't kill the process if any threads are set to OOM_DISABLE
368 */ 373 */
369 do_each_thread(g, q) { 374 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE) 375 task_lock(q);
376 if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) {
377 task_unlock(q);
371 return 1; 378 return 1;
379 }
380 task_unlock(q);
372 } while_each_thread(g, q); 381 } while_each_thread(g, q);
373 382
374 __oom_kill_task(p, 1); 383 __oom_kill_task(p, 1);
@@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
393 struct task_struct *c; 402 struct task_struct *c;
394 403
395 if (printk_ratelimit()) { 404 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj);
399 task_lock(current); 405 task_lock(current);
406 printk(KERN_WARNING "%s invoked oom-killer: "
407 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
408 current->comm, gfp_mask, order,
409 current->mm ? current->mm->oom_adj : OOM_DISABLE);
400 cpuset_print_task_mems_allowed(current); 410 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 411 task_unlock(current);
402 dump_stack(); 412 dump_stack();