aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>2009-09-21 20:03:13 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-09-22 10:17:39 -0400
commit28b83c5193e7ab951e402252278f2cc79dc4d298 (patch)
tree10080e8d3957c2a03f8419ab44c9ecb0ffcdaee0
parentf168e1b6390e2d79cf57e48e6ae6d9b0a9e2851a (diff)
oom: move oom_adj value from task_struct to signal_struct
Currently, OOM logic callflow is here. __out_of_memory() select_bad_process() for each task badness() calculate badness of one task oom_kill_process() search child oom_kill_task() kill target task and mm shared tasks with it example, process-A have two thread, thread-A and thread-B and it have very fat memory and each thread have following oom_adj and oom_score. thread-A: oom_adj = OOM_DISABLE, oom_score = 0 thread-B: oom_adj = 0, oom_score = very-high Then, select_bad_process() select thread-B, but oom_kill_task() refuse kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory() call select_bad_process() again. but select_bad_process() select the same task. It mean kernel fall in livelock. The fact is, select_bad_process() must select killable task. otherwise OOM logic go into livelock. And root cause is, oom_adj shouldn't be per-thread value. it should be per-process value because OOM-killer kill a process, not thread. Thus This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct signal_struct. it naturally prevent select_bad_process() choose wrong task. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/base.c24
-rw-r--r--include/linux/sched.h3
-rw-r--r--kernel/fork.c2
-rw-r--r--mm/oom_kill.c34
4 files changed, 39 insertions, 24 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6f742f6658a9..81cfff82875b 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); 999 struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
1000 char buffer[PROC_NUMBUF]; 1000 char buffer[PROC_NUMBUF];
1001 size_t len; 1001 size_t len;
1002 int oom_adjust; 1002 int oom_adjust = OOM_DISABLE;
1003 unsigned long flags;
1003 1004
1004 if (!task) 1005 if (!task)
1005 return -ESRCH; 1006 return -ESRCH;
1006 oom_adjust = task->oomkilladj; 1007
1008 if (lock_task_sighand(task, &flags)) {
1009 oom_adjust = task->signal->oom_adj;
1010 unlock_task_sighand(task, &flags);
1011 }
1012
1007 put_task_struct(task); 1013 put_task_struct(task);
1008 1014
1009 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); 1015 len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1017,6 +1023,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1017 struct task_struct *task; 1023 struct task_struct *task;
1018 char buffer[PROC_NUMBUF], *end; 1024 char buffer[PROC_NUMBUF], *end;
1019 int oom_adjust; 1025 int oom_adjust;
1026 unsigned long flags;
1020 1027
1021 memset(buffer, 0, sizeof(buffer)); 1028 memset(buffer, 0, sizeof(buffer));
1022 if (count > sizeof(buffer) - 1) 1029 if (count > sizeof(buffer) - 1)
@@ -1032,11 +1039,20 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
1032 task = get_proc_task(file->f_path.dentry->d_inode); 1039 task = get_proc_task(file->f_path.dentry->d_inode);
1033 if (!task) 1040 if (!task)
1034 return -ESRCH; 1041 return -ESRCH;
1035 if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { 1042 if (!lock_task_sighand(task, &flags)) {
1043 put_task_struct(task);
1044 return -ESRCH;
1045 }
1046
1047 if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
1048 unlock_task_sighand(task, &flags);
1036 put_task_struct(task); 1049 put_task_struct(task);
1037 return -EACCES; 1050 return -EACCES;
1038 } 1051 }
1039 task->oomkilladj = oom_adjust; 1052
1053 task->signal->oom_adj = oom_adjust;
1054
1055 unlock_task_sighand(task, &flags);
1040 put_task_struct(task); 1056 put_task_struct(task);
1041 if (end - buffer == 0) 1057 if (end - buffer == 0)
1042 return -EIO; 1058 return -EIO;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 899d7304d594..17e9a8e9a51d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -639,6 +639,8 @@ struct signal_struct {
639 unsigned audit_tty; 639 unsigned audit_tty;
640 struct tty_audit_buf *tty_audit_buf; 640 struct tty_audit_buf *tty_audit_buf;
641#endif 641#endif
642
643 int oom_adj; /* OOM kill score adjustment (bit shift) */
642}; 644};
643 645
644/* Context switch must be unlocked if interrupts are to be enabled */ 646/* Context switch must be unlocked if interrupts are to be enabled */
@@ -1221,7 +1223,6 @@ struct task_struct {
1221 * a short time 1223 * a short time
1222 */ 1224 */
1223 unsigned char fpu_counter; 1225 unsigned char fpu_counter;
1224 s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
1225#ifdef CONFIG_BLK_DEV_IO_TRACE 1226#ifdef CONFIG_BLK_DEV_IO_TRACE
1226 unsigned int btrace_seq; 1227 unsigned int btrace_seq;
1227#endif 1228#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 73a442b7be6d..1020977b57ca 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -880,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
880 880
881 tty_audit_fork(sig); 881 tty_audit_fork(sig);
882 882
883 sig->oom_adj = current->signal->oom_adj;
884
883 return 0; 885 return 0;
884} 886}
885 887
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index da4c342f2641..630b77fe862f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,6 +58,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj = p->signal->oom_adj;
62
63 if (oom_adj == OOM_DISABLE)
64 return 0;
61 65
62 task_lock(p); 66 task_lock(p);
63 mm = p->mm; 67 mm = p->mm;
@@ -148,15 +152,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
148 points /= 8; 152 points /= 8;
149 153
150 /* 154 /*
151 * Adjust the score by oomkilladj. 155 * Adjust the score by oom_adj.
152 */ 156 */
153 if (p->oomkilladj) { 157 if (oom_adj) {
154 if (p->oomkilladj > 0) { 158 if (oom_adj > 0) {
155 if (!points) 159 if (!points)
156 points = 1; 160 points = 1;
157 points <<= p->oomkilladj; 161 points <<= oom_adj;
158 } else 162 } else
159 points >>= -(p->oomkilladj); 163 points >>= -(oom_adj);
160 } 164 }
161 165
162#ifdef DEBUG 166#ifdef DEBUG
@@ -251,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
251 *ppoints = ULONG_MAX; 255 *ppoints = ULONG_MAX;
252 } 256 }
253 257
254 if (p->oomkilladj == OOM_DISABLE) 258 if (p->signal->oom_adj == OOM_DISABLE)
255 continue; 259 continue;
256 260
257 points = badness(p, uptime.tv_sec); 261 points = badness(p, uptime.tv_sec);
@@ -304,7 +308,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
304 } 308 }
305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 309 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 310 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, 311 get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
308 p->comm); 312 p->comm);
309 task_unlock(p); 313 task_unlock(p);
310 } while_each_thread(g, p); 314 } while_each_thread(g, p);
@@ -359,18 +363,9 @@ static int oom_kill_task(struct task_struct *p)
359 * change to NULL at any time since we do not hold task_lock(p). 363 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us. 364 * However, this is of no concern to us.
361 */ 365 */
362 366 if (!mm || p->signal->oom_adj == OOM_DISABLE)
363 if (mm == NULL)
364 return 1; 367 return 1;
365 368
366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
374 __oom_kill_task(p, 1); 369 __oom_kill_task(p, 1);
375 370
376 /* 371 /*
@@ -394,8 +389,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 389
395 if (printk_ratelimit()) { 390 if (printk_ratelimit()) {
396 printk(KERN_WARNING "%s invoked oom-killer: " 391 printk(KERN_WARNING "%s invoked oom-killer: "
397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 392 "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
398 current->comm, gfp_mask, order, current->oomkilladj); 393 current->comm, gfp_mask, order,
394 current->signal->oom_adj);
399 task_lock(current); 395 task_lock(current);
400 cpuset_print_task_mems_allowed(current); 396 cpuset_print_task_mems_allowed(current);
401 task_unlock(current); 397 task_unlock(current);