diff options
author | KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | 2009-09-21 20:03:13 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-09-22 10:17:39 -0400 |
commit | 28b83c5193e7ab951e402252278f2cc79dc4d298 (patch) | |
tree | 10080e8d3957c2a03f8419ab44c9ecb0ffcdaee0 | |
parent | f168e1b6390e2d79cf57e48e6ae6d9b0a9e2851a (diff) |
oom: move oom_adj value from task_struct to signal_struct
Currently, OOM logic callflow is here.
__out_of_memory()
select_bad_process() for each task
badness() calculate badness of one task
oom_kill_process() search child
oom_kill_task() kill target task and mm shared tasks with it
example, process-A have two thread, thread-A and thread-B and it have very
fat memory and each thread have following oom_adj and oom_score.
thread-A: oom_adj = OOM_DISABLE, oom_score = 0
thread-B: oom_adj = 0, oom_score = very-high
Then, select_bad_process() select thread-B, but oom_kill_task() refuse
kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory()
call select_bad_process() again. but select_bad_process() select the same
task. It mean kernel fall in livelock.
The fact is, select_bad_process() must select killable task. otherwise
OOM logic go into livelock.
And root cause is, oom_adj shouldn't be per-thread value. it should be
per-process value because OOM-killer kill a process, not thread. Thus
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct signal_struct. it naturally prevent
select_bad_process() choose wrong task.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/proc/base.c | 24 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/fork.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 34 |
4 files changed, 39 insertions, 24 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c index 6f742f6658a9..81cfff82875b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
@@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, | |||
999 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); | 999 | struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); |
1000 | char buffer[PROC_NUMBUF]; | 1000 | char buffer[PROC_NUMBUF]; |
1001 | size_t len; | 1001 | size_t len; |
1002 | int oom_adjust; | 1002 | int oom_adjust = OOM_DISABLE; |
1003 | unsigned long flags; | ||
1003 | 1004 | ||
1004 | if (!task) | 1005 | if (!task) |
1005 | return -ESRCH; | 1006 | return -ESRCH; |
1006 | oom_adjust = task->oomkilladj; | 1007 | |
1008 | if (lock_task_sighand(task, &flags)) { | ||
1009 | oom_adjust = task->signal->oom_adj; | ||
1010 | unlock_task_sighand(task, &flags); | ||
1011 | } | ||
1012 | |||
1007 | put_task_struct(task); | 1013 | put_task_struct(task); |
1008 | 1014 | ||
1009 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); | 1015 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); |
@@ -1017,6 +1023,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
1017 | struct task_struct *task; | 1023 | struct task_struct *task; |
1018 | char buffer[PROC_NUMBUF], *end; | 1024 | char buffer[PROC_NUMBUF], *end; |
1019 | int oom_adjust; | 1025 | int oom_adjust; |
1026 | unsigned long flags; | ||
1020 | 1027 | ||
1021 | memset(buffer, 0, sizeof(buffer)); | 1028 | memset(buffer, 0, sizeof(buffer)); |
1022 | if (count > sizeof(buffer) - 1) | 1029 | if (count > sizeof(buffer) - 1) |
@@ -1032,11 +1039,20 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
1032 | task = get_proc_task(file->f_path.dentry->d_inode); | 1039 | task = get_proc_task(file->f_path.dentry->d_inode); |
1033 | if (!task) | 1040 | if (!task) |
1034 | return -ESRCH; | 1041 | return -ESRCH; |
1035 | if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { | 1042 | if (!lock_task_sighand(task, &flags)) { |
1043 | put_task_struct(task); | ||
1044 | return -ESRCH; | ||
1045 | } | ||
1046 | |||
1047 | if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) { | ||
1048 | unlock_task_sighand(task, &flags); | ||
1036 | put_task_struct(task); | 1049 | put_task_struct(task); |
1037 | return -EACCES; | 1050 | return -EACCES; |
1038 | } | 1051 | } |
1039 | task->oomkilladj = oom_adjust; | 1052 | |
1053 | task->signal->oom_adj = oom_adjust; | ||
1054 | |||
1055 | unlock_task_sighand(task, &flags); | ||
1040 | put_task_struct(task); | 1056 | put_task_struct(task); |
1041 | if (end - buffer == 0) | 1057 | if (end - buffer == 0) |
1042 | return -EIO; | 1058 | return -EIO; |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 899d7304d594..17e9a8e9a51d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -639,6 +639,8 @@ struct signal_struct { | |||
639 | unsigned audit_tty; | 639 | unsigned audit_tty; |
640 | struct tty_audit_buf *tty_audit_buf; | 640 | struct tty_audit_buf *tty_audit_buf; |
641 | #endif | 641 | #endif |
642 | |||
643 | int oom_adj; /* OOM kill score adjustment (bit shift) */ | ||
642 | }; | 644 | }; |
643 | 645 | ||
644 | /* Context switch must be unlocked if interrupts are to be enabled */ | 646 | /* Context switch must be unlocked if interrupts are to be enabled */ |
@@ -1221,7 +1223,6 @@ struct task_struct { | |||
1221 | * a short time | 1223 | * a short time |
1222 | */ | 1224 | */ |
1223 | unsigned char fpu_counter; | 1225 | unsigned char fpu_counter; |
1224 | s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ | ||
1225 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 1226 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
1226 | unsigned int btrace_seq; | 1227 | unsigned int btrace_seq; |
1227 | #endif | 1228 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index 73a442b7be6d..1020977b57ca 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -880,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) | |||
880 | 880 | ||
881 | tty_audit_fork(sig); | 881 | tty_audit_fork(sig); |
882 | 882 | ||
883 | sig->oom_adj = current->signal->oom_adj; | ||
884 | |||
883 | return 0; | 885 | return 0; |
884 | } | 886 | } |
885 | 887 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index da4c342f2641..630b77fe862f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -58,6 +58,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
60 | struct task_struct *child; | 60 | struct task_struct *child; |
61 | int oom_adj = p->signal->oom_adj; | ||
62 | |||
63 | if (oom_adj == OOM_DISABLE) | ||
64 | return 0; | ||
61 | 65 | ||
62 | task_lock(p); | 66 | task_lock(p); |
63 | mm = p->mm; | 67 | mm = p->mm; |
@@ -148,15 +152,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
148 | points /= 8; | 152 | points /= 8; |
149 | 153 | ||
150 | /* | 154 | /* |
151 | * Adjust the score by oomkilladj. | 155 | * Adjust the score by oom_adj. |
152 | */ | 156 | */ |
153 | if (p->oomkilladj) { | 157 | if (oom_adj) { |
154 | if (p->oomkilladj > 0) { | 158 | if (oom_adj > 0) { |
155 | if (!points) | 159 | if (!points) |
156 | points = 1; | 160 | points = 1; |
157 | points <<= p->oomkilladj; | 161 | points <<= oom_adj; |
158 | } else | 162 | } else |
159 | points >>= -(p->oomkilladj); | 163 | points >>= -(oom_adj); |
160 | } | 164 | } |
161 | 165 | ||
162 | #ifdef DEBUG | 166 | #ifdef DEBUG |
@@ -251,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
251 | *ppoints = ULONG_MAX; | 255 | *ppoints = ULONG_MAX; |
252 | } | 256 | } |
253 | 257 | ||
254 | if (p->oomkilladj == OOM_DISABLE) | 258 | if (p->signal->oom_adj == OOM_DISABLE) |
255 | continue; | 259 | continue; |
256 | 260 | ||
257 | points = badness(p, uptime.tv_sec); | 261 | points = badness(p, uptime.tv_sec); |
@@ -304,7 +308,7 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
304 | } | 308 | } |
305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 309 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 310 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, | 311 | get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, |
308 | p->comm); | 312 | p->comm); |
309 | task_unlock(p); | 313 | task_unlock(p); |
310 | } while_each_thread(g, p); | 314 | } while_each_thread(g, p); |
@@ -359,18 +363,9 @@ static int oom_kill_task(struct task_struct *p) | |||
359 | * change to NULL at any time since we do not hold task_lock(p). | 363 | * change to NULL at any time since we do not hold task_lock(p). |
360 | * However, this is of no concern to us. | 364 | * However, this is of no concern to us. |
361 | */ | 365 | */ |
362 | 366 | if (!mm || p->signal->oom_adj == OOM_DISABLE) | |
363 | if (mm == NULL) | ||
364 | return 1; | 367 | return 1; |
365 | 368 | ||
366 | /* | ||
367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
368 | */ | ||
369 | do_each_thread(g, q) { | ||
370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
371 | return 1; | ||
372 | } while_each_thread(g, q); | ||
373 | |||
374 | __oom_kill_task(p, 1); | 369 | __oom_kill_task(p, 1); |
375 | 370 | ||
376 | /* | 371 | /* |
@@ -394,8 +389,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
394 | 389 | ||
395 | if (printk_ratelimit()) { | 390 | if (printk_ratelimit()) { |
396 | printk(KERN_WARNING "%s invoked oom-killer: " | 391 | printk(KERN_WARNING "%s invoked oom-killer: " |
397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", | 392 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", |
398 | current->comm, gfp_mask, order, current->oomkilladj); | 393 | current->comm, gfp_mask, order, |
394 | current->signal->oom_adj); | ||
399 | task_lock(current); | 395 | task_lock(current); |
400 | cpuset_print_task_mems_allowed(current); | 396 | cpuset_print_task_mems_allowed(current); |
401 | task_unlock(current); | 397 | task_unlock(current); |