diff options
| author | KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> | 2009-08-18 17:11:10 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-08-18 19:31:13 -0400 |
| commit | 0753ba01e126020bf0f8150934903b48935b697d (patch) | |
| tree | fbfd7e2d0abbe724a8c5e0e17fb9af522ed2e097 | |
| parent | 89a4eb4b66e8f4d395e14a14d262dac4d6ca52f0 (diff) | |
mm: revert "oom: move oom_adj value"
The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to
the mm_struct. It was a very good first step for sanitize OOM.
However Paul Menage reported the commit makes regression to his job
scheduler. Current OOM logic can kill OOM_DISABLED process.
Why? His program has the code of similar to the following.
...
set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */
...
if (vfork() == 0) {
set_oom_adj(0); /* Invoked child can be killed */
execve("foo-bar-cmd");
}
....
vfork() parent and child are shared the same mm_struct. then above
set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also
change oom_adj for vfork() parent. Then, vfork() parent (job scheduler)
lost OOM immune and it was killed.
Actually, fork-setting-exec idiom is very frequently used in userland program.
We must not break this assumption.
Then, this patch revert commit 2ff05b2b and related commit.
Reverted commit list
---------------------
- commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct)
- commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE)
- commit 8123681022 (oom: only oom kill exiting tasks with attached memory)
- commit 933b787b57 (mm: copy over oom_adj value at fork time)
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
| -rw-r--r-- | Documentation/filesystems/proc.txt | 15 | ||||
| -rw-r--r-- | fs/proc/base.c | 19 | ||||
| -rw-r--r-- | include/linux/mm_types.h | 2 | ||||
| -rw-r--r-- | include/linux/sched.h | 1 | ||||
| -rw-r--r-- | kernel/fork.c | 1 | ||||
| -rw-r--r-- | mm/oom_kill.c | 64 |
6 files changed, 48 insertions, 54 deletions
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e..ffead13f944 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt | |||
| @@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS | |||
| 1167 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score | 1167 | 3.1 /proc/<pid>/oom_adj - Adjust the oom-killer score |
| 1168 | ------------------------------------------------------ | 1168 | ------------------------------------------------------ |
| 1169 | 1169 | ||
| 1170 | This file can be used to adjust the score used to select which processes should | 1170 | This file can be used to adjust the score used to select which processes |
| 1171 | be killed in an out-of-memory situation. The oom_adj value is a characteristic | 1171 | should be killed in an out-of-memory situation. Giving it a high score will |
| 1172 | of the task's mm, so all threads that share an mm with pid will have the same | 1172 | increase the likelihood of this process being killed by the oom-killer. Valid |
| 1173 | oom_adj value. A high value will increase the likelihood of this process being | 1173 | values are in the range -16 to +15, plus the special value -17, which disables |
| 1174 | killed by the oom-killer. Valid values are in the range -16 to +15 as | 1174 | oom-killing altogether for this process. |
| 1175 | explained below and a special value of -17, which disables oom-killing | ||
| 1176 | altogether for threads sharing pid's mm. | ||
| 1177 | 1175 | ||
| 1178 | The process to be killed in an out-of-memory situation is selected among all others | 1176 | The process to be killed in an out-of-memory situation is selected among all others |
| 1179 | based on its badness score. This value equals the original memory size of the process | 1177 | based on its badness score. This value equals the original memory size of the process |
| @@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers | |||
| 1187 | are the prime candidates to be killed. Having only one 'hungry' child will make | 1185 | are the prime candidates to be killed. Having only one 'hungry' child will make |
| 1188 | parent less preferable than the child. | 1186 | parent less preferable than the child. |
| 1189 | 1187 | ||
| 1190 | /proc/<pid>/oom_adj cannot be changed for kthreads since they are immune from | ||
| 1191 | oom-killing already. | ||
| 1192 | |||
| 1193 | /proc/<pid>/oom_score shows process' current badness score. | 1188 | /proc/<pid>/oom_score shows process' current badness score. |
| 1194 | 1189 | ||
| 1195 | The following heuristics are then applied: | 1190 | The following heuristics are then applied: |
diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942..6f742f6658a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c | |||
| @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, | |||
| 1003 | 1003 | ||
| 1004 | if (!task) | 1004 | if (!task) |
| 1005 | return -ESRCH; | 1005 | return -ESRCH; |
| 1006 | task_lock(task); | 1006 | oom_adjust = task->oomkilladj; |
| 1007 | if (task->mm) | ||
| 1008 | oom_adjust = task->mm->oom_adj; | ||
| 1009 | else | ||
| 1010 | oom_adjust = OOM_DISABLE; | ||
| 1011 | task_unlock(task); | ||
| 1012 | put_task_struct(task); | 1007 | put_task_struct(task); |
| 1013 | 1008 | ||
| 1014 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); | 1009 | len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); |
| @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, | |||
| 1037 | task = get_proc_task(file->f_path.dentry->d_inode); | 1032 | task = get_proc_task(file->f_path.dentry->d_inode); |
| 1038 | if (!task) | 1033 | if (!task) |
| 1039 | return -ESRCH; | 1034 | return -ESRCH; |
| 1040 | task_lock(task); | 1035 | if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { |
| 1041 | if (!task->mm) { | ||
| 1042 | task_unlock(task); | ||
| 1043 | put_task_struct(task); | ||
| 1044 | return -EINVAL; | ||
| 1045 | } | ||
| 1046 | if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { | ||
| 1047 | task_unlock(task); | ||
| 1048 | put_task_struct(task); | 1036 | put_task_struct(task); |
| 1049 | return -EACCES; | 1037 | return -EACCES; |
| 1050 | } | 1038 | } |
| 1051 | task->mm->oom_adj = oom_adjust; | 1039 | task->oomkilladj = oom_adjust; |
| 1052 | task_unlock(task); | ||
| 1053 | put_task_struct(task); | 1040 | put_task_struct(task); |
| 1054 | if (end - buffer == 0) | 1041 | if (end - buffer == 0) |
| 1055 | return -EIO; | 1042 | return -EIO; |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7acc8439d9b..0042090a4d7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
| @@ -240,8 +240,6 @@ struct mm_struct { | |||
| 240 | 240 | ||
| 241 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ | 241 | unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ |
| 242 | 242 | ||
| 243 | s8 oom_adj; /* OOM kill score adjustment (bit shift) */ | ||
| 244 | |||
| 245 | cpumask_t cpu_vm_mask; | 243 | cpumask_t cpu_vm_mask; |
| 246 | 244 | ||
| 247 | /* Architecture-specific MM context */ | 245 | /* Architecture-specific MM context */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b..0f1ea4a6695 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1198,6 +1198,7 @@ struct task_struct { | |||
| 1198 | * a short time | 1198 | * a short time |
| 1199 | */ | 1199 | */ |
| 1200 | unsigned char fpu_counter; | 1200 | unsigned char fpu_counter; |
| 1201 | s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ | ||
| 1201 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 1202 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
| 1202 | unsigned int btrace_seq; | 1203 | unsigned int btrace_seq; |
| 1203 | #endif | 1204 | #endif |
diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556..144326b7af5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
| @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) | |||
| 426 | init_rwsem(&mm->mmap_sem); | 426 | init_rwsem(&mm->mmap_sem); |
| 427 | INIT_LIST_HEAD(&mm->mmlist); | 427 | INIT_LIST_HEAD(&mm->mmlist); |
| 428 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; | 428 | mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; |
| 429 | mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; | ||
| 430 | mm->core_state = NULL; | 429 | mm->core_state = NULL; |
| 431 | mm->nr_ptes = 0; | 430 | mm->nr_ptes = 0; |
| 432 | set_mm_counter(mm, file_rss, 0); | 431 | set_mm_counter(mm, file_rss, 0); |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a9..a7b2460e922 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 58 | unsigned long points, cpu_time, run_time; | 58 | unsigned long points, cpu_time, run_time; |
| 59 | struct mm_struct *mm; | 59 | struct mm_struct *mm; |
| 60 | struct task_struct *child; | 60 | struct task_struct *child; |
| 61 | int oom_adj; | ||
| 62 | 61 | ||
| 63 | task_lock(p); | 62 | task_lock(p); |
| 64 | mm = p->mm; | 63 | mm = p->mm; |
| @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 66 | task_unlock(p); | 65 | task_unlock(p); |
| 67 | return 0; | 66 | return 0; |
| 68 | } | 67 | } |
| 69 | oom_adj = mm->oom_adj; | ||
| 70 | if (oom_adj == OOM_DISABLE) { | ||
| 71 | task_unlock(p); | ||
| 72 | return 0; | ||
| 73 | } | ||
| 74 | 68 | ||
| 75 | /* | 69 | /* |
| 76 | * The memory size of the process is the basis for the badness. | 70 | * The memory size of the process is the basis for the badness. |
| @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
| 154 | points /= 8; | 148 | points /= 8; |
| 155 | 149 | ||
| 156 | /* | 150 | /* |
| 157 | * Adjust the score by oom_adj. | 151 | * Adjust the score by oomkilladj. |
| 158 | */ | 152 | */ |
| 159 | if (oom_adj) { | 153 | if (p->oomkilladj) { |
| 160 | if (oom_adj > 0) { | 154 | if (p->oomkilladj > 0) { |
| 161 | if (!points) | 155 | if (!points) |
| 162 | points = 1; | 156 | points = 1; |
| 163 | points <<= oom_adj; | 157 | points <<= p->oomkilladj; |
| 164 | } else | 158 | } else |
| 165 | points >>= -(oom_adj); | 159 | points >>= -(p->oomkilladj); |
| 166 | } | 160 | } |
| 167 | 161 | ||
| 168 | #ifdef DEBUG | 162 | #ifdef DEBUG |
| @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, | |||
| 257 | *ppoints = ULONG_MAX; | 251 | *ppoints = ULONG_MAX; |
| 258 | } | 252 | } |
| 259 | 253 | ||
| 254 | if (p->oomkilladj == OOM_DISABLE) | ||
| 255 | continue; | ||
| 256 | |||
| 260 | points = badness(p, uptime.tv_sec); | 257 | points = badness(p, uptime.tv_sec); |
| 261 | if (points > *ppoints) { | 258 | if (points > *ppoints || !chosen) { |
| 262 | chosen = p; | 259 | chosen = p; |
| 263 | *ppoints = points; | 260 | *ppoints = points; |
| 264 | } | 261 | } |
| @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
| 307 | } | 304 | } |
| 308 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", | 305 | printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", |
| 309 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, | 306 | p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, |
| 310 | get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); | 307 | get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, |
| 308 | p->comm); | ||
| 311 | task_unlock(p); | 309 | task_unlock(p); |
| 312 | } while_each_thread(g, p); | 310 | } while_each_thread(g, p); |
| 313 | } | 311 | } |
| @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
| 325 | return; | 323 | return; |
| 326 | } | 324 | } |
| 327 | 325 | ||
| 328 | if (!p->mm) | 326 | if (!p->mm) { |
| 327 | WARN_ON(1); | ||
| 328 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | ||
| 329 | return; | 329 | return; |
| 330 | } | ||
| 330 | 331 | ||
| 331 | if (verbose) | 332 | if (verbose) |
| 332 | printk(KERN_ERR "Killed process %d (%s)\n", | 333 | printk(KERN_ERR "Killed process %d (%s)\n", |
| @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) | |||
| 348 | struct mm_struct *mm; | 349 | struct mm_struct *mm; |
| 349 | struct task_struct *g, *q; | 350 | struct task_struct *g, *q; |
| 350 | 351 | ||
| 351 | task_lock(p); | ||
| 352 | mm = p->mm; | 352 | mm = p->mm; |
| 353 | if (!mm || mm->oom_adj == OOM_DISABLE) { | 353 | |
| 354 | task_unlock(p); | 354 | /* WARNING: mm may not be dereferenced since we did not obtain its |
| 355 | * value from get_task_mm(p). This is OK since all we need to do is | ||
| 356 | * compare mm to q->mm below. | ||
| 357 | * | ||
| 358 | * Furthermore, even if mm contains a non-NULL value, p->mm may | ||
| 359 | * change to NULL at any time since we do not hold task_lock(p). | ||
| 360 | * However, this is of no concern to us. | ||
| 361 | */ | ||
| 362 | |||
| 363 | if (mm == NULL) | ||
| 355 | return 1; | 364 | return 1; |
| 356 | } | 365 | |
| 357 | task_unlock(p); | 366 | /* |
| 367 | * Don't kill the process if any threads are set to OOM_DISABLE | ||
| 368 | */ | ||
| 369 | do_each_thread(g, q) { | ||
| 370 | if (q->mm == mm && q->oomkilladj == OOM_DISABLE) | ||
| 371 | return 1; | ||
| 372 | } while_each_thread(g, q); | ||
| 373 | |||
| 358 | __oom_kill_task(p, 1); | 374 | __oom_kill_task(p, 1); |
| 359 | 375 | ||
| 360 | /* | 376 | /* |
| @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 377 | struct task_struct *c; | 393 | struct task_struct *c; |
| 378 | 394 | ||
| 379 | if (printk_ratelimit()) { | 395 | if (printk_ratelimit()) { |
| 380 | task_lock(current); | ||
| 381 | printk(KERN_WARNING "%s invoked oom-killer: " | 396 | printk(KERN_WARNING "%s invoked oom-killer: " |
| 382 | "gfp_mask=0x%x, order=%d, oom_adj=%d\n", | 397 | "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", |
| 383 | current->comm, gfp_mask, order, | 398 | current->comm, gfp_mask, order, current->oomkilladj); |
| 384 | current->mm ? current->mm->oom_adj : OOM_DISABLE); | 399 | task_lock(current); |
| 385 | cpuset_print_task_mems_allowed(current); | 400 | cpuset_print_task_mems_allowed(current); |
| 386 | task_unlock(current); | 401 | task_unlock(current); |
| 387 | dump_stack(); | 402 | dump_stack(); |
| @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
| 394 | /* | 409 | /* |
| 395 | * If the task is already exiting, don't alarm the sysadmin or kill | 410 | * If the task is already exiting, don't alarm the sysadmin or kill |
| 396 | * its children or threads, just set TIF_MEMDIE so it can die quickly | 411 | * its children or threads, just set TIF_MEMDIE so it can die quickly |
| 397 | * if its mm is still attached. | ||
| 398 | */ | 412 | */ |
| 399 | if (p->mm && (p->flags & PF_EXITING)) { | 413 | if (p->flags & PF_EXITING) { |
| 400 | __oom_kill_task(p, 0); | 414 | __oom_kill_task(p, 0); |
| 401 | return 0; | 415 | return 0; |
| 402 | } | 416 | } |
