aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>2009-08-18 17:11:10 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2009-08-18 19:31:13 -0400
commit0753ba01e126020bf0f8150934903b48935b697d (patch)
treefbfd7e2d0abbe724a8c5e0e17fb9af522ed2e097 /mm
parent89a4eb4b66e8f4d395e14a14d262dac4d6ca52f0 (diff)
mm: revert "oom: move oom_adj value"
The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/oom_kill.c64
1 files changed, 39 insertions, 25 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 175a67a78a99..a7b2460e922b 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
58 unsigned long points, cpu_time, run_time; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 int oom_adj;
62 61
63 task_lock(p); 62 task_lock(p);
64 mm = p->mm; 63 mm = p->mm;
@@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
66 task_unlock(p); 65 task_unlock(p);
67 return 0; 66 return 0;
68 } 67 }
69 oom_adj = mm->oom_adj;
70 if (oom_adj == OOM_DISABLE) {
71 task_unlock(p);
72 return 0;
73 }
74 68
75 /* 69 /*
76 * The memory size of the process is the basis for the badness. 70 * The memory size of the process is the basis for the badness.
@@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
154 points /= 8; 148 points /= 8;
155 149
156 /* 150 /*
157 * Adjust the score by oom_adj. 151 * Adjust the score by oomkilladj.
158 */ 152 */
159 if (oom_adj) { 153 if (p->oomkilladj) {
160 if (oom_adj > 0) { 154 if (p->oomkilladj > 0) {
161 if (!points) 155 if (!points)
162 points = 1; 156 points = 1;
163 points <<= oom_adj; 157 points <<= p->oomkilladj;
164 } else 158 } else
165 points >>= -(oom_adj); 159 points >>= -(p->oomkilladj);
166 } 160 }
167 161
168#ifdef DEBUG 162#ifdef DEBUG
@@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
257 *ppoints = ULONG_MAX; 251 *ppoints = ULONG_MAX;
258 } 252 }
259 253
254 if (p->oomkilladj == OOM_DISABLE)
255 continue;
256
260 points = badness(p, uptime.tv_sec); 257 points = badness(p, uptime.tv_sec);
261 if (points > *ppoints) { 258 if (points > *ppoints || !chosen) {
262 chosen = p; 259 chosen = p;
263 *ppoints = points; 260 *ppoints = points;
264 } 261 }
@@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
307 } 304 }
308 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 305 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
309 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, 306 p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
310 get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); 307 get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj,
308 p->comm);
311 task_unlock(p); 309 task_unlock(p);
312 } while_each_thread(g, p); 310 } while_each_thread(g, p);
313} 311}
@@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
325 return; 323 return;
326 } 324 }
327 325
328 if (!p->mm) 326 if (!p->mm) {
327 WARN_ON(1);
328 printk(KERN_WARNING "tried to kill an mm-less task!\n");
329 return; 329 return;
330 }
330 331
331 if (verbose) 332 if (verbose)
332 printk(KERN_ERR "Killed process %d (%s)\n", 333 printk(KERN_ERR "Killed process %d (%s)\n",
@@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p)
348 struct mm_struct *mm; 349 struct mm_struct *mm;
349 struct task_struct *g, *q; 350 struct task_struct *g, *q;
350 351
351 task_lock(p);
352 mm = p->mm; 352 mm = p->mm;
353 if (!mm || mm->oom_adj == OOM_DISABLE) { 353
354 task_unlock(p); 354 /* WARNING: mm may not be dereferenced since we did not obtain its
355 * value from get_task_mm(p). This is OK since all we need to do is
356 * compare mm to q->mm below.
357 *
358 * Furthermore, even if mm contains a non-NULL value, p->mm may
359 * change to NULL at any time since we do not hold task_lock(p).
360 * However, this is of no concern to us.
361 */
362
363 if (mm == NULL)
355 return 1; 364 return 1;
356 } 365
357 task_unlock(p); 366 /*
367 * Don't kill the process if any threads are set to OOM_DISABLE
368 */
369 do_each_thread(g, q) {
370 if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
371 return 1;
372 } while_each_thread(g, q);
373
358 __oom_kill_task(p, 1); 374 __oom_kill_task(p, 1);
359 375
360 /* 376 /*
@@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
377 struct task_struct *c; 393 struct task_struct *c;
378 394
379 if (printk_ratelimit()) { 395 if (printk_ratelimit()) {
380 task_lock(current);
381 printk(KERN_WARNING "%s invoked oom-killer: " 396 printk(KERN_WARNING "%s invoked oom-killer: "
382 "gfp_mask=0x%x, order=%d, oom_adj=%d\n", 397 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
383 current->comm, gfp_mask, order, 398 current->comm, gfp_mask, order, current->oomkilladj);
384 current->mm ? current->mm->oom_adj : OOM_DISABLE); 399 task_lock(current);
385 cpuset_print_task_mems_allowed(current); 400 cpuset_print_task_mems_allowed(current);
386 task_unlock(current); 401 task_unlock(current);
387 dump_stack(); 402 dump_stack();
@@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 /* 409 /*
395 * If the task is already exiting, don't alarm the sysadmin or kill 410 * If the task is already exiting, don't alarm the sysadmin or kill
396 * its children or threads, just set TIF_MEMDIE so it can die quickly 411 * its children or threads, just set TIF_MEMDIE so it can die quickly
397 * if its mm is still attached.
398 */ 412 */
399 if (p->mm && (p->flags & PF_EXITING)) { 413 if (p->flags & PF_EXITING) {
400 __oom_kill_task(p, 0); 414 __oom_kill_task(p, 0);
401 return 0; 415 return 0;
402 } 416 }