aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-08-09 20:18:52 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:44:56 -0400
commit6f48d0ebd907ae419387f27b602ee98870cfa7bb (patch)
tree355bd8b616f5a78d8adabe5b9631d7aad970dbaa /mm
parent5e9d834a0e0c0485dfa487281ab9650fc37a3bb5 (diff)
oom: select task from tasklist for mempolicy ooms
The oom killer presently kills current whenever there is no more memory free or reclaimable on its mempolicy's nodes. There is no guarantee that current is a memory-hogging task or that killing it will free any substantial amount of memory, however. In such situations, it is better to scan the tasklist for nodes that are allowed to allocate on current's set of nodes and kill the task with the highest badness() score. This ensures that the most memory-hogging task, or the one configured by the user with /proc/pid/oom_adj, is always selected in such scenarios. Signed-off-by: David Rientjes <rientjes@google.com> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/mempolicy.c44
-rw-r--r--mm/oom_kill.c104
2 files changed, 112 insertions, 36 deletions
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 5bc0a96beb51..8a73708d59bb 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1712,6 +1712,50 @@ bool init_nodemask_of_mempolicy(nodemask_t *mask)
1712} 1712}
1713#endif 1713#endif
1714 1714
1715/*
1716 * mempolicy_nodemask_intersects
1717 *
1718 * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default
1719 * policy. Otherwise, check for intersection between mask and the policy
1720 * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local'
1721 * policy, always return true since it may allocate elsewhere on fallback.
1722 *
1723 * Takes task_lock(tsk) to prevent freeing of its mempolicy.
1724 */
1725bool mempolicy_nodemask_intersects(struct task_struct *tsk,
1726 const nodemask_t *mask)
1727{
1728 struct mempolicy *mempolicy;
1729 bool ret = true;
1730
1731 if (!mask)
1732 return ret;
1733 task_lock(tsk);
1734 mempolicy = tsk->mempolicy;
1735 if (!mempolicy)
1736 goto out;
1737
1738 switch (mempolicy->mode) {
1739 case MPOL_PREFERRED:
1740 /*
1741 * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to
1742 * allocate from, they may fallback to other nodes when oom.
1743 * Thus, it's possible for tsk to have allocated memory from
1744 * nodes in mask.
1745 */
1746 break;
1747 case MPOL_BIND:
1748 case MPOL_INTERLEAVE:
1749 ret = nodes_intersects(mempolicy->v.nodes, *mask);
1750 break;
1751 default:
1752 BUG();
1753 }
1754out:
1755 task_unlock(tsk);
1756 return ret;
1757}
1758
1715/* Allocate a page in interleaved policy. 1759/* Allocate a page in interleaved policy.
1716 Own path because it needs to do special accounting. */ 1760 Own path because it needs to do special accounting. */
1717static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, 1761static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 7c8488f6a3f5..13ceed78bc45 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -27,6 +27,7 @@
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/notifier.h> 28#include <linux/notifier.h>
29#include <linux/memcontrol.h> 29#include <linux/memcontrol.h>
30#include <linux/mempolicy.h>
30#include <linux/security.h> 31#include <linux/security.h>
31 32
32int sysctl_panic_on_oom; 33int sysctl_panic_on_oom;
@@ -35,23 +36,57 @@ int sysctl_oom_dump_tasks;
35static DEFINE_SPINLOCK(zone_scan_lock); 36static DEFINE_SPINLOCK(zone_scan_lock);
36/* #define DEBUG */ 37/* #define DEBUG */
37 38
38/* 39#ifdef CONFIG_NUMA
39 * Is all threads of the target process nodes overlap ours? 40/**
41 * has_intersects_mems_allowed() - check task eligiblity for kill
42 * @tsk: task struct of which task to consider
43 * @mask: nodemask passed to page allocator for mempolicy ooms
44 *
45 * Task eligibility is determined by whether or not a candidate task, @tsk,
46 * shares the same mempolicy nodes as current if it is bound by such a policy
47 * and whether or not it has the same set of allowed cpuset nodes.
40 */ 48 */
41static int has_intersects_mems_allowed(struct task_struct *tsk) 49static bool has_intersects_mems_allowed(struct task_struct *tsk,
50 const nodemask_t *mask)
42{ 51{
43 struct task_struct *t; 52 struct task_struct *start = tsk;
44 53
45 t = tsk;
46 do { 54 do {
47 if (cpuset_mems_allowed_intersects(current, t)) 55 if (mask) {
48 return 1; 56 /*
49 t = next_thread(t); 57 * If this is a mempolicy constrained oom, tsk's
50 } while (t != tsk); 58 * cpuset is irrelevant. Only return true if its
51 59 * mempolicy intersects current, otherwise it may be
52 return 0; 60 * needlessly killed.
61 */
62 if (mempolicy_nodemask_intersects(tsk, mask))
63 return true;
64 } else {
65 /*
66 * This is not a mempolicy constrained oom, so only
67 * check the mems of tsk's cpuset.
68 */
69 if (cpuset_mems_allowed_intersects(current, tsk))
70 return true;
71 }
72 tsk = next_thread(tsk);
73 } while (tsk != start);
74 return false;
75}
76#else
77static bool has_intersects_mems_allowed(struct task_struct *tsk,
78 const nodemask_t *mask)
79{
80 return true;
53} 81}
82#endif /* CONFIG_NUMA */
54 83
84/*
85 * The process p may have detached its own ->mm while exiting or through
86 * use_mm(), but one or more of its subthreads may still have a valid
87 * pointer. Return p, or any of its subthreads with a valid ->mm, with
88 * task_lock() held.
89 */
55static struct task_struct *find_lock_task_mm(struct task_struct *p) 90static struct task_struct *find_lock_task_mm(struct task_struct *p)
56{ 91{
57 struct task_struct *t = p; 92 struct task_struct *t = p;
@@ -106,10 +141,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
106 * The memory size of the process is the basis for the badness. 141 * The memory size of the process is the basis for the badness.
107 */ 142 */
108 points = p->mm->total_vm; 143 points = p->mm->total_vm;
109
110 /*
111 * After this unlock we can no longer dereference local variable `mm'
112 */
113 task_unlock(p); 144 task_unlock(p);
114 145
115 /* 146 /*
@@ -253,7 +284,8 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
253 * (not docbooked, we don't want this one cluttering up the manual) 284 * (not docbooked, we don't want this one cluttering up the manual)
254 */ 285 */
255static struct task_struct *select_bad_process(unsigned long *ppoints, 286static struct task_struct *select_bad_process(unsigned long *ppoints,
256 struct mem_cgroup *mem) 287 struct mem_cgroup *mem, enum oom_constraint constraint,
288 const nodemask_t *mask)
257{ 289{
258 struct task_struct *p; 290 struct task_struct *p;
259 struct task_struct *chosen = NULL; 291 struct task_struct *chosen = NULL;
@@ -269,7 +301,9 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
269 continue; 301 continue;
270 if (mem && !task_in_mem_cgroup(p, mem)) 302 if (mem && !task_in_mem_cgroup(p, mem))
271 continue; 303 continue;
272 if (!has_intersects_mems_allowed(p)) 304 if (!has_intersects_mems_allowed(p,
305 constraint == CONSTRAINT_MEMORY_POLICY ? mask :
306 NULL))
273 continue; 307 continue;
274 308
275 /* 309 /*
@@ -497,7 +531,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
497 panic("out of memory(memcg). panic_on_oom is selected.\n"); 531 panic("out of memory(memcg). panic_on_oom is selected.\n");
498 read_lock(&tasklist_lock); 532 read_lock(&tasklist_lock);
499retry: 533retry:
500 p = select_bad_process(&points, mem); 534 p = select_bad_process(&points, mem, CONSTRAINT_NONE, NULL);
501 if (!p || PTR_ERR(p) == -1UL) 535 if (!p || PTR_ERR(p) == -1UL)
502 goto out; 536 goto out;
503 537
@@ -576,7 +610,8 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
576/* 610/*
577 * Must be called with tasklist_lock held for read. 611 * Must be called with tasklist_lock held for read.
578 */ 612 */
579static void __out_of_memory(gfp_t gfp_mask, int order) 613static void __out_of_memory(gfp_t gfp_mask, int order,
614 enum oom_constraint constraint, const nodemask_t *mask)
580{ 615{
581 struct task_struct *p; 616 struct task_struct *p;
582 unsigned long points; 617 unsigned long points;
@@ -590,7 +625,7 @@ retry:
590 * Rambo mode: Shoot down a process and hope it solves whatever 625 * Rambo mode: Shoot down a process and hope it solves whatever
591 * issues we may have. 626 * issues we may have.
592 */ 627 */
593 p = select_bad_process(&points, NULL); 628 p = select_bad_process(&points, NULL, constraint, mask);
594 629
595 if (PTR_ERR(p) == -1UL) 630 if (PTR_ERR(p) == -1UL)
596 return; 631 return;
@@ -624,7 +659,8 @@ void pagefault_out_of_memory(void)
624 panic("out of memory from page fault. panic_on_oom is selected.\n"); 659 panic("out of memory from page fault. panic_on_oom is selected.\n");
625 660
626 read_lock(&tasklist_lock); 661 read_lock(&tasklist_lock);
627 __out_of_memory(0, 0); /* unknown gfp_mask and order */ 662 /* unknown gfp_mask and order */
663 __out_of_memory(0, 0, CONSTRAINT_NONE, NULL);
628 read_unlock(&tasklist_lock); 664 read_unlock(&tasklist_lock);
629 665
630 /* 666 /*
@@ -640,6 +676,7 @@ void pagefault_out_of_memory(void)
640 * @zonelist: zonelist pointer 676 * @zonelist: zonelist pointer
641 * @gfp_mask: memory allocation flags 677 * @gfp_mask: memory allocation flags
642 * @order: amount of memory being requested as a power of 2 678 * @order: amount of memory being requested as a power of 2
679 * @nodemask: nodemask passed to page allocator
643 * 680 *
644 * If we run out of memory, we have the choice between either 681 * If we run out of memory, we have the choice between either
645 * killing a random task (bad), letting the system crash (worse) 682 * killing a random task (bad), letting the system crash (worse)
@@ -678,24 +715,19 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
678 */ 715 */
679 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 716 constraint = constrained_alloc(zonelist, gfp_mask, nodemask);
680 read_lock(&tasklist_lock); 717 read_lock(&tasklist_lock);
681 718 if (unlikely(sysctl_panic_on_oom)) {
682 switch (constraint) { 719 /*
683 case CONSTRAINT_MEMORY_POLICY: 720 * panic_on_oom only affects CONSTRAINT_NONE, the kernel
684 oom_kill_process(current, gfp_mask, order, 0, NULL, 721 * should not panic for cpuset or mempolicy induced memory
685 "No available memory (MPOL_BIND)"); 722 * failures.
686 break; 723 */
687 724 if (constraint == CONSTRAINT_NONE) {
688 case CONSTRAINT_NONE:
689 if (sysctl_panic_on_oom) {
690 dump_header(NULL, gfp_mask, order, NULL); 725 dump_header(NULL, gfp_mask, order, NULL);
691 panic("out of memory. panic_on_oom is selected\n"); 726 read_unlock(&tasklist_lock);
727 panic("Out of memory: panic_on_oom is enabled\n");
692 } 728 }
693 /* Fall-through */
694 case CONSTRAINT_CPUSET:
695 __out_of_memory(gfp_mask, order);
696 break;
697 } 729 }
698 730 __out_of_memory(gfp_mask, order, constraint, nodemask);
699 read_unlock(&tasklist_lock); 731 read_unlock(&tasklist_lock);
700 732
701 /* 733 /*