diff options
author | KAMEZAWA Hiroyuki <kamezawa.hioryu@jp.fujitsu.com> | 2009-12-15 19:45:33 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-12-16 10:19:57 -0500 |
commit | 4365a5676fa3aa1d5ae6c90c22a0044f09ba584e (patch) | |
tree | 5b9914ccbdcf2aa695473421e71f6299fbe78cef | |
parent | 3b4798cbc13dd8d1150aa6377f97f0e11450a67d (diff) |
oom-kill: fix NUMA constraint check with nodemask
Fix node-oriented allocation handling in oom-kill.c I myself think of this
as a bugfix not as an ehnancement.
In these days, things are changed as
- alloc_pages() eats nodemask as its arguments, __alloc_pages_nodemask().
- mempolicy don't maintain its own private zonelists.
(And cpuset doesn't use nodemask for __alloc_pages_nodemask())
So, current oom-killer's check function is wrong.
This patch does
- check nodemask, if nodemask && nodemask doesn't cover all
node_states[N_HIGH_MEMORY], this is CONSTRAINT_MEMORY_POLICY.
- Scan all zonelist under nodemask, if it hits cpuset's wall
this faiulre is from cpuset.
And
- modifies the caller of out_of_memory not to call oom if __GFP_THISNODE.
This doesn't change "current" behavior. If callers use __GFP_THISNODE
it should handle "page allocation failure" by itself.
- handle __GFP_NOFAIL+__GFP_THISNODE path.
This is something like a FIXME but this gfpmask is not used now.
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hioryu@jp.fujitsu.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | drivers/char/sysrq.c | 2 | ||||
-rw-r--r-- | include/linux/oom.h | 4 | ||||
-rw-r--r-- | mm/oom_kill.c | 46 | ||||
-rw-r--r-- | mm/page_alloc.c | 22 |
4 files changed, 53 insertions, 21 deletions
diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 44203ff599da..1ae2de7d8b4f 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c | |||
@@ -339,7 +339,7 @@ static struct sysrq_key_op sysrq_term_op = { | |||
339 | 339 | ||
340 | static void moom_callback(struct work_struct *ignored) | 340 | static void moom_callback(struct work_struct *ignored) |
341 | { | 341 | { |
342 | out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0); | 342 | out_of_memory(node_zonelist(0, GFP_KERNEL), GFP_KERNEL, 0, NULL); |
343 | } | 343 | } |
344 | 344 | ||
345 | static DECLARE_WORK(moom_work, moom_callback); | 345 | static DECLARE_WORK(moom_work, moom_callback); |
diff --git a/include/linux/oom.h b/include/linux/oom.h index 6aac5fe4f6f1..537662315627 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #ifdef __KERNEL__ | 10 | #ifdef __KERNEL__ |
11 | 11 | ||
12 | #include <linux/types.h> | 12 | #include <linux/types.h> |
13 | #include <linux/nodemask.h> | ||
13 | 14 | ||
14 | struct zonelist; | 15 | struct zonelist; |
15 | struct notifier_block; | 16 | struct notifier_block; |
@@ -26,7 +27,8 @@ enum oom_constraint { | |||
26 | extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 27 | extern int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
27 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); | 28 | extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); |
28 | 29 | ||
29 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order); | 30 | extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
31 | int order, nodemask_t *mask); | ||
30 | extern int register_oom_notifier(struct notifier_block *nb); | 32 | extern int register_oom_notifier(struct notifier_block *nb); |
31 | extern int unregister_oom_notifier(struct notifier_block *nb); | 33 | extern int unregister_oom_notifier(struct notifier_block *nb); |
32 | 34 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6bb8a7a7ec9a..25c679e0288a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
196 | /* | 196 | /* |
197 | * Determine the type of allocation constraint. | 197 | * Determine the type of allocation constraint. |
198 | */ | 198 | */ |
199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
200 | gfp_t gfp_mask) | ||
201 | { | ||
202 | #ifdef CONFIG_NUMA | 199 | #ifdef CONFIG_NUMA |
200 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
201 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
202 | { | ||
203 | struct zone *zone; | 203 | struct zone *zone; |
204 | struct zoneref *z; | 204 | struct zoneref *z; |
205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
207 | 206 | ||
208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 207 | /* |
209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 208 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
210 | node_clear(zone_to_nid(zone), nodes); | 209 | * to kill current.We have to random task kill in this case. |
211 | else | 210 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
212 | return CONSTRAINT_CPUSET; | 211 | */ |
212 | if (gfp_mask & __GFP_THISNODE) | ||
213 | return CONSTRAINT_NONE; | ||
213 | 214 | ||
214 | if (!nodes_empty(nodes)) | 215 | /* |
216 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
217 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
218 | * feature. mempolicy is an only user of nodemask here. | ||
219 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
220 | */ | ||
221 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
215 | return CONSTRAINT_MEMORY_POLICY; | 222 | return CONSTRAINT_MEMORY_POLICY; |
216 | #endif | 223 | |
224 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
225 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
226 | high_zoneidx, nodemask) | ||
227 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
228 | return CONSTRAINT_CPUSET; | ||
217 | 229 | ||
218 | return CONSTRAINT_NONE; | 230 | return CONSTRAINT_NONE; |
219 | } | 231 | } |
232 | #else | ||
233 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
234 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
235 | { | ||
236 | return CONSTRAINT_NONE; | ||
237 | } | ||
238 | #endif | ||
220 | 239 | ||
221 | /* | 240 | /* |
222 | * Simple selection loop. We chose the process with the highest | 241 | * Simple selection loop. We chose the process with the highest |
@@ -613,7 +632,8 @@ rest_and_return: | |||
613 | * OR try to be smart about which process to kill. Note that we | 632 | * OR try to be smart about which process to kill. Note that we |
614 | * don't have to be perfect here, we just have to be good. | 633 | * don't have to be perfect here, we just have to be good. |
615 | */ | 634 | */ |
616 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 635 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
636 | int order, nodemask_t *nodemask) | ||
617 | { | 637 | { |
618 | unsigned long freed = 0; | 638 | unsigned long freed = 0; |
619 | enum oom_constraint constraint; | 639 | enum oom_constraint constraint; |
@@ -632,7 +652,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
632 | * Check if there were limitations on the allocation (only relevant for | 652 | * Check if there were limitations on the allocation (only relevant for |
633 | * NUMA) that may require different handling. | 653 | * NUMA) that may require different handling. |
634 | */ | 654 | */ |
635 | constraint = constrained_alloc(zonelist, gfp_mask); | 655 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
636 | read_lock(&tasklist_lock); | 656 | read_lock(&tasklist_lock); |
637 | 657 | ||
638 | switch (constraint) { | 658 | switch (constraint) { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 59d2e88fb47c..850c4a7e2fe5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1654 | if (page) | 1654 | if (page) |
1655 | goto out; | 1655 | goto out; |
1656 | 1656 | ||
1657 | /* The OOM killer will not help higher order allocs */ | 1657 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1658 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1658 | /* The OOM killer will not help higher order allocs */ |
1659 | goto out; | 1659 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1660 | 1660 | goto out; | |
1661 | /* | ||
1662 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1663 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1664 | * The caller should handle page allocation failure by itself if | ||
1665 | * it specifies __GFP_THISNODE. | ||
1666 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1667 | */ | ||
1668 | if (gfp_mask & __GFP_THISNODE) | ||
1669 | goto out; | ||
1670 | } | ||
1661 | /* Exhausted what can be done so it's blamo time */ | 1671 | /* Exhausted what can be done so it's blamo time */ |
1662 | out_of_memory(zonelist, gfp_mask, order); | 1672 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1663 | 1673 | ||
1664 | out: | 1674 | out: |
1665 | clear_zonelist_oom(zonelist, gfp_mask); | 1675 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int cpu) | |||
3123 | 3133 | ||
3124 | if (percpu_pagelist_fraction) | 3134 | if (percpu_pagelist_fraction) |
3125 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 3135 | setup_pagelist_highmark(zone_pcp(zone, cpu), |
3126 | (zone->present_pages / percpu_pagelist_fraction)); | 3136 | (zone->present_pages / percpu_pagelist_fraction)); |
3127 | } | 3137 | } |
3128 | 3138 | ||
3129 | return 0; | 3139 | return 0; |