aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-12-13 03:34:25 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-13 12:05:49 -0500
commit02a0e53d8227aff5e62e0433f82c12c1c2805fd6 (patch)
treefe32435308e5f1afe8bd12357bd8c5ff3b4133c7
parent55935a34a428a1497e3b37982e2782c09c6f914d (diff)
[PATCH] cpuset: rework cpuset_zone_allowed api
Elaborate the API for calling cpuset_zone_allowed(), so that users have to explicitly choose between the two variants: cpuset_zone_allowed_hardwall() cpuset_zone_allowed_softwall() Until now, whether or not you got the hardwall flavor depended solely on whether or not you or'd in the __GFP_HARDWALL gfp flag to the gfp_mask argument. If you didn't specify __GFP_HARDWALL, you implicitly got the softwall version. Unfortunately, this meant that users would end up with the softwall version without thinking about it. Since only the softwall version might sleep, this led to bugs with possible sleeping in interrupt context on more than one occassion. The hardwall version requires that the current tasks mems_allowed allows the node of the specified zone (or that you're in interrupt or that __GFP_THISNODE is set or that you're on a one cpuset system.) The softwall version, depending on the gfp_mask, might allow a node if it was allowed in the nearest enclusing cpuset marked mem_exclusive (which requires taking the cpuset lock 'callback_mutex' to evaluate.) This patch removes the cpuset_zone_allowed() call, and forces the caller to explicitly choose between the hardwall and the softwall case. If the caller wants the gfp_mask to determine this choice, they should (1) be sure they can sleep or that __GFP_HARDWALL is set, and (2) invoke the cpuset_zone_allowed_softwall() routine. This adds another 100 or 200 bytes to the kernel text space, due to the few lines of nearly duplicate code at the top of both cpuset_zone_allowed_* routines. It should save a few instructions executed for the calls that turned into calls of cpuset_zone_allowed_hardwall, thanks to not having to set (before the call) then check (within the call) the __GFP_HARDWALL flag. For the most critical call, from get_page_from_freelist(), the same instructions are executed as before -- the old cpuset_zone_allowed() routine it used to call is the same code as the cpuset_zone_allowed_softwall() routine that it calls now. Not a perfect win, but seems worth it, to reduce this chance of hitting a sleeping with irq off complaint again. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/cpuset.h22
-rw-r--r--kernel/cpuset.c82
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/slab.c2
-rw-r--r--mm/vmscan.c8
7 files changed, 92 insertions, 28 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8821e1f75b44..826b15e914e2 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -30,10 +30,19 @@ void cpuset_update_task_memory_state(void);
30 nodes_subset((nodes), current->mems_allowed) 30 nodes_subset((nodes), current->mems_allowed)
31int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); 31int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
32 32
33extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask); 33extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
34static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 34extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
35
36static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
37{
38 return number_of_cpusets <= 1 ||
39 __cpuset_zone_allowed_softwall(z, gfp_mask);
40}
41
42static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
35{ 43{
36 return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask); 44 return number_of_cpusets <= 1 ||
45 __cpuset_zone_allowed_hardwall(z, gfp_mask);
37} 46}
38 47
39extern int cpuset_excl_nodes_overlap(const struct task_struct *p); 48extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -94,7 +103,12 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
94 return 1; 103 return 1;
95} 104}
96 105
97static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 106static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
107{
108 return 1;
109}
110
111static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
98{ 112{
99 return 1; 113 return 1;
100} 114}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2c3b4431472b..232aed2b10f9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2342,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2342} 2342}
2343 2343
2344/** 2344/**
2345 * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? 2345 * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
2346 * @z: is this zone on an allowed node? 2346 * @z: is this zone on an allowed node?
2347 * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) 2347 * @gfp_mask: memory allocation flags
2348 * 2348 *
2349 * If we're in interrupt, yes, we can always allocate. If zone 2349 * If we're in interrupt, yes, we can always allocate. If
2350 * __GFP_THISNODE is set, yes, we can always allocate. If zone
2350 * z's node is in our tasks mems_allowed, yes. If it's not a 2351 * z's node is in our tasks mems_allowed, yes. If it's not a
2351 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2352 * __GFP_HARDWALL request and this zone's nodes is in the nearest
2352 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2353 * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
2353 * Otherwise, no. 2354 * Otherwise, no.
2354 * 2355 *
2356 * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
2357 * reduces to cpuset_zone_allowed_hardwall(). Otherwise,
2358 * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
2359 * from an enclosing cpuset.
2360 *
2361 * cpuset_zone_allowed_hardwall() only handles the simpler case of
2362 * hardwall cpusets, and never sleeps.
2363 *
2364 * The __GFP_THISNODE placement logic is really handled elsewhere,
2365 * by forcibly using a zonelist starting at a specified node, and by
2366 * (in get_page_from_freelist()) refusing to consider the zones for
2367 * any node on the zonelist except the first. By the time any such
2368 * calls get to this routine, we should just shut up and say 'yes'.
2369 *
2355 * GFP_USER allocations are marked with the __GFP_HARDWALL bit, 2370 * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
2356 * and do not allow allocations outside the current tasks cpuset. 2371 * and do not allow allocations outside the current tasks cpuset.
2357 * GFP_KERNEL allocations are not so marked, so can escape to the 2372 * GFP_KERNEL allocations are not so marked, so can escape to the
2358 * nearest mem_exclusive ancestor cpuset. 2373 * nearest enclosing mem_exclusive ancestor cpuset.
2359 * 2374 *
2360 * Scanning up parent cpusets requires callback_mutex. The __alloc_pages() 2375 * Scanning up parent cpusets requires callback_mutex. The
2361 * routine only calls here with __GFP_HARDWALL bit _not_ set if 2376 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2362 * it's a GFP_KERNEL allocation, and all nodes in the current tasks 2377 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2363 * mems_allowed came up empty on the first pass over the zonelist. 2378 * current tasks mems_allowed came up empty on the first pass over
2364 * So only GFP_KERNEL allocations, if all nodes in the cpuset are 2379 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2365 * short of memory, might require taking the callback_mutex mutex. 2380 * cpuset are short of memory, might require taking the callback_mutex
2381 * mutex.
2366 * 2382 *
2367 * The first call here from mm/page_alloc:get_page_from_freelist() 2383 * The first call here from mm/page_alloc:get_page_from_freelist()
2368 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so 2384 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
2369 * no allocation on a node outside the cpuset is allowed (unless in 2385 * so no allocation on a node outside the cpuset is allowed (unless
2370 * interrupt, of course). 2386 * in interrupt, of course).
2371 * 2387 *
2372 * The second pass through get_page_from_freelist() doesn't even call 2388 * The second pass through get_page_from_freelist() doesn't even call
2373 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() 2389 * here for GFP_ATOMIC calls. For those calls, the __alloc_pages()
@@ -2380,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2380 * GFP_USER - only nodes in current tasks mems allowed ok. 2396 * GFP_USER - only nodes in current tasks mems allowed ok.
2381 * 2397 *
2382 * Rule: 2398 * Rule:
2383 * Don't call cpuset_zone_allowed() if you can't sleep, unless you 2399 * Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
2384 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables 2400 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2385 * the code that might scan up ancestor cpusets and sleep. 2401 * the code that might scan up ancestor cpusets and sleep.
2386 **/ 2402 */
2387 2403
2388int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2404int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2389{ 2405{
2390 int node; /* node that zone z is on */ 2406 int node; /* node that zone z is on */
2391 const struct cpuset *cs; /* current cpuset ancestors */ 2407 const struct cpuset *cs; /* current cpuset ancestors */
@@ -2415,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2415 return allowed; 2431 return allowed;
2416} 2432}
2417 2433
2434/*
2435 * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
2436 * @z: is this zone on an allowed node?
2437 * @gfp_mask: memory allocation flags
2438 *
2439 * If we're in interrupt, yes, we can always allocate.
2440 * If __GFP_THISNODE is set, yes, we can always allocate. If zone
2441 * z's node is in our tasks mems_allowed, yes. Otherwise, no.
2442 *
2443 * The __GFP_THISNODE placement logic is really handled elsewhere,
2444 * by forcibly using a zonelist starting at a specified node, and by
2445 * (in get_page_from_freelist()) refusing to consider the zones for
2446 * any node on the zonelist except the first. By the time any such
2447 * calls get to this routine, we should just shut up and say 'yes'.
2448 *
2449 * Unlike the cpuset_zone_allowed_softwall() variant, above,
2450 * this variant requires that the zone be in the current tasks
2451 * mems_allowed or that we're in interrupt. It does not scan up the
2452 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2453 * It never sleeps.
2454 */
2455
2456int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2457{
2458 int node; /* node that zone z is on */
2459
2460 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2461 return 1;
2462 node = zone_to_nid(z);
2463 if (node_isset(node, current->mems_allowed))
2464 return 1;
2465 return 0;
2466}
2467
2418/** 2468/**
2419 * cpuset_lock - lock out any changes to cpuset structures 2469 * cpuset_lock - lock out any changes to cpuset structures
2420 * 2470 *
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 0ccc7f230252..089092d152ab 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
73 73
74 for (z = zonelist->zones; *z; z++) { 74 for (z = zonelist->zones; *z; z++) {
75 nid = zone_to_nid(*z); 75 nid = zone_to_nid(*z);
76 if (cpuset_zone_allowed(*z, GFP_HIGHUSER) && 76 if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
77 !list_empty(&hugepage_freelists[nid])) 77 !list_empty(&hugepage_freelists[nid]))
78 break; 78 break;
79 } 79 }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 223d9ccb7d64..64cf3c214634 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,7 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
177 nodemask_t nodes = node_online_map; 177 nodemask_t nodes = node_online_map;
178 178
179 for (z = zonelist->zones; *z; z++) 179 for (z = zonelist->zones; *z; z++)
180 if (cpuset_zone_allowed(*z, gfp_mask)) 180 if (cpuset_zone_allowed_softwall(*z, gfp_mask))
181 node_clear(zone_to_nid(*z), nodes); 181 node_clear(zone_to_nid(*z), nodes);
182 else 182 else
183 return CONSTRAINT_CPUSET; 183 return CONSTRAINT_CPUSET;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e6b17b2989e0..8c1a116875bc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1162,7 +1162,7 @@ zonelist_scan:
1162 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat)) 1162 zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
1163 break; 1163 break;
1164 if ((alloc_flags & ALLOC_CPUSET) && 1164 if ((alloc_flags & ALLOC_CPUSET) &&
1165 !cpuset_zone_allowed(zone, gfp_mask)) 1165 !cpuset_zone_allowed_softwall(zone, gfp_mask))
1166 goto try_next_zone; 1166 goto try_next_zone;
1167 1167
1168 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { 1168 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
diff --git a/mm/slab.c b/mm/slab.c
index 9d3550086c93..b856786a3a30 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3262,7 +3262,7 @@ retry:
3262 for (z = zonelist->zones; *z && !obj; z++) { 3262 for (z = zonelist->zones; *z && !obj; z++) {
3263 nid = zone_to_nid(*z); 3263 nid = zone_to_nid(*z);
3264 3264
3265 if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) && 3265 if (cpuset_zone_allowed_hardwall(*z, flags) &&
3266 cache->nodelists[nid] && 3266 cache->nodelists[nid] &&
3267 cache->nodelists[nid]->free_objects) 3267 cache->nodelists[nid]->free_objects)
3268 obj = ____cache_alloc_node(cache, 3268 obj = ____cache_alloc_node(cache,
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 093f5fe6dd77..e9813b06c7a3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
984 if (!populated_zone(zone)) 984 if (!populated_zone(zone))
985 continue; 985 continue;
986 986
987 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 987 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
988 continue; 988 continue;
989 989
990 note_zone_scanning_priority(zone, priority); 990 note_zone_scanning_priority(zone, priority);
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
1034 for (i = 0; zones[i] != NULL; i++) { 1034 for (i = 0; zones[i] != NULL; i++) {
1035 struct zone *zone = zones[i]; 1035 struct zone *zone = zones[i];
1036 1036
1037 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1037 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1038 continue; 1038 continue;
1039 1039
1040 lru_pages += zone->nr_active + zone->nr_inactive; 1040 lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1089,7 +1089,7 @@ out:
1089 for (i = 0; zones[i] != 0; i++) { 1089 for (i = 0; zones[i] != 0; i++) {
1090 struct zone *zone = zones[i]; 1090 struct zone *zone = zones[i];
1091 1091
1092 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1092 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1093 continue; 1093 continue;
1094 1094
1095 zone->prev_priority = priority; 1095 zone->prev_priority = priority;
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1354 return; 1354 return;
1355 if (pgdat->kswapd_max_order < order) 1355 if (pgdat->kswapd_max_order < order)
1356 pgdat->kswapd_max_order = order; 1356 pgdat->kswapd_max_order = order;
1357 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) 1357 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1358 return; 1358 return;
1359 if (!waitqueue_active(&pgdat->kswapd_wait)) 1359 if (!waitqueue_active(&pgdat->kswapd_wait))
1360 return; 1360 return;