[PATCH] cpusets: formalize intermediate GFP_KERNEL containment

This patch makes use of the previously underutilized cpuset flag 'mem_exclusive' to provide what amounts to another layer of memory placement resolution. With this patch, there are now the following four layers of memory placement available: 1) The whole system (interrupt and GFP_ATOMIC allocations can use this), 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use), 3) The current tasks cpuset (GFP_USER allocations constrained to here), and 4) Specific node placement, using mbind and set_mempolicy. These nest - each layer is a subset (same or within) of the previous. Layer (2) above is new, with this patch. The call used to check whether a zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is extended to take a gfp_mask argument, and its logic is extended, in the case that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if placement is allowed. The definition of GFP_USER, which used to be identical to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous cpuset_gfp_hardwall_flag patch. GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks cpuset, so long as any node therein is not too tight on memory, but will escape to the larger layer, if need be. The intended use is to allow something like a batch manager to handle several jobs, each job in its own cpuset, but using common kernel memory for caches and such. Swapper and oom_kill activity is also constrained to Layer (2). A task in or below one mem_exclusive cpuset should not cause swapping on nodes in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a task in another such cpuset. Heavy use of kernel memory for i/o caching and such by one job should not impact the memory available to jobs in other non-overlapping mem_exclusive cpusets. This patch enables providing hardwall, inescapable cpusets for memory allocations of each job, while sharing kernel memory allocations between several jobs, in an enclosing mem_exclusive cpuset. Like Dinakar's patch earlier to enable administering sched domains using the cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag that had previously done nothing much useful other than restrict what cpuset configurations were allowed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: Paul Jackson <pj@sgi.com> 2005-09-06 18:18:12 -0400
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-09-07 19:57:40 -0400
commit: 9bf2229f8817677127a60c177aefce1badd22d7b (patch)
tree: 06e95863a26b197233081db1dafd869dfd231950 /mm/vmscan.c
parent: f90b1d2f1aaaa40c6519a32e69615edc25bb97d5 (diff)
1 files changed, 4 insertions, 4 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde9..a740778f688d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
                if (zone->present_pages == 0)
                        continue;
-                if (!cpuset_zone_allowed(zone))
+                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
                zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
        for (i = 0; zones[i] != NULL; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone))
+                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
                zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
        for (i = 0; zones[i] != 0; i++) {
                struct zone *zone = zones[i];
-                if (!cpuset_zone_allowed(zone))
+                if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                        continue;
                zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                return;
        if (pgdat->kswapd_max_order < order)
                pgdat->kswapd_max_order = order;
-        if (!cpuset_zone_allowed(zone))
+        if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
                return;
        if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
                return;
author	Paul Jackson <pj@sgi.com>	2005-09-06 18:18:12 -0400
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-09-07 19:57:40 -0400
commit	9bf2229f8817677127a60c177aefce1badd22d7b (patch)
tree	06e95863a26b197233081db1dafd869dfd231950 /mm/vmscan.c
parent	f90b1d2f1aaaa40c6519a32e69615edc25bb97d5 (diff)

diff --git a/mm/vmscan.c b/mm/vmscan.c index 0095533cdde9..a740778f688d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone *zones, struct scan_control sc)
894	if (zone->present_pages == 0)	894	if (zone->present_pages == 0)
895	continue;	895	continue;
896		896
897	if (!cpuset_zone_allowed(zone))	897	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
898	continue;	898	continue;
899		899
900	zone->temp_priority = sc->priority;	900	zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
940	for (i = 0; zones[i] != NULL; i++) {	940	for (i = 0; zones[i] != NULL; i++) {
941	struct zone *zone = zones[i];	941	struct zone *zone = zones[i];
942		942
943	if (!cpuset_zone_allowed(zone))	943	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
944	continue;	944	continue;
945		945
946	zone->temp_priority = DEF_PRIORITY;	946	zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
986	for (i = 0; zones[i] != 0; i++) {	986	for (i = 0; zones[i] != 0; i++) {
987	struct zone *zone = zones[i];	987	struct zone *zone = zones[i];
988		988
989	if (!cpuset_zone_allowed(zone))	989	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
990	continue;	990	continue;
991		991
992	zone->prev_priority = zone->temp_priority;	992	zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1256	return;	1256	return;
1257	if (pgdat->kswapd_max_order < order)	1257	if (pgdat->kswapd_max_order < order)
1258	pgdat->kswapd_max_order = order;	1258	pgdat->kswapd_max_order = order;
1259	if (!cpuset_zone_allowed(zone))	1259	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1260	return;	1260	return;
1261	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))	1261	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
1262	return;	1262	return;