aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2005-09-06 18:18:12 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-09-07 19:57:40 -0400
commit9bf2229f8817677127a60c177aefce1badd22d7b (patch)
tree06e95863a26b197233081db1dafd869dfd231950 /mm
parentf90b1d2f1aaaa40c6519a32e69615edc25bb97d5 (diff)
[PATCH] cpusets: formalize intermediate GFP_KERNEL containment
This patch makes use of the previously underutilized cpuset flag 'mem_exclusive' to provide what amounts to another layer of memory placement resolution. With this patch, there are now the following four layers of memory placement available: 1) The whole system (interrupt and GFP_ATOMIC allocations can use this), 2) The nearest enclosing mem_exclusive cpuset (GFP_KERNEL allocations can use), 3) The current tasks cpuset (GFP_USER allocations constrained to here), and 4) Specific node placement, using mbind and set_mempolicy. These nest - each layer is a subset (same or within) of the previous. Layer (2) above is new, with this patch. The call used to check whether a zone (its node, actually) is in a cpuset (in its mems_allowed, actually) is extended to take a gfp_mask argument, and its logic is extended, in the case that __GFP_HARDWALL is not set in the flag bits, to look up the cpuset hierarchy for the nearest enclosing mem_exclusive cpuset, to determine if placement is allowed. The definition of GFP_USER, which used to be identical to GFP_KERNEL, is changed to also set the __GFP_HARDWALL bit, in the previous cpuset_gfp_hardwall_flag patch. GFP_ATOMIC and GFP_KERNEL allocations will stay within the current tasks cpuset, so long as any node therein is not too tight on memory, but will escape to the larger layer, if need be. The intended use is to allow something like a batch manager to handle several jobs, each job in its own cpuset, but using common kernel memory for caches and such. Swapper and oom_kill activity is also constrained to Layer (2). A task in or below one mem_exclusive cpuset should not cause swapping on nodes in another non-overlapping mem_exclusive cpuset, nor provoke oom_killing of a task in another such cpuset. Heavy use of kernel memory for i/o caching and such by one job should not impact the memory available to jobs in other non-overlapping mem_exclusive cpusets. This patch enables providing hardwall, inescapable cpusets for memory allocations of each job, while sharing kernel memory allocations between several jobs, in an enclosing mem_exclusive cpuset. Like Dinakar's patch earlier to enable administering sched domains using the cpu_exclusive flag, this patch also provides a useful meaning to a cpuset flag that had previously done nothing much useful other than restrict what cpuset configurations were allowed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/page_alloc.c16
-rw-r--r--mm/vmscan.c8
2 files changed, 14 insertions, 10 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 14d7032c1d12..3974fd81d27c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -806,11 +806,14 @@ __alloc_pages(unsigned int __nocast gfp_mask, unsigned int order,
806 classzone_idx = zone_idx(zones[0]); 806 classzone_idx = zone_idx(zones[0]);
807 807
808restart: 808restart:
809 /* Go through the zonelist once, looking for a zone with enough free */ 809 /*
810 * Go through the zonelist once, looking for a zone with enough free.
811 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
812 */
810 for (i = 0; (z = zones[i]) != NULL; i++) { 813 for (i = 0; (z = zones[i]) != NULL; i++) {
811 int do_reclaim = should_reclaim_zone(z, gfp_mask); 814 int do_reclaim = should_reclaim_zone(z, gfp_mask);
812 815
813 if (!cpuset_zone_allowed(z)) 816 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
814 continue; 817 continue;
815 818
816 /* 819 /*
@@ -845,6 +848,7 @@ zone_reclaim_retry:
845 * 848 *
846 * This is the last chance, in general, before the goto nopage. 849 * This is the last chance, in general, before the goto nopage.
847 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. 850 * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
851 * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
848 */ 852 */
849 for (i = 0; (z = zones[i]) != NULL; i++) { 853 for (i = 0; (z = zones[i]) != NULL; i++) {
850 if (!zone_watermark_ok(z, order, z->pages_min, 854 if (!zone_watermark_ok(z, order, z->pages_min,
@@ -852,7 +856,7 @@ zone_reclaim_retry:
852 gfp_mask & __GFP_HIGH)) 856 gfp_mask & __GFP_HIGH))
853 continue; 857 continue;
854 858
855 if (wait && !cpuset_zone_allowed(z)) 859 if (wait && !cpuset_zone_allowed(z, gfp_mask))
856 continue; 860 continue;
857 861
858 page = buffered_rmqueue(z, order, gfp_mask); 862 page = buffered_rmqueue(z, order, gfp_mask);
@@ -867,7 +871,7 @@ zone_reclaim_retry:
867 if (!(gfp_mask & __GFP_NOMEMALLOC)) { 871 if (!(gfp_mask & __GFP_NOMEMALLOC)) {
868 /* go through the zonelist yet again, ignoring mins */ 872 /* go through the zonelist yet again, ignoring mins */
869 for (i = 0; (z = zones[i]) != NULL; i++) { 873 for (i = 0; (z = zones[i]) != NULL; i++) {
870 if (!cpuset_zone_allowed(z)) 874 if (!cpuset_zone_allowed(z, gfp_mask))
871 continue; 875 continue;
872 page = buffered_rmqueue(z, order, gfp_mask); 876 page = buffered_rmqueue(z, order, gfp_mask);
873 if (page) 877 if (page)
@@ -903,7 +907,7 @@ rebalance:
903 gfp_mask & __GFP_HIGH)) 907 gfp_mask & __GFP_HIGH))
904 continue; 908 continue;
905 909
906 if (!cpuset_zone_allowed(z)) 910 if (!cpuset_zone_allowed(z, gfp_mask))
907 continue; 911 continue;
908 912
909 page = buffered_rmqueue(z, order, gfp_mask); 913 page = buffered_rmqueue(z, order, gfp_mask);
@@ -922,7 +926,7 @@ rebalance:
922 classzone_idx, 0, 0)) 926 classzone_idx, 0, 0))
923 continue; 927 continue;
924 928
925 if (!cpuset_zone_allowed(z)) 929 if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
926 continue; 930 continue;
927 931
928 page = buffered_rmqueue(z, order, gfp_mask); 932 page = buffered_rmqueue(z, order, gfp_mask);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 0095533cdde9..a740778f688d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -894,7 +894,7 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
894 if (zone->present_pages == 0) 894 if (zone->present_pages == 0)
895 continue; 895 continue;
896 896
897 if (!cpuset_zone_allowed(zone)) 897 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
898 continue; 898 continue;
899 899
900 zone->temp_priority = sc->priority; 900 zone->temp_priority = sc->priority;
@@ -940,7 +940,7 @@ int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
940 for (i = 0; zones[i] != NULL; i++) { 940 for (i = 0; zones[i] != NULL; i++) {
941 struct zone *zone = zones[i]; 941 struct zone *zone = zones[i];
942 942
943 if (!cpuset_zone_allowed(zone)) 943 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
944 continue; 944 continue;
945 945
946 zone->temp_priority = DEF_PRIORITY; 946 zone->temp_priority = DEF_PRIORITY;
@@ -986,7 +986,7 @@ out:
986 for (i = 0; zones[i] != 0; i++) { 986 for (i = 0; zones[i] != 0; i++) {
987 struct zone *zone = zones[i]; 987 struct zone *zone = zones[i];
988 988
989 if (!cpuset_zone_allowed(zone)) 989 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
990 continue; 990 continue;
991 991
992 zone->prev_priority = zone->temp_priority; 992 zone->prev_priority = zone->temp_priority;
@@ -1256,7 +1256,7 @@ void wakeup_kswapd(struct zone *zone, int order)
1256 return; 1256 return;
1257 if (pgdat->kswapd_max_order < order) 1257 if (pgdat->kswapd_max_order < order)
1258 pgdat->kswapd_max_order = order; 1258 pgdat->kswapd_max_order = order;
1259 if (!cpuset_zone_allowed(zone)) 1259 if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
1260 return; 1260 return;
1261 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait)) 1261 if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
1262 return; 1262 return;