aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Menage <menage@google.com>2008-04-29 04:00:26 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-04-29 11:06:11 -0400
commit786083667e0ced85ce17c4c0b6c57a9f47c5b9f2 (patch)
tree4519c51c3ec7d093fe0bafbbd67204a6f7b773b3
parentaddf2c739d9015d3e9c0500b58a3af051cd58ea7 (diff)
Cpuset hardwall flag: add a mem_hardwall flag to cpusets
This flag provides the hardwalling properties of mem_exclusive, without enforcing the exclusivity. Either mem_hardwall or mem_exclusive is sufficient to prevent GFP_KERNEL allocations from passing outside the cpuset's assigned nodes. Signed-off-by: Paul Menage <menage@google.com> Acked-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/cpusets.txt26
-rw-r--r--kernel/cpuset.c48
2 files changed, 46 insertions, 28 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index aa854b9b18cd..fb7b361e6eea 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -171,6 +171,7 @@ files describing that cpuset:
171 - memory_migrate flag: if set, move pages to cpusets nodes 171 - memory_migrate flag: if set, move pages to cpusets nodes
172 - cpu_exclusive flag: is cpu placement exclusive? 172 - cpu_exclusive flag: is cpu placement exclusive?
173 - mem_exclusive flag: is memory placement exclusive? 173 - mem_exclusive flag: is memory placement exclusive?
174 - mem_hardwall flag: is memory allocation hardwalled
174 - memory_pressure: measure of how much paging pressure in cpuset 175 - memory_pressure: measure of how much paging pressure in cpuset
175 176
176In addition, the root cpuset only has the following file: 177In addition, the root cpuset only has the following file:
@@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than
222a direct ancestor or descendent, may share any of the same CPUs or 223a direct ancestor or descendent, may share any of the same CPUs or
223Memory Nodes. 224Memory Nodes.
224 225
225A cpuset that is mem_exclusive restricts kernel allocations for 226A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled",
226page, buffer and other data commonly shared by the kernel across 227i.e. it restricts kernel allocations for page, buffer and other data
227multiple users. All cpusets, whether mem_exclusive or not, restrict 228commonly shared by the kernel across multiple users. All cpusets,
228allocations of memory for user space. This enables configuring a 229whether hardwalled or not, restrict allocations of memory for user
229system so that several independent jobs can share common kernel data, 230space. This enables configuring a system so that several independent
230such as file system pages, while isolating each jobs user allocation in 231jobs can share common kernel data, such as file system pages, while
231its own cpuset. To do this, construct a large mem_exclusive cpuset to 232isolating each job's user allocation in its own cpuset. To do this,
232hold all the jobs, and construct child, non-mem_exclusive cpusets for 233construct a large mem_exclusive cpuset to hold all the jobs, and
233each individual job. Only a small amount of typical kernel memory, 234construct child, non-mem_exclusive cpusets for each individual job.
234such as requests from interrupt handlers, is allowed to be taken 235Only a small amount of typical kernel memory, such as requests from
235outside even a mem_exclusive cpuset. 236interrupt handlers, is allowed to be taken outside even a
237mem_exclusive cpuset.
236 238
237 239
2381.5 What is memory_pressure ? 2401.5 What is memory_pressure ?
@@ -707,7 +709,7 @@ Now you want to do something with this cpuset.
707 709
708In this directory you can find several files: 710In this directory you can find several files:
709# ls 711# ls
710cpus cpu_exclusive mems mem_exclusive tasks 712cpus cpu_exclusive mems mem_exclusive mem_hardwall tasks
711 713
712Reading them will give you information about the state of this cpuset: 714Reading them will give you information about the state of this cpuset:
713the CPUs and Memory Nodes it can use, the processes that are using 715the CPUs and Memory Nodes it can use, the processes that are using
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index fe5407ca2f1e..8da627d33804 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner {
127typedef enum { 127typedef enum {
128 CS_CPU_EXCLUSIVE, 128 CS_CPU_EXCLUSIVE,
129 CS_MEM_EXCLUSIVE, 129 CS_MEM_EXCLUSIVE,
130 CS_MEM_HARDWALL,
130 CS_MEMORY_MIGRATE, 131 CS_MEMORY_MIGRATE,
131 CS_SCHED_LOAD_BALANCE, 132 CS_SCHED_LOAD_BALANCE,
132 CS_SPREAD_PAGE, 133 CS_SPREAD_PAGE,
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
144 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 145 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
145} 146}
146 147
148static inline int is_mem_hardwall(const struct cpuset *cs)
149{
150 return test_bit(CS_MEM_HARDWALL, &cs->flags);
151}
152
147static inline int is_sched_load_balance(const struct cpuset *cs) 153static inline int is_sched_load_balance(const struct cpuset *cs)
148{ 154{
149 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); 155 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
@@ -1042,12 +1048,9 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf)
1042 1048
1043/* 1049/*
1044 * update_flag - read a 0 or a 1 in a file and update associated flag 1050 * update_flag - read a 0 or a 1 in a file and update associated flag
1045 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 1051 * bit: the bit to update (see cpuset_flagbits_t)
1046 * CS_SCHED_LOAD_BALANCE, 1052 * cs: the cpuset to update
1047 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 1053 * turning_on: whether the flag is being set or cleared
1048 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
1049 * cs: the cpuset to update
1050 * buf: the buffer where we read the 0 or 1
1051 * 1054 *
1052 * Call with cgroup_mutex held. 1055 * Call with cgroup_mutex held.
1053 */ 1056 */
@@ -1228,6 +1231,7 @@ typedef enum {
1228 FILE_MEMLIST, 1231 FILE_MEMLIST,
1229 FILE_CPU_EXCLUSIVE, 1232 FILE_CPU_EXCLUSIVE,
1230 FILE_MEM_EXCLUSIVE, 1233 FILE_MEM_EXCLUSIVE,
1234 FILE_MEM_HARDWALL,
1231 FILE_SCHED_LOAD_BALANCE, 1235 FILE_SCHED_LOAD_BALANCE,
1232 FILE_SCHED_RELAX_DOMAIN_LEVEL, 1236 FILE_SCHED_RELAX_DOMAIN_LEVEL,
1233 FILE_MEMORY_PRESSURE_ENABLED, 1237 FILE_MEMORY_PRESSURE_ENABLED,
@@ -1313,6 +1317,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1313 case FILE_MEM_EXCLUSIVE: 1317 case FILE_MEM_EXCLUSIVE:
1314 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); 1318 retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
1315 break; 1319 break;
1320 case FILE_MEM_HARDWALL:
1321 retval = update_flag(CS_MEM_HARDWALL, cs, val);
1322 break;
1316 case FILE_SCHED_LOAD_BALANCE: 1323 case FILE_SCHED_LOAD_BALANCE:
1317 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); 1324 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
1318 break; 1325 break;
@@ -1423,6 +1430,8 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1423 return is_cpu_exclusive(cs); 1430 return is_cpu_exclusive(cs);
1424 case FILE_MEM_EXCLUSIVE: 1431 case FILE_MEM_EXCLUSIVE:
1425 return is_mem_exclusive(cs); 1432 return is_mem_exclusive(cs);
1433 case FILE_MEM_HARDWALL:
1434 return is_mem_hardwall(cs);
1426 case FILE_SCHED_LOAD_BALANCE: 1435 case FILE_SCHED_LOAD_BALANCE:
1427 return is_sched_load_balance(cs); 1436 return is_sched_load_balance(cs);
1428 case FILE_MEMORY_MIGRATE: 1437 case FILE_MEMORY_MIGRATE:
@@ -1475,6 +1484,13 @@ static struct cftype files[] = {
1475 }, 1484 },
1476 1485
1477 { 1486 {
1487 .name = "mem_hardwall",
1488 .read_u64 = cpuset_read_u64,
1489 .write_u64 = cpuset_write_u64,
1490 .private = FILE_MEM_HARDWALL,
1491 },
1492
1493 {
1478 .name = "sched_load_balance", 1494 .name = "sched_load_balance",
1479 .read_u64 = cpuset_read_u64, 1495 .read_u64 = cpuset_read_u64,
1480 .write_u64 = cpuset_write_u64, 1496 .write_u64 = cpuset_write_u64,
@@ -1963,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
1963} 1979}
1964 1980
1965/* 1981/*
1966 * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive 1982 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
1967 * ancestor to the specified cpuset. Call holding callback_mutex. 1983 * mem_hardwall ancestor to the specified cpuset. Call holding
1968 * If no ancestor is mem_exclusive (an unusual configuration), then 1984 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall
1969 * returns the root cpuset. 1985 * (an unusual configuration), then returns the root cpuset.
1970 */ 1986 */
1971static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) 1987static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
1972{ 1988{
1973 while (!is_mem_exclusive(cs) && cs->parent) 1989 while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent)
1974 cs = cs->parent; 1990 cs = cs->parent;
1975 return cs; 1991 return cs;
1976} 1992}
@@ -1984,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1984 * __GFP_THISNODE is set, yes, we can always allocate. If zone 2000 * __GFP_THISNODE is set, yes, we can always allocate. If zone
1985 * z's node is in our tasks mems_allowed, yes. If it's not a 2001 * z's node is in our tasks mems_allowed, yes. If it's not a
1986 * __GFP_HARDWALL request and this zone's nodes is in the nearest 2002 * __GFP_HARDWALL request and this zone's nodes is in the nearest
1987 * mem_exclusive cpuset ancestor to this tasks cpuset, yes. 2003 * hardwalled cpuset ancestor to this tasks cpuset, yes.
1988 * If the task has been OOM killed and has access to memory reserves 2004 * If the task has been OOM killed and has access to memory reserves
1989 * as specified by the TIF_MEMDIE flag, yes. 2005 * as specified by the TIF_MEMDIE flag, yes.
1990 * Otherwise, no. 2006 * Otherwise, no.
@@ -2007,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2007 * and do not allow allocations outside the current tasks cpuset 2023 * and do not allow allocations outside the current tasks cpuset
2008 * unless the task has been OOM killed as is marked TIF_MEMDIE. 2024 * unless the task has been OOM killed as is marked TIF_MEMDIE.
2009 * GFP_KERNEL allocations are not so marked, so can escape to the 2025 * GFP_KERNEL allocations are not so marked, so can escape to the
2010 * nearest enclosing mem_exclusive ancestor cpuset. 2026 * nearest enclosing hardwalled ancestor cpuset.
2011 * 2027 *
2012 * Scanning up parent cpusets requires callback_mutex. The 2028 * Scanning up parent cpusets requires callback_mutex. The
2013 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2029 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
@@ -2030,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
2030 * in_interrupt - any node ok (current task context irrelevant) 2046 * in_interrupt - any node ok (current task context irrelevant)
2031 * GFP_ATOMIC - any node ok 2047 * GFP_ATOMIC - any node ok
2032 * TIF_MEMDIE - any node ok 2048 * TIF_MEMDIE - any node ok
2033 * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok 2049 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2034 * GFP_USER - only nodes in current tasks mems allowed ok. 2050 * GFP_USER - only nodes in current tasks mems allowed ok.
2035 * 2051 *
2036 * Rule: 2052 * Rule:
@@ -2067,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2067 mutex_lock(&callback_mutex); 2083 mutex_lock(&callback_mutex);
2068 2084
2069 task_lock(current); 2085 task_lock(current);
2070 cs = nearest_exclusive_ancestor(task_cs(current)); 2086 cs = nearest_hardwall_ancestor(task_cs(current));
2071 task_unlock(current); 2087 task_unlock(current);
2072 2088
2073 allowed = node_isset(node, cs->mems_allowed); 2089 allowed = node_isset(node, cs->mems_allowed);