diff options
-rw-r--r-- | Documentation/cpusets.txt | 26 | ||||
-rw-r--r-- | kernel/cpuset.c | 48 |
2 files changed, 46 insertions, 28 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index aa854b9b18cd..fb7b361e6eea 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -171,6 +171,7 @@ files describing that cpuset: | |||
171 | - memory_migrate flag: if set, move pages to cpusets nodes | 171 | - memory_migrate flag: if set, move pages to cpusets nodes |
172 | - cpu_exclusive flag: is cpu placement exclusive? | 172 | - cpu_exclusive flag: is cpu placement exclusive? |
173 | - mem_exclusive flag: is memory placement exclusive? | 173 | - mem_exclusive flag: is memory placement exclusive? |
174 | - mem_hardwall flag: is memory allocation hardwalled | ||
174 | - memory_pressure: measure of how much paging pressure in cpuset | 175 | - memory_pressure: measure of how much paging pressure in cpuset |
175 | 176 | ||
176 | In addition, the root cpuset only has the following file: | 177 | In addition, the root cpuset only has the following file: |
@@ -222,17 +223,18 @@ If a cpuset is cpu or mem exclusive, no other cpuset, other than | |||
222 | a direct ancestor or descendent, may share any of the same CPUs or | 223 | a direct ancestor or descendent, may share any of the same CPUs or |
223 | Memory Nodes. | 224 | Memory Nodes. |
224 | 225 | ||
225 | A cpuset that is mem_exclusive restricts kernel allocations for | 226 | A cpuset that is mem_exclusive *or* mem_hardwall is "hardwalled", |
226 | page, buffer and other data commonly shared by the kernel across | 227 | i.e. it restricts kernel allocations for page, buffer and other data |
227 | multiple users. All cpusets, whether mem_exclusive or not, restrict | 228 | commonly shared by the kernel across multiple users. All cpusets, |
228 | allocations of memory for user space. This enables configuring a | 229 | whether hardwalled or not, restrict allocations of memory for user |
229 | system so that several independent jobs can share common kernel data, | 230 | space. This enables configuring a system so that several independent |
230 | such as file system pages, while isolating each jobs user allocation in | 231 | jobs can share common kernel data, such as file system pages, while |
231 | its own cpuset. To do this, construct a large mem_exclusive cpuset to | 232 | isolating each job's user allocation in its own cpuset. To do this, |
232 | hold all the jobs, and construct child, non-mem_exclusive cpusets for | 233 | construct a large mem_exclusive cpuset to hold all the jobs, and |
233 | each individual job. Only a small amount of typical kernel memory, | 234 | construct child, non-mem_exclusive cpusets for each individual job. |
234 | such as requests from interrupt handlers, is allowed to be taken | 235 | Only a small amount of typical kernel memory, such as requests from |
235 | outside even a mem_exclusive cpuset. | 236 | interrupt handlers, is allowed to be taken outside even a |
237 | mem_exclusive cpuset. | ||
236 | 238 | ||
237 | 239 | ||
238 | 1.5 What is memory_pressure ? | 240 | 1.5 What is memory_pressure ? |
@@ -707,7 +709,7 @@ Now you want to do something with this cpuset. | |||
707 | 709 | ||
708 | In this directory you can find several files: | 710 | In this directory you can find several files: |
709 | # ls | 711 | # ls |
710 | cpus cpu_exclusive mems mem_exclusive tasks | 712 | cpus cpu_exclusive mems mem_exclusive mem_hardwall tasks |
711 | 713 | ||
712 | Reading them will give you information about the state of this cpuset: | 714 | Reading them will give you information about the state of this cpuset: |
713 | the CPUs and Memory Nodes it can use, the processes that are using | 715 | the CPUs and Memory Nodes it can use, the processes that are using |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fe5407ca2f1e..8da627d33804 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -127,6 +127,7 @@ struct cpuset_hotplug_scanner { | |||
127 | typedef enum { | 127 | typedef enum { |
128 | CS_CPU_EXCLUSIVE, | 128 | CS_CPU_EXCLUSIVE, |
129 | CS_MEM_EXCLUSIVE, | 129 | CS_MEM_EXCLUSIVE, |
130 | CS_MEM_HARDWALL, | ||
130 | CS_MEMORY_MIGRATE, | 131 | CS_MEMORY_MIGRATE, |
131 | CS_SCHED_LOAD_BALANCE, | 132 | CS_SCHED_LOAD_BALANCE, |
132 | CS_SPREAD_PAGE, | 133 | CS_SPREAD_PAGE, |
@@ -144,6 +145,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs) | |||
144 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | 145 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
145 | } | 146 | } |
146 | 147 | ||
148 | static inline int is_mem_hardwall(const struct cpuset *cs) | ||
149 | { | ||
150 | return test_bit(CS_MEM_HARDWALL, &cs->flags); | ||
151 | } | ||
152 | |||
147 | static inline int is_sched_load_balance(const struct cpuset *cs) | 153 | static inline int is_sched_load_balance(const struct cpuset *cs) |
148 | { | 154 | { |
149 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | 155 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); |
@@ -1042,12 +1048,9 @@ static int update_relax_domain_level(struct cpuset *cs, char *buf) | |||
1042 | 1048 | ||
1043 | /* | 1049 | /* |
1044 | * update_flag - read a 0 or a 1 in a file and update associated flag | 1050 | * update_flag - read a 0 or a 1 in a file and update associated flag |
1045 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 1051 | * bit: the bit to update (see cpuset_flagbits_t) |
1046 | * CS_SCHED_LOAD_BALANCE, | 1052 | * cs: the cpuset to update |
1047 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, | 1053 | * turning_on: whether the flag is being set or cleared |
1048 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | ||
1049 | * cs: the cpuset to update | ||
1050 | * buf: the buffer where we read the 0 or 1 | ||
1051 | * | 1054 | * |
1052 | * Call with cgroup_mutex held. | 1055 | * Call with cgroup_mutex held. |
1053 | */ | 1056 | */ |
@@ -1228,6 +1231,7 @@ typedef enum { | |||
1228 | FILE_MEMLIST, | 1231 | FILE_MEMLIST, |
1229 | FILE_CPU_EXCLUSIVE, | 1232 | FILE_CPU_EXCLUSIVE, |
1230 | FILE_MEM_EXCLUSIVE, | 1233 | FILE_MEM_EXCLUSIVE, |
1234 | FILE_MEM_HARDWALL, | ||
1231 | FILE_SCHED_LOAD_BALANCE, | 1235 | FILE_SCHED_LOAD_BALANCE, |
1232 | FILE_SCHED_RELAX_DOMAIN_LEVEL, | 1236 | FILE_SCHED_RELAX_DOMAIN_LEVEL, |
1233 | FILE_MEMORY_PRESSURE_ENABLED, | 1237 | FILE_MEMORY_PRESSURE_ENABLED, |
@@ -1313,6 +1317,9 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | |||
1313 | case FILE_MEM_EXCLUSIVE: | 1317 | case FILE_MEM_EXCLUSIVE: |
1314 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); | 1318 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); |
1315 | break; | 1319 | break; |
1320 | case FILE_MEM_HARDWALL: | ||
1321 | retval = update_flag(CS_MEM_HARDWALL, cs, val); | ||
1322 | break; | ||
1316 | case FILE_SCHED_LOAD_BALANCE: | 1323 | case FILE_SCHED_LOAD_BALANCE: |
1317 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); | 1324 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); |
1318 | break; | 1325 | break; |
@@ -1423,6 +1430,8 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1423 | return is_cpu_exclusive(cs); | 1430 | return is_cpu_exclusive(cs); |
1424 | case FILE_MEM_EXCLUSIVE: | 1431 | case FILE_MEM_EXCLUSIVE: |
1425 | return is_mem_exclusive(cs); | 1432 | return is_mem_exclusive(cs); |
1433 | case FILE_MEM_HARDWALL: | ||
1434 | return is_mem_hardwall(cs); | ||
1426 | case FILE_SCHED_LOAD_BALANCE: | 1435 | case FILE_SCHED_LOAD_BALANCE: |
1427 | return is_sched_load_balance(cs); | 1436 | return is_sched_load_balance(cs); |
1428 | case FILE_MEMORY_MIGRATE: | 1437 | case FILE_MEMORY_MIGRATE: |
@@ -1475,6 +1484,13 @@ static struct cftype files[] = { | |||
1475 | }, | 1484 | }, |
1476 | 1485 | ||
1477 | { | 1486 | { |
1487 | .name = "mem_hardwall", | ||
1488 | .read_u64 = cpuset_read_u64, | ||
1489 | .write_u64 = cpuset_write_u64, | ||
1490 | .private = FILE_MEM_HARDWALL, | ||
1491 | }, | ||
1492 | |||
1493 | { | ||
1478 | .name = "sched_load_balance", | 1494 | .name = "sched_load_balance", |
1479 | .read_u64 = cpuset_read_u64, | 1495 | .read_u64 = cpuset_read_u64, |
1480 | .write_u64 = cpuset_write_u64, | 1496 | .write_u64 = cpuset_write_u64, |
@@ -1963,14 +1979,14 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
1963 | } | 1979 | } |
1964 | 1980 | ||
1965 | /* | 1981 | /* |
1966 | * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive | 1982 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
1967 | * ancestor to the specified cpuset. Call holding callback_mutex. | 1983 | * mem_hardwall ancestor to the specified cpuset. Call holding |
1968 | * If no ancestor is mem_exclusive (an unusual configuration), then | 1984 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall |
1969 | * returns the root cpuset. | 1985 | * (an unusual configuration), then returns the root cpuset. |
1970 | */ | 1986 | */ |
1971 | static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | 1987 | static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) |
1972 | { | 1988 | { |
1973 | while (!is_mem_exclusive(cs) && cs->parent) | 1989 | while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) |
1974 | cs = cs->parent; | 1990 | cs = cs->parent; |
1975 | return cs; | 1991 | return cs; |
1976 | } | 1992 | } |
@@ -1984,7 +2000,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1984 | * __GFP_THISNODE is set, yes, we can always allocate. If zone | 2000 | * __GFP_THISNODE is set, yes, we can always allocate. If zone |
1985 | * z's node is in our tasks mems_allowed, yes. If it's not a | 2001 | * z's node is in our tasks mems_allowed, yes. If it's not a |
1986 | * __GFP_HARDWALL request and this zone's nodes is in the nearest | 2002 | * __GFP_HARDWALL request and this zone's nodes is in the nearest |
1987 | * mem_exclusive cpuset ancestor to this tasks cpuset, yes. | 2003 | * hardwalled cpuset ancestor to this tasks cpuset, yes. |
1988 | * If the task has been OOM killed and has access to memory reserves | 2004 | * If the task has been OOM killed and has access to memory reserves |
1989 | * as specified by the TIF_MEMDIE flag, yes. | 2005 | * as specified by the TIF_MEMDIE flag, yes. |
1990 | * Otherwise, no. | 2006 | * Otherwise, no. |
@@ -2007,7 +2023,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
2007 | * and do not allow allocations outside the current tasks cpuset | 2023 | * and do not allow allocations outside the current tasks cpuset |
2008 | * unless the task has been OOM killed as is marked TIF_MEMDIE. | 2024 | * unless the task has been OOM killed as is marked TIF_MEMDIE. |
2009 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2025 | * GFP_KERNEL allocations are not so marked, so can escape to the |
2010 | * nearest enclosing mem_exclusive ancestor cpuset. | 2026 | * nearest enclosing hardwalled ancestor cpuset. |
2011 | * | 2027 | * |
2012 | * Scanning up parent cpusets requires callback_mutex. The | 2028 | * Scanning up parent cpusets requires callback_mutex. The |
2013 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 2029 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
@@ -2030,7 +2046,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
2030 | * in_interrupt - any node ok (current task context irrelevant) | 2046 | * in_interrupt - any node ok (current task context irrelevant) |
2031 | * GFP_ATOMIC - any node ok | 2047 | * GFP_ATOMIC - any node ok |
2032 | * TIF_MEMDIE - any node ok | 2048 | * TIF_MEMDIE - any node ok |
2033 | * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok | 2049 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
2034 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2050 | * GFP_USER - only nodes in current tasks mems allowed ok. |
2035 | * | 2051 | * |
2036 | * Rule: | 2052 | * Rule: |
@@ -2067,7 +2083,7 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | |||
2067 | mutex_lock(&callback_mutex); | 2083 | mutex_lock(&callback_mutex); |
2068 | 2084 | ||
2069 | task_lock(current); | 2085 | task_lock(current); |
2070 | cs = nearest_exclusive_ancestor(task_cs(current)); | 2086 | cs = nearest_hardwall_ancestor(task_cs(current)); |
2071 | task_unlock(current); | 2087 | task_unlock(current); |
2072 | 2088 | ||
2073 | allowed = node_isset(node, cs->mems_allowed); | 2089 | allowed = node_isset(node, cs->mems_allowed); |