aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 21:57:19 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 21:57:19 -0500
commit2756d373a3f45a3a9ebf4ac389f9e0e02bd35a93 (patch)
treee248c5adccb3045f96b3cfe0a1ffeb37bb81e4cb /kernel/cpuset.c
parent4e8790f77f051d4cc745a57b48a73052521e8dfc (diff)
parenteeecbd1971517103e06f11750dd1a9a1dc37e4e6 (diff)
Merge branch 'for-3.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup update from Tejun Heo: "cpuset got simplified a bit. cgroup core got a fix on unified hierarchy and grew some effective css related interfaces which will be used for blkio support for writeback IO traffic which is currently being worked on" * 'for-3.19' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: implement cgroup_get_e_css() cgroup: add cgroup_subsys->css_e_css_changed() cgroup: add cgroup_subsys->css_released() cgroup: fix the async css offline wait logic in cgroup_subtree_control_write() cgroup: restructure child_subsys_mask handling in cgroup_subtree_control_write() cgroup: separate out cgroup_calc_child_subsys_mask() from cgroup_refresh_child_subsys_mask() cpuset: lock vs unlock typo cpuset: simplify cpuset_node_allowed API cpuset: convert callback_mutex to a spinlock
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c162
1 files changed, 57 insertions, 105 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 723cfc9d0ad7..64b257f6bca2 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = {
248 if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) 248 if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
249 249
250/* 250/*
251 * There are two global mutexes guarding cpuset structures - cpuset_mutex 251 * There are two global locks guarding cpuset structures - cpuset_mutex and
252 * and callback_mutex. The latter may nest inside the former. We also 252 * callback_lock. We also require taking task_lock() when dereferencing a
253 * require taking task_lock() when dereferencing a task's cpuset pointer. 253 * task's cpuset pointer. See "The task_lock() exception", at the end of this
254 * See "The task_lock() exception", at the end of this comment. 254 * comment.
255 * 255 *
256 * A task must hold both mutexes to modify cpusets. If a task holds 256 * A task must hold both locks to modify cpusets. If a task holds
257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it 257 * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
258 * is the only task able to also acquire callback_mutex and be able to 258 * is the only task able to also acquire callback_lock and be able to
259 * modify cpusets. It can perform various checks on the cpuset structure 259 * modify cpusets. It can perform various checks on the cpuset structure
260 * first, knowing nothing will change. It can also allocate memory while 260 * first, knowing nothing will change. It can also allocate memory while
261 * just holding cpuset_mutex. While it is performing these checks, various 261 * just holding cpuset_mutex. While it is performing these checks, various
262 * callback routines can briefly acquire callback_mutex to query cpusets. 262 * callback routines can briefly acquire callback_lock to query cpusets.
263 * Once it is ready to make the changes, it takes callback_mutex, blocking 263 * Once it is ready to make the changes, it takes callback_lock, blocking
264 * everyone else. 264 * everyone else.
265 * 265 *
266 * Calls to the kernel memory allocator can not be made while holding 266 * Calls to the kernel memory allocator can not be made while holding
267 * callback_mutex, as that would risk double tripping on callback_mutex 267 * callback_lock, as that would risk double tripping on callback_lock
268 * from one of the callbacks into the cpuset code from within 268 * from one of the callbacks into the cpuset code from within
269 * __alloc_pages(). 269 * __alloc_pages().
270 * 270 *
271 * If a task is only holding callback_mutex, then it has read-only 271 * If a task is only holding callback_lock, then it has read-only
272 * access to cpusets. 272 * access to cpusets.
273 * 273 *
274 * Now, the task_struct fields mems_allowed and mempolicy may be changed 274 * Now, the task_struct fields mems_allowed and mempolicy may be changed
275 * by other task, we use alloc_lock in the task_struct fields to protect 275 * by other task, we use alloc_lock in the task_struct fields to protect
276 * them. 276 * them.
277 * 277 *
278 * The cpuset_common_file_read() handlers only hold callback_mutex across 278 * The cpuset_common_file_read() handlers only hold callback_lock across
279 * small pieces of code, such as when reading out possibly multi-word 279 * small pieces of code, such as when reading out possibly multi-word
280 * cpumasks and nodemasks. 280 * cpumasks and nodemasks.
281 * 281 *
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = {
284 */ 284 */
285 285
286static DEFINE_MUTEX(cpuset_mutex); 286static DEFINE_MUTEX(cpuset_mutex);
287static DEFINE_MUTEX(callback_mutex); 287static DEFINE_SPINLOCK(callback_lock);
288 288
289/* 289/*
290 * CPU / memory hotplug is handled asynchronously. 290 * CPU / memory hotplug is handled asynchronously.
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = {
329 * One way or another, we guarantee to return some non-empty subset 329 * One way or another, we guarantee to return some non-empty subset
330 * of cpu_online_mask. 330 * of cpu_online_mask.
331 * 331 *
332 * Call with callback_mutex held. 332 * Call with callback_lock or cpuset_mutex held.
333 */ 333 */
334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) 334static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
335{ 335{
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
347 * One way or another, we guarantee to return some non-empty subset 347 * One way or another, we guarantee to return some non-empty subset
348 * of node_states[N_MEMORY]. 348 * of node_states[N_MEMORY].
349 * 349 *
350 * Call with callback_mutex held. 350 * Call with callback_lock or cpuset_mutex held.
351 */ 351 */
352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) 352static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
353{ 353{
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
359/* 359/*
360 * update task's spread flag if cpuset's page/slab spread flag is set 360 * update task's spread flag if cpuset's page/slab spread flag is set
361 * 361 *
362 * Called with callback_mutex/cpuset_mutex held 362 * Call with callback_lock or cpuset_mutex held.
363 */ 363 */
364static void cpuset_update_task_spread_flag(struct cpuset *cs, 364static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk) 365 struct task_struct *tsk)
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
886 continue; 886 continue;
887 rcu_read_unlock(); 887 rcu_read_unlock();
888 888
889 mutex_lock(&callback_mutex); 889 spin_lock_irq(&callback_lock);
890 cpumask_copy(cp->effective_cpus, new_cpus); 890 cpumask_copy(cp->effective_cpus, new_cpus);
891 mutex_unlock(&callback_mutex); 891 spin_unlock_irq(&callback_lock);
892 892
893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 893 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); 894 !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
953 if (retval < 0) 953 if (retval < 0)
954 return retval; 954 return retval;
955 955
956 mutex_lock(&callback_mutex); 956 spin_lock_irq(&callback_lock);
957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); 957 cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
958 mutex_unlock(&callback_mutex); 958 spin_unlock_irq(&callback_lock);
959 959
960 /* use trialcs->cpus_allowed as a temp variable */ 960 /* use trialcs->cpus_allowed as a temp variable */
961 update_cpumasks_hier(cs, trialcs->cpus_allowed); 961 update_cpumasks_hier(cs, trialcs->cpus_allowed);
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1142 continue; 1142 continue;
1143 rcu_read_unlock(); 1143 rcu_read_unlock();
1144 1144
1145 mutex_lock(&callback_mutex); 1145 spin_lock_irq(&callback_lock);
1146 cp->effective_mems = *new_mems; 1146 cp->effective_mems = *new_mems;
1147 mutex_unlock(&callback_mutex); 1147 spin_unlock_irq(&callback_lock);
1148 1148
1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && 1149 WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
1150 !nodes_equal(cp->mems_allowed, cp->effective_mems)); 1150 !nodes_equal(cp->mems_allowed, cp->effective_mems));
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
1165 * mempolicies and if the cpuset is marked 'memory_migrate', 1165 * mempolicies and if the cpuset is marked 'memory_migrate',
1166 * migrate the tasks pages to the new memory. 1166 * migrate the tasks pages to the new memory.
1167 * 1167 *
1168 * Call with cpuset_mutex held. May take callback_mutex during call. 1168 * Call with cpuset_mutex held. May take callback_lock during call.
1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, 1169 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind 1170 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1171 * their mempolicies to the cpusets new mems_allowed. 1171 * their mempolicies to the cpusets new mems_allowed.
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1212 if (retval < 0) 1212 if (retval < 0)
1213 goto done; 1213 goto done;
1214 1214
1215 mutex_lock(&callback_mutex); 1215 spin_lock_irq(&callback_lock);
1216 cs->mems_allowed = trialcs->mems_allowed; 1216 cs->mems_allowed = trialcs->mems_allowed;
1217 mutex_unlock(&callback_mutex); 1217 spin_unlock_irq(&callback_lock);
1218 1218
1219 /* use trialcs->mems_allowed as a temp variable */ 1219 /* use trialcs->mems_allowed as a temp variable */
1220 update_nodemasks_hier(cs, &cs->mems_allowed); 1220 update_nodemasks_hier(cs, &cs->mems_allowed);
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) 1305 spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
1306 || (is_spread_page(cs) != is_spread_page(trialcs))); 1306 || (is_spread_page(cs) != is_spread_page(trialcs)));
1307 1307
1308 mutex_lock(&callback_mutex); 1308 spin_lock_irq(&callback_lock);
1309 cs->flags = trialcs->flags; 1309 cs->flags = trialcs->flags;
1310 mutex_unlock(&callback_mutex); 1310 spin_unlock_irq(&callback_lock);
1311 1311
1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) 1312 if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
1313 rebuild_sched_domains_locked(); 1313 rebuild_sched_domains_locked();
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1714 count = seq_get_buf(sf, &buf); 1714 count = seq_get_buf(sf, &buf);
1715 s = buf; 1715 s = buf;
1716 1716
1717 mutex_lock(&callback_mutex); 1717 spin_lock_irq(&callback_lock);
1718 1718
1719 switch (type) { 1719 switch (type) {
1720 case FILE_CPULIST: 1720 case FILE_CPULIST:
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
1741 seq_commit(sf, -1); 1741 seq_commit(sf, -1);
1742 } 1742 }
1743out_unlock: 1743out_unlock:
1744 mutex_unlock(&callback_mutex); 1744 spin_unlock_irq(&callback_lock);
1745 return ret; 1745 return ret;
1746} 1746}
1747 1747
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1958 1958
1959 cpuset_inc(); 1959 cpuset_inc();
1960 1960
1961 mutex_lock(&callback_mutex); 1961 spin_lock_irq(&callback_lock);
1962 if (cgroup_on_dfl(cs->css.cgroup)) { 1962 if (cgroup_on_dfl(cs->css.cgroup)) {
1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus); 1963 cpumask_copy(cs->effective_cpus, parent->effective_cpus);
1964 cs->effective_mems = parent->effective_mems; 1964 cs->effective_mems = parent->effective_mems;
1965 } 1965 }
1966 mutex_unlock(&callback_mutex); 1966 spin_unlock_irq(&callback_lock);
1967 1967
1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) 1968 if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
1969 goto out_unlock; 1969 goto out_unlock;
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
1990 } 1990 }
1991 rcu_read_unlock(); 1991 rcu_read_unlock();
1992 1992
1993 mutex_lock(&callback_mutex); 1993 spin_lock_irq(&callback_lock);
1994 cs->mems_allowed = parent->mems_allowed; 1994 cs->mems_allowed = parent->mems_allowed;
1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); 1995 cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
1996 mutex_unlock(&callback_mutex); 1996 spin_unlock_irq(&callback_lock);
1997out_unlock: 1997out_unlock:
1998 mutex_unlock(&cpuset_mutex); 1998 mutex_unlock(&cpuset_mutex);
1999 return 0; 1999 return 0;
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2032static void cpuset_bind(struct cgroup_subsys_state *root_css) 2032static void cpuset_bind(struct cgroup_subsys_state *root_css)
2033{ 2033{
2034 mutex_lock(&cpuset_mutex); 2034 mutex_lock(&cpuset_mutex);
2035 mutex_lock(&callback_mutex); 2035 spin_lock_irq(&callback_lock);
2036 2036
2037 if (cgroup_on_dfl(root_css->cgroup)) { 2037 if (cgroup_on_dfl(root_css->cgroup)) {
2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); 2038 cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2043 top_cpuset.mems_allowed = top_cpuset.effective_mems; 2043 top_cpuset.mems_allowed = top_cpuset.effective_mems;
2044 } 2044 }
2045 2045
2046 mutex_unlock(&callback_mutex); 2046 spin_unlock_irq(&callback_lock);
2047 mutex_unlock(&cpuset_mutex); 2047 mutex_unlock(&cpuset_mutex);
2048} 2048}
2049 2049
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2128{ 2128{
2129 bool is_empty; 2129 bool is_empty;
2130 2130
2131 mutex_lock(&callback_mutex); 2131 spin_lock_irq(&callback_lock);
2132 cpumask_copy(cs->cpus_allowed, new_cpus); 2132 cpumask_copy(cs->cpus_allowed, new_cpus);
2133 cpumask_copy(cs->effective_cpus, new_cpus); 2133 cpumask_copy(cs->effective_cpus, new_cpus);
2134 cs->mems_allowed = *new_mems; 2134 cs->mems_allowed = *new_mems;
2135 cs->effective_mems = *new_mems; 2135 cs->effective_mems = *new_mems;
2136 mutex_unlock(&callback_mutex); 2136 spin_unlock_irq(&callback_lock);
2137 2137
2138 /* 2138 /*
2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty, 2139 * Don't call update_tasks_cpumask() if the cpuset becomes empty,
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs,
2170 if (nodes_empty(*new_mems)) 2170 if (nodes_empty(*new_mems))
2171 *new_mems = parent_cs(cs)->effective_mems; 2171 *new_mems = parent_cs(cs)->effective_mems;
2172 2172
2173 mutex_lock(&callback_mutex); 2173 spin_lock_irq(&callback_lock);
2174 cpumask_copy(cs->effective_cpus, new_cpus); 2174 cpumask_copy(cs->effective_cpus, new_cpus);
2175 cs->effective_mems = *new_mems; 2175 cs->effective_mems = *new_mems;
2176 mutex_unlock(&callback_mutex); 2176 spin_unlock_irq(&callback_lock);
2177 2177
2178 if (cpus_updated) 2178 if (cpus_updated)
2179 update_tasks_cpumask(cs); 2179 update_tasks_cpumask(cs);
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
2259 2259
2260 /* synchronize cpus_allowed to cpu_active_mask */ 2260 /* synchronize cpus_allowed to cpu_active_mask */
2261 if (cpus_updated) { 2261 if (cpus_updated) {
2262 mutex_lock(&callback_mutex); 2262 spin_lock_irq(&callback_lock);
2263 if (!on_dfl) 2263 if (!on_dfl)
2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); 2264 cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus); 2265 cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
2266 mutex_unlock(&callback_mutex); 2266 spin_unlock_irq(&callback_lock);
2267 /* we don't mess with cpumasks of tasks in top_cpuset */ 2267 /* we don't mess with cpumasks of tasks in top_cpuset */
2268 } 2268 }
2269 2269
2270 /* synchronize mems_allowed to N_MEMORY */ 2270 /* synchronize mems_allowed to N_MEMORY */
2271 if (mems_updated) { 2271 if (mems_updated) {
2272 mutex_lock(&callback_mutex); 2272 spin_lock_irq(&callback_lock);
2273 if (!on_dfl) 2273 if (!on_dfl)
2274 top_cpuset.mems_allowed = new_mems; 2274 top_cpuset.mems_allowed = new_mems;
2275 top_cpuset.effective_mems = new_mems; 2275 top_cpuset.effective_mems = new_mems;
2276 mutex_unlock(&callback_mutex); 2276 spin_unlock_irq(&callback_lock);
2277 update_tasks_nodemask(&top_cpuset); 2277 update_tasks_nodemask(&top_cpuset);
2278 } 2278 }
2279 2279
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void)
2366 2366
2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2367void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2368{ 2368{
2369 mutex_lock(&callback_mutex); 2369 unsigned long flags;
2370
2371 spin_lock_irqsave(&callback_lock, flags);
2370 rcu_read_lock(); 2372 rcu_read_lock();
2371 guarantee_online_cpus(task_cs(tsk), pmask); 2373 guarantee_online_cpus(task_cs(tsk), pmask);
2372 rcu_read_unlock(); 2374 rcu_read_unlock();
2373 mutex_unlock(&callback_mutex); 2375 spin_unlock_irqrestore(&callback_lock, flags);
2374} 2376}
2375 2377
2376void cpuset_cpus_allowed_fallback(struct task_struct *tsk) 2378void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void)
2416nodemask_t cpuset_mems_allowed(struct task_struct *tsk) 2418nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
2417{ 2419{
2418 nodemask_t mask; 2420 nodemask_t mask;
2421 unsigned long flags;
2419 2422
2420 mutex_lock(&callback_mutex); 2423 spin_lock_irqsave(&callback_lock, flags);
2421 rcu_read_lock(); 2424 rcu_read_lock();
2422 guarantee_online_mems(task_cs(tsk), &mask); 2425 guarantee_online_mems(task_cs(tsk), &mask);
2423 rcu_read_unlock(); 2426 rcu_read_unlock();
2424 mutex_unlock(&callback_mutex); 2427 spin_unlock_irqrestore(&callback_lock, flags);
2425 2428
2426 return mask; 2429 return mask;
2427} 2430}
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
2440/* 2443/*
2441 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or 2444 * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or
2442 * mem_hardwall ancestor to the specified cpuset. Call holding 2445 * mem_hardwall ancestor to the specified cpuset. Call holding
2443 * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall 2446 * callback_lock. If no ancestor is mem_exclusive or mem_hardwall
2444 * (an unusual configuration), then returns the root cpuset. 2447 * (an unusual configuration), then returns the root cpuset.
2445 */ 2448 */
2446static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) 2449static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2451} 2454}
2452 2455
2453/** 2456/**
2454 * cpuset_node_allowed_softwall - Can we allocate on a memory node? 2457 * cpuset_node_allowed - Can we allocate on a memory node?
2455 * @node: is this an allowed node? 2458 * @node: is this an allowed node?
2456 * @gfp_mask: memory allocation flags 2459 * @gfp_mask: memory allocation flags
2457 * 2460 *
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2463 * flag, yes. 2466 * flag, yes.
2464 * Otherwise, no. 2467 * Otherwise, no.
2465 * 2468 *
2466 * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2467 * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2468 * might sleep, and might allow a node from an enclosing cpuset.
2469 *
2470 * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2471 * cpusets, and never sleeps.
2472 *
2473 * The __GFP_THISNODE placement logic is really handled elsewhere, 2469 * The __GFP_THISNODE placement logic is really handled elsewhere,
2474 * by forcibly using a zonelist starting at a specified node, and by 2470 * by forcibly using a zonelist starting at a specified node, and by
2475 * (in get_page_from_freelist()) refusing to consider the zones for 2471 * (in get_page_from_freelist()) refusing to consider the zones for
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2482 * GFP_KERNEL allocations are not so marked, so can escape to the 2478 * GFP_KERNEL allocations are not so marked, so can escape to the
2483 * nearest enclosing hardwalled ancestor cpuset. 2479 * nearest enclosing hardwalled ancestor cpuset.
2484 * 2480 *
2485 * Scanning up parent cpusets requires callback_mutex. The 2481 * Scanning up parent cpusets requires callback_lock. The
2486 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit 2482 * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
2487 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the 2483 * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
2488 * current tasks mems_allowed came up empty on the first pass over 2484 * current tasks mems_allowed came up empty on the first pass over
2489 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the 2485 * the zonelist. So only GFP_KERNEL allocations, if all nodes in the
2490 * cpuset are short of memory, might require taking the callback_mutex 2486 * cpuset are short of memory, might require taking the callback_lock.
2491 * mutex.
2492 * 2487 *
2493 * The first call here from mm/page_alloc:get_page_from_freelist() 2488 * The first call here from mm/page_alloc:get_page_from_freelist()
2494 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, 2489 * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
2505 * TIF_MEMDIE - any node ok 2500 * TIF_MEMDIE - any node ok
2506 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok 2501 * GFP_KERNEL - any node in enclosing hardwalled cpuset ok
2507 * GFP_USER - only nodes in current tasks mems allowed ok. 2502 * GFP_USER - only nodes in current tasks mems allowed ok.
2508 *
2509 * Rule:
2510 * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2511 * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2512 * the code that might scan up ancestor cpusets and sleep.
2513 */ 2503 */
2514int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) 2504int __cpuset_node_allowed(int node, gfp_t gfp_mask)
2515{ 2505{
2516 struct cpuset *cs; /* current cpuset ancestors */ 2506 struct cpuset *cs; /* current cpuset ancestors */
2517 int allowed; /* is allocation in zone z allowed? */ 2507 int allowed; /* is allocation in zone z allowed? */
2508 unsigned long flags;
2518 2509
2519 if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) 2510 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2520 return 1; 2511 return 1;
2521 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2522 if (node_isset(node, current->mems_allowed)) 2512 if (node_isset(node, current->mems_allowed))
2523 return 1; 2513 return 1;
2524 /* 2514 /*
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2534 return 1; 2524 return 1;
2535 2525
2536 /* Not hardwall and node outside mems_allowed: scan up cpusets */ 2526 /* Not hardwall and node outside mems_allowed: scan up cpusets */
2537 mutex_lock(&callback_mutex); 2527 spin_lock_irqsave(&callback_lock, flags);
2538 2528
2539 rcu_read_lock(); 2529 rcu_read_lock();
2540 cs = nearest_hardwall_ancestor(task_cs(current)); 2530 cs = nearest_hardwall_ancestor(task_cs(current));
2541 allowed = node_isset(node, cs->mems_allowed); 2531 allowed = node_isset(node, cs->mems_allowed);
2542 rcu_read_unlock(); 2532 rcu_read_unlock();
2543 2533
2544 mutex_unlock(&callback_mutex); 2534 spin_unlock_irqrestore(&callback_lock, flags);
2545 return allowed; 2535 return allowed;
2546} 2536}
2547 2537
2548/*
2549 * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2550 * @node: is this an allowed node?
2551 * @gfp_mask: memory allocation flags
2552 *
2553 * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2554 * set, yes, we can always allocate. If node is in our task's mems_allowed,
2555 * yes. If the task has been OOM killed and has access to memory reserves as
2556 * specified by the TIF_MEMDIE flag, yes.
2557 * Otherwise, no.
2558 *
2559 * The __GFP_THISNODE placement logic is really handled elsewhere,
2560 * by forcibly using a zonelist starting at a specified node, and by
2561 * (in get_page_from_freelist()) refusing to consider the zones for
2562 * any node on the zonelist except the first. By the time any such
2563 * calls get to this routine, we should just shut up and say 'yes'.
2564 *
2565 * Unlike the cpuset_node_allowed_softwall() variant, above,
2566 * this variant requires that the node be in the current task's
2567 * mems_allowed or that we're in interrupt. It does not scan up the
2568 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2569 * It never sleeps.
2570 */
2571int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2572{
2573 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2574 return 1;
2575 if (node_isset(node, current->mems_allowed))
2576 return 1;
2577 /*
2578 * Allow tasks that have access to memory reserves because they have
2579 * been OOM killed to get memory anywhere.
2580 */
2581 if (unlikely(test_thread_flag(TIF_MEMDIE)))
2582 return 1;
2583 return 0;
2584}
2585
2586/** 2538/**
2587 * cpuset_mem_spread_node() - On which node to begin search for a file page 2539 * cpuset_mem_spread_node() - On which node to begin search for a file page
2588 * cpuset_slab_spread_node() - On which node to begin search for a slab page 2540 * cpuset_slab_spread_node() - On which node to begin search for a slab page