aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c145
1 files changed, 108 insertions, 37 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..02b9611eadde 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
947 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
948 * disallowed ones. 948 * disallowed ones.
949 *
950 * Called with task's alloc_lock held
951 */ 949 */
952static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 951 nodemask_t *newmems)
954{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
957 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
958 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
959} 1005}
960 1006
961/* 1007/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p);
982 cpuset_change_task_nodemask(p, newmems); 1027 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p);
984 1028
985 NODEMASK_FREE(newmems); 1029 NODEMASK_FREE(newmems);
986 1030
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1383 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1384 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1385 1429
1386 task_lock(tsk);
1387 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1388 task_unlock(tsk);
1389 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1390 1432
1391} 1433}
@@ -2182,19 +2224,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2224void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2225{
2184 mutex_lock(&callback_mutex); 2226 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2227 task_lock(tsk);
2228 guarantee_online_cpus(task_cs(tsk), pmask);
2229 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2230 mutex_unlock(&callback_mutex);
2187} 2231}
2188 2232
2189/** 2233int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2234{
2195 task_lock(tsk); 2235 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2236 int cpu;
2197 task_unlock(tsk); 2237
2238 rcu_read_lock();
2239 cs = task_cs(tsk);
2240 if (cs)
2241 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2242 rcu_read_unlock();
2243
2244 /*
2245 * We own tsk->cpus_allowed, nobody can change it under us.
2246 *
2247 * But we used cs && cs->cpus_allowed lockless and thus can
2248 * race with cgroup_attach_task() or update_cpumask() and get
2249 * the wrong tsk->cpus_allowed. However, both cases imply the
2250 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2251 * which takes task_rq_lock().
2252 *
2253 * If we are called after it dropped the lock we must see all
2254 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2255 * set any mask even if it is not right from task_cs() pov,
2256 * the pending set_cpus_allowed_ptr() will fix things.
2257 */
2258
2259 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2260 if (cpu >= nr_cpu_ids) {
2261 /*
2262 * Either tsk->cpus_allowed is wrong (see above) or it
2263 * is actually empty. The latter case is only possible
2264 * if we are racing with remove_tasks_in_empty_cpuset().
2265 * Like above we can temporary set any mask and rely on
2266 * set_cpus_allowed_ptr() as synchronization point.
2267 */
2268 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2269 cpu = cpumask_any(cpu_active_mask);
2270 }
2271
2272 return cpu;
2198} 2273}
2199 2274
2200void cpuset_init_current_mems_allowed(void) 2275void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2458}
2384 2459
2385/** 2460/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2461 * cpuset_unlock - release lock on cpuset changes
2403 * 2462 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2463 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2410,7 +2469,8 @@ void cpuset_unlock(void)
2410} 2469}
2411 2470
2412/** 2471/**
2413 * cpuset_mem_spread_node() - On which node to begin search for a page 2472 * cpuset_mem_spread_node() - On which node to begin search for a file page
2473 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2414 * 2474 *
2415 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2475 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2416 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2476 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2435,16 +2495,27 @@ void cpuset_unlock(void)
2435 * See kmem_cache_alloc_node(). 2495 * See kmem_cache_alloc_node().
2436 */ 2496 */
2437 2497
2438int cpuset_mem_spread_node(void) 2498static int cpuset_spread_node(int *rotor)
2439{ 2499{
2440 int node; 2500 int node;
2441 2501
2442 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2502 node = next_node(*rotor, current->mems_allowed);
2443 if (node == MAX_NUMNODES) 2503 if (node == MAX_NUMNODES)
2444 node = first_node(current->mems_allowed); 2504 node = first_node(current->mems_allowed);
2445 current->cpuset_mem_spread_rotor = node; 2505 *rotor = node;
2446 return node; 2506 return node;
2447} 2507}
2508
2509int cpuset_mem_spread_node(void)
2510{
2511 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2512}
2513
2514int cpuset_slab_spread_node(void)
2515{
2516 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2517}
2518
2448EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2519EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2449 2520
2450/** 2521/**