diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 145 |
1 files changed, 108 insertions, 37 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d10946748ec2..02b9611eadde 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
946 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, | 946 | * In order to avoid seeing no nodes if the old and new nodes are disjoint, |
947 | * we structure updates as setting all new allowed nodes, then clearing newly | 947 | * we structure updates as setting all new allowed nodes, then clearing newly |
948 | * disallowed ones. | 948 | * disallowed ones. |
949 | * | ||
950 | * Called with task's alloc_lock held | ||
951 | */ | 949 | */ |
952 | static void cpuset_change_task_nodemask(struct task_struct *tsk, | 950 | static void cpuset_change_task_nodemask(struct task_struct *tsk, |
953 | nodemask_t *newmems) | 951 | nodemask_t *newmems) |
954 | { | 952 | { |
953 | repeat: | ||
954 | /* | ||
955 | * Allow tasks that have access to memory reserves because they have | ||
956 | * been OOM killed to get memory anywhere. | ||
957 | */ | ||
958 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
959 | return; | ||
960 | if (current->flags & PF_EXITING) /* Let dying task have memory */ | ||
961 | return; | ||
962 | |||
963 | task_lock(tsk); | ||
955 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); | 964 | nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); |
956 | mpol_rebind_task(tsk, &tsk->mems_allowed); | 965 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); |
957 | mpol_rebind_task(tsk, newmems); | 966 | |
967 | |||
968 | /* | ||
969 | * ensure checking ->mems_allowed_change_disable after setting all new | ||
970 | * allowed nodes. | ||
971 | * | ||
972 | * the read-side task can see an nodemask with new allowed nodes and | ||
973 | * old allowed nodes. and if it allocates page when cpuset clears newly | ||
974 | * disallowed ones continuous, it can see the new allowed bits. | ||
975 | * | ||
976 | * And if setting all new allowed nodes is after the checking, setting | ||
977 | * all new allowed nodes and clearing newly disallowed ones will be done | ||
978 | * continuous, and the read-side task may find no node to alloc page. | ||
979 | */ | ||
980 | smp_mb(); | ||
981 | |||
982 | /* | ||
983 | * Allocation of memory is very fast, we needn't sleep when waiting | ||
984 | * for the read-side. | ||
985 | */ | ||
986 | while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { | ||
987 | task_unlock(tsk); | ||
988 | if (!task_curr(tsk)) | ||
989 | yield(); | ||
990 | goto repeat; | ||
991 | } | ||
992 | |||
993 | /* | ||
994 | * ensure checking ->mems_allowed_change_disable before clearing all new | ||
995 | * disallowed nodes. | ||
996 | * | ||
997 | * if clearing newly disallowed bits before the checking, the read-side | ||
998 | * task may find no node to alloc page. | ||
999 | */ | ||
1000 | smp_mb(); | ||
1001 | |||
1002 | mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); | ||
958 | tsk->mems_allowed = *newmems; | 1003 | tsk->mems_allowed = *newmems; |
1004 | task_unlock(tsk); | ||
959 | } | 1005 | } |
960 | 1006 | ||
961 | /* | 1007 | /* |
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p, | |||
978 | cs = cgroup_cs(scan->cg); | 1024 | cs = cgroup_cs(scan->cg); |
979 | guarantee_online_mems(cs, newmems); | 1025 | guarantee_online_mems(cs, newmems); |
980 | 1026 | ||
981 | task_lock(p); | ||
982 | cpuset_change_task_nodemask(p, newmems); | 1027 | cpuset_change_task_nodemask(p, newmems); |
983 | task_unlock(p); | ||
984 | 1028 | ||
985 | NODEMASK_FREE(newmems); | 1029 | NODEMASK_FREE(newmems); |
986 | 1030 | ||
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to, | |||
1383 | err = set_cpus_allowed_ptr(tsk, cpus_attach); | 1427 | err = set_cpus_allowed_ptr(tsk, cpus_attach); |
1384 | WARN_ON_ONCE(err); | 1428 | WARN_ON_ONCE(err); |
1385 | 1429 | ||
1386 | task_lock(tsk); | ||
1387 | cpuset_change_task_nodemask(tsk, to); | 1430 | cpuset_change_task_nodemask(tsk, to); |
1388 | task_unlock(tsk); | ||
1389 | cpuset_update_task_spread_flag(cs, tsk); | 1431 | cpuset_update_task_spread_flag(cs, tsk); |
1390 | 1432 | ||
1391 | } | 1433 | } |
@@ -2182,19 +2224,52 @@ void __init cpuset_init_smp(void) | |||
2182 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2224 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
2183 | { | 2225 | { |
2184 | mutex_lock(&callback_mutex); | 2226 | mutex_lock(&callback_mutex); |
2185 | cpuset_cpus_allowed_locked(tsk, pmask); | 2227 | task_lock(tsk); |
2228 | guarantee_online_cpus(task_cs(tsk), pmask); | ||
2229 | task_unlock(tsk); | ||
2186 | mutex_unlock(&callback_mutex); | 2230 | mutex_unlock(&callback_mutex); |
2187 | } | 2231 | } |
2188 | 2232 | ||
2189 | /** | 2233 | int cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
2190 | * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset. | ||
2191 | * Must be called with callback_mutex held. | ||
2192 | **/ | ||
2193 | void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask) | ||
2194 | { | 2234 | { |
2195 | task_lock(tsk); | 2235 | const struct cpuset *cs; |
2196 | guarantee_online_cpus(task_cs(tsk), pmask); | 2236 | int cpu; |
2197 | task_unlock(tsk); | 2237 | |
2238 | rcu_read_lock(); | ||
2239 | cs = task_cs(tsk); | ||
2240 | if (cs) | ||
2241 | cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed); | ||
2242 | rcu_read_unlock(); | ||
2243 | |||
2244 | /* | ||
2245 | * We own tsk->cpus_allowed, nobody can change it under us. | ||
2246 | * | ||
2247 | * But we used cs && cs->cpus_allowed lockless and thus can | ||
2248 | * race with cgroup_attach_task() or update_cpumask() and get | ||
2249 | * the wrong tsk->cpus_allowed. However, both cases imply the | ||
2250 | * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() | ||
2251 | * which takes task_rq_lock(). | ||
2252 | * | ||
2253 | * If we are called after it dropped the lock we must see all | ||
2254 | * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary | ||
2255 | * set any mask even if it is not right from task_cs() pov, | ||
2256 | * the pending set_cpus_allowed_ptr() will fix things. | ||
2257 | */ | ||
2258 | |||
2259 | cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); | ||
2260 | if (cpu >= nr_cpu_ids) { | ||
2261 | /* | ||
2262 | * Either tsk->cpus_allowed is wrong (see above) or it | ||
2263 | * is actually empty. The latter case is only possible | ||
2264 | * if we are racing with remove_tasks_in_empty_cpuset(). | ||
2265 | * Like above we can temporary set any mask and rely on | ||
2266 | * set_cpus_allowed_ptr() as synchronization point. | ||
2267 | */ | ||
2268 | cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask); | ||
2269 | cpu = cpumask_any(cpu_active_mask); | ||
2270 | } | ||
2271 | |||
2272 | return cpu; | ||
2198 | } | 2273 | } |
2199 | 2274 | ||
2200 | void cpuset_init_current_mems_allowed(void) | 2275 | void cpuset_init_current_mems_allowed(void) |
@@ -2383,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | |||
2383 | } | 2458 | } |
2384 | 2459 | ||
2385 | /** | 2460 | /** |
2386 | * cpuset_lock - lock out any changes to cpuset structures | ||
2387 | * | ||
2388 | * The out of memory (oom) code needs to mutex_lock cpusets | ||
2389 | * from being changed while it scans the tasklist looking for a | ||
2390 | * task in an overlapping cpuset. Expose callback_mutex via this | ||
2391 | * cpuset_lock() routine, so the oom code can lock it, before | ||
2392 | * locking the task list. The tasklist_lock is a spinlock, so | ||
2393 | * must be taken inside callback_mutex. | ||
2394 | */ | ||
2395 | |||
2396 | void cpuset_lock(void) | ||
2397 | { | ||
2398 | mutex_lock(&callback_mutex); | ||
2399 | } | ||
2400 | |||
2401 | /** | ||
2402 | * cpuset_unlock - release lock on cpuset changes | 2461 | * cpuset_unlock - release lock on cpuset changes |
2403 | * | 2462 | * |
2404 | * Undo the lock taken in a previous cpuset_lock() call. | 2463 | * Undo the lock taken in a previous cpuset_lock() call. |
@@ -2410,7 +2469,8 @@ void cpuset_unlock(void) | |||
2410 | } | 2469 | } |
2411 | 2470 | ||
2412 | /** | 2471 | /** |
2413 | * cpuset_mem_spread_node() - On which node to begin search for a page | 2472 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
2473 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | ||
2414 | * | 2474 | * |
2415 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | 2475 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for |
2416 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | 2476 | * tasks in a cpuset with is_spread_page or is_spread_slab set), |
@@ -2435,16 +2495,27 @@ void cpuset_unlock(void) | |||
2435 | * See kmem_cache_alloc_node(). | 2495 | * See kmem_cache_alloc_node(). |
2436 | */ | 2496 | */ |
2437 | 2497 | ||
2438 | int cpuset_mem_spread_node(void) | 2498 | static int cpuset_spread_node(int *rotor) |
2439 | { | 2499 | { |
2440 | int node; | 2500 | int node; |
2441 | 2501 | ||
2442 | node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); | 2502 | node = next_node(*rotor, current->mems_allowed); |
2443 | if (node == MAX_NUMNODES) | 2503 | if (node == MAX_NUMNODES) |
2444 | node = first_node(current->mems_allowed); | 2504 | node = first_node(current->mems_allowed); |
2445 | current->cpuset_mem_spread_rotor = node; | 2505 | *rotor = node; |
2446 | return node; | 2506 | return node; |
2447 | } | 2507 | } |
2508 | |||
2509 | int cpuset_mem_spread_node(void) | ||
2510 | { | ||
2511 | return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); | ||
2512 | } | ||
2513 | |||
2514 | int cpuset_slab_spread_node(void) | ||
2515 | { | ||
2516 | return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); | ||
2517 | } | ||
2518 | |||
2448 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | 2519 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); |
2449 | 2520 | ||
2450 | /** | 2521 | /** |