aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1658
1 files changed, 667 insertions, 991 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..c62acf45d3b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,15 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 tmp = (u64)delta_exec * weight;
1334
1310 if (!lw->inv_weight) { 1335 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1312 lw->inv_weight = 1; 1337 lw->inv_weight = 1;
1313 else 1338 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1339 lw->inv_weight = WMULT_CONST / lw->weight;
1315 / (lw->weight+1);
1316 } 1340 }
1317 1341
1318 tmp = (u64)delta_exec * weight;
1319 /* 1342 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1343 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1344 */
@@ -1773,7 +1796,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1796 update_rq_clock(rq);
1774 sched_info_queued(p); 1797 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1798 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1799}
1778 1800
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1801static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1803,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1803 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1804 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1805 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1806}
1786 1807
1787/* 1808/*
@@ -2116,7 +2137,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2137 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2138 * this case, we can save a useless back to back clock update.
2118 */ 2139 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2140 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2141 rq->skip_clock_update = 1;
2121} 2142}
2122 2143
@@ -2162,6 +2183,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2183 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2184 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2185 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2186
2187#ifdef CONFIG_LOCKDEP
2188 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2189 lockdep_is_held(&task_rq(p)->lock)));
2190#endif
2165#endif 2191#endif
2166 2192
2167 trace_sched_migrate_task(p, new_cpu); 2193 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2208,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2208static int migration_cpu_stop(void *data);
2183 2209
2184/* 2210/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2211 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2212 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2213 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2264,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2264 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2265 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2266 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2267 on_rq = p->on_rq;
2255 ncsw = 0; 2268 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2269 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2270 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2271 task_rq_unlock(rq, p, &flags);
2259 2272
2260 /* 2273 /*
2261 * If it changed from the expected state, bail out now. 2274 * If it changed from the expected state, bail out now.
@@ -2330,7 +2343,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2343
2331#ifdef CONFIG_SMP 2344#ifdef CONFIG_SMP
2332/* 2345/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2346 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2347 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2348static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2349{
@@ -2363,12 +2376,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2376}
2364 2377
2365/* 2378/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2379 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2380 */
2368static inline 2381static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2382int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2383{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2384 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2385
2373 /* 2386 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2387 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2407,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2407}
2395#endif 2408#endif
2396 2409
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2410static void
2398 bool is_sync, bool is_migrate, bool is_local, 2411ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2412{
2413#ifdef CONFIG_SCHEDSTATS
2414 struct rq *rq = this_rq();
2415
2416#ifdef CONFIG_SMP
2417 int this_cpu = smp_processor_id();
2418
2419 if (cpu == this_cpu) {
2420 schedstat_inc(rq, ttwu_local);
2421 schedstat_inc(p, se.statistics.nr_wakeups_local);
2422 } else {
2423 struct sched_domain *sd;
2424
2425 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2426 rcu_read_lock();
2427 for_each_domain(this_cpu, sd) {
2428 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2429 schedstat_inc(sd, ttwu_wake_remote);
2430 break;
2431 }
2432 }
2433 rcu_read_unlock();
2434 }
2435#endif /* CONFIG_SMP */
2436
2437 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2438 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2439
2440 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2441 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2442
2443 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2444 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2445
2446#endif /* CONFIG_SCHEDSTATS */
2447}
2448
2449static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2450{
2411 activate_task(rq, p, en_flags); 2451 activate_task(rq, p, en_flags);
2452 p->on_rq = 1;
2453
2454 /* if a worker is waking up, notify workqueue */
2455 if (p->flags & PF_WQ_WORKER)
2456 wq_worker_waking_up(p, cpu_of(rq));
2412} 2457}
2413 2458
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2459/*
2415 int wake_flags, bool success) 2460 * Mark the task runnable and perform wakeup-preemption.
2461 */
2462static void
2463ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2464{
2417 trace_sched_wakeup(p, success); 2465 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2466 check_preempt_curr(rq, p, wake_flags);
2419 2467
2420 p->state = TASK_RUNNING; 2468 p->state = TASK_RUNNING;
@@ -2433,9 +2481,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2481 rq->idle_stamp = 0;
2434 } 2482 }
2435#endif 2483#endif
2436 /* if a worker is waking up, notify workqueue */ 2484}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2485
2438 wq_worker_waking_up(p, cpu_of(rq)); 2486static void
2487ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2488{
2489#ifdef CONFIG_SMP
2490 if (p->sched_contributes_to_load)
2491 rq->nr_uninterruptible--;
2492#endif
2493
2494 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2495 ttwu_do_wakeup(rq, p, wake_flags);
2496}
2497
2498/*
2499 * Called in case the task @p isn't fully descheduled from its runqueue,
2500 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2501 * since all we need to do is flip p->state to TASK_RUNNING, since
2502 * the task is still ->on_rq.
2503 */
2504static int ttwu_remote(struct task_struct *p, int wake_flags)
2505{
2506 struct rq *rq;
2507 int ret = 0;
2508
2509 rq = __task_rq_lock(p);
2510 if (p->on_rq) {
2511 ttwu_do_wakeup(rq, p, wake_flags);
2512 ret = 1;
2513 }
2514 __task_rq_unlock(rq);
2515
2516 return ret;
2517}
2518
2519#ifdef CONFIG_SMP
2520static void sched_ttwu_pending(void)
2521{
2522 struct rq *rq = this_rq();
2523 struct task_struct *list = xchg(&rq->wake_list, NULL);
2524
2525 if (!list)
2526 return;
2527
2528 raw_spin_lock(&rq->lock);
2529
2530 while (list) {
2531 struct task_struct *p = list;
2532 list = list->wake_entry;
2533 ttwu_do_activate(rq, p, 0);
2534 }
2535
2536 raw_spin_unlock(&rq->lock);
2537}
2538
2539void scheduler_ipi(void)
2540{
2541 sched_ttwu_pending();
2542}
2543
2544static void ttwu_queue_remote(struct task_struct *p, int cpu)
2545{
2546 struct rq *rq = cpu_rq(cpu);
2547 struct task_struct *next = rq->wake_list;
2548
2549 for (;;) {
2550 struct task_struct *old = next;
2551
2552 p->wake_entry = next;
2553 next = cmpxchg(&rq->wake_list, old, p);
2554 if (next == old)
2555 break;
2556 }
2557
2558 if (!next)
2559 smp_send_reschedule(cpu);
2560}
2561#endif
2562
2563static void ttwu_queue(struct task_struct *p, int cpu)
2564{
2565 struct rq *rq = cpu_rq(cpu);
2566
2567#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
2568 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2569 ttwu_queue_remote(p, cpu);
2570 return;
2571 }
2572#endif
2573
2574 raw_spin_lock(&rq->lock);
2575 ttwu_do_activate(rq, p, 0);
2576 raw_spin_unlock(&rq->lock);
2439} 2577}
2440 2578
2441/** 2579/**
@@ -2453,92 +2591,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2591 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2592 * or @state didn't match @p's state.
2455 */ 2593 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2594static int
2457 int wake_flags) 2595try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2596{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2597 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2598 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2599
2466 smp_wmb(); 2600 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2601 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2602 if (!(p->state & state))
2469 goto out; 2603 goto out;
2470 2604
2471 if (p->se.on_rq) 2605 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2606 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2607
2477#ifdef CONFIG_SMP 2608 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2609 goto stat;
2479 goto out_activate;
2480 2610
2611#ifdef CONFIG_SMP
2481 /* 2612 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2613 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2614 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2615 */
2487 if (task_contributes_to_load(p)) { 2616 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2617#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2618 /*
2490 else 2619 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2620 * middle of schedule(), in this case we should take care not
2492 } 2621 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2622 * deadlock.
2494 2623 */
2495 if (p->sched_class->task_waking) { 2624 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2625 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2626 goto stat;
2627 }
2628#endif
2629 cpu_relax();
2498 } 2630 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2631 /*
2509 * We migrated the task without holding either rq->lock, however 2632 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2633 */
2514 WARN_ON(task_cpu(p) != cpu); 2634 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2635
2517#ifdef CONFIG_SCHEDSTATS 2636 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2637 p->state = TASK_WAKING;
2519 if (cpu == this_cpu) 2638
2520 schedstat_inc(rq, ttwu_local); 2639 if (p->sched_class->task_waking)
2521 else { 2640 p->sched_class->task_waking(p);
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2641
2532out_activate: 2642 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2643 if (task_cpu(p) != cpu)
2644 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2645#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2646
2535 cpu == this_cpu, en_flags); 2647 ttwu_queue(p, cpu);
2536 success = 1; 2648stat:
2537out_running: 2649 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2650out:
2540 task_rq_unlock(rq, &flags); 2651 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2652
2543 return success; 2653 return success;
2544} 2654}
@@ -2547,31 +2657,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2657 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2658 * @p: the thread to be awakened
2549 * 2659 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2660 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2661 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2662 * the current task.
2553 */ 2663 */
2554static void try_to_wake_up_local(struct task_struct *p) 2664static void try_to_wake_up_local(struct task_struct *p)
2555{ 2665{
2556 struct rq *rq = task_rq(p); 2666 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2667
2559 BUG_ON(rq != this_rq()); 2668 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2669 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2670 lockdep_assert_held(&rq->lock);
2562 2671
2672 if (!raw_spin_trylock(&p->pi_lock)) {
2673 raw_spin_unlock(&rq->lock);
2674 raw_spin_lock(&p->pi_lock);
2675 raw_spin_lock(&rq->lock);
2676 }
2677
2563 if (!(p->state & TASK_NORMAL)) 2678 if (!(p->state & TASK_NORMAL))
2564 return; 2679 goto out;
2565 2680
2566 if (!p->se.on_rq) { 2681 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2682 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2683
2569 schedstat_inc(rq, ttwu_local); 2684 ttwu_do_wakeup(rq, p, 0);
2570 } 2685 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2686out:
2572 success = true; 2687 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2688}
2576 2689
2577/** 2690/**
@@ -2604,19 +2717,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2717 */
2605static void __sched_fork(struct task_struct *p) 2718static void __sched_fork(struct task_struct *p)
2606{ 2719{
2720 p->on_rq = 0;
2721
2722 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2723 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2724 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2725 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2726 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2727 p->se.vruntime = 0;
2728 INIT_LIST_HEAD(&p->se.group_node);
2612 2729
2613#ifdef CONFIG_SCHEDSTATS 2730#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2731 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2732#endif
2616 2733
2617 INIT_LIST_HEAD(&p->rt.run_list); 2734 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2735
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2736#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2737 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2741,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2741/*
2627 * fork()/clone()-time setup: 2742 * fork()/clone()-time setup:
2628 */ 2743 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2744void sched_fork(struct task_struct *p)
2630{ 2745{
2746 unsigned long flags;
2631 int cpu = get_cpu(); 2747 int cpu = get_cpu();
2632 2748
2633 __sched_fork(p); 2749 __sched_fork(p);
@@ -2678,16 +2794,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2794 *
2679 * Silence PROVE_RCU. 2795 * Silence PROVE_RCU.
2680 */ 2796 */
2681 rcu_read_lock(); 2797 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2798 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2799 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2800
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2801#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2802 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2803 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2804#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2805#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2806 p->on_cpu = 0;
2691#endif 2807#endif
2692#ifdef CONFIG_PREEMPT 2808#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2809 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2823,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2823 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2824 * on the runqueue and wakes it.
2709 */ 2825 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2826void wake_up_new_task(struct task_struct *p)
2711{ 2827{
2712 unsigned long flags; 2828 unsigned long flags;
2713 struct rq *rq; 2829 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2830
2831 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2832#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2833 /*
2721 * Fork balancing, do it here and not earlier because: 2834 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2835 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2836 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2837 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2838 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2839#endif
2734 2840
2735 rq = task_rq_lock(p, &flags); 2841 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2842 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2843 p->on_rq = 1;
2844 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2845 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2846#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2847 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2848 p->sched_class->task_woken(rq, p);
2742#endif 2849#endif
2743 task_rq_unlock(rq, &flags); 2850 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2851}
2746 2852
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2853#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3556,22 @@ void sched_exec(void)
3450{ 3556{
3451 struct task_struct *p = current; 3557 struct task_struct *p = current;
3452 unsigned long flags; 3558 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3559 int dest_cpu;
3455 3560
3456 rq = task_rq_lock(p, &flags); 3561 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3562 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3563 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3564 goto unlock;
3460 3565
3461 /* 3566 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3567 struct migration_arg arg = { p, dest_cpu };
3467 3568
3468 task_rq_unlock(rq, &flags); 3569 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3570 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3571 return;
3471 } 3572 }
3472unlock: 3573unlock:
3473 task_rq_unlock(rq, &flags); 3574 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3575}
3475 3576
3476#endif 3577#endif
@@ -3507,7 +3608,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3608
3508 rq = task_rq_lock(p, &flags); 3609 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3610 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3611 task_rq_unlock(rq, p, &flags);
3511 3612
3512 return ns; 3613 return ns;
3513} 3614}
@@ -3525,7 +3626,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3626
3526 rq = task_rq_lock(p, &flags); 3627 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3628 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3629 task_rq_unlock(rq, p, &flags);
3529 3630
3530 return ns; 3631 return ns;
3531} 3632}
@@ -3549,7 +3650,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3650 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3651 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3652 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3653 task_rq_unlock(rq, p, &flags);
3553 3654
3554 return ns; 3655 return ns;
3555} 3656}
@@ -3903,9 +4004,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4004/*
3904 * This function gets called by the timer code, with HZ frequency. 4005 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4006 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4007 */
3910void scheduler_tick(void) 4008void scheduler_tick(void)
3911{ 4009{
@@ -4025,17 +4123,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4123 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4124
4027 schedstat_inc(this_rq(), sched_count); 4125 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4126}
4035 4127
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4128static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4129{
4038 if (prev->se.on_rq) 4130 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4131 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4132 prev->sched_class->put_prev_task(rq, prev);
4041} 4133}
@@ -4097,11 +4189,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4189 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4190 prev->state = TASK_RUNNING;
4099 } else { 4191 } else {
4192 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4193 prev->on_rq = 0;
4194
4100 /* 4195 /*
4101 * If a worker is going to sleep, notify and 4196 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4197 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4198 * concurrency.
4104 * up the task.
4105 */ 4199 */
4106 if (prev->flags & PF_WQ_WORKER) { 4200 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4201 struct task_struct *to_wakeup;
@@ -4110,11 +4204,10 @@ need_resched:
4110 if (to_wakeup) 4204 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4205 try_to_wake_up_local(to_wakeup);
4112 } 4206 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4207
4115 /* 4208 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4209 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4210 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4211 */
4119 if (blk_needs_flush_plug(prev)) { 4212 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4213 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4254,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4254EXPORT_SYMBOL(schedule);
4162 4255
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4256#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4257
4173 if (!sched_feat(OWNER_SPIN)) 4258static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4259{
4260 bool ret = false;
4175 4261
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4262 rcu_read_lock();
4177 /* 4263 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4264 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4265
4188 /* 4266 /*
4189 * Even if the access succeeded (likely case), 4267 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4268 * lock->owner still matches owner, if that fails, owner might
4269 * point to free()d memory, if it still matches, the rcu_read_lock()
4270 * ensures the memory stays valid.
4191 */ 4271 */
4192 if (cpu >= nr_cpumask_bits) 4272 barrier();
4193 return 0;
4194 4273
4195 /* 4274 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4275fail:
4197 * get_cpu() and that we have the percpu area. 4276 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4277
4202 rq = cpu_rq(cpu); 4278 return ret;
4279}
4203 4280
4204 for (;;) { 4281/*
4205 /* 4282 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4283 * access and not reliable.
4207 */ 4284 */
4208 if (lock->owner != owner) { 4285int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4286{
4210 * If the lock has switched to a different owner, 4287 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4288 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4289
4219 /* 4290 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4291 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4292 return 0;
4224 4293
4225 arch_mutex_cpu_relax(); 4294 arch_mutex_cpu_relax();
4226 } 4295 }
4227 4296
4297 /*
4298 * If the owner changed to another task there is likely
4299 * heavy contention, stop spinning.
4300 */
4301 if (lock->owner)
4302 return 0;
4303
4228 return 1; 4304 return 1;
4229} 4305}
4230#endif 4306#endif
@@ -4684,19 +4760,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4760 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4761void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4762{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4763 int oldprio, on_rq, running;
4689 struct rq *rq; 4764 struct rq *rq;
4690 const struct sched_class *prev_class; 4765 const struct sched_class *prev_class;
4691 4766
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4767 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4768
4694 rq = task_rq_lock(p, &flags); 4769 rq = __task_rq_lock(p);
4695 4770
4696 trace_sched_pi_setprio(p, prio); 4771 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4772 oldprio = p->prio;
4698 prev_class = p->sched_class; 4773 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4774 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4775 running = task_current(rq, p);
4701 if (on_rq) 4776 if (on_rq)
4702 dequeue_task(rq, p, 0); 4777 dequeue_task(rq, p, 0);
@@ -4716,7 +4791,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4791 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4792
4718 check_class_changed(rq, p, prev_class, oldprio); 4793 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4794 __task_rq_unlock(rq);
4720} 4795}
4721 4796
4722#endif 4797#endif
@@ -4744,7 +4819,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4819 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4820 goto out_unlock;
4746 } 4821 }
4747 on_rq = p->se.on_rq; 4822 on_rq = p->on_rq;
4748 if (on_rq) 4823 if (on_rq)
4749 dequeue_task(rq, p, 0); 4824 dequeue_task(rq, p, 0);
4750 4825
@@ -4764,7 +4839,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4839 resched_task(rq->curr);
4765 } 4840 }
4766out_unlock: 4841out_unlock:
4767 task_rq_unlock(rq, &flags); 4842 task_rq_unlock(rq, p, &flags);
4768} 4843}
4769EXPORT_SYMBOL(set_user_nice); 4844EXPORT_SYMBOL(set_user_nice);
4770 4845
@@ -4878,8 +4953,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4953static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4954__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4955{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4956 p->policy = policy;
4884 p->rt_priority = prio; 4957 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4958 p->normal_prio = normal_prio(p);
@@ -4994,20 +5067,17 @@ recheck:
4994 /* 5067 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5068 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5069 * changing the priority of the task:
4997 */ 5070 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5071 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5072 * runqueue lock must be held.
5002 */ 5073 */
5003 rq = __task_rq_lock(p); 5074 rq = task_rq_lock(p, &flags);
5004 5075
5005 /* 5076 /*
5006 * Changing the policy of the stop threads its a very bad idea 5077 * Changing the policy of the stop threads its a very bad idea
5007 */ 5078 */
5008 if (p == rq->stop) { 5079 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5080 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5081 return -EINVAL;
5012 } 5082 }
5013 5083
@@ -5031,8 +5101,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5101 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5102 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5103 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5104 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5105 return -EPERM;
5037 } 5106 }
5038 } 5107 }
@@ -5041,11 +5110,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5110 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5111 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5112 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5113 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5114 goto recheck;
5047 } 5115 }
5048 on_rq = p->se.on_rq; 5116 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5117 running = task_current(rq, p);
5050 if (on_rq) 5118 if (on_rq)
5051 deactivate_task(rq, p, 0); 5119 deactivate_task(rq, p, 0);
@@ -5064,8 +5132,7 @@ recheck:
5064 activate_task(rq, p, 0); 5132 activate_task(rq, p, 0);
5065 5133
5066 check_class_changed(rq, p, prev_class, oldprio); 5134 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5135 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5136
5070 rt_mutex_adjust_pi(p); 5137 rt_mutex_adjust_pi(p);
5071 5138
@@ -5316,7 +5383,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5383{
5317 struct task_struct *p; 5384 struct task_struct *p;
5318 unsigned long flags; 5385 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5386 int retval;
5321 5387
5322 get_online_cpus(); 5388 get_online_cpus();
@@ -5331,9 +5397,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5397 if (retval)
5332 goto out_unlock; 5398 goto out_unlock;
5333 5399
5334 rq = task_rq_lock(p, &flags); 5400 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5401 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5402 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5403
5338out_unlock: 5404out_unlock:
5339 rcu_read_unlock(); 5405 rcu_read_unlock();
@@ -5658,7 +5724,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5724
5659 rq = task_rq_lock(p, &flags); 5725 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5726 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5727 task_rq_unlock(rq, p, &flags);
5662 5728
5663 rcu_read_unlock(); 5729 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5730 jiffies_to_timespec(time_slice, &t);
@@ -5776,17 +5842,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5842 rcu_read_unlock();
5777 5843
5778 rq->curr = rq->idle = idle; 5844 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5845#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5846 idle->on_cpu = 1;
5781#endif 5847#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5848 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5849
5784 /* Set the preempt count _outside_ the spinlocks! */ 5850 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5851 task_thread_info(idle)->preempt_count = 0;
5789#endif 5852
5790 /* 5853 /*
5791 * The idle tasks have their own, simple scheduling class: 5854 * The idle tasks have their own, simple scheduling class:
5792 */ 5855 */
@@ -5881,26 +5944,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5944 unsigned int dest_cpu;
5882 int ret = 0; 5945 int ret = 0;
5883 5946
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5947 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5948
5893 task_rq_unlock(rq, &flags); 5949 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5950 goto out;
5895 }
5896 5951
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5952 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5953 ret = -EINVAL;
5899 goto out; 5954 goto out;
5900 } 5955 }
5901 5956
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5957 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 5958 ret = -EINVAL;
5905 goto out; 5959 goto out;
5906 } 5960 }
@@ -5917,16 +5971,16 @@ again:
5917 goto out; 5971 goto out;
5918 5972
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5973 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5974 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 5975 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5976 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5977 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5978 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5979 tlb_migrate_finish(p->mm);
5926 return 0; 5980 return 0;
5927 } 5981 }
5928out: 5982out:
5929 task_rq_unlock(rq, &flags); 5983 task_rq_unlock(rq, p, &flags);
5930 5984
5931 return ret; 5985 return ret;
5932} 5986}
@@ -5954,6 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6008 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6009 rq_dest = cpu_rq(dest_cpu);
5956 6010
6011 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6012 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6013 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6014 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6021,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6021 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6022 * placed properly.
5968 */ 6023 */
5969 if (p->se.on_rq) { 6024 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6025 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6026 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6027 activate_task(rq_dest, p, 0);
@@ -5976,6 +6031,7 @@ done:
5976 ret = 1; 6031 ret = 1;
5977fail: 6032fail:
5978 double_rq_unlock(rq_src, rq_dest); 6033 double_rq_unlock(rq_src, rq_dest);
6034 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6035 return ret;
5980} 6036}
5981 6037
@@ -6316,6 +6372,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6372
6317#ifdef CONFIG_HOTPLUG_CPU 6373#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6374 case CPU_DYING:
6375 sched_ttwu_pending();
6319 /* Update our root-domain */ 6376 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6377 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6378 if (rq->rd) {
@@ -6394,6 +6451,8 @@ early_initcall(migration_init);
6394 6451
6395#ifdef CONFIG_SMP 6452#ifdef CONFIG_SMP
6396 6453
6454static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6455
6397#ifdef CONFIG_SCHED_DEBUG 6456#ifdef CONFIG_SCHED_DEBUG
6398 6457
6399static __read_mostly int sched_domain_debug_enabled; 6458static __read_mostly int sched_domain_debug_enabled;
@@ -6489,7 +6548,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6548
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6549static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6550{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6551 int level = 0;
6494 6552
6495 if (!sched_domain_debug_enabled) 6553 if (!sched_domain_debug_enabled)
@@ -6502,20 +6560,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6560
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6561 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6562
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6563 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6564 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6565 break;
6513 level++; 6566 level++;
6514 sd = sd->parent; 6567 sd = sd->parent;
6515 if (!sd) 6568 if (!sd)
6516 break; 6569 break;
6517 } 6570 }
6518 free_cpumask_var(groupmask);
6519} 6571}
6520#else /* !CONFIG_SCHED_DEBUG */ 6572#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6573# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6624,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6624 return 1;
6573} 6625}
6574 6626
6575static void free_rootdomain(struct root_domain *rd) 6627static void free_rootdomain(struct rcu_head *rcu)
6576{ 6628{
6577 synchronize_sched(); 6629 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6630
6579 cpupri_cleanup(&rd->cpupri); 6631 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6632 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6633 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6634 free_cpumask_var(rd->span);
@@ -6618,7 +6669,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6669 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6670
6620 if (old_rd) 6671 if (old_rd)
6621 free_rootdomain(old_rd); 6672 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6673}
6623 6674
6624static int init_rootdomain(struct root_domain *rd) 6675static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6720,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6720 return rd;
6670} 6721}
6671 6722
6723static void free_sched_domain(struct rcu_head *rcu)
6724{
6725 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6726 if (atomic_dec_and_test(&sd->groups->ref))
6727 kfree(sd->groups);
6728 kfree(sd);
6729}
6730
6731static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6732{
6733 call_rcu(&sd->rcu, free_sched_domain);
6734}
6735
6736static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6737{
6738 for (; sd; sd = sd->parent)
6739 destroy_sched_domain(sd, cpu);
6740}
6741
6672/* 6742/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6743 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6744 * hold the hotplug lock.
@@ -6679,9 +6749,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6749 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6750 struct sched_domain *tmp;
6681 6751
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6752 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6753 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6754 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6759,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6759 tmp->parent = parent->parent;
6693 if (parent->parent) 6760 if (parent->parent)
6694 parent->parent->child = tmp; 6761 parent->parent->child = tmp;
6762 destroy_sched_domain(parent, cpu);
6695 } else 6763 } else
6696 tmp = tmp->parent; 6764 tmp = tmp->parent;
6697 } 6765 }
6698 6766
6699 if (sd && sd_degenerate(sd)) { 6767 if (sd && sd_degenerate(sd)) {
6768 tmp = sd;
6700 sd = sd->parent; 6769 sd = sd->parent;
6770 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6771 if (sd)
6702 sd->child = NULL; 6772 sd->child = NULL;
6703 } 6773 }
@@ -6705,7 +6775,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6775 sched_domain_debug(sd, cpu);
6706 6776
6707 rq_attach_root(rq, rd); 6777 rq_attach_root(rq, rd);
6778 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6779 rcu_assign_pointer(rq->sd, sd);
6780 destroy_sched_domains(tmp, cpu);
6709} 6781}
6710 6782
6711/* cpus with isolated domains */ 6783/* cpus with isolated domains */
@@ -6721,56 +6793,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6793
6722__setup("isolcpus=", isolated_cpu_setup); 6794__setup("isolcpus=", isolated_cpu_setup);
6723 6795
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6796#define SD_NODES_PER_DOMAIN 16
6775 6797
6776#ifdef CONFIG_NUMA 6798#ifdef CONFIG_NUMA
@@ -6787,7 +6809,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6809 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6810static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6811{
6790 int i, n, val, min_val, best_node = 0; 6812 int i, n, val, min_val, best_node = -1;
6791 6813
6792 min_val = INT_MAX; 6814 min_val = INT_MAX;
6793 6815
@@ -6811,7 +6833,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6833 }
6812 } 6834 }
6813 6835
6814 node_set(best_node, *used_nodes); 6836 if (best_node != -1)
6837 node_set(best_node, *used_nodes);
6815 return best_node; 6838 return best_node;
6816} 6839}
6817 6840
@@ -6837,315 +6860,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6860
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6861 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6862 int next_node = find_next_best_node(node, &used_nodes);
6840 6863 if (next_node < 0)
6864 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6865 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6866 }
6843} 6867}
6868
6869static const struct cpumask *cpu_node_mask(int cpu)
6870{
6871 lockdep_assert_held(&sched_domains_mutex);
6872
6873 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6874
6875 return sched_domains_tmpmask;
6876}
6877
6878static const struct cpumask *cpu_allnodes_mask(int cpu)
6879{
6880 return cpu_possible_mask;
6881}
6844#endif /* CONFIG_NUMA */ 6882#endif /* CONFIG_NUMA */
6845 6883
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6884static const struct cpumask *cpu_cpu_mask(int cpu)
6885{
6886 return cpumask_of_node(cpu_to_node(cpu));
6887}
6847 6888
6848/* 6889int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6890
6859struct static_sched_domain { 6891struct sd_data {
6860 struct sched_domain sd; 6892 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6893 struct sched_group **__percpu sg;
6862}; 6894};
6863 6895
6864struct s_data { 6896struct s_data {
6865#ifdef CONFIG_NUMA 6897 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6898 struct root_domain *rd;
6879}; 6899};
6880 6900
6881enum s_alloc { 6901enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6902 sa_rootdomain,
6884 sa_tmpmask, 6903 sa_sd,
6885 sa_send_covered, 6904 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6905 sa_none,
6897}; 6906};
6898 6907
6899/* 6908struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 6909
6906static int 6910typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6911typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 6912
6916/* 6913struct sched_domain_topology_level {
6917 * multi-core sched-domains: 6914 sched_domain_init_f init;
6918 */ 6915 sched_domain_mask_f mask;
6919#ifdef CONFIG_SCHED_MC 6916 struct sd_data data;
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6917};
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922
6923static int
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6925 struct sched_group **sg, struct cpumask *mask)
6926{
6927 int group;
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6918
6940/* 6919/*
6941 * book sched-domains: 6920 * Assumes the sched_domain tree is fully constructed
6942 */ 6921 */
6943#ifdef CONFIG_SCHED_BOOK 6922static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6923{
6951 int group = cpu; 6924 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6925 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6926
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6927 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6928 cpu = cpumask_first(sched_domain_span(child));
6967 6929
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6930 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6931 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6932
6933 return cpu;
6988} 6934}
6989 6935
6990#ifdef CONFIG_NUMA
6991/* 6936/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6937 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6938 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6939 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6940 * (due to the fact that we keep track of groups covered with a struct cpumask).
6941 *
6942 * build_sched_groups will build a circular linked list of the groups
6943 * covered by the given span, and will set each group's ->cpumask correctly,
6944 * and ->cpu_power to 0.
6995 */ 6945 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6946static void
6997static struct sched_group ***sched_group_nodes_bycpu; 6947build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 6948{
7045 struct sched_domain *sd; 6949 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 6950 struct sd_data *sdd = sd->private;
7047 int n, j; 6951 const struct cpumask *span = sched_domain_span(sd);
7048 6952 struct cpumask *covered;
7049 cpumask_clear(d->covered); 6953 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072
7073 sg->cpu_power = 0;
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask);
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 6954
7078 prev = sg; 6955 lockdep_assert_held(&sched_domains_mutex);
7079 for (j = 0; j < nr_node_ids; j++) { 6956 covered = sched_domains_tmpmask;
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107 6957
7108#ifdef CONFIG_NUMA 6958 cpumask_clear(covered);
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 6959
7115 for_each_cpu(cpu, cpu_map) { 6960 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 6961 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 6962 int group = get_group(i, sdd, &sg);
6963 int j;
7118 6964
7119 if (!sched_group_nodes) 6965 if (cpumask_test_cpu(i, covered))
7120 continue; 6966 continue;
7121 6967
7122 for (i = 0; i < nr_node_ids; i++) { 6968 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6969 sg->cpu_power = 0;
7124 6970
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6971 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 6972 if (get_group(j, sdd, NULL) != group)
7127 continue; 6973 continue;
7128 6974
7129 if (sg == NULL) 6975 cpumask_set_cpu(j, covered);
7130 continue; 6976 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 6977 }
7139 kfree(sched_group_nodes); 6978
7140 sched_group_nodes_bycpu[cpu] = NULL; 6979 if (!first)
6980 first = sg;
6981 if (last)
6982 last->next = sg;
6983 last = sg;
7141 } 6984 }
6985 last->next = first;
7142} 6986}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 6987
7150/* 6988/*
7151 * Initialize sched groups cpu_power. 6989 * Initialize sched groups cpu_power.
@@ -7159,11 +6997,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 6997 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 6998static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 6999{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7000 WARN_ON(!sd || !sd->groups);
7168 7001
7169 if (cpu != group_first_cpu(sd->groups)) 7002 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7004,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7004
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7005 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7006
7174 child = sd->child; 7007 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7008}
7205 7009
7206/* 7010/*
@@ -7214,15 +7018,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7018# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7019#endif
7216 7020
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7021#define SD_INIT_FUNC(type) \
7218 7022static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7023sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7024{ \
7221{ \ 7025 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7026 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7027 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7028 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7029 return sd; \
7226} 7030}
7227 7031
7228SD_INIT_FUNC(CPU) 7032SD_INIT_FUNC(CPU)
@@ -7241,13 +7045,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7045#endif
7242 7046
7243static int default_relax_domain_level = -1; 7047static int default_relax_domain_level = -1;
7048int sched_domain_level_max;
7244 7049
7245static int __init setup_relax_domain_level(char *str) 7050static int __init setup_relax_domain_level(char *str)
7246{ 7051{
7247 unsigned long val; 7052 unsigned long val;
7248 7053
7249 val = simple_strtoul(str, NULL, 0); 7054 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7055 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7056 default_relax_domain_level = val;
7252 7057
7253 return 1; 7058 return 1;
@@ -7275,37 +7080,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7080 }
7276} 7081}
7277 7082
7083static void __sdt_free(const struct cpumask *cpu_map);
7084static int __sdt_alloc(const struct cpumask *cpu_map);
7085
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7086static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7087 const struct cpumask *cpu_map)
7280{ 7088{
7281 switch (what) { 7089 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7090 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7091 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7092 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7093 case sa_sd:
7289 case sa_send_covered: 7094 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7095 case sa_sd_storage:
7291 case sa_this_book_map: 7096 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7097 case sa_none:
7310 break; 7098 break;
7311 } 7099 }
@@ -7314,308 +7102,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7102static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7103 const struct cpumask *cpu_map)
7316{ 7104{
7317#ifdef CONFIG_NUMA 7105 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7106
7319 return sa_none; 7107 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7108 return sa_sd_storage;
7321 return sa_domainspan; 7109 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7110 if (!d->sd)
7323 return sa_covered; 7111 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7112 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7113 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7114 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7115 return sa_rootdomain;
7351} 7116}
7352 7117
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7118/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7119 * NULL the sd_data elements we've used to build the sched_domain and
7120 * sched_group structure so that the subsequent __free_domain_allocs()
7121 * will not free the data we're using.
7122 */
7123static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7124{
7356 struct sched_domain *sd = NULL; 7125 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7126 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7127
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7128 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7129 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7130
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7131 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7132 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7133 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7134 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7135}
7416 7136
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7137#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7138static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7139{
7421 struct sched_domain *sd = parent; 7140 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7141}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7142#endif
7448 return sd;
7449}
7450 7143
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7144/*
7452 const struct cpumask *cpu_map, int cpu) 7145 * Topology list, bottom-up.
7453{ 7146 */
7454 switch (l) { 7147static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7148#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7149 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7150#endif
7465#ifdef CONFIG_SCHED_MC 7151#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7152 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7153#endif
7474#ifdef CONFIG_SCHED_BOOK 7154#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7155 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7156#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7157 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7158#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7159 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7160 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7161#endif
7496 default: 7162 { NULL, },
7497 break; 7163};
7164
7165static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7166
7167static int __sdt_alloc(const struct cpumask *cpu_map)
7168{
7169 struct sched_domain_topology_level *tl;
7170 int j;
7171
7172 for (tl = sched_domain_topology; tl->init; tl++) {
7173 struct sd_data *sdd = &tl->data;
7174
7175 sdd->sd = alloc_percpu(struct sched_domain *);
7176 if (!sdd->sd)
7177 return -ENOMEM;
7178
7179 sdd->sg = alloc_percpu(struct sched_group *);
7180 if (!sdd->sg)
7181 return -ENOMEM;
7182
7183 for_each_cpu(j, cpu_map) {
7184 struct sched_domain *sd;
7185 struct sched_group *sg;
7186
7187 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7188 GFP_KERNEL, cpu_to_node(j));
7189 if (!sd)
7190 return -ENOMEM;
7191
7192 *per_cpu_ptr(sdd->sd, j) = sd;
7193
7194 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7195 GFP_KERNEL, cpu_to_node(j));
7196 if (!sg)
7197 return -ENOMEM;
7198
7199 *per_cpu_ptr(sdd->sg, j) = sg;
7200 }
7201 }
7202
7203 return 0;
7204}
7205
7206static void __sdt_free(const struct cpumask *cpu_map)
7207{
7208 struct sched_domain_topology_level *tl;
7209 int j;
7210
7211 for (tl = sched_domain_topology; tl->init; tl++) {
7212 struct sd_data *sdd = &tl->data;
7213
7214 for_each_cpu(j, cpu_map) {
7215 kfree(*per_cpu_ptr(sdd->sd, j));
7216 kfree(*per_cpu_ptr(sdd->sg, j));
7217 }
7218 free_percpu(sdd->sd);
7219 free_percpu(sdd->sg);
7220 }
7221}
7222
7223struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7224 struct s_data *d, const struct cpumask *cpu_map,
7225 struct sched_domain_attr *attr, struct sched_domain *child,
7226 int cpu)
7227{
7228 struct sched_domain *sd = tl->init(tl, cpu);
7229 if (!sd)
7230 return child;
7231
7232 set_domain_attribute(sd, attr);
7233 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7234 if (child) {
7235 sd->level = child->level + 1;
7236 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7237 child->parent = sd;
7498 } 7238 }
7239 sd->child = child;
7240
7241 return sd;
7499} 7242}
7500 7243
7501/* 7244/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7245 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7246 * to the individual cpus
7504 */ 7247 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7248static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7249 struct sched_domain_attr *attr)
7507{ 7250{
7508 enum s_alloc alloc_state = sa_none; 7251 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7252 struct sched_domain *sd;
7511 int i; 7253 struct s_data d;
7512#ifdef CONFIG_NUMA 7254 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7255
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7256 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7257 if (alloc_state != sa_rootdomain)
7518 goto error; 7258 goto error;
7519 alloc_state = sa_sched_groups;
7520 7259
7521 /* 7260 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7261 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7262 struct sched_domain_topology_level *tl;
7526 cpu_map);
7527 7263
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7264 sd = NULL;
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7265 for (tl = sched_domain_topology; tl->init; tl++)
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7266 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534
7535 for_each_cpu(i, cpu_map) {
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540
7541 /* Set up physical groups */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544 7267
7545#ifdef CONFIG_NUMA 7268 while (sd->child)
7546 /* Set up node groups */ 7269 sd = sd->child;
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554 7270
7555 /* Calculate CPU power for physical packages and nodes */ 7271 *per_cpu_ptr(d.sd, i) = sd;
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 } 7272 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574 7273
7274 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7275 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7276 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7277 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7278 get_group(i, sd->private, &sd->groups);
7279 atomic_inc(&sd->groups->ref);
7579 7280
7580#ifdef CONFIG_NUMA 7281 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7282 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7283
7584 if (d.sd_allnodes) { 7284 build_sched_groups(sd);
7585 struct sched_group *sg; 7285 }
7286 }
7586 7287
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7288 /* Calculate CPU power for physical packages and nodes */
7588 d.tmpmask); 7289 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7589 init_numa_sched_groups_power(sg); 7290 if (!cpumask_test_cpu(i, cpu_map))
7291 continue;
7292
7293 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7294 claim_allocations(i, sd);
7295 init_sched_groups_power(i, sd);
7296 }
7590 } 7297 }
7591#endif
7592 7298
7593 /* Attach the domains */ 7299 /* Attach the domains */
7300 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7301 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7302 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7303 cpu_attach_domain(sd, d.rd, i);
7605 } 7304 }
7305 rcu_read_unlock();
7606 7306
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7307 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7308error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7309 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7310 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7311}
7620 7312
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7313static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7362,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7362 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7363 * exclude other special cases in the future.
7672 */ 7364 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7365static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7366{
7675 int err; 7367 int err;
7676 7368
@@ -7681,32 +7373,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7373 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7374 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7375 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7376 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7377 register_sched_domain_sysctl();
7686 7378
7687 return err; 7379 return err;
7688} 7380}
7689 7381
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7382/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7383 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7384 * These cpus will now be attached to the NULL domain
7699 */ 7385 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7386static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7387{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7388 int i;
7705 7389
7390 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7391 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7392 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7393 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7394}
7711 7395
7712/* handle null as "default" */ 7396/* handle null as "default" */
@@ -7795,8 +7479,7 @@ match1:
7795 goto match2; 7479 goto match2;
7796 } 7480 }
7797 /* no match - add a new doms_new */ 7481 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7482 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7483match2:
7801 ; 7484 ;
7802 } 7485 }
@@ -7815,7 +7498,7 @@ match2:
7815} 7498}
7816 7499
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7500#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7501static void reinit_sched_domains(void)
7819{ 7502{
7820 get_online_cpus(); 7503 get_online_cpus();
7821 7504
@@ -7848,7 +7531,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7531 else
7849 sched_mc_power_savings = level; 7532 sched_mc_power_savings = level;
7850 7533
7851 arch_reinit_sched_domains(); 7534 reinit_sched_domains();
7852 7535
7853 return count; 7536 return count;
7854} 7537}
@@ -7967,14 +7650,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7650 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7651 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7652
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7653 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7654 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7655 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7656 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7657 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7658 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8281,6 +7959,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7959 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7960 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 7961#ifdef CONFIG_SMP
7962 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 7963#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7964 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7965 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8019,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8019 int old_prio = p->prio;
8341 int on_rq; 8020 int on_rq;
8342 8021
8343 on_rq = p->se.on_rq; 8022 on_rq = p->on_rq;
8344 if (on_rq) 8023 if (on_rq)
8345 deactivate_task(rq, p, 0); 8024 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8025 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8232,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8232{
8554 struct rt_rq *rt_rq; 8233 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8234 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8235 int i;
8558 8236
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8237 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8245,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8245 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8246
8569 for_each_possible_cpu(i) { 8247 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8248 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8249 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8250 if (!rt_rq)
@@ -8683,7 +8359,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8359 rq = task_rq_lock(tsk, &flags);
8684 8360
8685 running = task_current(rq, tsk); 8361 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8362 on_rq = tsk->on_rq;
8687 8363
8688 if (on_rq) 8364 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8365 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8378,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8378 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8379 enqueue_task(rq, tsk, 0);
8704 8380
8705 task_rq_unlock(rq, &flags); 8381 task_rq_unlock(rq, tsk, &flags);
8706} 8382}
8707#endif /* CONFIG_CGROUP_SCHED */ 8383#endif /* CONFIG_CGROUP_SCHED */
8708 8384