aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1693
1 files changed, 692 insertions, 1001 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 312f8b95c2d4..2d12893b8b0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
293 * limitation from this.) 293 * limitation from this.)
294 */ 294 */
295#define MIN_SHARES 2 295#define MIN_SHARES 2
296#define MAX_SHARES (1UL << 18) 296#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
297 297
298static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299#endif 299#endif
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 /*
1334 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1335 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1336 * 2^SCHED_LOAD_RESOLUTION.
1337 */
1338 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1339 tmp = (u64)delta_exec * scale_load_down(weight);
1340 else
1341 tmp = (u64)delta_exec;
1342
1310 if (!lw->inv_weight) { 1343 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1344 unsigned long w = scale_load_down(lw->weight);
1345
1346 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1312 lw->inv_weight = 1; 1347 lw->inv_weight = 1;
1348 else if (unlikely(!w))
1349 lw->inv_weight = WMULT_CONST;
1313 else 1350 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1351 lw->inv_weight = WMULT_CONST / w;
1315 / (lw->weight+1);
1316 } 1352 }
1317 1353
1318 tmp = (u64)delta_exec * weight;
1319 /* 1354 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1355 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1356 */
@@ -1755,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
1755 1790
1756static void set_load_weight(struct task_struct *p) 1791static void set_load_weight(struct task_struct *p)
1757{ 1792{
1793 int prio = p->static_prio - MAX_RT_PRIO;
1794 struct load_weight *load = &p->se.load;
1795
1758 /* 1796 /*
1759 * SCHED_IDLE tasks get minimal weight: 1797 * SCHED_IDLE tasks get minimal weight:
1760 */ 1798 */
1761 if (p->policy == SCHED_IDLE) { 1799 if (p->policy == SCHED_IDLE) {
1762 p->se.load.weight = WEIGHT_IDLEPRIO; 1800 load->weight = scale_load(WEIGHT_IDLEPRIO);
1763 p->se.load.inv_weight = WMULT_IDLEPRIO; 1801 load->inv_weight = WMULT_IDLEPRIO;
1764 return; 1802 return;
1765 } 1803 }
1766 1804
1767 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1805 load->weight = scale_load(prio_to_weight[prio]);
1768 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1806 load->inv_weight = prio_to_wmult[prio];
1769} 1807}
1770 1808
1771static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1809static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1773,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1811 update_rq_clock(rq);
1774 sched_info_queued(p); 1812 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1813 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1814}
1778 1815
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1816static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1818 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1819 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1820 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1821}
1786 1822
1787/* 1823/*
@@ -2116,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2152 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2153 * this case, we can save a useless back to back clock update.
2118 */ 2154 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2155 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2156 rq->skip_clock_update = 1;
2121} 2157}
2122 2158
@@ -2162,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2198 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2199 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201
2202#ifdef CONFIG_LOCKDEP
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock)));
2205#endif
2165#endif 2206#endif
2166 2207
2167 trace_sched_migrate_task(p, new_cpu); 2208 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2223,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2223static int migration_cpu_stop(void *data);
2183 2224
2184/* 2225/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2226 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2227 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2228 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2279 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2280 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2281 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2282 on_rq = p->on_rq;
2255 ncsw = 0; 2283 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2284 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2285 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2286 task_rq_unlock(rq, p, &flags);
2259 2287
2260 /* 2288 /*
2261 * If it changed from the expected state, bail out now. 2289 * If it changed from the expected state, bail out now.
@@ -2330,7 +2358,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2358
2331#ifdef CONFIG_SMP 2359#ifdef CONFIG_SMP
2332/* 2360/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2361 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2362 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2363static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2364{
@@ -2363,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2391}
2364 2392
2365/* 2393/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2394 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2395 */
2368static inline 2396static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2397int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2398{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2399 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2400
2373 /* 2401 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2402 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2422}
2395#endif 2423#endif
2396 2424
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2425static void
2398 bool is_sync, bool is_migrate, bool is_local, 2426ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2427{
2428#ifdef CONFIG_SCHEDSTATS
2429 struct rq *rq = this_rq();
2430
2431#ifdef CONFIG_SMP
2432 int this_cpu = smp_processor_id();
2433
2434 if (cpu == this_cpu) {
2435 schedstat_inc(rq, ttwu_local);
2436 schedstat_inc(p, se.statistics.nr_wakeups_local);
2437 } else {
2438 struct sched_domain *sd;
2439
2440 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2441 rcu_read_lock();
2442 for_each_domain(this_cpu, sd) {
2443 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2444 schedstat_inc(sd, ttwu_wake_remote);
2445 break;
2446 }
2447 }
2448 rcu_read_unlock();
2449 }
2450#endif /* CONFIG_SMP */
2451
2452 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2453 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2454
2455 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2456 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2457
2458 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2460
2461#endif /* CONFIG_SCHEDSTATS */
2462}
2463
2464static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2465{
2411 activate_task(rq, p, en_flags); 2466 activate_task(rq, p, en_flags);
2467 p->on_rq = 1;
2468
2469 /* if a worker is waking up, notify workqueue */
2470 if (p->flags & PF_WQ_WORKER)
2471 wq_worker_waking_up(p, cpu_of(rq));
2412} 2472}
2413 2473
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2474/*
2415 int wake_flags, bool success) 2475 * Mark the task runnable and perform wakeup-preemption.
2476 */
2477static void
2478ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2479{
2417 trace_sched_wakeup(p, success); 2480 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2481 check_preempt_curr(rq, p, wake_flags);
2419 2482
2420 p->state = TASK_RUNNING; 2483 p->state = TASK_RUNNING;
@@ -2433,9 +2496,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2496 rq->idle_stamp = 0;
2434 } 2497 }
2435#endif 2498#endif
2436 /* if a worker is waking up, notify workqueue */ 2499}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2500
2438 wq_worker_waking_up(p, cpu_of(rq)); 2501static void
2502ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2503{
2504#ifdef CONFIG_SMP
2505 if (p->sched_contributes_to_load)
2506 rq->nr_uninterruptible--;
2507#endif
2508
2509 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2510 ttwu_do_wakeup(rq, p, wake_flags);
2511}
2512
2513/*
2514 * Called in case the task @p isn't fully descheduled from its runqueue,
2515 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2516 * since all we need to do is flip p->state to TASK_RUNNING, since
2517 * the task is still ->on_rq.
2518 */
2519static int ttwu_remote(struct task_struct *p, int wake_flags)
2520{
2521 struct rq *rq;
2522 int ret = 0;
2523
2524 rq = __task_rq_lock(p);
2525 if (p->on_rq) {
2526 ttwu_do_wakeup(rq, p, wake_flags);
2527 ret = 1;
2528 }
2529 __task_rq_unlock(rq);
2530
2531 return ret;
2532}
2533
2534#ifdef CONFIG_SMP
2535static void sched_ttwu_pending(void)
2536{
2537 struct rq *rq = this_rq();
2538 struct task_struct *list = xchg(&rq->wake_list, NULL);
2539
2540 if (!list)
2541 return;
2542
2543 raw_spin_lock(&rq->lock);
2544
2545 while (list) {
2546 struct task_struct *p = list;
2547 list = list->wake_entry;
2548 ttwu_do_activate(rq, p, 0);
2549 }
2550
2551 raw_spin_unlock(&rq->lock);
2552}
2553
2554void scheduler_ipi(void)
2555{
2556 sched_ttwu_pending();
2557}
2558
2559static void ttwu_queue_remote(struct task_struct *p, int cpu)
2560{
2561 struct rq *rq = cpu_rq(cpu);
2562 struct task_struct *next = rq->wake_list;
2563
2564 for (;;) {
2565 struct task_struct *old = next;
2566
2567 p->wake_entry = next;
2568 next = cmpxchg(&rq->wake_list, old, p);
2569 if (next == old)
2570 break;
2571 }
2572
2573 if (!next)
2574 smp_send_reschedule(cpu);
2575}
2576#endif
2577
2578static void ttwu_queue(struct task_struct *p, int cpu)
2579{
2580 struct rq *rq = cpu_rq(cpu);
2581
2582#if defined(CONFIG_SMP)
2583 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2584 ttwu_queue_remote(p, cpu);
2585 return;
2586 }
2587#endif
2588
2589 raw_spin_lock(&rq->lock);
2590 ttwu_do_activate(rq, p, 0);
2591 raw_spin_unlock(&rq->lock);
2439} 2592}
2440 2593
2441/** 2594/**
@@ -2453,92 +2606,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2606 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2607 * or @state didn't match @p's state.
2455 */ 2608 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2609static int
2457 int wake_flags) 2610try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2611{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2612 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2613 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2614
2466 smp_wmb(); 2615 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2616 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2617 if (!(p->state & state))
2469 goto out; 2618 goto out;
2470 2619
2471 if (p->se.on_rq) 2620 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2621 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2622
2477#ifdef CONFIG_SMP 2623 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2624 goto stat;
2479 goto out_activate;
2480 2625
2626#ifdef CONFIG_SMP
2481 /* 2627 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2628 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2629 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2630 */
2487 if (task_contributes_to_load(p)) { 2631 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2632#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2633 /*
2490 else 2634 * If called from interrupt context we could have landed in the
2491 this_rq()->nr_uninterruptible--; 2635 * middle of schedule(), in this case we should take care not
2492 } 2636 * to spin on ->on_cpu if p is current, since that would
2493 p->state = TASK_WAKING; 2637 * deadlock.
2494 2638 */
2495 if (p->sched_class->task_waking) { 2639 if (p == current) {
2496 p->sched_class->task_waking(rq, p); 2640 ttwu_queue(p, cpu);
2497 en_flags |= ENQUEUE_WAKING; 2641 goto stat;
2642 }
2643#endif
2644 cpu_relax();
2498 } 2645 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2646 /*
2509 * We migrated the task without holding either rq->lock, however 2647 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2648 */
2514 WARN_ON(task_cpu(p) != cpu); 2649 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2650
2517#ifdef CONFIG_SCHEDSTATS 2651 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2652 p->state = TASK_WAKING;
2519 if (cpu == this_cpu)
2520 schedstat_inc(rq, ttwu_local);
2521 else {
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2653
2532out_activate: 2654 if (p->sched_class->task_waking)
2655 p->sched_class->task_waking(p);
2656
2657 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2658 if (task_cpu(p) != cpu)
2659 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2660#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2661
2535 cpu == this_cpu, en_flags); 2662 ttwu_queue(p, cpu);
2536 success = 1; 2663stat:
2537out_running: 2664 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2665out:
2540 task_rq_unlock(rq, &flags); 2666 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2667
2543 return success; 2668 return success;
2544} 2669}
@@ -2547,31 +2672,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2672 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2673 * @p: the thread to be awakened
2549 * 2674 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2675 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2676 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2677 * the current task.
2553 */ 2678 */
2554static void try_to_wake_up_local(struct task_struct *p) 2679static void try_to_wake_up_local(struct task_struct *p)
2555{ 2680{
2556 struct rq *rq = task_rq(p); 2681 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2682
2559 BUG_ON(rq != this_rq()); 2683 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2684 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2685 lockdep_assert_held(&rq->lock);
2562 2686
2687 if (!raw_spin_trylock(&p->pi_lock)) {
2688 raw_spin_unlock(&rq->lock);
2689 raw_spin_lock(&p->pi_lock);
2690 raw_spin_lock(&rq->lock);
2691 }
2692
2563 if (!(p->state & TASK_NORMAL)) 2693 if (!(p->state & TASK_NORMAL))
2564 return; 2694 goto out;
2565 2695
2566 if (!p->se.on_rq) { 2696 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2697 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2698
2569 schedstat_inc(rq, ttwu_local); 2699 ttwu_do_wakeup(rq, p, 0);
2570 } 2700 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2701out:
2572 success = true; 2702 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2703}
2576 2704
2577/** 2705/**
@@ -2604,19 +2732,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2732 */
2605static void __sched_fork(struct task_struct *p) 2733static void __sched_fork(struct task_struct *p)
2606{ 2734{
2735 p->on_rq = 0;
2736
2737 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2738 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2739 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2740 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2741 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2742 p->se.vruntime = 0;
2743 INIT_LIST_HEAD(&p->se.group_node);
2612 2744
2613#ifdef CONFIG_SCHEDSTATS 2745#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2746 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2747#endif
2616 2748
2617 INIT_LIST_HEAD(&p->rt.run_list); 2749 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2750
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2751#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2752 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2756,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2756/*
2627 * fork()/clone()-time setup: 2757 * fork()/clone()-time setup:
2628 */ 2758 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2759void sched_fork(struct task_struct *p)
2630{ 2760{
2761 unsigned long flags;
2631 int cpu = get_cpu(); 2762 int cpu = get_cpu();
2632 2763
2633 __sched_fork(p); 2764 __sched_fork(p);
@@ -2678,16 +2809,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2809 *
2679 * Silence PROVE_RCU. 2810 * Silence PROVE_RCU.
2680 */ 2811 */
2681 rcu_read_lock(); 2812 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2813 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2814 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2815
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2816#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2817 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2818 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2819#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2820#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2821 p->on_cpu = 0;
2691#endif 2822#endif
2692#ifdef CONFIG_PREEMPT 2823#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2824 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2838,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2838 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2839 * on the runqueue and wakes it.
2709 */ 2840 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2841void wake_up_new_task(struct task_struct *p)
2711{ 2842{
2712 unsigned long flags; 2843 unsigned long flags;
2713 struct rq *rq; 2844 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2845
2846 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2847#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2848 /*
2721 * Fork balancing, do it here and not earlier because: 2849 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2850 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2851 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2852 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2853 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2854#endif
2734 2855
2735 rq = task_rq_lock(p, &flags); 2856 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2857 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2858 p->on_rq = 1;
2859 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2860 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2861#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2862 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2863 p->sched_class->task_woken(rq, p);
2742#endif 2864#endif
2743 task_rq_unlock(rq, &flags); 2865 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2866}
2746 2867
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2868#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3571,22 @@ void sched_exec(void)
3450{ 3571{
3451 struct task_struct *p = current; 3572 struct task_struct *p = current;
3452 unsigned long flags; 3573 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3574 int dest_cpu;
3455 3575
3456 rq = task_rq_lock(p, &flags); 3576 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3577 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3578 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3579 goto unlock;
3460 3580
3461 /* 3581 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3582 struct migration_arg arg = { p, dest_cpu };
3467 3583
3468 task_rq_unlock(rq, &flags); 3584 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3585 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3586 return;
3471 } 3587 }
3472unlock: 3588unlock:
3473 task_rq_unlock(rq, &flags); 3589 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3590}
3475 3591
3476#endif 3592#endif
@@ -3507,7 +3623,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3623
3508 rq = task_rq_lock(p, &flags); 3624 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3625 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3626 task_rq_unlock(rq, p, &flags);
3511 3627
3512 return ns; 3628 return ns;
3513} 3629}
@@ -3525,7 +3641,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3641
3526 rq = task_rq_lock(p, &flags); 3642 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3643 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3644 task_rq_unlock(rq, p, &flags);
3529 3645
3530 return ns; 3646 return ns;
3531} 3647}
@@ -3549,7 +3665,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3665 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3666 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3667 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3668 task_rq_unlock(rq, p, &flags);
3553 3669
3554 return ns; 3670 return ns;
3555} 3671}
@@ -3903,9 +4019,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4019/*
3904 * This function gets called by the timer code, with HZ frequency. 4020 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4021 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4022 */
3910void scheduler_tick(void) 4023void scheduler_tick(void)
3911{ 4024{
@@ -4025,17 +4138,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4138 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4139
4027 schedstat_inc(this_rq(), sched_count); 4140 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4141}
4035 4142
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4143static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4144{
4038 if (prev->se.on_rq) 4145 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4146 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4147 prev->sched_class->put_prev_task(rq, prev);
4041} 4148}
@@ -4097,11 +4204,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4204 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4205 prev->state = TASK_RUNNING;
4099 } else { 4206 } else {
4207 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4208 prev->on_rq = 0;
4209
4100 /* 4210 /*
4101 * If a worker is going to sleep, notify and 4211 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4212 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4213 * concurrency.
4104 * up the task.
4105 */ 4214 */
4106 if (prev->flags & PF_WQ_WORKER) { 4215 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4216 struct task_struct *to_wakeup;
@@ -4110,11 +4219,10 @@ need_resched:
4110 if (to_wakeup) 4219 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4220 try_to_wake_up_local(to_wakeup);
4112 } 4221 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4114 4222
4115 /* 4223 /*
4116 * If we are going to sleep and we have plugged IO queued, make 4224 * If we are going to sleep and we have plugged IO
4117 * sure to submit it to avoid deadlocks. 4225 * queued, make sure to submit it to avoid deadlocks.
4118 */ 4226 */
4119 if (blk_needs_flush_plug(prev)) { 4227 if (blk_needs_flush_plug(prev)) {
4120 raw_spin_unlock(&rq->lock); 4228 raw_spin_unlock(&rq->lock);
@@ -4161,70 +4269,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4269EXPORT_SYMBOL(schedule);
4162 4270
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4271#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4272
4173 if (!sched_feat(OWNER_SPIN)) 4273static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4274{
4275 bool ret = false;
4175 4276
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4277 rcu_read_lock();
4177 /* 4278 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4279 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4280
4188 /* 4281 /*
4189 * Even if the access succeeded (likely case), 4282 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4283 * lock->owner still matches owner, if that fails, owner might
4284 * point to free()d memory, if it still matches, the rcu_read_lock()
4285 * ensures the memory stays valid.
4191 */ 4286 */
4192 if (cpu >= nr_cpumask_bits) 4287 barrier();
4193 return 0;
4194 4288
4195 /* 4289 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4290fail:
4197 * get_cpu() and that we have the percpu area. 4291 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4292
4202 rq = cpu_rq(cpu); 4293 return ret;
4294}
4203 4295
4204 for (;;) { 4296/*
4205 /* 4297 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4298 * access and not reliable.
4207 */ 4299 */
4208 if (lock->owner != owner) { 4300int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4301{
4210 * If the lock has switched to a different owner, 4302 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4303 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4304
4219 /* 4305 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4306 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4307 return 0;
4224 4308
4225 arch_mutex_cpu_relax(); 4309 arch_mutex_cpu_relax();
4226 } 4310 }
4227 4311
4312 /*
4313 * If the owner changed to another task there is likely
4314 * heavy contention, stop spinning.
4315 */
4316 if (lock->owner)
4317 return 0;
4318
4228 return 1; 4319 return 1;
4229} 4320}
4230#endif 4321#endif
@@ -4684,19 +4775,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4775 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4776void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4777{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4778 int oldprio, on_rq, running;
4689 struct rq *rq; 4779 struct rq *rq;
4690 const struct sched_class *prev_class; 4780 const struct sched_class *prev_class;
4691 4781
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4782 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4783
4694 rq = task_rq_lock(p, &flags); 4784 rq = __task_rq_lock(p);
4695 4785
4696 trace_sched_pi_setprio(p, prio); 4786 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4787 oldprio = p->prio;
4698 prev_class = p->sched_class; 4788 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4789 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4790 running = task_current(rq, p);
4701 if (on_rq) 4791 if (on_rq)
4702 dequeue_task(rq, p, 0); 4792 dequeue_task(rq, p, 0);
@@ -4716,7 +4806,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4806 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4807
4718 check_class_changed(rq, p, prev_class, oldprio); 4808 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4809 __task_rq_unlock(rq);
4720} 4810}
4721 4811
4722#endif 4812#endif
@@ -4744,7 +4834,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4834 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4835 goto out_unlock;
4746 } 4836 }
4747 on_rq = p->se.on_rq; 4837 on_rq = p->on_rq;
4748 if (on_rq) 4838 if (on_rq)
4749 dequeue_task(rq, p, 0); 4839 dequeue_task(rq, p, 0);
4750 4840
@@ -4764,7 +4854,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4854 resched_task(rq->curr);
4765 } 4855 }
4766out_unlock: 4856out_unlock:
4767 task_rq_unlock(rq, &flags); 4857 task_rq_unlock(rq, p, &flags);
4768} 4858}
4769EXPORT_SYMBOL(set_user_nice); 4859EXPORT_SYMBOL(set_user_nice);
4770 4860
@@ -4878,8 +4968,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4968static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4969__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4970{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4971 p->policy = policy;
4884 p->rt_priority = prio; 4972 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4973 p->normal_prio = normal_prio(p);
@@ -4994,20 +5082,17 @@ recheck:
4994 /* 5082 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5083 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5084 * changing the priority of the task:
4997 */ 5085 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5086 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5087 * runqueue lock must be held.
5002 */ 5088 */
5003 rq = __task_rq_lock(p); 5089 rq = task_rq_lock(p, &flags);
5004 5090
5005 /* 5091 /*
5006 * Changing the policy of the stop threads its a very bad idea 5092 * Changing the policy of the stop threads its a very bad idea
5007 */ 5093 */
5008 if (p == rq->stop) { 5094 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5095 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5096 return -EINVAL;
5012 } 5097 }
5013 5098
@@ -5031,8 +5116,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5116 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5117 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5118 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5119 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5120 return -EPERM;
5037 } 5121 }
5038 } 5122 }
@@ -5041,11 +5125,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5125 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5126 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5127 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5128 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5129 goto recheck;
5047 } 5130 }
5048 on_rq = p->se.on_rq; 5131 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5132 running = task_current(rq, p);
5050 if (on_rq) 5133 if (on_rq)
5051 deactivate_task(rq, p, 0); 5134 deactivate_task(rq, p, 0);
@@ -5064,8 +5147,7 @@ recheck:
5064 activate_task(rq, p, 0); 5147 activate_task(rq, p, 0);
5065 5148
5066 check_class_changed(rq, p, prev_class, oldprio); 5149 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5150 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5151
5070 rt_mutex_adjust_pi(p); 5152 rt_mutex_adjust_pi(p);
5071 5153
@@ -5316,7 +5398,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5398{
5317 struct task_struct *p; 5399 struct task_struct *p;
5318 unsigned long flags; 5400 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5401 int retval;
5321 5402
5322 get_online_cpus(); 5403 get_online_cpus();
@@ -5331,9 +5412,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5412 if (retval)
5332 goto out_unlock; 5413 goto out_unlock;
5333 5414
5334 rq = task_rq_lock(p, &flags); 5415 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5416 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5417 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5418
5338out_unlock: 5419out_unlock:
5339 rcu_read_unlock(); 5420 rcu_read_unlock();
@@ -5658,7 +5739,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5739
5659 rq = task_rq_lock(p, &flags); 5740 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5741 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5742 task_rq_unlock(rq, p, &flags);
5662 5743
5663 rcu_read_unlock(); 5744 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5745 jiffies_to_timespec(time_slice, &t);
@@ -5776,17 +5857,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5857 rcu_read_unlock();
5777 5858
5778 rq->curr = rq->idle = idle; 5859 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5860#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5861 idle->on_cpu = 1;
5781#endif 5862#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5863 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5864
5784 /* Set the preempt count _outside_ the spinlocks! */ 5865 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5866 task_thread_info(idle)->preempt_count = 0;
5789#endif 5867
5790 /* 5868 /*
5791 * The idle tasks have their own, simple scheduling class: 5869 * The idle tasks have their own, simple scheduling class:
5792 */ 5870 */
@@ -5881,26 +5959,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5959 unsigned int dest_cpu;
5882 int ret = 0; 5960 int ret = 0;
5883 5961
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5962 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5963
5893 task_rq_unlock(rq, &flags); 5964 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5965 goto out;
5895 }
5896 5966
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5967 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5968 ret = -EINVAL;
5899 goto out; 5969 goto out;
5900 } 5970 }
5901 5971
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 5972 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 5973 ret = -EINVAL;
5905 goto out; 5974 goto out;
5906 } 5975 }
@@ -5917,16 +5986,16 @@ again:
5917 goto out; 5986 goto out;
5918 5987
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5988 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 5989 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 5990 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 5991 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 5992 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 5993 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 5994 tlb_migrate_finish(p->mm);
5926 return 0; 5995 return 0;
5927 } 5996 }
5928out: 5997out:
5929 task_rq_unlock(rq, &flags); 5998 task_rq_unlock(rq, p, &flags);
5930 5999
5931 return ret; 6000 return ret;
5932} 6001}
@@ -5954,6 +6023,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6023 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6024 rq_dest = cpu_rq(dest_cpu);
5956 6025
6026 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6027 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6028 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6029 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6036,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6036 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6037 * placed properly.
5968 */ 6038 */
5969 if (p->se.on_rq) { 6039 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6040 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6041 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6042 activate_task(rq_dest, p, 0);
@@ -5976,6 +6046,7 @@ done:
5976 ret = 1; 6046 ret = 1;
5977fail: 6047fail:
5978 double_rq_unlock(rq_src, rq_dest); 6048 double_rq_unlock(rq_src, rq_dest);
6049 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6050 return ret;
5980} 6051}
5981 6052
@@ -6316,6 +6387,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6387
6317#ifdef CONFIG_HOTPLUG_CPU 6388#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6389 case CPU_DYING:
6390 sched_ttwu_pending();
6319 /* Update our root-domain */ 6391 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6392 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6393 if (rq->rd) {
@@ -6394,6 +6466,8 @@ early_initcall(migration_init);
6394 6466
6395#ifdef CONFIG_SMP 6467#ifdef CONFIG_SMP
6396 6468
6469static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6470
6397#ifdef CONFIG_SCHED_DEBUG 6471#ifdef CONFIG_SCHED_DEBUG
6398 6472
6399static __read_mostly int sched_domain_debug_enabled; 6473static __read_mostly int sched_domain_debug_enabled;
@@ -6468,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6468 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6542 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6469 6543
6470 printk(KERN_CONT " %s", str); 6544 printk(KERN_CONT " %s", str);
6471 if (group->cpu_power != SCHED_LOAD_SCALE) { 6545 if (group->cpu_power != SCHED_POWER_SCALE) {
6472 printk(KERN_CONT " (cpu_power = %d)", 6546 printk(KERN_CONT " (cpu_power = %d)",
6473 group->cpu_power); 6547 group->cpu_power);
6474 } 6548 }
@@ -6489,7 +6563,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6563
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6564static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6565{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6566 int level = 0;
6494 6567
6495 if (!sched_domain_debug_enabled) 6568 if (!sched_domain_debug_enabled)
@@ -6502,20 +6575,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6575
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6576 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6577
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6578 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6579 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6580 break;
6513 level++; 6581 level++;
6514 sd = sd->parent; 6582 sd = sd->parent;
6515 if (!sd) 6583 if (!sd)
6516 break; 6584 break;
6517 } 6585 }
6518 free_cpumask_var(groupmask);
6519} 6586}
6520#else /* !CONFIG_SCHED_DEBUG */ 6587#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6588# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6639,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6639 return 1;
6573} 6640}
6574 6641
6575static void free_rootdomain(struct root_domain *rd) 6642static void free_rootdomain(struct rcu_head *rcu)
6576{ 6643{
6577 synchronize_sched(); 6644 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6645
6579 cpupri_cleanup(&rd->cpupri); 6646 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6647 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6648 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6649 free_cpumask_var(rd->span);
@@ -6618,7 +6684,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6684 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6685
6620 if (old_rd) 6686 if (old_rd)
6621 free_rootdomain(old_rd); 6687 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6688}
6623 6689
6624static int init_rootdomain(struct root_domain *rd) 6690static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6735,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6735 return rd;
6670} 6736}
6671 6737
6738static void free_sched_domain(struct rcu_head *rcu)
6739{
6740 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6741 if (atomic_dec_and_test(&sd->groups->ref))
6742 kfree(sd->groups);
6743 kfree(sd);
6744}
6745
6746static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6747{
6748 call_rcu(&sd->rcu, free_sched_domain);
6749}
6750
6751static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6752{
6753 for (; sd; sd = sd->parent)
6754 destroy_sched_domain(sd, cpu);
6755}
6756
6672/* 6757/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6758 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6759 * hold the hotplug lock.
@@ -6679,9 +6764,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6764 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6765 struct sched_domain *tmp;
6681 6766
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6767 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6768 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6769 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6774,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6774 tmp->parent = parent->parent;
6693 if (parent->parent) 6775 if (parent->parent)
6694 parent->parent->child = tmp; 6776 parent->parent->child = tmp;
6777 destroy_sched_domain(parent, cpu);
6695 } else 6778 } else
6696 tmp = tmp->parent; 6779 tmp = tmp->parent;
6697 } 6780 }
6698 6781
6699 if (sd && sd_degenerate(sd)) { 6782 if (sd && sd_degenerate(sd)) {
6783 tmp = sd;
6700 sd = sd->parent; 6784 sd = sd->parent;
6785 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6786 if (sd)
6702 sd->child = NULL; 6787 sd->child = NULL;
6703 } 6788 }
@@ -6705,7 +6790,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6790 sched_domain_debug(sd, cpu);
6706 6791
6707 rq_attach_root(rq, rd); 6792 rq_attach_root(rq, rd);
6793 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6794 rcu_assign_pointer(rq->sd, sd);
6795 destroy_sched_domains(tmp, cpu);
6709} 6796}
6710 6797
6711/* cpus with isolated domains */ 6798/* cpus with isolated domains */
@@ -6721,56 +6808,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6808
6722__setup("isolcpus=", isolated_cpu_setup); 6809__setup("isolcpus=", isolated_cpu_setup);
6723 6810
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6811#define SD_NODES_PER_DOMAIN 16
6775 6812
6776#ifdef CONFIG_NUMA 6813#ifdef CONFIG_NUMA
@@ -6787,7 +6824,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6824 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6825static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6826{
6790 int i, n, val, min_val, best_node = 0; 6827 int i, n, val, min_val, best_node = -1;
6791 6828
6792 min_val = INT_MAX; 6829 min_val = INT_MAX;
6793 6830
@@ -6811,7 +6848,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6848 }
6812 } 6849 }
6813 6850
6814 node_set(best_node, *used_nodes); 6851 if (best_node != -1)
6852 node_set(best_node, *used_nodes);
6815 return best_node; 6853 return best_node;
6816} 6854}
6817 6855
@@ -6837,315 +6875,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6875
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6876 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6877 int next_node = find_next_best_node(node, &used_nodes);
6840 6878 if (next_node < 0)
6879 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6880 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6881 }
6843} 6882}
6883
6884static const struct cpumask *cpu_node_mask(int cpu)
6885{
6886 lockdep_assert_held(&sched_domains_mutex);
6887
6888 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6889
6890 return sched_domains_tmpmask;
6891}
6892
6893static const struct cpumask *cpu_allnodes_mask(int cpu)
6894{
6895 return cpu_possible_mask;
6896}
6844#endif /* CONFIG_NUMA */ 6897#endif /* CONFIG_NUMA */
6845 6898
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6899static const struct cpumask *cpu_cpu_mask(int cpu)
6900{
6901 return cpumask_of_node(cpu_to_node(cpu));
6902}
6847 6903
6848/* 6904int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6905
6859struct static_sched_domain { 6906struct sd_data {
6860 struct sched_domain sd; 6907 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6908 struct sched_group **__percpu sg;
6862}; 6909};
6863 6910
6864struct s_data { 6911struct s_data {
6865#ifdef CONFIG_NUMA 6912 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6913 struct root_domain *rd;
6879}; 6914};
6880 6915
6881enum s_alloc { 6916enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6917 sa_rootdomain,
6884 sa_tmpmask, 6918 sa_sd,
6885 sa_send_covered, 6919 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6920 sa_none,
6897}; 6921};
6898 6922
6899/* 6923struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905 6924
6906static int 6925typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6926typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 6927
6916/* 6928struct sched_domain_topology_level {
6917 * multi-core sched-domains: 6929 sched_domain_init_f init;
6918 */ 6930 sched_domain_mask_f mask;
6919#ifdef CONFIG_SCHED_MC 6931 struct sd_data data;
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6932};
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922
6923static int
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6925 struct sched_group **sg, struct cpumask *mask)
6926{
6927 int group;
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6933
6940/* 6934/*
6941 * book sched-domains: 6935 * Assumes the sched_domain tree is fully constructed
6942 */ 6936 */
6943#ifdef CONFIG_SCHED_BOOK 6937static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6938{
6951 int group = cpu; 6939 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6940 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6941
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6942 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6943 cpu = cpumask_first(sched_domain_span(child));
6967 6944
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6945 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6946 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6947
6948 return cpu;
6988} 6949}
6989 6950
6990#ifdef CONFIG_NUMA
6991/* 6951/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6952 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6953 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6954 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6955 * (due to the fact that we keep track of groups covered with a struct cpumask).
6956 *
6957 * build_sched_groups will build a circular linked list of the groups
6958 * covered by the given span, and will set each group's ->cpumask correctly,
6959 * and ->cpu_power to 0.
6995 */ 6960 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6961static void
6997static struct sched_group ***sched_group_nodes_bycpu; 6962build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 6963{
7045 struct sched_domain *sd; 6964 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 6965 struct sd_data *sdd = sd->private;
7047 int n, j; 6966 const struct cpumask *span = sched_domain_span(sd);
7048 6967 struct cpumask *covered;
7049 cpumask_clear(d->covered); 6968 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072
7073 sg->cpu_power = 0;
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask);
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 6969
7078 prev = sg; 6970 lockdep_assert_held(&sched_domains_mutex);
7079 for (j = 0; j < nr_node_ids; j++) { 6971 covered = sched_domains_tmpmask;
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107 6972
7108#ifdef CONFIG_NUMA 6973 cpumask_clear(covered);
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 6974
7115 for_each_cpu(cpu, cpu_map) { 6975 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 6976 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 6977 int group = get_group(i, sdd, &sg);
6978 int j;
7118 6979
7119 if (!sched_group_nodes) 6980 if (cpumask_test_cpu(i, covered))
7120 continue; 6981 continue;
7121 6982
7122 for (i = 0; i < nr_node_ids; i++) { 6983 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 6984 sg->cpu_power = 0;
7124 6985
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 6986 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 6987 if (get_group(j, sdd, NULL) != group)
7127 continue; 6988 continue;
7128 6989
7129 if (sg == NULL) 6990 cpumask_set_cpu(j, covered);
7130 continue; 6991 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 6992 }
7139 kfree(sched_group_nodes); 6993
7140 sched_group_nodes_bycpu[cpu] = NULL; 6994 if (!first)
6995 first = sg;
6996 if (last)
6997 last->next = sg;
6998 last = sg;
7141 } 6999 }
7000 last->next = first;
7142} 7001}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 7002
7150/* 7003/*
7151 * Initialize sched groups cpu_power. 7004 * Initialize sched groups cpu_power.
@@ -7159,11 +7012,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 7012 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7013static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 7014{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7015 WARN_ON(!sd || !sd->groups);
7168 7016
7169 if (cpu != group_first_cpu(sd->groups)) 7017 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7019,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7019
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7020 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7021
7174 child = sd->child; 7022 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7023}
7205 7024
7206/* 7025/*
@@ -7214,15 +7033,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7033# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7034#endif
7216 7035
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7036#define SD_INIT_FUNC(type) \
7218 7037static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7038sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7039{ \
7221{ \ 7040 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7041 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7042 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7043 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7044 return sd; \
7226} 7045}
7227 7046
7228SD_INIT_FUNC(CPU) 7047SD_INIT_FUNC(CPU)
@@ -7241,13 +7060,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7060#endif
7242 7061
7243static int default_relax_domain_level = -1; 7062static int default_relax_domain_level = -1;
7063int sched_domain_level_max;
7244 7064
7245static int __init setup_relax_domain_level(char *str) 7065static int __init setup_relax_domain_level(char *str)
7246{ 7066{
7247 unsigned long val; 7067 unsigned long val;
7248 7068
7249 val = simple_strtoul(str, NULL, 0); 7069 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7070 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7071 default_relax_domain_level = val;
7252 7072
7253 return 1; 7073 return 1;
@@ -7275,37 +7095,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7095 }
7276} 7096}
7277 7097
7098static void __sdt_free(const struct cpumask *cpu_map);
7099static int __sdt_alloc(const struct cpumask *cpu_map);
7100
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7101static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7102 const struct cpumask *cpu_map)
7280{ 7103{
7281 switch (what) { 7104 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7105 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7106 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7107 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7108 case sa_sd:
7289 case sa_send_covered: 7109 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7110 case sa_sd_storage:
7291 case sa_this_book_map: 7111 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7112 case sa_none:
7310 break; 7113 break;
7311 } 7114 }
@@ -7314,308 +7117,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7117static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7118 const struct cpumask *cpu_map)
7316{ 7119{
7317#ifdef CONFIG_NUMA 7120 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7121
7319 return sa_none; 7122 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7123 return sa_sd_storage;
7321 return sa_domainspan; 7124 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7125 if (!d->sd)
7323 return sa_covered; 7126 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7127 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7128 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7129 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7130 return sa_rootdomain;
7351} 7131}
7352 7132
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7133/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7134 * NULL the sd_data elements we've used to build the sched_domain and
7135 * sched_group structure so that the subsequent __free_domain_allocs()
7136 * will not free the data we're using.
7137 */
7138static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7139{
7356 struct sched_domain *sd = NULL; 7140 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7141 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7142
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7143 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7144 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7145
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7146 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7147 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7148 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7149 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7150}
7416 7151
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7152#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7153static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7154{
7421 struct sched_domain *sd = parent; 7155 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7156}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7157#endif
7448 return sd;
7449}
7450 7158
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7159/*
7452 const struct cpumask *cpu_map, int cpu) 7160 * Topology list, bottom-up.
7453{ 7161 */
7454 switch (l) { 7162static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7163#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7164 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7165#endif
7465#ifdef CONFIG_SCHED_MC 7166#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7167 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7168#endif
7474#ifdef CONFIG_SCHED_BOOK 7169#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7170 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7171#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7172 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7173#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7174 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7175 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7176#endif
7496 default: 7177 { NULL, },
7497 break; 7178};
7179
7180static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7181
7182static int __sdt_alloc(const struct cpumask *cpu_map)
7183{
7184 struct sched_domain_topology_level *tl;
7185 int j;
7186
7187 for (tl = sched_domain_topology; tl->init; tl++) {
7188 struct sd_data *sdd = &tl->data;
7189
7190 sdd->sd = alloc_percpu(struct sched_domain *);
7191 if (!sdd->sd)
7192 return -ENOMEM;
7193
7194 sdd->sg = alloc_percpu(struct sched_group *);
7195 if (!sdd->sg)
7196 return -ENOMEM;
7197
7198 for_each_cpu(j, cpu_map) {
7199 struct sched_domain *sd;
7200 struct sched_group *sg;
7201
7202 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7203 GFP_KERNEL, cpu_to_node(j));
7204 if (!sd)
7205 return -ENOMEM;
7206
7207 *per_cpu_ptr(sdd->sd, j) = sd;
7208
7209 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7210 GFP_KERNEL, cpu_to_node(j));
7211 if (!sg)
7212 return -ENOMEM;
7213
7214 *per_cpu_ptr(sdd->sg, j) = sg;
7215 }
7216 }
7217
7218 return 0;
7219}
7220
7221static void __sdt_free(const struct cpumask *cpu_map)
7222{
7223 struct sched_domain_topology_level *tl;
7224 int j;
7225
7226 for (tl = sched_domain_topology; tl->init; tl++) {
7227 struct sd_data *sdd = &tl->data;
7228
7229 for_each_cpu(j, cpu_map) {
7230 kfree(*per_cpu_ptr(sdd->sd, j));
7231 kfree(*per_cpu_ptr(sdd->sg, j));
7232 }
7233 free_percpu(sdd->sd);
7234 free_percpu(sdd->sg);
7235 }
7236}
7237
7238struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7239 struct s_data *d, const struct cpumask *cpu_map,
7240 struct sched_domain_attr *attr, struct sched_domain *child,
7241 int cpu)
7242{
7243 struct sched_domain *sd = tl->init(tl, cpu);
7244 if (!sd)
7245 return child;
7246
7247 set_domain_attribute(sd, attr);
7248 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7249 if (child) {
7250 sd->level = child->level + 1;
7251 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7252 child->parent = sd;
7498 } 7253 }
7254 sd->child = child;
7255
7256 return sd;
7499} 7257}
7500 7258
7501/* 7259/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7260 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7261 * to the individual cpus
7504 */ 7262 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7263static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7264 struct sched_domain_attr *attr)
7507{ 7265{
7508 enum s_alloc alloc_state = sa_none; 7266 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7267 struct sched_domain *sd;
7511 int i; 7268 struct s_data d;
7512#ifdef CONFIG_NUMA 7269 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7270
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7271 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7272 if (alloc_state != sa_rootdomain)
7518 goto error; 7273 goto error;
7519 alloc_state = sa_sched_groups;
7520 7274
7521 /* 7275 /* Set up domains for cpus specified by the cpu_map. */
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) { 7276 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), 7277 struct sched_domain_topology_level *tl;
7526 cpu_map);
7527 7278
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7279 sd = NULL;
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7280 for (tl = sched_domain_topology; tl->init; tl++)
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); 7281 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534
7535 for_each_cpu(i, cpu_map) {
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540
7541 /* Set up physical groups */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544 7282
7545#ifdef CONFIG_NUMA 7283 while (sd->child)
7546 /* Set up node groups */ 7284 sd = sd->child;
7547 if (d.sd_allnodes)
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549
7550 for (i = 0; i < nr_node_ids; i++)
7551 if (build_numa_sched_groups(&d, cpu_map, i))
7552 goto error;
7553#endif
7554 7285
7555 /* Calculate CPU power for physical packages and nodes */ 7286 *per_cpu_ptr(d.sd, i) = sd;
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 } 7287 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574 7288
7289 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7290 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7291 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7292 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7293 get_group(i, sd->private, &sd->groups);
7294 atomic_inc(&sd->groups->ref);
7579 7295
7580#ifdef CONFIG_NUMA 7296 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7297 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7298
7584 if (d.sd_allnodes) { 7299 build_sched_groups(sd);
7585 struct sched_group *sg; 7300 }
7301 }
7586 7302
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7303 /* Calculate CPU power for physical packages and nodes */
7588 d.tmpmask); 7304 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7589 init_numa_sched_groups_power(sg); 7305 if (!cpumask_test_cpu(i, cpu_map))
7306 continue;
7307
7308 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7309 claim_allocations(i, sd);
7310 init_sched_groups_power(i, sd);
7311 }
7590 } 7312 }
7591#endif
7592 7313
7593 /* Attach the domains */ 7314 /* Attach the domains */
7315 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7316 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7317 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7318 cpu_attach_domain(sd, d.rd, i);
7605 } 7319 }
7320 rcu_read_unlock();
7606 7321
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7322 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7323error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7324 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7325 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7326}
7620 7327
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7328static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7377,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7377 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7378 * exclude other special cases in the future.
7672 */ 7379 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7380static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7381{
7675 int err; 7382 int err;
7676 7383
@@ -7681,32 +7388,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7388 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7389 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7390 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7391 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7392 register_sched_domain_sysctl();
7686 7393
7687 return err; 7394 return err;
7688} 7395}
7689 7396
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7397/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7398 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7399 * These cpus will now be attached to the NULL domain
7699 */ 7400 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7401static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7402{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7403 int i;
7705 7404
7405 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7406 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7407 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7408 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7409}
7711 7410
7712/* handle null as "default" */ 7411/* handle null as "default" */
@@ -7795,8 +7494,7 @@ match1:
7795 goto match2; 7494 goto match2;
7796 } 7495 }
7797 /* no match - add a new doms_new */ 7496 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7497 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7498match2:
7801 ; 7499 ;
7802 } 7500 }
@@ -7815,7 +7513,7 @@ match2:
7815} 7513}
7816 7514
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7515#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7516static void reinit_sched_domains(void)
7819{ 7517{
7820 get_online_cpus(); 7518 get_online_cpus();
7821 7519
@@ -7848,7 +7546,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7546 else
7849 sched_mc_power_savings = level; 7547 sched_mc_power_savings = level;
7850 7548
7851 arch_reinit_sched_domains(); 7549 reinit_sched_domains();
7852 7550
7853 return count; 7551 return count;
7854} 7552}
@@ -7967,14 +7665,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7665 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7666 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7667
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7668 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7669 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7670 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7671 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7672 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7673 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8224,7 +7917,7 @@ void __init sched_init(void)
8224#ifdef CONFIG_SMP 7917#ifdef CONFIG_SMP
8225 rq->sd = NULL; 7918 rq->sd = NULL;
8226 rq->rd = NULL; 7919 rq->rd = NULL;
8227 rq->cpu_power = SCHED_LOAD_SCALE; 7920 rq->cpu_power = SCHED_POWER_SCALE;
8228 rq->post_schedule = 0; 7921 rq->post_schedule = 0;
8229 rq->active_balance = 0; 7922 rq->active_balance = 0;
8230 rq->next_balance = jiffies; 7923 rq->next_balance = jiffies;
@@ -8281,6 +7974,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7974 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7975 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 7976#ifdef CONFIG_SMP
7977 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 7978#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 7979 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 7980 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8034,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8034 int old_prio = p->prio;
8341 int on_rq; 8035 int on_rq;
8342 8036
8343 on_rq = p->se.on_rq; 8037 on_rq = p->on_rq;
8344 if (on_rq) 8038 if (on_rq)
8345 deactivate_task(rq, p, 0); 8039 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8040 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8247,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8247{
8554 struct rt_rq *rt_rq; 8248 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8249 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8250 int i;
8558 8251
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8252 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8260,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8260 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8261
8569 for_each_possible_cpu(i) { 8262 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8263 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8264 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8265 if (!rt_rq)
@@ -8683,7 +8374,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8374 rq = task_rq_lock(tsk, &flags);
8684 8375
8685 running = task_current(rq, tsk); 8376 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8377 on_rq = tsk->on_rq;
8687 8378
8688 if (on_rq) 8379 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8380 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8393 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8394 enqueue_task(rq, tsk, 0);
8704 8395
8705 task_rq_unlock(rq, &flags); 8396 task_rq_unlock(rq, tsk, &flags);
8706} 8397}
8707#endif /* CONFIG_CGROUP_SCHED */ 8398#endif /* CONFIG_CGROUP_SCHED */
8708 8399
@@ -9130,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9130static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8821static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9131 u64 shareval) 8822 u64 shareval)
9132{ 8823{
9133 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8824 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9134} 8825}
9135 8826
9136static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8827static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9137{ 8828{
9138 struct task_group *tg = cgroup_tg(cgrp); 8829 struct task_group *tg = cgroup_tg(cgrp);
9139 8830
9140 return (u64) tg->shares; 8831 return (u64) scale_load_down(tg->shares);
9141} 8832}
9142#endif /* CONFIG_FAIR_GROUP_SCHED */ 8833#endif /* CONFIG_FAIR_GROUP_SCHED */
9143 8834