aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1785
1 files changed, 734 insertions, 1051 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 48013633d792..cbb3a0eee58e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
231#endif 231#endif
232 232
233/* 233/*
234 * sched_domains_mutex serializes calls to arch_init_sched_domains, 234 * sched_domains_mutex serializes calls to init_sched_domains,
235 * detach_destroy_domains and partition_sched_domains. 235 * detach_destroy_domains and partition_sched_domains.
236 */ 236 */
237static DEFINE_MUTEX(sched_domains_mutex); 237static DEFINE_MUTEX(sched_domains_mutex);
@@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
293 * limitation from this.) 293 * limitation from this.)
294 */ 294 */
295#define MIN_SHARES 2 295#define MIN_SHARES 2
296#define MAX_SHARES (1UL << 18) 296#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
297 297
298static int root_task_group_load = ROOT_TASK_GROUP_LOAD; 298static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
299#endif 299#endif
@@ -312,6 +312,9 @@ struct cfs_rq {
312 312
313 u64 exec_clock; 313 u64 exec_clock;
314 u64 min_vruntime; 314 u64 min_vruntime;
315#ifndef CONFIG_64BIT
316 u64 min_vruntime_copy;
317#endif
315 318
316 struct rb_root tasks_timeline; 319 struct rb_root tasks_timeline;
317 struct rb_node *rb_leftmost; 320 struct rb_node *rb_leftmost;
@@ -325,7 +328,9 @@ struct cfs_rq {
325 */ 328 */
326 struct sched_entity *curr, *next, *last, *skip; 329 struct sched_entity *curr, *next, *last, *skip;
327 330
331#ifdef CONFIG_SCHED_DEBUG
328 unsigned int nr_spread_over; 332 unsigned int nr_spread_over;
333#endif
329 334
330#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
331 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 336 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -417,6 +422,7 @@ struct rt_rq {
417 */ 422 */
418struct root_domain { 423struct root_domain {
419 atomic_t refcount; 424 atomic_t refcount;
425 struct rcu_head rcu;
420 cpumask_var_t span; 426 cpumask_var_t span;
421 cpumask_var_t online; 427 cpumask_var_t online;
422 428
@@ -460,7 +466,7 @@ struct rq {
460 u64 nohz_stamp; 466 u64 nohz_stamp;
461 unsigned char nohz_balance_kick; 467 unsigned char nohz_balance_kick;
462#endif 468#endif
463 unsigned int skip_clock_update; 469 int skip_clock_update;
464 470
465 /* capture load from *all* tasks on this cpu: */ 471 /* capture load from *all* tasks on this cpu: */
466 struct load_weight load; 472 struct load_weight load;
@@ -553,6 +559,10 @@ struct rq {
553 unsigned int ttwu_count; 559 unsigned int ttwu_count;
554 unsigned int ttwu_local; 560 unsigned int ttwu_local;
555#endif 561#endif
562
563#ifdef CONFIG_SMP
564 struct task_struct *wake_list;
565#endif
556}; 566};
557 567
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 568static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -571,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
571 581
572#define rcu_dereference_check_sched_domain(p) \ 582#define rcu_dereference_check_sched_domain(p) \
573 rcu_dereference_check((p), \ 583 rcu_dereference_check((p), \
574 rcu_read_lock_sched_held() || \ 584 rcu_read_lock_held() || \
575 lockdep_is_held(&sched_domains_mutex)) 585 lockdep_is_held(&sched_domains_mutex))
576 586
577/* 587/*
@@ -596,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
596 * Return the group to which this tasks belongs. 606 * Return the group to which this tasks belongs.
597 * 607 *
598 * We use task_subsys_state_check() and extend the RCU verification 608 * We use task_subsys_state_check() and extend the RCU verification
599 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() 609 * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
600 * holds that lock for each task it moves into the cgroup. Therefore 610 * holds that lock for each task it moves into the cgroup. Therefore
601 * by holding that lock, we pin the task to the current cgroup. 611 * by holding that lock, we pin the task to the current cgroup.
602 */ 612 */
@@ -606,7 +616,7 @@ static inline struct task_group *task_group(struct task_struct *p)
606 struct cgroup_subsys_state *css; 616 struct cgroup_subsys_state *css;
607 617
608 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 618 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
609 lockdep_is_held(&task_rq(p)->lock)); 619 lockdep_is_held(&p->pi_lock));
610 tg = container_of(css, struct task_group, css); 620 tg = container_of(css, struct task_group, css);
611 621
612 return autogroup_task_group(p, tg); 622 return autogroup_task_group(p, tg);
@@ -642,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
642{ 652{
643 s64 delta; 653 s64 delta;
644 654
645 if (rq->skip_clock_update) 655 if (rq->skip_clock_update > 0)
646 return; 656 return;
647 657
648 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 658 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -838,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
838 return rq->curr == p; 848 return rq->curr == p;
839} 849}
840 850
841#ifndef __ARCH_WANT_UNLOCKED_CTXSW
842static inline int task_running(struct rq *rq, struct task_struct *p) 851static inline int task_running(struct rq *rq, struct task_struct *p)
843{ 852{
853#ifdef CONFIG_SMP
854 return p->on_cpu;
855#else
844 return task_current(rq, p); 856 return task_current(rq, p);
857#endif
845} 858}
846 859
860#ifndef __ARCH_WANT_UNLOCKED_CTXSW
847static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 861static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
848{ 862{
863#ifdef CONFIG_SMP
864 /*
865 * We can optimise this out completely for !SMP, because the
866 * SMP rebalancing from interrupt is the only thing that cares
867 * here.
868 */
869 next->on_cpu = 1;
870#endif
849} 871}
850 872
851static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) 873static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
852{ 874{
875#ifdef CONFIG_SMP
876 /*
877 * After ->on_cpu is cleared, the task can be moved to a different CPU.
878 * We must ensure this doesn't happen until the switch is completely
879 * finished.
880 */
881 smp_wmb();
882 prev->on_cpu = 0;
883#endif
853#ifdef CONFIG_DEBUG_SPINLOCK 884#ifdef CONFIG_DEBUG_SPINLOCK
854 /* this is a valid case when another task releases the spinlock */ 885 /* this is a valid case when another task releases the spinlock */
855 rq->lock.owner = current; 886 rq->lock.owner = current;
@@ -865,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
865} 896}
866 897
867#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 898#else /* __ARCH_WANT_UNLOCKED_CTXSW */
868static inline int task_running(struct rq *rq, struct task_struct *p)
869{
870#ifdef CONFIG_SMP
871 return p->oncpu;
872#else
873 return task_current(rq, p);
874#endif
875}
876
877static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 899static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
878{ 900{
879#ifdef CONFIG_SMP 901#ifdef CONFIG_SMP
@@ -882,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
882 * SMP rebalancing from interrupt is the only thing that cares 904 * SMP rebalancing from interrupt is the only thing that cares
883 * here. 905 * here.
884 */ 906 */
885 next->oncpu = 1; 907 next->on_cpu = 1;
886#endif 908#endif
887#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 909#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
888 raw_spin_unlock_irq(&rq->lock); 910 raw_spin_unlock_irq(&rq->lock);
@@ -895,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
895{ 917{
896#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
897 /* 919 /*
898 * After ->oncpu is cleared, the task can be moved to a different CPU. 920 * After ->on_cpu is cleared, the task can be moved to a different CPU.
899 * We must ensure this doesn't happen until the switch is completely 921 * We must ensure this doesn't happen until the switch is completely
900 * finished. 922 * finished.
901 */ 923 */
902 smp_wmb(); 924 smp_wmb();
903 prev->oncpu = 0; 925 prev->on_cpu = 0;
904#endif 926#endif
905#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW 927#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
906 local_irq_enable(); 928 local_irq_enable();
@@ -909,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
909#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 931#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
910 932
911/* 933/*
912 * Check whether the task is waking, we use this to synchronize ->cpus_allowed 934 * __task_rq_lock - lock the rq @p resides on.
913 * against ttwu().
914 */
915static inline int task_is_waking(struct task_struct *p)
916{
917 return unlikely(p->state == TASK_WAKING);
918}
919
920/*
921 * __task_rq_lock - lock the runqueue a given task resides on.
922 * Must be called interrupts disabled.
923 */ 935 */
924static inline struct rq *__task_rq_lock(struct task_struct *p) 936static inline struct rq *__task_rq_lock(struct task_struct *p)
925 __acquires(rq->lock) 937 __acquires(rq->lock)
926{ 938{
927 struct rq *rq; 939 struct rq *rq;
928 940
941 lockdep_assert_held(&p->pi_lock);
942
929 for (;;) { 943 for (;;) {
930 rq = task_rq(p); 944 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
@@ -936,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936} 950}
937 951
938/* 952/*
939 * task_rq_lock - lock the runqueue a given task resides on and disable 953 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
940 * interrupts. Note the ordering: we can safely lookup the task_rq without
941 * explicitly disabling preemption.
942 */ 954 */
943static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 955static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
956 __acquires(p->pi_lock)
944 __acquires(rq->lock) 957 __acquires(rq->lock)
945{ 958{
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 local_irq_save(*flags); 962 raw_spin_lock_irqsave(&p->pi_lock, *flags);
950 rq = task_rq(p); 963 rq = task_rq(p);
951 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 965 if (likely(rq == task_rq(p)))
953 return rq; 966 return rq;
954 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock(&rq->lock);
968 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
955 } 969 }
956} 970}
957 971
@@ -961,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
961 raw_spin_unlock(&rq->lock); 975 raw_spin_unlock(&rq->lock);
962} 976}
963 977
964static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 978static inline void
979task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
965 __releases(rq->lock) 980 __releases(rq->lock)
981 __releases(p->pi_lock)
966{ 982{
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 983 raw_spin_unlock(&rq->lock);
984 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
968} 985}
969 986
970/* 987/*
@@ -1193,11 +1210,17 @@ int get_nohz_timer_target(void)
1193 int i; 1210 int i;
1194 struct sched_domain *sd; 1211 struct sched_domain *sd;
1195 1212
1213 rcu_read_lock();
1196 for_each_domain(cpu, sd) { 1214 for_each_domain(cpu, sd) {
1197 for_each_cpu(i, sched_domain_span(sd)) 1215 for_each_cpu(i, sched_domain_span(sd)) {
1198 if (!idle_cpu(i)) 1216 if (!idle_cpu(i)) {
1199 return i; 1217 cpu = i;
1218 goto unlock;
1219 }
1220 }
1200 } 1221 }
1222unlock:
1223 rcu_read_unlock();
1201 return cpu; 1224 return cpu;
1202} 1225}
1203/* 1226/*
@@ -1307,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1307{ 1330{
1308 u64 tmp; 1331 u64 tmp;
1309 1332
1333 /*
1334 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
1335 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
1336 * 2^SCHED_LOAD_RESOLUTION.
1337 */
1338 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
1339 tmp = (u64)delta_exec * scale_load_down(weight);
1340 else
1341 tmp = (u64)delta_exec;
1342
1310 if (!lw->inv_weight) { 1343 if (!lw->inv_weight) {
1311 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) 1344 unsigned long w = scale_load_down(lw->weight);
1345
1346 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
1312 lw->inv_weight = 1; 1347 lw->inv_weight = 1;
1348 else if (unlikely(!w))
1349 lw->inv_weight = WMULT_CONST;
1313 else 1350 else
1314 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) 1351 lw->inv_weight = WMULT_CONST / w;
1315 / (lw->weight+1);
1316 } 1352 }
1317 1353
1318 tmp = (u64)delta_exec * weight;
1319 /* 1354 /*
1320 * Check whether we'd overflow the 64-bit multiplication: 1355 * Check whether we'd overflow the 64-bit multiplication:
1321 */ 1356 */
@@ -1755,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
1755 1790
1756static void set_load_weight(struct task_struct *p) 1791static void set_load_weight(struct task_struct *p)
1757{ 1792{
1793 int prio = p->static_prio - MAX_RT_PRIO;
1794 struct load_weight *load = &p->se.load;
1795
1758 /* 1796 /*
1759 * SCHED_IDLE tasks get minimal weight: 1797 * SCHED_IDLE tasks get minimal weight:
1760 */ 1798 */
1761 if (p->policy == SCHED_IDLE) { 1799 if (p->policy == SCHED_IDLE) {
1762 p->se.load.weight = WEIGHT_IDLEPRIO; 1800 load->weight = scale_load(WEIGHT_IDLEPRIO);
1763 p->se.load.inv_weight = WMULT_IDLEPRIO; 1801 load->inv_weight = WMULT_IDLEPRIO;
1764 return; 1802 return;
1765 } 1803 }
1766 1804
1767 p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO]; 1805 load->weight = scale_load(prio_to_weight[prio]);
1768 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1806 load->inv_weight = prio_to_wmult[prio];
1769} 1807}
1770 1808
1771static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1809static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1773,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1773 update_rq_clock(rq); 1811 update_rq_clock(rq);
1774 sched_info_queued(p); 1812 sched_info_queued(p);
1775 p->sched_class->enqueue_task(rq, p, flags); 1813 p->sched_class->enqueue_task(rq, p, flags);
1776 p->se.on_rq = 1;
1777} 1814}
1778 1815
1779static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 1816static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1781,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1781 update_rq_clock(rq); 1818 update_rq_clock(rq);
1782 sched_info_dequeued(p); 1819 sched_info_dequeued(p);
1783 p->sched_class->dequeue_task(rq, p, flags); 1820 p->sched_class->dequeue_task(rq, p, flags);
1784 p->se.on_rq = 0;
1785} 1821}
1786 1822
1787/* 1823/*
@@ -2116,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2116 * A queue event has occurred, and we're going to schedule. In 2152 * A queue event has occurred, and we're going to schedule. In
2117 * this case, we can save a useless back to back clock update. 2153 * this case, we can save a useless back to back clock update.
2118 */ 2154 */
2119 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) 2155 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
2120 rq->skip_clock_update = 1; 2156 rq->skip_clock_update = 1;
2121} 2157}
2122 2158
@@ -2162,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2162 */ 2198 */
2163 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 2199 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2164 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 2200 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2201
2202#ifdef CONFIG_LOCKDEP
2203 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2204 lockdep_is_held(&task_rq(p)->lock)));
2205#endif
2165#endif 2206#endif
2166 2207
2167 trace_sched_migrate_task(p, new_cpu); 2208 trace_sched_migrate_task(p, new_cpu);
@@ -2182,19 +2223,6 @@ struct migration_arg {
2182static int migration_cpu_stop(void *data); 2223static int migration_cpu_stop(void *data);
2183 2224
2184/* 2225/*
2185 * The task's runqueue lock must be held.
2186 * Returns true if you have to wait for migration thread.
2187 */
2188static bool migrate_task(struct task_struct *p, struct rq *rq)
2189{
2190 /*
2191 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task.
2193 */
2194 return p->se.on_rq || task_running(rq, p);
2195}
2196
2197/*
2198 * wait_task_inactive - wait for a thread to unschedule. 2226 * wait_task_inactive - wait for a thread to unschedule.
2199 * 2227 *
2200 * If @match_state is nonzero, it's the @p->state value just checked and 2228 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2251,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2251 rq = task_rq_lock(p, &flags); 2279 rq = task_rq_lock(p, &flags);
2252 trace_sched_wait_task(p); 2280 trace_sched_wait_task(p);
2253 running = task_running(rq, p); 2281 running = task_running(rq, p);
2254 on_rq = p->se.on_rq; 2282 on_rq = p->on_rq;
2255 ncsw = 0; 2283 ncsw = 0;
2256 if (!match_state || p->state == match_state) 2284 if (!match_state || p->state == match_state)
2257 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 2285 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2258 task_rq_unlock(rq, &flags); 2286 task_rq_unlock(rq, p, &flags);
2259 2287
2260 /* 2288 /*
2261 * If it changed from the expected state, bail out now. 2289 * If it changed from the expected state, bail out now.
@@ -2330,7 +2358,7 @@ EXPORT_SYMBOL_GPL(kick_process);
2330 2358
2331#ifdef CONFIG_SMP 2359#ifdef CONFIG_SMP
2332/* 2360/*
2333 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held. 2361 * ->cpus_allowed is protected by both rq->lock and p->pi_lock
2334 */ 2362 */
2335static int select_fallback_rq(int cpu, struct task_struct *p) 2363static int select_fallback_rq(int cpu, struct task_struct *p)
2336{ 2364{
@@ -2363,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2363} 2391}
2364 2392
2365/* 2393/*
2366 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable. 2394 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
2367 */ 2395 */
2368static inline 2396static inline
2369int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags) 2397int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2370{ 2398{
2371 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags); 2399 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2372 2400
2373 /* 2401 /*
2374 * In order not to call set_task_cpu() on a blocking task we need 2402 * In order not to call set_task_cpu() on a blocking task we need
@@ -2394,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
2394} 2422}
2395#endif 2423#endif
2396 2424
2397static inline void ttwu_activate(struct task_struct *p, struct rq *rq, 2425static void
2398 bool is_sync, bool is_migrate, bool is_local, 2426ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2399 unsigned long en_flags)
2400{ 2427{
2428#ifdef CONFIG_SCHEDSTATS
2429 struct rq *rq = this_rq();
2430
2431#ifdef CONFIG_SMP
2432 int this_cpu = smp_processor_id();
2433
2434 if (cpu == this_cpu) {
2435 schedstat_inc(rq, ttwu_local);
2436 schedstat_inc(p, se.statistics.nr_wakeups_local);
2437 } else {
2438 struct sched_domain *sd;
2439
2440 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2441 rcu_read_lock();
2442 for_each_domain(this_cpu, sd) {
2443 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2444 schedstat_inc(sd, ttwu_wake_remote);
2445 break;
2446 }
2447 }
2448 rcu_read_unlock();
2449 }
2450#endif /* CONFIG_SMP */
2451
2452 schedstat_inc(rq, ttwu_count);
2401 schedstat_inc(p, se.statistics.nr_wakeups); 2453 schedstat_inc(p, se.statistics.nr_wakeups);
2402 if (is_sync) 2454
2455 if (wake_flags & WF_SYNC)
2403 schedstat_inc(p, se.statistics.nr_wakeups_sync); 2456 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2404 if (is_migrate) 2457
2458 if (cpu != task_cpu(p))
2405 schedstat_inc(p, se.statistics.nr_wakeups_migrate); 2459 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2406 if (is_local)
2407 schedstat_inc(p, se.statistics.nr_wakeups_local);
2408 else
2409 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2410 2460
2461#endif /* CONFIG_SCHEDSTATS */
2462}
2463
2464static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
2465{
2411 activate_task(rq, p, en_flags); 2466 activate_task(rq, p, en_flags);
2467 p->on_rq = 1;
2468
2469 /* if a worker is waking up, notify workqueue */
2470 if (p->flags & PF_WQ_WORKER)
2471 wq_worker_waking_up(p, cpu_of(rq));
2412} 2472}
2413 2473
2414static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, 2474/*
2415 int wake_flags, bool success) 2475 * Mark the task runnable and perform wakeup-preemption.
2476 */
2477static void
2478ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
2416{ 2479{
2417 trace_sched_wakeup(p, success); 2480 trace_sched_wakeup(p, true);
2418 check_preempt_curr(rq, p, wake_flags); 2481 check_preempt_curr(rq, p, wake_flags);
2419 2482
2420 p->state = TASK_RUNNING; 2483 p->state = TASK_RUNNING;
@@ -2433,9 +2496,118 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2433 rq->idle_stamp = 0; 2496 rq->idle_stamp = 0;
2434 } 2497 }
2435#endif 2498#endif
2436 /* if a worker is waking up, notify workqueue */ 2499}
2437 if ((p->flags & PF_WQ_WORKER) && success) 2500
2438 wq_worker_waking_up(p, cpu_of(rq)); 2501static void
2502ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
2503{
2504#ifdef CONFIG_SMP
2505 if (p->sched_contributes_to_load)
2506 rq->nr_uninterruptible--;
2507#endif
2508
2509 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
2510 ttwu_do_wakeup(rq, p, wake_flags);
2511}
2512
2513/*
2514 * Called in case the task @p isn't fully descheduled from its runqueue,
2515 * in this case we must do a remote wakeup. Its a 'light' wakeup though,
2516 * since all we need to do is flip p->state to TASK_RUNNING, since
2517 * the task is still ->on_rq.
2518 */
2519static int ttwu_remote(struct task_struct *p, int wake_flags)
2520{
2521 struct rq *rq;
2522 int ret = 0;
2523
2524 rq = __task_rq_lock(p);
2525 if (p->on_rq) {
2526 ttwu_do_wakeup(rq, p, wake_flags);
2527 ret = 1;
2528 }
2529 __task_rq_unlock(rq);
2530
2531 return ret;
2532}
2533
2534#ifdef CONFIG_SMP
2535static void sched_ttwu_pending(void)
2536{
2537 struct rq *rq = this_rq();
2538 struct task_struct *list = xchg(&rq->wake_list, NULL);
2539
2540 if (!list)
2541 return;
2542
2543 raw_spin_lock(&rq->lock);
2544
2545 while (list) {
2546 struct task_struct *p = list;
2547 list = list->wake_entry;
2548 ttwu_do_activate(rq, p, 0);
2549 }
2550
2551 raw_spin_unlock(&rq->lock);
2552}
2553
2554void scheduler_ipi(void)
2555{
2556 sched_ttwu_pending();
2557}
2558
2559static void ttwu_queue_remote(struct task_struct *p, int cpu)
2560{
2561 struct rq *rq = cpu_rq(cpu);
2562 struct task_struct *next = rq->wake_list;
2563
2564 for (;;) {
2565 struct task_struct *old = next;
2566
2567 p->wake_entry = next;
2568 next = cmpxchg(&rq->wake_list, old, p);
2569 if (next == old)
2570 break;
2571 }
2572
2573 if (!next)
2574 smp_send_reschedule(cpu);
2575}
2576
2577#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2578static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
2579{
2580 struct rq *rq;
2581 int ret = 0;
2582
2583 rq = __task_rq_lock(p);
2584 if (p->on_cpu) {
2585 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2586 ttwu_do_wakeup(rq, p, wake_flags);
2587 ret = 1;
2588 }
2589 __task_rq_unlock(rq);
2590
2591 return ret;
2592
2593}
2594#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2595#endif /* CONFIG_SMP */
2596
2597static void ttwu_queue(struct task_struct *p, int cpu)
2598{
2599 struct rq *rq = cpu_rq(cpu);
2600
2601#if defined(CONFIG_SMP)
2602 if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
2603 ttwu_queue_remote(p, cpu);
2604 return;
2605 }
2606#endif
2607
2608 raw_spin_lock(&rq->lock);
2609 ttwu_do_activate(rq, p, 0);
2610 raw_spin_unlock(&rq->lock);
2439} 2611}
2440 2612
2441/** 2613/**
@@ -2453,92 +2625,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2453 * Returns %true if @p was woken up, %false if it was already running 2625 * Returns %true if @p was woken up, %false if it was already running
2454 * or @state didn't match @p's state. 2626 * or @state didn't match @p's state.
2455 */ 2627 */
2456static int try_to_wake_up(struct task_struct *p, unsigned int state, 2628static int
2457 int wake_flags) 2629try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2458{ 2630{
2459 int cpu, orig_cpu, this_cpu, success = 0;
2460 unsigned long flags; 2631 unsigned long flags;
2461 unsigned long en_flags = ENQUEUE_WAKEUP; 2632 int cpu, success = 0;
2462 struct rq *rq;
2463
2464 this_cpu = get_cpu();
2465 2633
2466 smp_wmb(); 2634 smp_wmb();
2467 rq = task_rq_lock(p, &flags); 2635 raw_spin_lock_irqsave(&p->pi_lock, flags);
2468 if (!(p->state & state)) 2636 if (!(p->state & state))
2469 goto out; 2637 goto out;
2470 2638
2471 if (p->se.on_rq) 2639 success = 1; /* we're going to change ->state */
2472 goto out_running;
2473
2474 cpu = task_cpu(p); 2640 cpu = task_cpu(p);
2475 orig_cpu = cpu;
2476 2641
2477#ifdef CONFIG_SMP 2642 if (p->on_rq && ttwu_remote(p, wake_flags))
2478 if (unlikely(task_running(rq, p))) 2643 goto stat;
2479 goto out_activate;
2480 2644
2645#ifdef CONFIG_SMP
2481 /* 2646 /*
2482 * In order to handle concurrent wakeups and release the rq->lock 2647 * If the owning (remote) cpu is still in the middle of schedule() with
2483 * we put the task in TASK_WAKING state. 2648 * this task as prev, wait until its done referencing the task.
2484 *
2485 * First fix up the nr_uninterruptible count:
2486 */ 2649 */
2487 if (task_contributes_to_load(p)) { 2650 while (p->on_cpu) {
2488 if (likely(cpu_online(orig_cpu))) 2651#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2489 rq->nr_uninterruptible--; 2652 /*
2490 else 2653 * In case the architecture enables interrupts in
2491 this_rq()->nr_uninterruptible--; 2654 * context_switch(), we cannot busy wait, since that
2492 } 2655 * would lead to deadlocks when an interrupt hits and
2493 p->state = TASK_WAKING; 2656 * tries to wake up @prev. So bail and do a complete
2494 2657 * remote wakeup.
2495 if (p->sched_class->task_waking) { 2658 */
2496 p->sched_class->task_waking(rq, p); 2659 if (ttwu_activate_remote(p, wake_flags))
2497 en_flags |= ENQUEUE_WAKING; 2660 goto stat;
2661#else
2662 cpu_relax();
2663#endif
2498 } 2664 }
2499
2500 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2501 if (cpu != orig_cpu)
2502 set_task_cpu(p, cpu);
2503 __task_rq_unlock(rq);
2504
2505 rq = cpu_rq(cpu);
2506 raw_spin_lock(&rq->lock);
2507
2508 /* 2665 /*
2509 * We migrated the task without holding either rq->lock, however 2666 * Pairs with the smp_wmb() in finish_lock_switch().
2510 * since the task is not on the task list itself, nobody else
2511 * will try and migrate the task, hence the rq should match the
2512 * cpu we just moved it to.
2513 */ 2667 */
2514 WARN_ON(task_cpu(p) != cpu); 2668 smp_rmb();
2515 WARN_ON(p->state != TASK_WAKING);
2516 2669
2517#ifdef CONFIG_SCHEDSTATS 2670 p->sched_contributes_to_load = !!task_contributes_to_load(p);
2518 schedstat_inc(rq, ttwu_count); 2671 p->state = TASK_WAKING;
2519 if (cpu == this_cpu) 2672
2520 schedstat_inc(rq, ttwu_local); 2673 if (p->sched_class->task_waking)
2521 else { 2674 p->sched_class->task_waking(p);
2522 struct sched_domain *sd;
2523 for_each_domain(this_cpu, sd) {
2524 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2525 schedstat_inc(sd, ttwu_wake_remote);
2526 break;
2527 }
2528 }
2529 }
2530#endif /* CONFIG_SCHEDSTATS */
2531 2675
2532out_activate: 2676 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2677 if (task_cpu(p) != cpu)
2678 set_task_cpu(p, cpu);
2533#endif /* CONFIG_SMP */ 2679#endif /* CONFIG_SMP */
2534 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu, 2680
2535 cpu == this_cpu, en_flags); 2681 ttwu_queue(p, cpu);
2536 success = 1; 2682stat:
2537out_running: 2683 ttwu_stat(p, cpu, wake_flags);
2538 ttwu_post_activation(p, rq, wake_flags, success);
2539out: 2684out:
2540 task_rq_unlock(rq, &flags); 2685 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2541 put_cpu();
2542 2686
2543 return success; 2687 return success;
2544} 2688}
@@ -2547,31 +2691,34 @@ out:
2547 * try_to_wake_up_local - try to wake up a local task with rq lock held 2691 * try_to_wake_up_local - try to wake up a local task with rq lock held
2548 * @p: the thread to be awakened 2692 * @p: the thread to be awakened
2549 * 2693 *
2550 * Put @p on the run-queue if it's not already there. The caller must 2694 * Put @p on the run-queue if it's not already there. The caller must
2551 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2695 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2552 * the current task. this_rq() stays locked over invocation. 2696 * the current task.
2553 */ 2697 */
2554static void try_to_wake_up_local(struct task_struct *p) 2698static void try_to_wake_up_local(struct task_struct *p)
2555{ 2699{
2556 struct rq *rq = task_rq(p); 2700 struct rq *rq = task_rq(p);
2557 bool success = false;
2558 2701
2559 BUG_ON(rq != this_rq()); 2702 BUG_ON(rq != this_rq());
2560 BUG_ON(p == current); 2703 BUG_ON(p == current);
2561 lockdep_assert_held(&rq->lock); 2704 lockdep_assert_held(&rq->lock);
2562 2705
2706 if (!raw_spin_trylock(&p->pi_lock)) {
2707 raw_spin_unlock(&rq->lock);
2708 raw_spin_lock(&p->pi_lock);
2709 raw_spin_lock(&rq->lock);
2710 }
2711
2563 if (!(p->state & TASK_NORMAL)) 2712 if (!(p->state & TASK_NORMAL))
2564 return; 2713 goto out;
2565 2714
2566 if (!p->se.on_rq) { 2715 if (!p->on_rq)
2567 if (likely(!task_running(rq, p))) { 2716 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
2568 schedstat_inc(rq, ttwu_count); 2717
2569 schedstat_inc(rq, ttwu_local); 2718 ttwu_do_wakeup(rq, p, 0);
2570 } 2719 ttwu_stat(p, smp_processor_id(), 0);
2571 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP); 2720out:
2572 success = true; 2721 raw_spin_unlock(&p->pi_lock);
2573 }
2574 ttwu_post_activation(p, rq, 0, success);
2575} 2722}
2576 2723
2577/** 2724/**
@@ -2604,19 +2751,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
2604 */ 2751 */
2605static void __sched_fork(struct task_struct *p) 2752static void __sched_fork(struct task_struct *p)
2606{ 2753{
2754 p->on_rq = 0;
2755
2756 p->se.on_rq = 0;
2607 p->se.exec_start = 0; 2757 p->se.exec_start = 0;
2608 p->se.sum_exec_runtime = 0; 2758 p->se.sum_exec_runtime = 0;
2609 p->se.prev_sum_exec_runtime = 0; 2759 p->se.prev_sum_exec_runtime = 0;
2610 p->se.nr_migrations = 0; 2760 p->se.nr_migrations = 0;
2611 p->se.vruntime = 0; 2761 p->se.vruntime = 0;
2762 INIT_LIST_HEAD(&p->se.group_node);
2612 2763
2613#ifdef CONFIG_SCHEDSTATS 2764#ifdef CONFIG_SCHEDSTATS
2614 memset(&p->se.statistics, 0, sizeof(p->se.statistics)); 2765 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2615#endif 2766#endif
2616 2767
2617 INIT_LIST_HEAD(&p->rt.run_list); 2768 INIT_LIST_HEAD(&p->rt.run_list);
2618 p->se.on_rq = 0;
2619 INIT_LIST_HEAD(&p->se.group_node);
2620 2769
2621#ifdef CONFIG_PREEMPT_NOTIFIERS 2770#ifdef CONFIG_PREEMPT_NOTIFIERS
2622 INIT_HLIST_HEAD(&p->preempt_notifiers); 2771 INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2626,8 +2775,9 @@ static void __sched_fork(struct task_struct *p)
2626/* 2775/*
2627 * fork()/clone()-time setup: 2776 * fork()/clone()-time setup:
2628 */ 2777 */
2629void sched_fork(struct task_struct *p, int clone_flags) 2778void sched_fork(struct task_struct *p)
2630{ 2779{
2780 unsigned long flags;
2631 int cpu = get_cpu(); 2781 int cpu = get_cpu();
2632 2782
2633 __sched_fork(p); 2783 __sched_fork(p);
@@ -2678,16 +2828,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2678 * 2828 *
2679 * Silence PROVE_RCU. 2829 * Silence PROVE_RCU.
2680 */ 2830 */
2681 rcu_read_lock(); 2831 raw_spin_lock_irqsave(&p->pi_lock, flags);
2682 set_task_cpu(p, cpu); 2832 set_task_cpu(p, cpu);
2683 rcu_read_unlock(); 2833 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2684 2834
2685#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2835#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2686 if (likely(sched_info_on())) 2836 if (likely(sched_info_on()))
2687 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2837 memset(&p->sched_info, 0, sizeof(p->sched_info));
2688#endif 2838#endif
2689#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 2839#if defined(CONFIG_SMP)
2690 p->oncpu = 0; 2840 p->on_cpu = 0;
2691#endif 2841#endif
2692#ifdef CONFIG_PREEMPT 2842#ifdef CONFIG_PREEMPT
2693 /* Want to start with kernel preemption disabled. */ 2843 /* Want to start with kernel preemption disabled. */
@@ -2707,41 +2857,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
2707 * that must be done for every newly created context, then puts the task 2857 * that must be done for every newly created context, then puts the task
2708 * on the runqueue and wakes it. 2858 * on the runqueue and wakes it.
2709 */ 2859 */
2710void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) 2860void wake_up_new_task(struct task_struct *p)
2711{ 2861{
2712 unsigned long flags; 2862 unsigned long flags;
2713 struct rq *rq; 2863 struct rq *rq;
2714 int cpu __maybe_unused = get_cpu();
2715 2864
2865 raw_spin_lock_irqsave(&p->pi_lock, flags);
2716#ifdef CONFIG_SMP 2866#ifdef CONFIG_SMP
2717 rq = task_rq_lock(p, &flags);
2718 p->state = TASK_WAKING;
2719
2720 /* 2867 /*
2721 * Fork balancing, do it here and not earlier because: 2868 * Fork balancing, do it here and not earlier because:
2722 * - cpus_allowed can change in the fork path 2869 * - cpus_allowed can change in the fork path
2723 * - any previously selected cpu might disappear through hotplug 2870 * - any previously selected cpu might disappear through hotplug
2724 *
2725 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2726 * without people poking at ->cpus_allowed.
2727 */ 2871 */
2728 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0); 2872 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
2729 set_task_cpu(p, cpu);
2730
2731 p->state = TASK_RUNNING;
2732 task_rq_unlock(rq, &flags);
2733#endif 2873#endif
2734 2874
2735 rq = task_rq_lock(p, &flags); 2875 rq = __task_rq_lock(p);
2736 activate_task(rq, p, 0); 2876 activate_task(rq, p, 0);
2737 trace_sched_wakeup_new(p, 1); 2877 p->on_rq = 1;
2878 trace_sched_wakeup_new(p, true);
2738 check_preempt_curr(rq, p, WF_FORK); 2879 check_preempt_curr(rq, p, WF_FORK);
2739#ifdef CONFIG_SMP 2880#ifdef CONFIG_SMP
2740 if (p->sched_class->task_woken) 2881 if (p->sched_class->task_woken)
2741 p->sched_class->task_woken(rq, p); 2882 p->sched_class->task_woken(rq, p);
2742#endif 2883#endif
2743 task_rq_unlock(rq, &flags); 2884 task_rq_unlock(rq, p, &flags);
2744 put_cpu();
2745} 2885}
2746 2886
2747#ifdef CONFIG_PREEMPT_NOTIFIERS 2887#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -3450,27 +3590,22 @@ void sched_exec(void)
3450{ 3590{
3451 struct task_struct *p = current; 3591 struct task_struct *p = current;
3452 unsigned long flags; 3592 unsigned long flags;
3453 struct rq *rq;
3454 int dest_cpu; 3593 int dest_cpu;
3455 3594
3456 rq = task_rq_lock(p, &flags); 3595 raw_spin_lock_irqsave(&p->pi_lock, flags);
3457 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0); 3596 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
3458 if (dest_cpu == smp_processor_id()) 3597 if (dest_cpu == smp_processor_id())
3459 goto unlock; 3598 goto unlock;
3460 3599
3461 /* 3600 if (likely(cpu_active(dest_cpu))) {
3462 * select_task_rq() can race against ->cpus_allowed
3463 */
3464 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3465 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3466 struct migration_arg arg = { p, dest_cpu }; 3601 struct migration_arg arg = { p, dest_cpu };
3467 3602
3468 task_rq_unlock(rq, &flags); 3603 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3469 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 3604 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
3470 return; 3605 return;
3471 } 3606 }
3472unlock: 3607unlock:
3473 task_rq_unlock(rq, &flags); 3608 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3474} 3609}
3475 3610
3476#endif 3611#endif
@@ -3507,7 +3642,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
3507 3642
3508 rq = task_rq_lock(p, &flags); 3643 rq = task_rq_lock(p, &flags);
3509 ns = do_task_delta_exec(p, rq); 3644 ns = do_task_delta_exec(p, rq);
3510 task_rq_unlock(rq, &flags); 3645 task_rq_unlock(rq, p, &flags);
3511 3646
3512 return ns; 3647 return ns;
3513} 3648}
@@ -3525,7 +3660,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3525 3660
3526 rq = task_rq_lock(p, &flags); 3661 rq = task_rq_lock(p, &flags);
3527 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); 3662 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
3528 task_rq_unlock(rq, &flags); 3663 task_rq_unlock(rq, p, &flags);
3529 3664
3530 return ns; 3665 return ns;
3531} 3666}
@@ -3549,7 +3684,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
3549 rq = task_rq_lock(p, &flags); 3684 rq = task_rq_lock(p, &flags);
3550 thread_group_cputime(p, &totals); 3685 thread_group_cputime(p, &totals);
3551 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq); 3686 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
3552 task_rq_unlock(rq, &flags); 3687 task_rq_unlock(rq, p, &flags);
3553 3688
3554 return ns; 3689 return ns;
3555} 3690}
@@ -3903,9 +4038,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
3903/* 4038/*
3904 * This function gets called by the timer code, with HZ frequency. 4039 * This function gets called by the timer code, with HZ frequency.
3905 * We call it with interrupts disabled. 4040 * We call it with interrupts disabled.
3906 *
3907 * It also gets called by the fork code, when changing the parent's
3908 * timeslices.
3909 */ 4041 */
3910void scheduler_tick(void) 4042void scheduler_tick(void)
3911{ 4043{
@@ -4025,17 +4157,11 @@ static inline void schedule_debug(struct task_struct *prev)
4025 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4157 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4026 4158
4027 schedstat_inc(this_rq(), sched_count); 4159 schedstat_inc(this_rq(), sched_count);
4028#ifdef CONFIG_SCHEDSTATS
4029 if (unlikely(prev->lock_depth >= 0)) {
4030 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4031 schedstat_inc(prev, sched_info.bkl_count);
4032 }
4033#endif
4034} 4160}
4035 4161
4036static void put_prev_task(struct rq *rq, struct task_struct *prev) 4162static void put_prev_task(struct rq *rq, struct task_struct *prev)
4037{ 4163{
4038 if (prev->se.on_rq) 4164 if (prev->on_rq || rq->skip_clock_update < 0)
4039 update_rq_clock(rq); 4165 update_rq_clock(rq);
4040 prev->sched_class->put_prev_task(rq, prev); 4166 prev->sched_class->put_prev_task(rq, prev);
4041} 4167}
@@ -4097,11 +4223,13 @@ need_resched:
4097 if (unlikely(signal_pending_state(prev->state, prev))) { 4223 if (unlikely(signal_pending_state(prev->state, prev))) {
4098 prev->state = TASK_RUNNING; 4224 prev->state = TASK_RUNNING;
4099 } else { 4225 } else {
4226 deactivate_task(rq, prev, DEQUEUE_SLEEP);
4227 prev->on_rq = 0;
4228
4100 /* 4229 /*
4101 * If a worker is going to sleep, notify and 4230 * If a worker went to sleep, notify and ask workqueue
4102 * ask workqueue whether it wants to wake up a 4231 * whether it wants to wake up a task to maintain
4103 * task to maintain concurrency. If so, wake 4232 * concurrency.
4104 * up the task.
4105 */ 4233 */
4106 if (prev->flags & PF_WQ_WORKER) { 4234 if (prev->flags & PF_WQ_WORKER) {
4107 struct task_struct *to_wakeup; 4235 struct task_struct *to_wakeup;
@@ -4110,21 +4238,20 @@ need_resched:
4110 if (to_wakeup) 4238 if (to_wakeup)
4111 try_to_wake_up_local(to_wakeup); 4239 try_to_wake_up_local(to_wakeup);
4112 } 4240 }
4113 deactivate_task(rq, prev, DEQUEUE_SLEEP); 4241
4242 /*
4243 * If we are going to sleep and we have plugged IO
4244 * queued, make sure to submit it to avoid deadlocks.
4245 */
4246 if (blk_needs_flush_plug(prev)) {
4247 raw_spin_unlock(&rq->lock);
4248 blk_schedule_flush_plug(prev);
4249 raw_spin_lock(&rq->lock);
4250 }
4114 } 4251 }
4115 switch_count = &prev->nvcsw; 4252 switch_count = &prev->nvcsw;
4116 } 4253 }
4117 4254
4118 /*
4119 * If we are going to sleep and we have plugged IO queued, make
4120 * sure to submit it to avoid deadlocks.
4121 */
4122 if (prev->state != TASK_RUNNING && blk_needs_flush_plug(prev)) {
4123 raw_spin_unlock(&rq->lock);
4124 blk_flush_plug(prev);
4125 raw_spin_lock(&rq->lock);
4126 }
4127
4128 pre_schedule(rq, prev); 4255 pre_schedule(rq, prev);
4129 4256
4130 if (unlikely(!rq->nr_running)) 4257 if (unlikely(!rq->nr_running))
@@ -4161,70 +4288,53 @@ need_resched:
4161EXPORT_SYMBOL(schedule); 4288EXPORT_SYMBOL(schedule);
4162 4289
4163#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 4290#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
4164/*
4165 * Look out! "owner" is an entirely speculative pointer
4166 * access and not reliable.
4167 */
4168int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4169{
4170 unsigned int cpu;
4171 struct rq *rq;
4172 4291
4173 if (!sched_feat(OWNER_SPIN)) 4292static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
4174 return 0; 4293{
4294 bool ret = false;
4175 4295
4176#ifdef CONFIG_DEBUG_PAGEALLOC 4296 rcu_read_lock();
4177 /* 4297 if (lock->owner != owner)
4178 * Need to access the cpu field knowing that 4298 goto fail;
4179 * DEBUG_PAGEALLOC could have unmapped it if
4180 * the mutex owner just released it and exited.
4181 */
4182 if (probe_kernel_address(&owner->cpu, cpu))
4183 return 0;
4184#else
4185 cpu = owner->cpu;
4186#endif
4187 4299
4188 /* 4300 /*
4189 * Even if the access succeeded (likely case), 4301 * Ensure we emit the owner->on_cpu, dereference _after_ checking
4190 * the cpu field may no longer be valid. 4302 * lock->owner still matches owner, if that fails, owner might
4303 * point to free()d memory, if it still matches, the rcu_read_lock()
4304 * ensures the memory stays valid.
4191 */ 4305 */
4192 if (cpu >= nr_cpumask_bits) 4306 barrier();
4193 return 0;
4194 4307
4195 /* 4308 ret = owner->on_cpu;
4196 * We need to validate that we can do a 4309fail:
4197 * get_cpu() and that we have the percpu area. 4310 rcu_read_unlock();
4198 */
4199 if (!cpu_online(cpu))
4200 return 0;
4201 4311
4202 rq = cpu_rq(cpu); 4312 return ret;
4313}
4203 4314
4204 for (;;) { 4315/*
4205 /* 4316 * Look out! "owner" is an entirely speculative pointer
4206 * Owner changed, break to re-assess state. 4317 * access and not reliable.
4207 */ 4318 */
4208 if (lock->owner != owner) { 4319int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
4209 /* 4320{
4210 * If the lock has switched to a different owner, 4321 if (!sched_feat(OWNER_SPIN))
4211 * we likely have heavy contention. Return 0 to quit 4322 return 0;
4212 * optimistic spinning and not contend further:
4213 */
4214 if (lock->owner)
4215 return 0;
4216 break;
4217 }
4218 4323
4219 /* 4324 while (owner_running(lock, owner)) {
4220 * Is that owner really running on that cpu? 4325 if (need_resched())
4221 */
4222 if (task_thread_info(rq->curr) != owner || need_resched())
4223 return 0; 4326 return 0;
4224 4327
4225 arch_mutex_cpu_relax(); 4328 arch_mutex_cpu_relax();
4226 } 4329 }
4227 4330
4331 /*
4332 * If the owner changed to another task there is likely
4333 * heavy contention, stop spinning.
4334 */
4335 if (lock->owner)
4336 return 0;
4337
4228 return 1; 4338 return 1;
4229} 4339}
4230#endif 4340#endif
@@ -4684,19 +4794,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
4684 */ 4794 */
4685void rt_mutex_setprio(struct task_struct *p, int prio) 4795void rt_mutex_setprio(struct task_struct *p, int prio)
4686{ 4796{
4687 unsigned long flags;
4688 int oldprio, on_rq, running; 4797 int oldprio, on_rq, running;
4689 struct rq *rq; 4798 struct rq *rq;
4690 const struct sched_class *prev_class; 4799 const struct sched_class *prev_class;
4691 4800
4692 BUG_ON(prio < 0 || prio > MAX_PRIO); 4801 BUG_ON(prio < 0 || prio > MAX_PRIO);
4693 4802
4694 rq = task_rq_lock(p, &flags); 4803 rq = __task_rq_lock(p);
4695 4804
4696 trace_sched_pi_setprio(p, prio); 4805 trace_sched_pi_setprio(p, prio);
4697 oldprio = p->prio; 4806 oldprio = p->prio;
4698 prev_class = p->sched_class; 4807 prev_class = p->sched_class;
4699 on_rq = p->se.on_rq; 4808 on_rq = p->on_rq;
4700 running = task_current(rq, p); 4809 running = task_current(rq, p);
4701 if (on_rq) 4810 if (on_rq)
4702 dequeue_task(rq, p, 0); 4811 dequeue_task(rq, p, 0);
@@ -4716,7 +4825,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4716 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); 4825 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4717 4826
4718 check_class_changed(rq, p, prev_class, oldprio); 4827 check_class_changed(rq, p, prev_class, oldprio);
4719 task_rq_unlock(rq, &flags); 4828 __task_rq_unlock(rq);
4720} 4829}
4721 4830
4722#endif 4831#endif
@@ -4744,7 +4853,7 @@ void set_user_nice(struct task_struct *p, long nice)
4744 p->static_prio = NICE_TO_PRIO(nice); 4853 p->static_prio = NICE_TO_PRIO(nice);
4745 goto out_unlock; 4854 goto out_unlock;
4746 } 4855 }
4747 on_rq = p->se.on_rq; 4856 on_rq = p->on_rq;
4748 if (on_rq) 4857 if (on_rq)
4749 dequeue_task(rq, p, 0); 4858 dequeue_task(rq, p, 0);
4750 4859
@@ -4764,7 +4873,7 @@ void set_user_nice(struct task_struct *p, long nice)
4764 resched_task(rq->curr); 4873 resched_task(rq->curr);
4765 } 4874 }
4766out_unlock: 4875out_unlock:
4767 task_rq_unlock(rq, &flags); 4876 task_rq_unlock(rq, p, &flags);
4768} 4877}
4769EXPORT_SYMBOL(set_user_nice); 4878EXPORT_SYMBOL(set_user_nice);
4770 4879
@@ -4878,8 +4987,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
4878static void 4987static void
4879__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) 4988__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4880{ 4989{
4881 BUG_ON(p->se.on_rq);
4882
4883 p->policy = policy; 4990 p->policy = policy;
4884 p->rt_priority = prio; 4991 p->rt_priority = prio;
4885 p->normal_prio = normal_prio(p); 4992 p->normal_prio = normal_prio(p);
@@ -4994,20 +5101,17 @@ recheck:
4994 /* 5101 /*
4995 * make sure no PI-waiters arrive (or leave) while we are 5102 * make sure no PI-waiters arrive (or leave) while we are
4996 * changing the priority of the task: 5103 * changing the priority of the task:
4997 */ 5104 *
4998 raw_spin_lock_irqsave(&p->pi_lock, flags);
4999 /*
5000 * To be able to change p->policy safely, the appropriate 5105 * To be able to change p->policy safely, the appropriate
5001 * runqueue lock must be held. 5106 * runqueue lock must be held.
5002 */ 5107 */
5003 rq = __task_rq_lock(p); 5108 rq = task_rq_lock(p, &flags);
5004 5109
5005 /* 5110 /*
5006 * Changing the policy of the stop threads its a very bad idea 5111 * Changing the policy of the stop threads its a very bad idea
5007 */ 5112 */
5008 if (p == rq->stop) { 5113 if (p == rq->stop) {
5009 __task_rq_unlock(rq); 5114 task_rq_unlock(rq, p, &flags);
5010 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5011 return -EINVAL; 5115 return -EINVAL;
5012 } 5116 }
5013 5117
@@ -5031,8 +5135,7 @@ recheck:
5031 if (rt_bandwidth_enabled() && rt_policy(policy) && 5135 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5032 task_group(p)->rt_bandwidth.rt_runtime == 0 && 5136 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5033 !task_group_is_autogroup(task_group(p))) { 5137 !task_group_is_autogroup(task_group(p))) {
5034 __task_rq_unlock(rq); 5138 task_rq_unlock(rq, p, &flags);
5035 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5036 return -EPERM; 5139 return -EPERM;
5037 } 5140 }
5038 } 5141 }
@@ -5041,11 +5144,10 @@ recheck:
5041 /* recheck policy now with rq lock held */ 5144 /* recheck policy now with rq lock held */
5042 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 5145 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5043 policy = oldpolicy = -1; 5146 policy = oldpolicy = -1;
5044 __task_rq_unlock(rq); 5147 task_rq_unlock(rq, p, &flags);
5045 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5046 goto recheck; 5148 goto recheck;
5047 } 5149 }
5048 on_rq = p->se.on_rq; 5150 on_rq = p->on_rq;
5049 running = task_current(rq, p); 5151 running = task_current(rq, p);
5050 if (on_rq) 5152 if (on_rq)
5051 deactivate_task(rq, p, 0); 5153 deactivate_task(rq, p, 0);
@@ -5064,8 +5166,7 @@ recheck:
5064 activate_task(rq, p, 0); 5166 activate_task(rq, p, 0);
5065 5167
5066 check_class_changed(rq, p, prev_class, oldprio); 5168 check_class_changed(rq, p, prev_class, oldprio);
5067 __task_rq_unlock(rq); 5169 task_rq_unlock(rq, p, &flags);
5068 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5069 5170
5070 rt_mutex_adjust_pi(p); 5171 rt_mutex_adjust_pi(p);
5071 5172
@@ -5316,7 +5417,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5316{ 5417{
5317 struct task_struct *p; 5418 struct task_struct *p;
5318 unsigned long flags; 5419 unsigned long flags;
5319 struct rq *rq;
5320 int retval; 5420 int retval;
5321 5421
5322 get_online_cpus(); 5422 get_online_cpus();
@@ -5331,9 +5431,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
5331 if (retval) 5431 if (retval)
5332 goto out_unlock; 5432 goto out_unlock;
5333 5433
5334 rq = task_rq_lock(p, &flags); 5434 raw_spin_lock_irqsave(&p->pi_lock, flags);
5335 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 5435 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
5336 task_rq_unlock(rq, &flags); 5436 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5337 5437
5338out_unlock: 5438out_unlock:
5339 rcu_read_unlock(); 5439 rcu_read_unlock();
@@ -5658,7 +5758,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5658 5758
5659 rq = task_rq_lock(p, &flags); 5759 rq = task_rq_lock(p, &flags);
5660 time_slice = p->sched_class->get_rr_interval(rq, p); 5760 time_slice = p->sched_class->get_rr_interval(rq, p);
5661 task_rq_unlock(rq, &flags); 5761 task_rq_unlock(rq, p, &flags);
5662 5762
5663 rcu_read_unlock(); 5763 rcu_read_unlock();
5664 jiffies_to_timespec(time_slice, &t); 5764 jiffies_to_timespec(time_slice, &t);
@@ -5760,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5760 idle->state = TASK_RUNNING; 5860 idle->state = TASK_RUNNING;
5761 idle->se.exec_start = sched_clock(); 5861 idle->se.exec_start = sched_clock();
5762 5862
5763 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5863 do_set_cpus_allowed(idle, cpumask_of(cpu));
5764 /* 5864 /*
5765 * We're having a chicken and egg problem, even though we are 5865 * We're having a chicken and egg problem, even though we are
5766 * holding rq->lock, the cpu isn't yet set to this cpu so the 5866 * holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5776,17 +5876,14 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5776 rcu_read_unlock(); 5876 rcu_read_unlock();
5777 5877
5778 rq->curr = rq->idle = idle; 5878 rq->curr = rq->idle = idle;
5779#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5879#if defined(CONFIG_SMP)
5780 idle->oncpu = 1; 5880 idle->on_cpu = 1;
5781#endif 5881#endif
5782 raw_spin_unlock_irqrestore(&rq->lock, flags); 5882 raw_spin_unlock_irqrestore(&rq->lock, flags);
5783 5883
5784 /* Set the preempt count _outside_ the spinlocks! */ 5884 /* Set the preempt count _outside_ the spinlocks! */
5785#if defined(CONFIG_PREEMPT)
5786 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
5787#else
5788 task_thread_info(idle)->preempt_count = 0; 5885 task_thread_info(idle)->preempt_count = 0;
5789#endif 5886
5790 /* 5887 /*
5791 * The idle tasks have their own, simple scheduling class: 5888 * The idle tasks have their own, simple scheduling class:
5792 */ 5889 */
@@ -5851,6 +5948,16 @@ static inline void sched_init_granularity(void)
5851} 5948}
5852 5949
5853#ifdef CONFIG_SMP 5950#ifdef CONFIG_SMP
5951void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
5952{
5953 if (p->sched_class && p->sched_class->set_cpus_allowed)
5954 p->sched_class->set_cpus_allowed(p, new_mask);
5955 else {
5956 cpumask_copy(&p->cpus_allowed, new_mask);
5957 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5958 }
5959}
5960
5854/* 5961/*
5855 * This is how migration works: 5962 * This is how migration works:
5856 * 5963 *
@@ -5881,52 +5988,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5881 unsigned int dest_cpu; 5988 unsigned int dest_cpu;
5882 int ret = 0; 5989 int ret = 0;
5883 5990
5884 /*
5885 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5886 * drop the rq->lock and still rely on ->cpus_allowed.
5887 */
5888again:
5889 while (task_is_waking(p))
5890 cpu_relax();
5891 rq = task_rq_lock(p, &flags); 5991 rq = task_rq_lock(p, &flags);
5892 if (task_is_waking(p)) { 5992
5893 task_rq_unlock(rq, &flags); 5993 if (cpumask_equal(&p->cpus_allowed, new_mask))
5894 goto again; 5994 goto out;
5895 }
5896 5995
5897 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5996 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5898 ret = -EINVAL; 5997 ret = -EINVAL;
5899 goto out; 5998 goto out;
5900 } 5999 }
5901 6000
5902 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current && 6001 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
5903 !cpumask_equal(&p->cpus_allowed, new_mask))) {
5904 ret = -EINVAL; 6002 ret = -EINVAL;
5905 goto out; 6003 goto out;
5906 } 6004 }
5907 6005
5908 if (p->sched_class->set_cpus_allowed) 6006 do_set_cpus_allowed(p, new_mask);
5909 p->sched_class->set_cpus_allowed(p, new_mask);
5910 else {
5911 cpumask_copy(&p->cpus_allowed, new_mask);
5912 p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
5913 }
5914 6007
5915 /* Can the task run on the task's current CPU? If so, we're done */ 6008 /* Can the task run on the task's current CPU? If so, we're done */
5916 if (cpumask_test_cpu(task_cpu(p), new_mask)) 6009 if (cpumask_test_cpu(task_cpu(p), new_mask))
5917 goto out; 6010 goto out;
5918 6011
5919 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 6012 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5920 if (migrate_task(p, rq)) { 6013 if (p->on_rq) {
5921 struct migration_arg arg = { p, dest_cpu }; 6014 struct migration_arg arg = { p, dest_cpu };
5922 /* Need help from migration thread: drop lock and wait. */ 6015 /* Need help from migration thread: drop lock and wait. */
5923 task_rq_unlock(rq, &flags); 6016 task_rq_unlock(rq, p, &flags);
5924 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 6017 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5925 tlb_migrate_finish(p->mm); 6018 tlb_migrate_finish(p->mm);
5926 return 0; 6019 return 0;
5927 } 6020 }
5928out: 6021out:
5929 task_rq_unlock(rq, &flags); 6022 task_rq_unlock(rq, p, &flags);
5930 6023
5931 return ret; 6024 return ret;
5932} 6025}
@@ -5954,6 +6047,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5954 rq_src = cpu_rq(src_cpu); 6047 rq_src = cpu_rq(src_cpu);
5955 rq_dest = cpu_rq(dest_cpu); 6048 rq_dest = cpu_rq(dest_cpu);
5956 6049
6050 raw_spin_lock(&p->pi_lock);
5957 double_rq_lock(rq_src, rq_dest); 6051 double_rq_lock(rq_src, rq_dest);
5958 /* Already moved. */ 6052 /* Already moved. */
5959 if (task_cpu(p) != src_cpu) 6053 if (task_cpu(p) != src_cpu)
@@ -5966,7 +6060,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5966 * If we're not on a rq, the next wake-up will ensure we're 6060 * If we're not on a rq, the next wake-up will ensure we're
5967 * placed properly. 6061 * placed properly.
5968 */ 6062 */
5969 if (p->se.on_rq) { 6063 if (p->on_rq) {
5970 deactivate_task(rq_src, p, 0); 6064 deactivate_task(rq_src, p, 0);
5971 set_task_cpu(p, dest_cpu); 6065 set_task_cpu(p, dest_cpu);
5972 activate_task(rq_dest, p, 0); 6066 activate_task(rq_dest, p, 0);
@@ -5976,6 +6070,7 @@ done:
5976 ret = 1; 6070 ret = 1;
5977fail: 6071fail:
5978 double_rq_unlock(rq_src, rq_dest); 6072 double_rq_unlock(rq_src, rq_dest);
6073 raw_spin_unlock(&p->pi_lock);
5979 return ret; 6074 return ret;
5980} 6075}
5981 6076
@@ -6316,6 +6411,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6316 6411
6317#ifdef CONFIG_HOTPLUG_CPU 6412#ifdef CONFIG_HOTPLUG_CPU
6318 case CPU_DYING: 6413 case CPU_DYING:
6414 sched_ttwu_pending();
6319 /* Update our root-domain */ 6415 /* Update our root-domain */
6320 raw_spin_lock_irqsave(&rq->lock, flags); 6416 raw_spin_lock_irqsave(&rq->lock, flags);
6321 if (rq->rd) { 6417 if (rq->rd) {
@@ -6394,6 +6490,8 @@ early_initcall(migration_init);
6394 6490
6395#ifdef CONFIG_SMP 6491#ifdef CONFIG_SMP
6396 6492
6493static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
6494
6397#ifdef CONFIG_SCHED_DEBUG 6495#ifdef CONFIG_SCHED_DEBUG
6398 6496
6399static __read_mostly int sched_domain_debug_enabled; 6497static __read_mostly int sched_domain_debug_enabled;
@@ -6468,7 +6566,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6468 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 6566 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
6469 6567
6470 printk(KERN_CONT " %s", str); 6568 printk(KERN_CONT " %s", str);
6471 if (group->cpu_power != SCHED_LOAD_SCALE) { 6569 if (group->cpu_power != SCHED_POWER_SCALE) {
6472 printk(KERN_CONT " (cpu_power = %d)", 6570 printk(KERN_CONT " (cpu_power = %d)",
6473 group->cpu_power); 6571 group->cpu_power);
6474 } 6572 }
@@ -6489,7 +6587,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6489 6587
6490static void sched_domain_debug(struct sched_domain *sd, int cpu) 6588static void sched_domain_debug(struct sched_domain *sd, int cpu)
6491{ 6589{
6492 cpumask_var_t groupmask;
6493 int level = 0; 6590 int level = 0;
6494 6591
6495 if (!sched_domain_debug_enabled) 6592 if (!sched_domain_debug_enabled)
@@ -6502,20 +6599,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6502 6599
6503 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); 6600 printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
6504 6601
6505 if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
6506 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
6507 return;
6508 }
6509
6510 for (;;) { 6602 for (;;) {
6511 if (sched_domain_debug_one(sd, cpu, level, groupmask)) 6603 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
6512 break; 6604 break;
6513 level++; 6605 level++;
6514 sd = sd->parent; 6606 sd = sd->parent;
6515 if (!sd) 6607 if (!sd)
6516 break; 6608 break;
6517 } 6609 }
6518 free_cpumask_var(groupmask);
6519} 6610}
6520#else /* !CONFIG_SCHED_DEBUG */ 6611#else /* !CONFIG_SCHED_DEBUG */
6521# define sched_domain_debug(sd, cpu) do { } while (0) 6612# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6572,12 +6663,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6572 return 1; 6663 return 1;
6573} 6664}
6574 6665
6575static void free_rootdomain(struct root_domain *rd) 6666static void free_rootdomain(struct rcu_head *rcu)
6576{ 6667{
6577 synchronize_sched(); 6668 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
6578 6669
6579 cpupri_cleanup(&rd->cpupri); 6670 cpupri_cleanup(&rd->cpupri);
6580
6581 free_cpumask_var(rd->rto_mask); 6671 free_cpumask_var(rd->rto_mask);
6582 free_cpumask_var(rd->online); 6672 free_cpumask_var(rd->online);
6583 free_cpumask_var(rd->span); 6673 free_cpumask_var(rd->span);
@@ -6618,7 +6708,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6618 raw_spin_unlock_irqrestore(&rq->lock, flags); 6708 raw_spin_unlock_irqrestore(&rq->lock, flags);
6619 6709
6620 if (old_rd) 6710 if (old_rd)
6621 free_rootdomain(old_rd); 6711 call_rcu_sched(&old_rd->rcu, free_rootdomain);
6622} 6712}
6623 6713
6624static int init_rootdomain(struct root_domain *rd) 6714static int init_rootdomain(struct root_domain *rd)
@@ -6669,6 +6759,25 @@ static struct root_domain *alloc_rootdomain(void)
6669 return rd; 6759 return rd;
6670} 6760}
6671 6761
6762static void free_sched_domain(struct rcu_head *rcu)
6763{
6764 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
6765 if (atomic_dec_and_test(&sd->groups->ref))
6766 kfree(sd->groups);
6767 kfree(sd);
6768}
6769
6770static void destroy_sched_domain(struct sched_domain *sd, int cpu)
6771{
6772 call_rcu(&sd->rcu, free_sched_domain);
6773}
6774
6775static void destroy_sched_domains(struct sched_domain *sd, int cpu)
6776{
6777 for (; sd; sd = sd->parent)
6778 destroy_sched_domain(sd, cpu);
6779}
6780
6672/* 6781/*
6673 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6782 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
6674 * hold the hotplug lock. 6783 * hold the hotplug lock.
@@ -6679,9 +6788,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6679 struct rq *rq = cpu_rq(cpu); 6788 struct rq *rq = cpu_rq(cpu);
6680 struct sched_domain *tmp; 6789 struct sched_domain *tmp;
6681 6790
6682 for (tmp = sd; tmp; tmp = tmp->parent)
6683 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6684
6685 /* Remove the sched domains which do not contribute to scheduling. */ 6791 /* Remove the sched domains which do not contribute to scheduling. */
6686 for (tmp = sd; tmp; ) { 6792 for (tmp = sd; tmp; ) {
6687 struct sched_domain *parent = tmp->parent; 6793 struct sched_domain *parent = tmp->parent;
@@ -6692,12 +6798,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6692 tmp->parent = parent->parent; 6798 tmp->parent = parent->parent;
6693 if (parent->parent) 6799 if (parent->parent)
6694 parent->parent->child = tmp; 6800 parent->parent->child = tmp;
6801 destroy_sched_domain(parent, cpu);
6695 } else 6802 } else
6696 tmp = tmp->parent; 6803 tmp = tmp->parent;
6697 } 6804 }
6698 6805
6699 if (sd && sd_degenerate(sd)) { 6806 if (sd && sd_degenerate(sd)) {
6807 tmp = sd;
6700 sd = sd->parent; 6808 sd = sd->parent;
6809 destroy_sched_domain(tmp, cpu);
6701 if (sd) 6810 if (sd)
6702 sd->child = NULL; 6811 sd->child = NULL;
6703 } 6812 }
@@ -6705,7 +6814,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6705 sched_domain_debug(sd, cpu); 6814 sched_domain_debug(sd, cpu);
6706 6815
6707 rq_attach_root(rq, rd); 6816 rq_attach_root(rq, rd);
6817 tmp = rq->sd;
6708 rcu_assign_pointer(rq->sd, sd); 6818 rcu_assign_pointer(rq->sd, sd);
6819 destroy_sched_domains(tmp, cpu);
6709} 6820}
6710 6821
6711/* cpus with isolated domains */ 6822/* cpus with isolated domains */
@@ -6721,56 +6832,6 @@ static int __init isolated_cpu_setup(char *str)
6721 6832
6722__setup("isolcpus=", isolated_cpu_setup); 6833__setup("isolcpus=", isolated_cpu_setup);
6723 6834
6724/*
6725 * init_sched_build_groups takes the cpumask we wish to span, and a pointer
6726 * to a function which identifies what group(along with sched group) a CPU
6727 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6728 * (due to the fact that we keep track of groups covered with a struct cpumask).
6729 *
6730 * init_sched_build_groups will build a circular linked list of the groups
6731 * covered by the given span, and will set each group's ->cpumask correctly,
6732 * and ->cpu_power to 0.
6733 */
6734static void
6735init_sched_build_groups(const struct cpumask *span,
6736 const struct cpumask *cpu_map,
6737 int (*group_fn)(int cpu, const struct cpumask *cpu_map,
6738 struct sched_group **sg,
6739 struct cpumask *tmpmask),
6740 struct cpumask *covered, struct cpumask *tmpmask)
6741{
6742 struct sched_group *first = NULL, *last = NULL;
6743 int i;
6744
6745 cpumask_clear(covered);
6746
6747 for_each_cpu(i, span) {
6748 struct sched_group *sg;
6749 int group = group_fn(i, cpu_map, &sg, tmpmask);
6750 int j;
6751
6752 if (cpumask_test_cpu(i, covered))
6753 continue;
6754
6755 cpumask_clear(sched_group_cpus(sg));
6756 sg->cpu_power = 0;
6757
6758 for_each_cpu(j, span) {
6759 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6760 continue;
6761
6762 cpumask_set_cpu(j, covered);
6763 cpumask_set_cpu(j, sched_group_cpus(sg));
6764 }
6765 if (!first)
6766 first = sg;
6767 if (last)
6768 last->next = sg;
6769 last = sg;
6770 }
6771 last->next = first;
6772}
6773
6774#define SD_NODES_PER_DOMAIN 16 6835#define SD_NODES_PER_DOMAIN 16
6775 6836
6776#ifdef CONFIG_NUMA 6837#ifdef CONFIG_NUMA
@@ -6787,7 +6848,7 @@ init_sched_build_groups(const struct cpumask *span,
6787 */ 6848 */
6788static int find_next_best_node(int node, nodemask_t *used_nodes) 6849static int find_next_best_node(int node, nodemask_t *used_nodes)
6789{ 6850{
6790 int i, n, val, min_val, best_node = 0; 6851 int i, n, val, min_val, best_node = -1;
6791 6852
6792 min_val = INT_MAX; 6853 min_val = INT_MAX;
6793 6854
@@ -6811,7 +6872,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6811 } 6872 }
6812 } 6873 }
6813 6874
6814 node_set(best_node, *used_nodes); 6875 if (best_node != -1)
6876 node_set(best_node, *used_nodes);
6815 return best_node; 6877 return best_node;
6816} 6878}
6817 6879
@@ -6837,315 +6899,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
6837 6899
6838 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6900 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
6839 int next_node = find_next_best_node(node, &used_nodes); 6901 int next_node = find_next_best_node(node, &used_nodes);
6840 6902 if (next_node < 0)
6903 break;
6841 cpumask_or(span, span, cpumask_of_node(next_node)); 6904 cpumask_or(span, span, cpumask_of_node(next_node));
6842 } 6905 }
6843} 6906}
6907
6908static const struct cpumask *cpu_node_mask(int cpu)
6909{
6910 lockdep_assert_held(&sched_domains_mutex);
6911
6912 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
6913
6914 return sched_domains_tmpmask;
6915}
6916
6917static const struct cpumask *cpu_allnodes_mask(int cpu)
6918{
6919 return cpu_possible_mask;
6920}
6844#endif /* CONFIG_NUMA */ 6921#endif /* CONFIG_NUMA */
6845 6922
6846int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6923static const struct cpumask *cpu_cpu_mask(int cpu)
6924{
6925 return cpumask_of_node(cpu_to_node(cpu));
6926}
6847 6927
6848/* 6928int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6849 * The cpus mask in sched_group and sched_domain hangs off the end.
6850 *
6851 * ( See the the comments in include/linux/sched.h:struct sched_group
6852 * and struct sched_domain. )
6853 */
6854struct static_sched_group {
6855 struct sched_group sg;
6856 DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
6857};
6858 6929
6859struct static_sched_domain { 6930struct sd_data {
6860 struct sched_domain sd; 6931 struct sched_domain **__percpu sd;
6861 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 6932 struct sched_group **__percpu sg;
6862}; 6933};
6863 6934
6864struct s_data { 6935struct s_data {
6865#ifdef CONFIG_NUMA 6936 struct sched_domain ** __percpu sd;
6866 int sd_allnodes;
6867 cpumask_var_t domainspan;
6868 cpumask_var_t covered;
6869 cpumask_var_t notcovered;
6870#endif
6871 cpumask_var_t nodemask;
6872 cpumask_var_t this_sibling_map;
6873 cpumask_var_t this_core_map;
6874 cpumask_var_t this_book_map;
6875 cpumask_var_t send_covered;
6876 cpumask_var_t tmpmask;
6877 struct sched_group **sched_group_nodes;
6878 struct root_domain *rd; 6937 struct root_domain *rd;
6879}; 6938};
6880 6939
6881enum s_alloc { 6940enum s_alloc {
6882 sa_sched_groups = 0,
6883 sa_rootdomain, 6941 sa_rootdomain,
6884 sa_tmpmask, 6942 sa_sd,
6885 sa_send_covered, 6943 sa_sd_storage,
6886 sa_this_book_map,
6887 sa_this_core_map,
6888 sa_this_sibling_map,
6889 sa_nodemask,
6890 sa_sched_group_nodes,
6891#ifdef CONFIG_NUMA
6892 sa_notcovered,
6893 sa_covered,
6894 sa_domainspan,
6895#endif
6896 sa_none, 6944 sa_none,
6897}; 6945};
6898 6946
6899/* 6947struct sched_domain_topology_level;
6900 * SMT sched-domains:
6901 */
6902#ifdef CONFIG_SCHED_SMT
6903static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
6904static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
6905
6906static int
6907cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6908 struct sched_group **sg, struct cpumask *unused)
6909{
6910 if (sg)
6911 *sg = &per_cpu(sched_groups, cpu).sg;
6912 return cpu;
6913}
6914#endif /* CONFIG_SCHED_SMT */
6915 6948
6916/* 6949typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
6917 * multi-core sched-domains: 6950typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
6918 */
6919#ifdef CONFIG_SCHED_MC
6920static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6921static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6922 6951
6923static int 6952struct sched_domain_topology_level {
6924cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6953 sched_domain_init_f init;
6925 struct sched_group **sg, struct cpumask *mask) 6954 sched_domain_mask_f mask;
6926{ 6955 struct sd_data data;
6927 int group; 6956};
6928#ifdef CONFIG_SCHED_SMT
6929 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6930 group = cpumask_first(mask);
6931#else
6932 group = cpu;
6933#endif
6934 if (sg)
6935 *sg = &per_cpu(sched_group_core, group).sg;
6936 return group;
6937}
6938#endif /* CONFIG_SCHED_MC */
6939 6957
6940/* 6958/*
6941 * book sched-domains: 6959 * Assumes the sched_domain tree is fully constructed
6942 */ 6960 */
6943#ifdef CONFIG_SCHED_BOOK 6961static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
6944static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6945static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6946
6947static int
6948cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6949 struct sched_group **sg, struct cpumask *mask)
6950{ 6962{
6951 int group = cpu; 6963 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
6952#ifdef CONFIG_SCHED_MC 6964 struct sched_domain *child = sd->child;
6953 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6954 group = cpumask_first(mask);
6955#elif defined(CONFIG_SCHED_SMT)
6956 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6957 group = cpumask_first(mask);
6958#endif
6959 if (sg)
6960 *sg = &per_cpu(sched_group_book, group).sg;
6961 return group;
6962}
6963#endif /* CONFIG_SCHED_BOOK */
6964 6965
6965static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6966 if (child)
6966static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6967 cpu = cpumask_first(sched_domain_span(child));
6967 6968
6968static int
6969cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6970 struct sched_group **sg, struct cpumask *mask)
6971{
6972 int group;
6973#ifdef CONFIG_SCHED_BOOK
6974 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6975 group = cpumask_first(mask);
6976#elif defined(CONFIG_SCHED_MC)
6977 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6978 group = cpumask_first(mask);
6979#elif defined(CONFIG_SCHED_SMT)
6980 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6981 group = cpumask_first(mask);
6982#else
6983 group = cpu;
6984#endif
6985 if (sg) 6969 if (sg)
6986 *sg = &per_cpu(sched_group_phys, group).sg; 6970 *sg = *per_cpu_ptr(sdd->sg, cpu);
6987 return group; 6971
6972 return cpu;
6988} 6973}
6989 6974
6990#ifdef CONFIG_NUMA
6991/* 6975/*
6992 * The init_sched_build_groups can't handle what we want to do with node 6976 * build_sched_groups takes the cpumask we wish to span, and a pointer
6993 * groups, so roll our own. Now each node has its own list of groups which 6977 * to a function which identifies what group(along with sched group) a CPU
6994 * gets dynamically allocated. 6978 * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
6979 * (due to the fact that we keep track of groups covered with a struct cpumask).
6980 *
6981 * build_sched_groups will build a circular linked list of the groups
6982 * covered by the given span, and will set each group's ->cpumask correctly,
6983 * and ->cpu_power to 0.
6995 */ 6984 */
6996static DEFINE_PER_CPU(struct static_sched_domain, node_domains); 6985static void
6997static struct sched_group ***sched_group_nodes_bycpu; 6986build_sched_groups(struct sched_domain *sd)
6998
6999static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
7000static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
7001
7002static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
7003 struct sched_group **sg,
7004 struct cpumask *nodemask)
7005{
7006 int group;
7007
7008 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
7009 group = cpumask_first(nodemask);
7010
7011 if (sg)
7012 *sg = &per_cpu(sched_group_allnodes, group).sg;
7013 return group;
7014}
7015
7016static void init_numa_sched_groups_power(struct sched_group *group_head)
7017{
7018 struct sched_group *sg = group_head;
7019 int j;
7020
7021 if (!sg)
7022 return;
7023 do {
7024 for_each_cpu(j, sched_group_cpus(sg)) {
7025 struct sched_domain *sd;
7026
7027 sd = &per_cpu(phys_domains, j).sd;
7028 if (j != group_first_cpu(sd->groups)) {
7029 /*
7030 * Only add "power" once for each
7031 * physical package.
7032 */
7033 continue;
7034 }
7035
7036 sg->cpu_power += sd->groups->cpu_power;
7037 }
7038 sg = sg->next;
7039 } while (sg != group_head);
7040}
7041
7042static int build_numa_sched_groups(struct s_data *d,
7043 const struct cpumask *cpu_map, int num)
7044{ 6987{
7045 struct sched_domain *sd; 6988 struct sched_group *first = NULL, *last = NULL;
7046 struct sched_group *sg, *prev; 6989 struct sd_data *sdd = sd->private;
7047 int n, j; 6990 const struct cpumask *span = sched_domain_span(sd);
7048 6991 struct cpumask *covered;
7049 cpumask_clear(d->covered); 6992 int i;
7050 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
7051 if (cpumask_empty(d->nodemask)) {
7052 d->sched_group_nodes[num] = NULL;
7053 goto out;
7054 }
7055
7056 sched_domain_node_span(num, d->domainspan);
7057 cpumask_and(d->domainspan, d->domainspan, cpu_map);
7058
7059 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7060 GFP_KERNEL, num);
7061 if (!sg) {
7062 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
7063 num);
7064 return -ENOMEM;
7065 }
7066 d->sched_group_nodes[num] = sg;
7067
7068 for_each_cpu(j, d->nodemask) {
7069 sd = &per_cpu(node_domains, j).sd;
7070 sd->groups = sg;
7071 }
7072 6993
7073 sg->cpu_power = 0; 6994 lockdep_assert_held(&sched_domains_mutex);
7074 cpumask_copy(sched_group_cpus(sg), d->nodemask); 6995 covered = sched_domains_tmpmask;
7075 sg->next = sg;
7076 cpumask_or(d->covered, d->covered, d->nodemask);
7077 6996
7078 prev = sg; 6997 cpumask_clear(covered);
7079 for (j = 0; j < nr_node_ids; j++) {
7080 n = (num + j) % nr_node_ids;
7081 cpumask_complement(d->notcovered, d->covered);
7082 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
7083 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
7084 if (cpumask_empty(d->tmpmask))
7085 break;
7086 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
7087 if (cpumask_empty(d->tmpmask))
7088 continue;
7089 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
7090 GFP_KERNEL, num);
7091 if (!sg) {
7092 printk(KERN_WARNING
7093 "Can not alloc domain group for node %d\n", j);
7094 return -ENOMEM;
7095 }
7096 sg->cpu_power = 0;
7097 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
7098 sg->next = prev->next;
7099 cpumask_or(d->covered, d->covered, d->tmpmask);
7100 prev->next = sg;
7101 prev = sg;
7102 }
7103out:
7104 return 0;
7105}
7106#endif /* CONFIG_NUMA */
7107
7108#ifdef CONFIG_NUMA
7109/* Free memory allocated for various sched_group structures */
7110static void free_sched_groups(const struct cpumask *cpu_map,
7111 struct cpumask *nodemask)
7112{
7113 int cpu, i;
7114 6998
7115 for_each_cpu(cpu, cpu_map) { 6999 for_each_cpu(i, span) {
7116 struct sched_group **sched_group_nodes 7000 struct sched_group *sg;
7117 = sched_group_nodes_bycpu[cpu]; 7001 int group = get_group(i, sdd, &sg);
7002 int j;
7118 7003
7119 if (!sched_group_nodes) 7004 if (cpumask_test_cpu(i, covered))
7120 continue; 7005 continue;
7121 7006
7122 for (i = 0; i < nr_node_ids; i++) { 7007 cpumask_clear(sched_group_cpus(sg));
7123 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7008 sg->cpu_power = 0;
7124 7009
7125 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 7010 for_each_cpu(j, span) {
7126 if (cpumask_empty(nodemask)) 7011 if (get_group(j, sdd, NULL) != group)
7127 continue; 7012 continue;
7128 7013
7129 if (sg == NULL) 7014 cpumask_set_cpu(j, covered);
7130 continue; 7015 cpumask_set_cpu(j, sched_group_cpus(sg));
7131 sg = sg->next;
7132next_sg:
7133 oldsg = sg;
7134 sg = sg->next;
7135 kfree(oldsg);
7136 if (oldsg != sched_group_nodes[i])
7137 goto next_sg;
7138 } 7016 }
7139 kfree(sched_group_nodes); 7017
7140 sched_group_nodes_bycpu[cpu] = NULL; 7018 if (!first)
7019 first = sg;
7020 if (last)
7021 last->next = sg;
7022 last = sg;
7141 } 7023 }
7024 last->next = first;
7142} 7025}
7143#else /* !CONFIG_NUMA */
7144static void free_sched_groups(const struct cpumask *cpu_map,
7145 struct cpumask *nodemask)
7146{
7147}
7148#endif /* CONFIG_NUMA */
7149 7026
7150/* 7027/*
7151 * Initialize sched groups cpu_power. 7028 * Initialize sched groups cpu_power.
@@ -7159,11 +7036,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
7159 */ 7036 */
7160static void init_sched_groups_power(int cpu, struct sched_domain *sd) 7037static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7161{ 7038{
7162 struct sched_domain *child;
7163 struct sched_group *group;
7164 long power;
7165 int weight;
7166
7167 WARN_ON(!sd || !sd->groups); 7039 WARN_ON(!sd || !sd->groups);
7168 7040
7169 if (cpu != group_first_cpu(sd->groups)) 7041 if (cpu != group_first_cpu(sd->groups))
@@ -7171,36 +7043,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7171 7043
7172 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); 7044 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
7173 7045
7174 child = sd->child; 7046 update_group_power(sd, cpu);
7175
7176 sd->groups->cpu_power = 0;
7177
7178 if (!child) {
7179 power = SCHED_LOAD_SCALE;
7180 weight = cpumask_weight(sched_domain_span(sd));
7181 /*
7182 * SMT siblings share the power of a single core.
7183 * Usually multiple threads get a better yield out of
7184 * that one core than a single thread would have,
7185 * reflect that in sd->smt_gain.
7186 */
7187 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
7188 power *= sd->smt_gain;
7189 power /= weight;
7190 power >>= SCHED_LOAD_SHIFT;
7191 }
7192 sd->groups->cpu_power += power;
7193 return;
7194 }
7195
7196 /*
7197 * Add cpu_power of each child group to this groups cpu_power.
7198 */
7199 group = child->groups;
7200 do {
7201 sd->groups->cpu_power += group->cpu_power;
7202 group = group->next;
7203 } while (group != child->groups);
7204} 7047}
7205 7048
7206/* 7049/*
@@ -7214,15 +7057,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7214# define SD_INIT_NAME(sd, type) do { } while (0) 7057# define SD_INIT_NAME(sd, type) do { } while (0)
7215#endif 7058#endif
7216 7059
7217#define SD_INIT(sd, type) sd_init_##type(sd) 7060#define SD_INIT_FUNC(type) \
7218 7061static noinline struct sched_domain * \
7219#define SD_INIT_FUNC(type) \ 7062sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
7220static noinline void sd_init_##type(struct sched_domain *sd) \ 7063{ \
7221{ \ 7064 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
7222 memset(sd, 0, sizeof(*sd)); \ 7065 *sd = SD_##type##_INIT; \
7223 *sd = SD_##type##_INIT; \ 7066 SD_INIT_NAME(sd, type); \
7224 sd->level = SD_LV_##type; \ 7067 sd->private = &tl->data; \
7225 SD_INIT_NAME(sd, type); \ 7068 return sd; \
7226} 7069}
7227 7070
7228SD_INIT_FUNC(CPU) 7071SD_INIT_FUNC(CPU)
@@ -7241,13 +7084,14 @@ SD_INIT_FUNC(CPU)
7241#endif 7084#endif
7242 7085
7243static int default_relax_domain_level = -1; 7086static int default_relax_domain_level = -1;
7087int sched_domain_level_max;
7244 7088
7245static int __init setup_relax_domain_level(char *str) 7089static int __init setup_relax_domain_level(char *str)
7246{ 7090{
7247 unsigned long val; 7091 unsigned long val;
7248 7092
7249 val = simple_strtoul(str, NULL, 0); 7093 val = simple_strtoul(str, NULL, 0);
7250 if (val < SD_LV_MAX) 7094 if (val < sched_domain_level_max)
7251 default_relax_domain_level = val; 7095 default_relax_domain_level = val;
7252 7096
7253 return 1; 7097 return 1;
@@ -7275,37 +7119,20 @@ static void set_domain_attribute(struct sched_domain *sd,
7275 } 7119 }
7276} 7120}
7277 7121
7122static void __sdt_free(const struct cpumask *cpu_map);
7123static int __sdt_alloc(const struct cpumask *cpu_map);
7124
7278static void __free_domain_allocs(struct s_data *d, enum s_alloc what, 7125static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7279 const struct cpumask *cpu_map) 7126 const struct cpumask *cpu_map)
7280{ 7127{
7281 switch (what) { 7128 switch (what) {
7282 case sa_sched_groups:
7283 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
7284 d->sched_group_nodes = NULL;
7285 case sa_rootdomain: 7129 case sa_rootdomain:
7286 free_rootdomain(d->rd); /* fall through */ 7130 if (!atomic_read(&d->rd->refcount))
7287 case sa_tmpmask: 7131 free_rootdomain(&d->rd->rcu); /* fall through */
7288 free_cpumask_var(d->tmpmask); /* fall through */ 7132 case sa_sd:
7289 case sa_send_covered: 7133 free_percpu(d->sd); /* fall through */
7290 free_cpumask_var(d->send_covered); /* fall through */ 7134 case sa_sd_storage:
7291 case sa_this_book_map: 7135 __sdt_free(cpu_map); /* fall through */
7292 free_cpumask_var(d->this_book_map); /* fall through */
7293 case sa_this_core_map:
7294 free_cpumask_var(d->this_core_map); /* fall through */
7295 case sa_this_sibling_map:
7296 free_cpumask_var(d->this_sibling_map); /* fall through */
7297 case sa_nodemask:
7298 free_cpumask_var(d->nodemask); /* fall through */
7299 case sa_sched_group_nodes:
7300#ifdef CONFIG_NUMA
7301 kfree(d->sched_group_nodes); /* fall through */
7302 case sa_notcovered:
7303 free_cpumask_var(d->notcovered); /* fall through */
7304 case sa_covered:
7305 free_cpumask_var(d->covered); /* fall through */
7306 case sa_domainspan:
7307 free_cpumask_var(d->domainspan); /* fall through */
7308#endif
7309 case sa_none: 7136 case sa_none:
7310 break; 7137 break;
7311 } 7138 }
@@ -7314,308 +7141,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
7314static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, 7141static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
7315 const struct cpumask *cpu_map) 7142 const struct cpumask *cpu_map)
7316{ 7143{
7317#ifdef CONFIG_NUMA 7144 memset(d, 0, sizeof(*d));
7318 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) 7145
7319 return sa_none; 7146 if (__sdt_alloc(cpu_map))
7320 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) 7147 return sa_sd_storage;
7321 return sa_domainspan; 7148 d->sd = alloc_percpu(struct sched_domain *);
7322 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) 7149 if (!d->sd)
7323 return sa_covered; 7150 return sa_sd_storage;
7324 /* Allocate the per-node list of sched groups */
7325 d->sched_group_nodes = kcalloc(nr_node_ids,
7326 sizeof(struct sched_group *), GFP_KERNEL);
7327 if (!d->sched_group_nodes) {
7328 printk(KERN_WARNING "Can not alloc sched group node list\n");
7329 return sa_notcovered;
7330 }
7331 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
7332#endif
7333 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
7334 return sa_sched_group_nodes;
7335 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
7336 return sa_nodemask;
7337 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
7338 return sa_this_sibling_map;
7339 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
7340 return sa_this_core_map;
7341 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7342 return sa_this_book_map;
7343 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
7344 return sa_send_covered;
7345 d->rd = alloc_rootdomain(); 7151 d->rd = alloc_rootdomain();
7346 if (!d->rd) { 7152 if (!d->rd)
7347 printk(KERN_WARNING "Cannot alloc root domain\n"); 7153 return sa_sd;
7348 return sa_tmpmask;
7349 }
7350 return sa_rootdomain; 7154 return sa_rootdomain;
7351} 7155}
7352 7156
7353static struct sched_domain *__build_numa_sched_domains(struct s_data *d, 7157/*
7354 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) 7158 * NULL the sd_data elements we've used to build the sched_domain and
7159 * sched_group structure so that the subsequent __free_domain_allocs()
7160 * will not free the data we're using.
7161 */
7162static void claim_allocations(int cpu, struct sched_domain *sd)
7355{ 7163{
7356 struct sched_domain *sd = NULL; 7164 struct sd_data *sdd = sd->private;
7357#ifdef CONFIG_NUMA 7165 struct sched_group *sg = sd->groups;
7358 struct sched_domain *parent;
7359
7360 d->sd_allnodes = 0;
7361 if (cpumask_weight(cpu_map) >
7362 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
7363 sd = &per_cpu(allnodes_domains, i).sd;
7364 SD_INIT(sd, ALLNODES);
7365 set_domain_attribute(sd, attr);
7366 cpumask_copy(sched_domain_span(sd), cpu_map);
7367 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
7368 d->sd_allnodes = 1;
7369 }
7370 parent = sd;
7371
7372 sd = &per_cpu(node_domains, i).sd;
7373 SD_INIT(sd, NODE);
7374 set_domain_attribute(sd, attr);
7375 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
7376 sd->parent = parent;
7377 if (parent)
7378 parent->child = sd;
7379 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
7380#endif
7381 return sd;
7382}
7383 7166
7384static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, 7167 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
7385 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7168 *per_cpu_ptr(sdd->sd, cpu) = NULL;
7386 struct sched_domain *parent, int i)
7387{
7388 struct sched_domain *sd;
7389 sd = &per_cpu(phys_domains, i).sd;
7390 SD_INIT(sd, CPU);
7391 set_domain_attribute(sd, attr);
7392 cpumask_copy(sched_domain_span(sd), d->nodemask);
7393 sd->parent = parent;
7394 if (parent)
7395 parent->child = sd;
7396 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
7397 return sd;
7398}
7399 7169
7400static struct sched_domain *__build_book_sched_domain(struct s_data *d, 7170 if (cpu == cpumask_first(sched_group_cpus(sg))) {
7401 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7171 WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
7402 struct sched_domain *parent, int i) 7172 *per_cpu_ptr(sdd->sg, cpu) = NULL;
7403{ 7173 }
7404 struct sched_domain *sd = parent;
7405#ifdef CONFIG_SCHED_BOOK
7406 sd = &per_cpu(book_domains, i).sd;
7407 SD_INIT(sd, BOOK);
7408 set_domain_attribute(sd, attr);
7409 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7410 sd->parent = parent;
7411 parent->child = sd;
7412 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7413#endif
7414 return sd;
7415} 7174}
7416 7175
7417static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7176#ifdef CONFIG_SCHED_SMT
7418 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7177static const struct cpumask *cpu_smt_mask(int cpu)
7419 struct sched_domain *parent, int i)
7420{ 7178{
7421 struct sched_domain *sd = parent; 7179 return topology_thread_cpumask(cpu);
7422#ifdef CONFIG_SCHED_MC
7423 sd = &per_cpu(core_domains, i).sd;
7424 SD_INIT(sd, MC);
7425 set_domain_attribute(sd, attr);
7426 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
7427 sd->parent = parent;
7428 parent->child = sd;
7429 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
7430#endif
7431 return sd;
7432} 7180}
7433
7434static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
7435 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7436 struct sched_domain *parent, int i)
7437{
7438 struct sched_domain *sd = parent;
7439#ifdef CONFIG_SCHED_SMT
7440 sd = &per_cpu(cpu_domains, i).sd;
7441 SD_INIT(sd, SIBLING);
7442 set_domain_attribute(sd, attr);
7443 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
7444 sd->parent = parent;
7445 parent->child = sd;
7446 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
7447#endif 7181#endif
7448 return sd;
7449}
7450 7182
7451static void build_sched_groups(struct s_data *d, enum sched_domain_level l, 7183/*
7452 const struct cpumask *cpu_map, int cpu) 7184 * Topology list, bottom-up.
7453{ 7185 */
7454 switch (l) { 7186static struct sched_domain_topology_level default_topology[] = {
7455#ifdef CONFIG_SCHED_SMT 7187#ifdef CONFIG_SCHED_SMT
7456 case SD_LV_SIBLING: /* set up CPU (sibling) groups */ 7188 { sd_init_SIBLING, cpu_smt_mask, },
7457 cpumask_and(d->this_sibling_map, cpu_map,
7458 topology_thread_cpumask(cpu));
7459 if (cpu == cpumask_first(d->this_sibling_map))
7460 init_sched_build_groups(d->this_sibling_map, cpu_map,
7461 &cpu_to_cpu_group,
7462 d->send_covered, d->tmpmask);
7463 break;
7464#endif 7189#endif
7465#ifdef CONFIG_SCHED_MC 7190#ifdef CONFIG_SCHED_MC
7466 case SD_LV_MC: /* set up multi-core groups */ 7191 { sd_init_MC, cpu_coregroup_mask, },
7467 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
7468 if (cpu == cpumask_first(d->this_core_map))
7469 init_sched_build_groups(d->this_core_map, cpu_map,
7470 &cpu_to_core_group,
7471 d->send_covered, d->tmpmask);
7472 break;
7473#endif 7192#endif
7474#ifdef CONFIG_SCHED_BOOK 7193#ifdef CONFIG_SCHED_BOOK
7475 case SD_LV_BOOK: /* set up book groups */ 7194 { sd_init_BOOK, cpu_book_mask, },
7476 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7477 if (cpu == cpumask_first(d->this_book_map))
7478 init_sched_build_groups(d->this_book_map, cpu_map,
7479 &cpu_to_book_group,
7480 d->send_covered, d->tmpmask);
7481 break;
7482#endif 7195#endif
7483 case SD_LV_CPU: /* set up physical groups */ 7196 { sd_init_CPU, cpu_cpu_mask, },
7484 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7485 if (!cpumask_empty(d->nodemask))
7486 init_sched_build_groups(d->nodemask, cpu_map,
7487 &cpu_to_phys_group,
7488 d->send_covered, d->tmpmask);
7489 break;
7490#ifdef CONFIG_NUMA 7197#ifdef CONFIG_NUMA
7491 case SD_LV_ALLNODES: 7198 { sd_init_NODE, cpu_node_mask, },
7492 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, 7199 { sd_init_ALLNODES, cpu_allnodes_mask, },
7493 d->send_covered, d->tmpmask);
7494 break;
7495#endif 7200#endif
7496 default: 7201 { NULL, },
7497 break; 7202};
7203
7204static struct sched_domain_topology_level *sched_domain_topology = default_topology;
7205
7206static int __sdt_alloc(const struct cpumask *cpu_map)
7207{
7208 struct sched_domain_topology_level *tl;
7209 int j;
7210
7211 for (tl = sched_domain_topology; tl->init; tl++) {
7212 struct sd_data *sdd = &tl->data;
7213
7214 sdd->sd = alloc_percpu(struct sched_domain *);
7215 if (!sdd->sd)
7216 return -ENOMEM;
7217
7218 sdd->sg = alloc_percpu(struct sched_group *);
7219 if (!sdd->sg)
7220 return -ENOMEM;
7221
7222 for_each_cpu(j, cpu_map) {
7223 struct sched_domain *sd;
7224 struct sched_group *sg;
7225
7226 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
7227 GFP_KERNEL, cpu_to_node(j));
7228 if (!sd)
7229 return -ENOMEM;
7230
7231 *per_cpu_ptr(sdd->sd, j) = sd;
7232
7233 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
7234 GFP_KERNEL, cpu_to_node(j));
7235 if (!sg)
7236 return -ENOMEM;
7237
7238 *per_cpu_ptr(sdd->sg, j) = sg;
7239 }
7240 }
7241
7242 return 0;
7243}
7244
7245static void __sdt_free(const struct cpumask *cpu_map)
7246{
7247 struct sched_domain_topology_level *tl;
7248 int j;
7249
7250 for (tl = sched_domain_topology; tl->init; tl++) {
7251 struct sd_data *sdd = &tl->data;
7252
7253 for_each_cpu(j, cpu_map) {
7254 kfree(*per_cpu_ptr(sdd->sd, j));
7255 kfree(*per_cpu_ptr(sdd->sg, j));
7256 }
7257 free_percpu(sdd->sd);
7258 free_percpu(sdd->sg);
7498 } 7259 }
7499} 7260}
7500 7261
7262struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
7263 struct s_data *d, const struct cpumask *cpu_map,
7264 struct sched_domain_attr *attr, struct sched_domain *child,
7265 int cpu)
7266{
7267 struct sched_domain *sd = tl->init(tl, cpu);
7268 if (!sd)
7269 return child;
7270
7271 set_domain_attribute(sd, attr);
7272 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
7273 if (child) {
7274 sd->level = child->level + 1;
7275 sched_domain_level_max = max(sched_domain_level_max, sd->level);
7276 child->parent = sd;
7277 }
7278 sd->child = child;
7279
7280 return sd;
7281}
7282
7501/* 7283/*
7502 * Build sched domains for a given set of cpus and attach the sched domains 7284 * Build sched domains for a given set of cpus and attach the sched domains
7503 * to the individual cpus 7285 * to the individual cpus
7504 */ 7286 */
7505static int __build_sched_domains(const struct cpumask *cpu_map, 7287static int build_sched_domains(const struct cpumask *cpu_map,
7506 struct sched_domain_attr *attr) 7288 struct sched_domain_attr *attr)
7507{ 7289{
7508 enum s_alloc alloc_state = sa_none; 7290 enum s_alloc alloc_state = sa_none;
7509 struct s_data d;
7510 struct sched_domain *sd; 7291 struct sched_domain *sd;
7511 int i; 7292 struct s_data d;
7512#ifdef CONFIG_NUMA 7293 int i, ret = -ENOMEM;
7513 d.sd_allnodes = 0;
7514#endif
7515 7294
7516 alloc_state = __visit_domain_allocation_hell(&d, cpu_map); 7295 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
7517 if (alloc_state != sa_rootdomain) 7296 if (alloc_state != sa_rootdomain)
7518 goto error; 7297 goto error;
7519 alloc_state = sa_sched_groups;
7520
7521 /*
7522 * Set up domains for cpus specified by the cpu_map.
7523 */
7524 for_each_cpu(i, cpu_map) {
7525 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
7526 cpu_map);
7527
7528 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7529 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7530 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7531 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7532 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7533 }
7534 7298
7299 /* Set up domains for cpus specified by the cpu_map. */
7535 for_each_cpu(i, cpu_map) { 7300 for_each_cpu(i, cpu_map) {
7536 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7301 struct sched_domain_topology_level *tl;
7537 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7538 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7539 }
7540
7541 /* Set up physical groups */
7542 for (i = 0; i < nr_node_ids; i++)
7543 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
7544 7302
7545#ifdef CONFIG_NUMA 7303 sd = NULL;
7546 /* Set up node groups */ 7304 for (tl = sched_domain_topology; tl->init; tl++)
7547 if (d.sd_allnodes) 7305 sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
7548 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
7549 7306
7550 for (i = 0; i < nr_node_ids; i++) 7307 while (sd->child)
7551 if (build_numa_sched_groups(&d, cpu_map, i)) 7308 sd = sd->child;
7552 goto error;
7553#endif
7554 7309
7555 /* Calculate CPU power for physical packages and nodes */ 7310 *per_cpu_ptr(d.sd, i) = sd;
7556#ifdef CONFIG_SCHED_SMT
7557 for_each_cpu(i, cpu_map) {
7558 sd = &per_cpu(cpu_domains, i).sd;
7559 init_sched_groups_power(i, sd);
7560 } 7311 }
7561#endif
7562#ifdef CONFIG_SCHED_MC
7563 for_each_cpu(i, cpu_map) {
7564 sd = &per_cpu(core_domains, i).sd;
7565 init_sched_groups_power(i, sd);
7566 }
7567#endif
7568#ifdef CONFIG_SCHED_BOOK
7569 for_each_cpu(i, cpu_map) {
7570 sd = &per_cpu(book_domains, i).sd;
7571 init_sched_groups_power(i, sd);
7572 }
7573#endif
7574 7312
7313 /* Build the groups for the domains */
7575 for_each_cpu(i, cpu_map) { 7314 for_each_cpu(i, cpu_map) {
7576 sd = &per_cpu(phys_domains, i).sd; 7315 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7577 init_sched_groups_power(i, sd); 7316 sd->span_weight = cpumask_weight(sched_domain_span(sd));
7578 } 7317 get_group(i, sd->private, &sd->groups);
7318 atomic_inc(&sd->groups->ref);
7579 7319
7580#ifdef CONFIG_NUMA 7320 if (i != cpumask_first(sched_domain_span(sd)))
7581 for (i = 0; i < nr_node_ids; i++) 7321 continue;
7582 init_numa_sched_groups_power(d.sched_group_nodes[i]);
7583 7322
7584 if (d.sd_allnodes) { 7323 build_sched_groups(sd);
7585 struct sched_group *sg; 7324 }
7325 }
7326
7327 /* Calculate CPU power for physical packages and nodes */
7328 for (i = nr_cpumask_bits-1; i >= 0; i--) {
7329 if (!cpumask_test_cpu(i, cpu_map))
7330 continue;
7586 7331
7587 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 7332 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
7588 d.tmpmask); 7333 claim_allocations(i, sd);
7589 init_numa_sched_groups_power(sg); 7334 init_sched_groups_power(i, sd);
7335 }
7590 } 7336 }
7591#endif
7592 7337
7593 /* Attach the domains */ 7338 /* Attach the domains */
7339 rcu_read_lock();
7594 for_each_cpu(i, cpu_map) { 7340 for_each_cpu(i, cpu_map) {
7595#ifdef CONFIG_SCHED_SMT 7341 sd = *per_cpu_ptr(d.sd, i);
7596 sd = &per_cpu(cpu_domains, i).sd;
7597#elif defined(CONFIG_SCHED_MC)
7598 sd = &per_cpu(core_domains, i).sd;
7599#elif defined(CONFIG_SCHED_BOOK)
7600 sd = &per_cpu(book_domains, i).sd;
7601#else
7602 sd = &per_cpu(phys_domains, i).sd;
7603#endif
7604 cpu_attach_domain(sd, d.rd, i); 7342 cpu_attach_domain(sd, d.rd, i);
7605 } 7343 }
7344 rcu_read_unlock();
7606 7345
7607 d.sched_group_nodes = NULL; /* don't free this we still need it */ 7346 ret = 0;
7608 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
7609 return 0;
7610
7611error: 7347error:
7612 __free_domain_allocs(&d, alloc_state, cpu_map); 7348 __free_domain_allocs(&d, alloc_state, cpu_map);
7613 return -ENOMEM; 7349 return ret;
7614}
7615
7616static int build_sched_domains(const struct cpumask *cpu_map)
7617{
7618 return __build_sched_domains(cpu_map, NULL);
7619} 7350}
7620 7351
7621static cpumask_var_t *doms_cur; /* current sched domains */ 7352static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7670,7 +7401,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
7670 * For now this just excludes isolated cpus, but could be used to 7401 * For now this just excludes isolated cpus, but could be used to
7671 * exclude other special cases in the future. 7402 * exclude other special cases in the future.
7672 */ 7403 */
7673static int arch_init_sched_domains(const struct cpumask *cpu_map) 7404static int init_sched_domains(const struct cpumask *cpu_map)
7674{ 7405{
7675 int err; 7406 int err;
7676 7407
@@ -7681,32 +7412,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
7681 doms_cur = &fallback_doms; 7412 doms_cur = &fallback_doms;
7682 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 7413 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
7683 dattr_cur = NULL; 7414 dattr_cur = NULL;
7684 err = build_sched_domains(doms_cur[0]); 7415 err = build_sched_domains(doms_cur[0], NULL);
7685 register_sched_domain_sysctl(); 7416 register_sched_domain_sysctl();
7686 7417
7687 return err; 7418 return err;
7688} 7419}
7689 7420
7690static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
7691 struct cpumask *tmpmask)
7692{
7693 free_sched_groups(cpu_map, tmpmask);
7694}
7695
7696/* 7421/*
7697 * Detach sched domains from a group of cpus specified in cpu_map 7422 * Detach sched domains from a group of cpus specified in cpu_map
7698 * These cpus will now be attached to the NULL domain 7423 * These cpus will now be attached to the NULL domain
7699 */ 7424 */
7700static void detach_destroy_domains(const struct cpumask *cpu_map) 7425static void detach_destroy_domains(const struct cpumask *cpu_map)
7701{ 7426{
7702 /* Save because hotplug lock held. */
7703 static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
7704 int i; 7427 int i;
7705 7428
7429 rcu_read_lock();
7706 for_each_cpu(i, cpu_map) 7430 for_each_cpu(i, cpu_map)
7707 cpu_attach_domain(NULL, &def_root_domain, i); 7431 cpu_attach_domain(NULL, &def_root_domain, i);
7708 synchronize_sched(); 7432 rcu_read_unlock();
7709 arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
7710} 7433}
7711 7434
7712/* handle null as "default" */ 7435/* handle null as "default" */
@@ -7795,8 +7518,7 @@ match1:
7795 goto match2; 7518 goto match2;
7796 } 7519 }
7797 /* no match - add a new doms_new */ 7520 /* no match - add a new doms_new */
7798 __build_sched_domains(doms_new[i], 7521 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
7799 dattr_new ? dattr_new + i : NULL);
7800match2: 7522match2:
7801 ; 7523 ;
7802 } 7524 }
@@ -7815,7 +7537,7 @@ match2:
7815} 7537}
7816 7538
7817#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7539#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7818static void arch_reinit_sched_domains(void) 7540static void reinit_sched_domains(void)
7819{ 7541{
7820 get_online_cpus(); 7542 get_online_cpus();
7821 7543
@@ -7848,7 +7570,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7848 else 7570 else
7849 sched_mc_power_savings = level; 7571 sched_mc_power_savings = level;
7850 7572
7851 arch_reinit_sched_domains(); 7573 reinit_sched_domains();
7852 7574
7853 return count; 7575 return count;
7854} 7576}
@@ -7967,14 +7689,9 @@ void __init sched_init_smp(void)
7967 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7689 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
7968 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7690 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
7969 7691
7970#if defined(CONFIG_NUMA)
7971 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
7972 GFP_KERNEL);
7973 BUG_ON(sched_group_nodes_bycpu == NULL);
7974#endif
7975 get_online_cpus(); 7692 get_online_cpus();
7976 mutex_lock(&sched_domains_mutex); 7693 mutex_lock(&sched_domains_mutex);
7977 arch_init_sched_domains(cpu_active_mask); 7694 init_sched_domains(cpu_active_mask);
7978 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7695 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
7979 if (cpumask_empty(non_isolated_cpus)) 7696 if (cpumask_empty(non_isolated_cpus))
7980 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7697 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -8224,7 +7941,7 @@ void __init sched_init(void)
8224#ifdef CONFIG_SMP 7941#ifdef CONFIG_SMP
8225 rq->sd = NULL; 7942 rq->sd = NULL;
8226 rq->rd = NULL; 7943 rq->rd = NULL;
8227 rq->cpu_power = SCHED_LOAD_SCALE; 7944 rq->cpu_power = SCHED_POWER_SCALE;
8228 rq->post_schedule = 0; 7945 rq->post_schedule = 0;
8229 rq->active_balance = 0; 7946 rq->active_balance = 0;
8230 rq->next_balance = jiffies; 7947 rq->next_balance = jiffies;
@@ -8281,6 +7998,7 @@ void __init sched_init(void)
8281 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 7998 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
8282 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7999 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
8283#ifdef CONFIG_SMP 8000#ifdef CONFIG_SMP
8001 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
8284#ifdef CONFIG_NO_HZ 8002#ifdef CONFIG_NO_HZ
8285 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); 8003 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
8286 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); 8004 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8340,7 +8058,7 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
8340 int old_prio = p->prio; 8058 int old_prio = p->prio;
8341 int on_rq; 8059 int on_rq;
8342 8060
8343 on_rq = p->se.on_rq; 8061 on_rq = p->on_rq;
8344 if (on_rq) 8062 if (on_rq)
8345 deactivate_task(rq, p, 0); 8063 deactivate_task(rq, p, 0);
8346 __setscheduler(rq, p, SCHED_NORMAL, 0); 8064 __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8553,7 +8271,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8553{ 8271{
8554 struct rt_rq *rt_rq; 8272 struct rt_rq *rt_rq;
8555 struct sched_rt_entity *rt_se; 8273 struct sched_rt_entity *rt_se;
8556 struct rq *rq;
8557 int i; 8274 int i;
8558 8275
8559 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); 8276 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8567,8 +8284,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8567 ktime_to_ns(def_rt_bandwidth.rt_period), 0); 8284 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
8568 8285
8569 for_each_possible_cpu(i) { 8286 for_each_possible_cpu(i) {
8570 rq = cpu_rq(i);
8571
8572 rt_rq = kzalloc_node(sizeof(struct rt_rq), 8287 rt_rq = kzalloc_node(sizeof(struct rt_rq),
8573 GFP_KERNEL, cpu_to_node(i)); 8288 GFP_KERNEL, cpu_to_node(i));
8574 if (!rt_rq) 8289 if (!rt_rq)
@@ -8683,7 +8398,7 @@ void sched_move_task(struct task_struct *tsk)
8683 rq = task_rq_lock(tsk, &flags); 8398 rq = task_rq_lock(tsk, &flags);
8684 8399
8685 running = task_current(rq, tsk); 8400 running = task_current(rq, tsk);
8686 on_rq = tsk->se.on_rq; 8401 on_rq = tsk->on_rq;
8687 8402
8688 if (on_rq) 8403 if (on_rq)
8689 dequeue_task(rq, tsk, 0); 8404 dequeue_task(rq, tsk, 0);
@@ -8702,7 +8417,7 @@ void sched_move_task(struct task_struct *tsk)
8702 if (on_rq) 8417 if (on_rq)
8703 enqueue_task(rq, tsk, 0); 8418 enqueue_task(rq, tsk, 0);
8704 8419
8705 task_rq_unlock(rq, &flags); 8420 task_rq_unlock(rq, tsk, &flags);
8706} 8421}
8707#endif /* CONFIG_CGROUP_SCHED */ 8422#endif /* CONFIG_CGROUP_SCHED */
8708 8423
@@ -9073,42 +8788,10 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9073 return 0; 8788 return 0;
9074} 8789}
9075 8790
9076static int
9077cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9078 struct task_struct *tsk, bool threadgroup)
9079{
9080 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
9081 if (retval)
9082 return retval;
9083 if (threadgroup) {
9084 struct task_struct *c;
9085 rcu_read_lock();
9086 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9087 retval = cpu_cgroup_can_attach_task(cgrp, c);
9088 if (retval) {
9089 rcu_read_unlock();
9090 return retval;
9091 }
9092 }
9093 rcu_read_unlock();
9094 }
9095 return 0;
9096}
9097
9098static void 8791static void
9099cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 8792cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
9100 struct cgroup *old_cont, struct task_struct *tsk,
9101 bool threadgroup)
9102{ 8793{
9103 sched_move_task(tsk); 8794 sched_move_task(tsk);
9104 if (threadgroup) {
9105 struct task_struct *c;
9106 rcu_read_lock();
9107 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
9108 sched_move_task(c);
9109 }
9110 rcu_read_unlock();
9111 }
9112} 8795}
9113 8796
9114static void 8797static void
@@ -9130,14 +8813,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
9130static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8813static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9131 u64 shareval) 8814 u64 shareval)
9132{ 8815{
9133 return sched_group_set_shares(cgroup_tg(cgrp), shareval); 8816 return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
9134} 8817}
9135 8818
9136static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) 8819static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
9137{ 8820{
9138 struct task_group *tg = cgroup_tg(cgrp); 8821 struct task_group *tg = cgroup_tg(cgrp);
9139 8822
9140 return (u64) tg->shares; 8823 return (u64) scale_load_down(tg->shares);
9141} 8824}
9142#endif /* CONFIG_FAIR_GROUP_SCHED */ 8825#endif /* CONFIG_FAIR_GROUP_SCHED */
9143 8826
@@ -9196,8 +8879,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9196 .name = "cpu", 8879 .name = "cpu",
9197 .create = cpu_cgroup_create, 8880 .create = cpu_cgroup_create,
9198 .destroy = cpu_cgroup_destroy, 8881 .destroy = cpu_cgroup_destroy,
9199 .can_attach = cpu_cgroup_can_attach, 8882 .can_attach_task = cpu_cgroup_can_attach_task,
9200 .attach = cpu_cgroup_attach, 8883 .attach_task = cpu_cgroup_attach_task,
9201 .exit = cpu_cgroup_exit, 8884 .exit = cpu_cgroup_exit,
9202 .populate = cpu_cgroup_populate, 8885 .populate = cpu_cgroup_populate,
9203 .subsys_id = cpu_cgroup_subsys_id, 8886 .subsys_id = cpu_cgroup_subsys_id,