aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1016
1 files changed, 591 insertions, 425 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 76c0e9691fc0..3a8fb30a91b1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
298 298
299#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -469,7 +470,7 @@ struct rt_rq {
469 u64 rt_time; 470 u64 rt_time;
470 u64 rt_runtime; 471 u64 rt_runtime;
471 /* Nests inside the rq lock: */ 472 /* Nests inside the rq lock: */
472 spinlock_t rt_runtime_lock; 473 raw_spinlock_t rt_runtime_lock;
473 474
474#ifdef CONFIG_RT_GROUP_SCHED 475#ifdef CONFIG_RT_GROUP_SCHED
475 unsigned long rt_nr_boosted; 476 unsigned long rt_nr_boosted;
@@ -524,7 +525,7 @@ static struct root_domain def_root_domain;
524 */ 525 */
525struct rq { 526struct rq {
526 /* runqueue lock: */ 527 /* runqueue lock: */
527 spinlock_t lock; 528 raw_spinlock_t lock;
528 529
529 /* 530 /*
530 * nr_running and cpu_load should be in the same cacheline because 531 * nr_running and cpu_load should be in the same cacheline because
@@ -534,14 +535,12 @@ struct rq {
534 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
535 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
537 unsigned long last_tick_seen;
538 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
539#endif 539#endif
540 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
541 struct load_weight load; 541 struct load_weight load;
542 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
543 u64 nr_switches; 543 u64 nr_switches;
544 u64 nr_migrations_in;
545 544
546 struct cfs_rq cfs; 545 struct cfs_rq cfs;
547 struct rt_rq rt; 546 struct rt_rq rt;
@@ -590,6 +589,8 @@ struct rq {
590 589
591 u64 rt_avg; 590 u64 rt_avg;
592 u64 age_stamp; 591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
593#endif 594#endif
594 595
595 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -676,6 +677,7 @@ inline void update_rq_clock(struct rq *rq)
676 677
677/** 678/**
678 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
679 * 681 *
680 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
681 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
@@ -683,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
683 */ 685 */
684int runqueue_is_locked(int cpu) 686int runqueue_is_locked(int cpu)
685{ 687{
686 return spin_is_locked(&cpu_rq(cpu)->lock); 688 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
687} 689}
688 690
689/* 691/*
@@ -770,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
770 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
771 return -EINVAL; 773 return -EINVAL;
772 774
773 filp->f_pos += cnt; 775 *ppos += cnt;
774 776
775 return cnt; 777 return cnt;
776} 778}
@@ -812,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
812 * default: 0.25ms 814 * default: 0.25ms
813 */ 815 */
814unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815 818
816/* 819/*
817 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -890,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
890 */ 893 */
891 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
892 895
893 spin_unlock_irq(&rq->lock); 896 raw_spin_unlock_irq(&rq->lock);
894} 897}
895 898
896#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -914,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
914 next->oncpu = 1; 917 next->oncpu = 1;
915#endif 918#endif
916#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 919#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
917 spin_unlock_irq(&rq->lock); 920 raw_spin_unlock_irq(&rq->lock);
918#else 921#else
919 spin_unlock(&rq->lock); 922 raw_spin_unlock(&rq->lock);
920#endif 923#endif
921} 924}
922 925
@@ -946,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
946{ 949{
947 for (;;) { 950 for (;;) {
948 struct rq *rq = task_rq(p); 951 struct rq *rq = task_rq(p);
949 spin_lock(&rq->lock); 952 raw_spin_lock(&rq->lock);
950 if (likely(rq == task_rq(p))) 953 if (likely(rq == task_rq(p)))
951 return rq; 954 return rq;
952 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
953 } 956 }
954} 957}
955 958
@@ -966,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
966 for (;;) { 969 for (;;) {
967 local_irq_save(*flags); 970 local_irq_save(*flags);
968 rq = task_rq(p); 971 rq = task_rq(p);
969 spin_lock(&rq->lock); 972 raw_spin_lock(&rq->lock);
970 if (likely(rq == task_rq(p))) 973 if (likely(rq == task_rq(p)))
971 return rq; 974 return rq;
972 spin_unlock_irqrestore(&rq->lock, *flags); 975 raw_spin_unlock_irqrestore(&rq->lock, *flags);
973 } 976 }
974} 977}
975 978
@@ -978,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
978 struct rq *rq = task_rq(p); 981 struct rq *rq = task_rq(p);
979 982
980 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
981 spin_unlock_wait(&rq->lock); 984 raw_spin_unlock_wait(&rq->lock);
982} 985}
983 986
984static void __task_rq_unlock(struct rq *rq) 987static void __task_rq_unlock(struct rq *rq)
985 __releases(rq->lock) 988 __releases(rq->lock)
986{ 989{
987 spin_unlock(&rq->lock); 990 raw_spin_unlock(&rq->lock);
988} 991}
989 992
990static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 993static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
991 __releases(rq->lock) 994 __releases(rq->lock)
992{ 995{
993 spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock_irqrestore(&rq->lock, *flags);
994} 997}
995 998
996/* 999/*
@@ -1003,7 +1006,7 @@ static struct rq *this_rq_lock(void)
1003 1006
1004 local_irq_disable(); 1007 local_irq_disable();
1005 rq = this_rq(); 1008 rq = this_rq();
1006 spin_lock(&rq->lock); 1009 raw_spin_lock(&rq->lock);
1007 1010
1008 return rq; 1011 return rq;
1009} 1012}
@@ -1050,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1050 1053
1051 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1054 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1052 1055
1053 spin_lock(&rq->lock); 1056 raw_spin_lock(&rq->lock);
1054 update_rq_clock(rq); 1057 update_rq_clock(rq);
1055 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1058 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1056 spin_unlock(&rq->lock); 1059 raw_spin_unlock(&rq->lock);
1057 1060
1058 return HRTIMER_NORESTART; 1061 return HRTIMER_NORESTART;
1059} 1062}
@@ -1066,10 +1069,10 @@ static void __hrtick_start(void *arg)
1066{ 1069{
1067 struct rq *rq = arg; 1070 struct rq *rq = arg;
1068 1071
1069 spin_lock(&rq->lock); 1072 raw_spin_lock(&rq->lock);
1070 hrtimer_restart(&rq->hrtick_timer); 1073 hrtimer_restart(&rq->hrtick_timer);
1071 rq->hrtick_csd_pending = 0; 1074 rq->hrtick_csd_pending = 0;
1072 spin_unlock(&rq->lock); 1075 raw_spin_unlock(&rq->lock);
1073} 1076}
1074 1077
1075/* 1078/*
@@ -1176,7 +1179,7 @@ static void resched_task(struct task_struct *p)
1176{ 1179{
1177 int cpu; 1180 int cpu;
1178 1181
1179 assert_spin_locked(&task_rq(p)->lock); 1182 assert_raw_spin_locked(&task_rq(p)->lock);
1180 1183
1181 if (test_tsk_need_resched(p)) 1184 if (test_tsk_need_resched(p))
1182 return; 1185 return;
@@ -1198,10 +1201,10 @@ static void resched_cpu(int cpu)
1198 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1199 unsigned long flags; 1202 unsigned long flags;
1200 1203
1201 if (!spin_trylock_irqsave(&rq->lock, flags)) 1204 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1202 return; 1205 return;
1203 resched_task(cpu_curr(cpu)); 1206 resched_task(cpu_curr(cpu));
1204 spin_unlock_irqrestore(&rq->lock, flags); 1207 raw_spin_unlock_irqrestore(&rq->lock, flags);
1205} 1208}
1206 1209
1207#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
@@ -1270,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1270#else /* !CONFIG_SMP */ 1273#else /* !CONFIG_SMP */
1271static void resched_task(struct task_struct *p) 1274static void resched_task(struct task_struct *p)
1272{ 1275{
1273 assert_spin_locked(&task_rq(p)->lock); 1276 assert_raw_spin_locked(&task_rq(p)->lock);
1274 set_tsk_need_resched(p); 1277 set_tsk_need_resched(p);
1275} 1278}
1276 1279
@@ -1563,11 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1563 1566
1564#ifdef CONFIG_FAIR_GROUP_SCHED 1567#ifdef CONFIG_FAIR_GROUP_SCHED
1565 1568
1566struct update_shares_data { 1569static __read_mostly unsigned long *update_shares_data;
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571 1570
1572static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1571static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1573 1572
@@ -1577,12 +1576,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577static void update_group_shares_cpu(struct task_group *tg, int cpu, 1576static void update_group_shares_cpu(struct task_group *tg, int cpu,
1578 unsigned long sd_shares, 1577 unsigned long sd_shares,
1579 unsigned long sd_rq_weight, 1578 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd) 1579 unsigned long *usd_rq_weight)
1581{ 1580{
1582 unsigned long shares, rq_weight; 1581 unsigned long shares, rq_weight;
1583 int boost = 0; 1582 int boost = 0;
1584 1583
1585 rq_weight = usd->rq_weight[cpu]; 1584 rq_weight = usd_rq_weight[cpu];
1586 if (!rq_weight) { 1585 if (!rq_weight) {
1587 boost = 1; 1586 boost = 1;
1588 rq_weight = NICE_0_LOAD; 1587 rq_weight = NICE_0_LOAD;
@@ -1601,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1601 struct rq *rq = cpu_rq(cpu); 1600 struct rq *rq = cpu_rq(cpu);
1602 unsigned long flags; 1601 unsigned long flags;
1603 1602
1604 spin_lock_irqsave(&rq->lock, flags); 1603 raw_spin_lock_irqsave(&rq->lock, flags);
1605 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1604 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1606 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1605 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1607 __set_se_shares(tg->se[cpu], shares); 1606 __set_se_shares(tg->se[cpu], shares);
1608 spin_unlock_irqrestore(&rq->lock, flags); 1607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1609 } 1608 }
1610} 1609}
1611 1610
@@ -1616,8 +1615,8 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1616 */ 1615 */
1617static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1618{ 1617{
1619 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1620 struct update_shares_data *usd; 1619 unsigned long *usd_rq_weight;
1621 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1622 unsigned long flags; 1621 unsigned long flags;
1623 int i; 1622 int i;
@@ -1626,12 +1625,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1626 return 0; 1625 return 0;
1627 1626
1628 local_irq_save(flags); 1627 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data); 1628 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1630 1629
1631 for_each_cpu(i, sched_domain_span(sd)) { 1630 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1634 1633
1634 rq_weight += weight;
1635 /* 1635 /*
1636 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1637 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1640,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1640 if (!weight) 1640 if (!weight)
1641 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1642 1642
1643 rq_weight += weight; 1643 sum_weight += weight;
1644 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1645 } 1645 }
1646 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1647 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1648 shares = tg->shares; 1651 shares = tg->shares;
1649 1652
@@ -1651,7 +1654,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1651 shares = tg->shares; 1654 shares = tg->shares;
1652 1655
1653 for_each_cpu(i, sched_domain_span(sd)) 1656 for_each_cpu(i, sched_domain_span(sd))
1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1657 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1655 1658
1656 local_irq_restore(flags); 1659 local_irq_restore(flags);
1657 1660
@@ -1703,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1703 if (root_task_group_empty()) 1706 if (root_task_group_empty())
1704 return; 1707 return;
1705 1708
1706 spin_unlock(&rq->lock); 1709 raw_spin_unlock(&rq->lock);
1707 update_shares(sd); 1710 update_shares(sd);
1708 spin_lock(&rq->lock); 1711 raw_spin_lock(&rq->lock);
1709} 1712}
1710 1713
1711static void update_h_load(long cpu) 1714static void update_h_load(long cpu)
@@ -1745,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1745 __acquires(busiest->lock) 1748 __acquires(busiest->lock)
1746 __acquires(this_rq->lock) 1749 __acquires(this_rq->lock)
1747{ 1750{
1748 spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1749 double_rq_lock(this_rq, busiest); 1752 double_rq_lock(this_rq, busiest);
1750 1753
1751 return 1; 1754 return 1;
@@ -1766,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1766{ 1769{
1767 int ret = 0; 1770 int ret = 0;
1768 1771
1769 if (unlikely(!spin_trylock(&busiest->lock))) { 1772 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1770 if (busiest < this_rq) { 1773 if (busiest < this_rq) {
1771 spin_unlock(&this_rq->lock); 1774 raw_spin_unlock(&this_rq->lock);
1772 spin_lock(&busiest->lock); 1775 raw_spin_lock(&busiest->lock);
1773 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1776 raw_spin_lock_nested(&this_rq->lock,
1777 SINGLE_DEPTH_NESTING);
1774 ret = 1; 1778 ret = 1;
1775 } else 1779 } else
1776 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1780 raw_spin_lock_nested(&busiest->lock,
1781 SINGLE_DEPTH_NESTING);
1777 } 1782 }
1778 return ret; 1783 return ret;
1779} 1784}
@@ -1787,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1787{ 1792{
1788 if (unlikely(!irqs_disabled())) { 1793 if (unlikely(!irqs_disabled())) {
1789 /* printk() doesn't work good under rq->lock */ 1794 /* printk() doesn't work good under rq->lock */
1790 spin_unlock(&this_rq->lock); 1795 raw_spin_unlock(&this_rq->lock);
1791 BUG_ON(1); 1796 BUG_ON(1);
1792 } 1797 }
1793 1798
@@ -1797,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1797static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1802static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1798 __releases(busiest->lock) 1803 __releases(busiest->lock)
1799{ 1804{
1800 spin_unlock(&busiest->lock); 1805 raw_spin_unlock(&busiest->lock);
1801 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1802} 1807}
1803#endif 1808#endif
@@ -1812,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1812#endif 1817#endif
1813 1818
1814static void calc_load_account_active(struct rq *this_rq); 1819static void calc_load_account_active(struct rq *this_rq);
1820static void update_sysctl(void);
1821static int get_update_sysctl_factor(void);
1822
1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1824{
1825 set_task_rq(p, cpu);
1826#ifdef CONFIG_SMP
1827 /*
1828 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1829 * successfuly executed on another CPU. We must ensure that updates of
1830 * per-task data have been completed by this moment.
1831 */
1832 smp_wmb();
1833 task_thread_info(p)->cpu = cpu;
1834#endif
1835}
1815 1836
1816#include "sched_stats.h" 1837#include "sched_stats.h"
1817#include "sched_idletask.c" 1838#include "sched_idletask.c"
@@ -1969,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
1969 return cpu_curr(task_cpu(p)) == p; 1990 return cpu_curr(task_cpu(p)) == p;
1970} 1991}
1971 1992
1972static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1973{
1974 set_task_rq(p, cpu);
1975#ifdef CONFIG_SMP
1976 /*
1977 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1978 * successfuly executed on another CPU. We must ensure that updates of
1979 * per-task data have been completed by this moment.
1980 */
1981 smp_wmb();
1982 task_thread_info(p)->cpu = cpu;
1983#endif
1984}
1985
1986static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1987 const struct sched_class *prev_class, 1994 const struct sched_class *prev_class,
1988 int oldprio, int running) 1995 int oldprio, int running)
@@ -2004,17 +2011,17 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2004{ 2011{
2005 s64 delta; 2012 s64 delta;
2006 2013
2014 if (p->sched_class != &fair_sched_class)
2015 return 0;
2016
2007 /* 2017 /*
2008 * Buddy candidates are cache hot: 2018 * Buddy candidates are cache hot:
2009 */ 2019 */
2010 if (sched_feat(CACHE_HOT_BUDDY) && 2020 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2011 (&p->se == cfs_rq_of(&p->se)->next || 2021 (&p->se == cfs_rq_of(&p->se)->next ||
2012 &p->se == cfs_rq_of(&p->se)->last)) 2022 &p->se == cfs_rq_of(&p->se)->last))
2013 return 1; 2023 return 1;
2014 2024
2015 if (p->sched_class != &fair_sched_class)
2016 return 0;
2017
2018 if (sysctl_sched_migration_cost == -1) 2025 if (sysctl_sched_migration_cost == -1)
2019 return 1; 2026 return 1;
2020 if (sysctl_sched_migration_cost == 0) 2027 if (sysctl_sched_migration_cost == 0)
@@ -2025,39 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2025 return delta < (s64)sysctl_sched_migration_cost; 2032 return delta < (s64)sysctl_sched_migration_cost;
2026} 2033}
2027 2034
2028
2029void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2030{ 2036{
2031 int old_cpu = task_cpu(p); 2037#ifdef CONFIG_SCHED_DEBUG
2032 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2038 /*
2033 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2039 * We should never call set_task_cpu() on a blocked task,
2034 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2040 * ttwu() will sort out the placement.
2035 u64 clock_offset; 2041 */
2036 2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2037 clock_offset = old_rq->clock - new_rq->clock; 2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2044#endif
2038 2045
2039 trace_sched_migrate_task(p, new_cpu); 2046 trace_sched_migrate_task(p, new_cpu);
2040 2047
2041#ifdef CONFIG_SCHEDSTATS 2048 if (task_cpu(p) != new_cpu) {
2042 if (p->se.wait_start)
2043 p->se.wait_start -= clock_offset;
2044 if (p->se.sleep_start)
2045 p->se.sleep_start -= clock_offset;
2046 if (p->se.block_start)
2047 p->se.block_start -= clock_offset;
2048#endif
2049 if (old_cpu != new_cpu) {
2050 p->se.nr_migrations++; 2049 p->se.nr_migrations++;
2051 new_rq->nr_migrations_in++; 2050 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2052#ifdef CONFIG_SCHEDSTATS
2053 if (task_hot(p, old_rq->clock, NULL))
2054 schedstat_inc(p, se.nr_forced2_migrations);
2055#endif
2056 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2057 1, 1, NULL, 0);
2058 } 2051 }
2059 p->se.vruntime -= old_cfsrq->min_vruntime -
2060 new_cfsrq->min_vruntime;
2061 2052
2062 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2063} 2054}
@@ -2082,12 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2082 2073
2083 /* 2074 /*
2084 * If the task is not on a runqueue (and not running), then 2075 * If the task is not on a runqueue (and not running), then
2085 * it is sufficient to simply update the task's cpu field. 2076 * the next wake-up will properly place the task.
2086 */ 2077 */
2087 if (!p->se.on_rq && !task_running(rq, p)) { 2078 if (!p->se.on_rq && !task_running(rq, p))
2088 set_task_cpu(p, dest_cpu);
2089 return 0; 2079 return 0;
2090 }
2091 2080
2092 init_completion(&req->done); 2081 init_completion(&req->done);
2093 req->task = p; 2082 req->task = p;
@@ -2292,6 +2281,75 @@ void task_oncpu_function_call(struct task_struct *p,
2292 preempt_enable(); 2281 preempt_enable();
2293} 2282}
2294 2283
2284#ifdef CONFIG_SMP
2285static int select_fallback_rq(int cpu, struct task_struct *p)
2286{
2287 int dest_cpu;
2288 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2289
2290 /* Look for allowed, online CPU in same node. */
2291 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2292 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2293 return dest_cpu;
2294
2295 /* Any allowed, online CPU? */
2296 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2297 if (dest_cpu < nr_cpu_ids)
2298 return dest_cpu;
2299
2300 /* No more Mr. Nice Guy. */
2301 if (dest_cpu >= nr_cpu_ids) {
2302 rcu_read_lock();
2303 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2304 rcu_read_unlock();
2305 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2306
2307 /*
2308 * Don't tell them about moving exiting tasks or
2309 * kernel threads (both mm NULL), since they never
2310 * leave kernel.
2311 */
2312 if (p->mm && printk_ratelimit()) {
2313 printk(KERN_INFO "process %d (%s) no "
2314 "longer affine to cpu%d\n",
2315 task_pid_nr(p), p->comm, cpu);
2316 }
2317 }
2318
2319 return dest_cpu;
2320}
2321
2322/*
2323 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2324 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2325 * by:
2326 *
2327 * exec: is unstable, retry loop
2328 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2329 */
2330static inline
2331int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2332{
2333 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2334
2335 /*
2336 * In order not to call set_task_cpu() on a blocking task we need
2337 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2338 * cpu.
2339 *
2340 * Since this is common to all placement strategies, this lives here.
2341 *
2342 * [ this allows ->select_task() to simply return task_cpu(p) and
2343 * not worry about this generic constraint ]
2344 */
2345 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2346 !cpu_online(cpu)))
2347 cpu = select_fallback_rq(task_cpu(p), p);
2348
2349 return cpu;
2350}
2351#endif
2352
2295/*** 2353/***
2296 * try_to_wake_up - wake up a thread 2354 * try_to_wake_up - wake up a thread
2297 * @p: the to-be-woken-up thread 2355 * @p: the to-be-woken-up thread
@@ -2311,7 +2369,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2311{ 2369{
2312 int cpu, orig_cpu, this_cpu, success = 0; 2370 int cpu, orig_cpu, this_cpu, success = 0;
2313 unsigned long flags; 2371 unsigned long flags;
2314 struct rq *rq; 2372 struct rq *rq, *orig_rq;
2315 2373
2316 if (!sched_feat(SYNC_WAKEUPS)) 2374 if (!sched_feat(SYNC_WAKEUPS))
2317 wake_flags &= ~WF_SYNC; 2375 wake_flags &= ~WF_SYNC;
@@ -2319,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2319 this_cpu = get_cpu(); 2377 this_cpu = get_cpu();
2320 2378
2321 smp_wmb(); 2379 smp_wmb();
2322 rq = task_rq_lock(p, &flags); 2380 rq = orig_rq = task_rq_lock(p, &flags);
2323 update_rq_clock(rq); 2381 update_rq_clock(rq);
2324 if (!(p->state & state)) 2382 if (!(p->state & state))
2325 goto out; 2383 goto out;
@@ -2343,13 +2401,19 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2343 if (task_contributes_to_load(p)) 2401 if (task_contributes_to_load(p))
2344 rq->nr_uninterruptible--; 2402 rq->nr_uninterruptible--;
2345 p->state = TASK_WAKING; 2403 p->state = TASK_WAKING;
2346 task_rq_unlock(rq, &flags);
2347 2404
2348 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2405 if (p->sched_class->task_waking)
2406 p->sched_class->task_waking(rq, p);
2407
2408 __task_rq_unlock(rq);
2409
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2349 if (cpu != orig_cpu) 2411 if (cpu != orig_cpu)
2350 set_task_cpu(p, cpu); 2412 set_task_cpu(p, cpu);
2351 2413
2352 rq = task_rq_lock(p, &flags); 2414 rq = __task_rq_lock(p);
2415 update_rq_clock(rq);
2416
2353 WARN_ON(p->state != TASK_WAKING); 2417 WARN_ON(p->state != TASK_WAKING);
2354 cpu = task_cpu(p); 2418 cpu = task_cpu(p);
2355 2419
@@ -2404,8 +2468,19 @@ out_running:
2404 2468
2405 p->state = TASK_RUNNING; 2469 p->state = TASK_RUNNING;
2406#ifdef CONFIG_SMP 2470#ifdef CONFIG_SMP
2407 if (p->sched_class->task_wake_up) 2471 if (p->sched_class->task_woken)
2408 p->sched_class->task_wake_up(rq, p); 2472 p->sched_class->task_woken(rq, p);
2473
2474 if (unlikely(rq->idle_stamp)) {
2475 u64 delta = rq->clock - rq->idle_stamp;
2476 u64 max = 2*sysctl_sched_migration_cost;
2477
2478 if (delta > max)
2479 rq->avg_idle = max;
2480 else
2481 update_avg(&rq->avg_idle, delta);
2482 rq->idle_stamp = 0;
2483 }
2409#endif 2484#endif
2410out: 2485out:
2411 task_rq_unlock(rq, &flags); 2486 task_rq_unlock(rq, &flags);
@@ -2452,7 +2527,6 @@ static void __sched_fork(struct task_struct *p)
2452 p->se.avg_overlap = 0; 2527 p->se.avg_overlap = 0;
2453 p->se.start_runtime = 0; 2528 p->se.start_runtime = 0;
2454 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2529 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2455 p->se.avg_running = 0;
2456 2530
2457#ifdef CONFIG_SCHEDSTATS 2531#ifdef CONFIG_SCHEDSTATS
2458 p->se.wait_start = 0; 2532 p->se.wait_start = 0;
@@ -2474,7 +2548,6 @@ static void __sched_fork(struct task_struct *p)
2474 p->se.nr_failed_migrations_running = 0; 2548 p->se.nr_failed_migrations_running = 0;
2475 p->se.nr_failed_migrations_hot = 0; 2549 p->se.nr_failed_migrations_hot = 0;
2476 p->se.nr_forced_migrations = 0; 2550 p->se.nr_forced_migrations = 0;
2477 p->se.nr_forced2_migrations = 0;
2478 2551
2479 p->se.nr_wakeups = 0; 2552 p->se.nr_wakeups = 0;
2480 p->se.nr_wakeups_sync = 0; 2553 p->se.nr_wakeups_sync = 0;
@@ -2495,14 +2568,6 @@ static void __sched_fork(struct task_struct *p)
2495#ifdef CONFIG_PREEMPT_NOTIFIERS 2568#ifdef CONFIG_PREEMPT_NOTIFIERS
2496 INIT_HLIST_HEAD(&p->preempt_notifiers); 2569 INIT_HLIST_HEAD(&p->preempt_notifiers);
2497#endif 2570#endif
2498
2499 /*
2500 * We mark the process as running here, but have not actually
2501 * inserted it onto the runqueue yet. This guarantees that
2502 * nobody will actually run it, and a signal or other external
2503 * event cannot wake it up and insert it on the runqueue either.
2504 */
2505 p->state = TASK_RUNNING;
2506} 2571}
2507 2572
2508/* 2573/*
@@ -2513,6 +2578,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2513 int cpu = get_cpu(); 2578 int cpu = get_cpu();
2514 2579
2515 __sched_fork(p); 2580 __sched_fork(p);
2581 /*
2582 * We mark the process as waking here. This guarantees that
2583 * nobody will actually run it, and a signal or other external
2584 * event cannot wake it up and insert it on the runqueue either.
2585 */
2586 p->state = TASK_WAKING;
2516 2587
2517 /* 2588 /*
2518 * Revert to default priority/policy on fork if requested. 2589 * Revert to default priority/policy on fork if requested.
@@ -2544,9 +2615,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2544 if (!rt_prio(p->prio)) 2615 if (!rt_prio(p->prio))
2545 p->sched_class = &fair_sched_class; 2616 p->sched_class = &fair_sched_class;
2546 2617
2547#ifdef CONFIG_SMP 2618 if (p->sched_class->task_fork)
2548 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2619 p->sched_class->task_fork(p);
2549#endif 2620
2550 set_task_cpu(p, cpu); 2621 set_task_cpu(p, cpu);
2551 2622
2552#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2576,28 +2647,35 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2576{ 2647{
2577 unsigned long flags; 2648 unsigned long flags;
2578 struct rq *rq; 2649 struct rq *rq;
2650 int cpu = get_cpu();
2651
2652#ifdef CONFIG_SMP
2653 /*
2654 * Fork balancing, do it here and not earlier because:
2655 * - cpus_allowed can change in the fork path
2656 * - any previously selected cpu might disappear through hotplug
2657 *
2658 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2659 * ->cpus_allowed is stable, we have preemption disabled, meaning
2660 * cpu_online_mask is stable.
2661 */
2662 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2663 set_task_cpu(p, cpu);
2664#endif
2579 2665
2580 rq = task_rq_lock(p, &flags); 2666 rq = task_rq_lock(p, &flags);
2581 BUG_ON(p->state != TASK_RUNNING); 2667 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING;
2582 update_rq_clock(rq); 2669 update_rq_clock(rq);
2583 2670 activate_task(rq, p, 0);
2584 if (!p->sched_class->task_new || !current->se.on_rq) {
2585 activate_task(rq, p, 0);
2586 } else {
2587 /*
2588 * Let the scheduling class do new task startup
2589 * management (if any):
2590 */
2591 p->sched_class->task_new(rq, p);
2592 inc_nr_running(rq);
2593 }
2594 trace_sched_wakeup_new(rq, p, 1); 2671 trace_sched_wakeup_new(rq, p, 1);
2595 check_preempt_curr(rq, p, WF_FORK); 2672 check_preempt_curr(rq, p, WF_FORK);
2596#ifdef CONFIG_SMP 2673#ifdef CONFIG_SMP
2597 if (p->sched_class->task_wake_up) 2674 if (p->sched_class->task_woken)
2598 p->sched_class->task_wake_up(rq, p); 2675 p->sched_class->task_woken(rq, p);
2599#endif 2676#endif
2600 task_rq_unlock(rq, &flags); 2677 task_rq_unlock(rq, &flags);
2678 put_cpu();
2601} 2679}
2602 2680
2603#ifdef CONFIG_PREEMPT_NOTIFIERS 2681#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2747,10 +2825,10 @@ static inline void post_schedule(struct rq *rq)
2747 if (rq->post_schedule) { 2825 if (rq->post_schedule) {
2748 unsigned long flags; 2826 unsigned long flags;
2749 2827
2750 spin_lock_irqsave(&rq->lock, flags); 2828 raw_spin_lock_irqsave(&rq->lock, flags);
2751 if (rq->curr->sched_class->post_schedule) 2829 if (rq->curr->sched_class->post_schedule)
2752 rq->curr->sched_class->post_schedule(rq); 2830 rq->curr->sched_class->post_schedule(rq);
2753 spin_unlock_irqrestore(&rq->lock, flags); 2831 raw_spin_unlock_irqrestore(&rq->lock, flags);
2754 2832
2755 rq->post_schedule = 0; 2833 rq->post_schedule = 0;
2756 } 2834 }
@@ -2814,14 +2892,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2814 */ 2892 */
2815 arch_start_context_switch(prev); 2893 arch_start_context_switch(prev);
2816 2894
2817 if (unlikely(!mm)) { 2895 if (likely(!mm)) {
2818 next->active_mm = oldmm; 2896 next->active_mm = oldmm;
2819 atomic_inc(&oldmm->mm_count); 2897 atomic_inc(&oldmm->mm_count);
2820 enter_lazy_tlb(oldmm, next); 2898 enter_lazy_tlb(oldmm, next);
2821 } else 2899 } else
2822 switch_mm(oldmm, mm, next); 2900 switch_mm(oldmm, mm, next);
2823 2901
2824 if (unlikely(!prev->mm)) { 2902 if (likely(!prev->mm)) {
2825 prev->active_mm = NULL; 2903 prev->active_mm = NULL;
2826 rq->prev_mm = oldmm; 2904 rq->prev_mm = oldmm;
2827 } 2905 }
@@ -2984,15 +3062,6 @@ static void calc_load_account_active(struct rq *this_rq)
2984} 3062}
2985 3063
2986/* 3064/*
2987 * Externally visible per-cpu scheduler statistics:
2988 * cpu_nr_migrations(cpu) - number of migrations into that cpu
2989 */
2990u64 cpu_nr_migrations(int cpu)
2991{
2992 return cpu_rq(cpu)->nr_migrations_in;
2993}
2994
2995/*
2996 * Update rq->cpu_load[] statistics. This function is usually called every 3065 * Update rq->cpu_load[] statistics. This function is usually called every
2997 * scheduler tick (TICK_NSEC). 3066 * scheduler tick (TICK_NSEC).
2998 */ 3067 */
@@ -3041,15 +3110,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3041{ 3110{
3042 BUG_ON(!irqs_disabled()); 3111 BUG_ON(!irqs_disabled());
3043 if (rq1 == rq2) { 3112 if (rq1 == rq2) {
3044 spin_lock(&rq1->lock); 3113 raw_spin_lock(&rq1->lock);
3045 __acquire(rq2->lock); /* Fake it out ;) */ 3114 __acquire(rq2->lock); /* Fake it out ;) */
3046 } else { 3115 } else {
3047 if (rq1 < rq2) { 3116 if (rq1 < rq2) {
3048 spin_lock(&rq1->lock); 3117 raw_spin_lock(&rq1->lock);
3049 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3050 } else { 3119 } else {
3051 spin_lock(&rq2->lock); 3120 raw_spin_lock(&rq2->lock);
3052 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3053 } 3122 }
3054 } 3123 }
3055 update_rq_clock(rq1); 3124 update_rq_clock(rq1);
@@ -3066,29 +3135,44 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3066 __releases(rq1->lock) 3135 __releases(rq1->lock)
3067 __releases(rq2->lock) 3136 __releases(rq2->lock)
3068{ 3137{
3069 spin_unlock(&rq1->lock); 3138 raw_spin_unlock(&rq1->lock);
3070 if (rq1 != rq2) 3139 if (rq1 != rq2)
3071 spin_unlock(&rq2->lock); 3140 raw_spin_unlock(&rq2->lock);
3072 else 3141 else
3073 __release(rq2->lock); 3142 __release(rq2->lock);
3074} 3143}
3075 3144
3076/* 3145/*
3077 * If dest_cpu is allowed for this process, migrate the task to it. 3146 * sched_exec - execve() is a valuable balancing opportunity, because at
3078 * This is accomplished by forcing the cpu_allowed mask to only 3147 * this point the task has the smallest effective memory and cache footprint.
3079 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3080 * the cpu_allowed mask is restored.
3081 */ 3148 */
3082static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3149void sched_exec(void)
3083{ 3150{
3151 struct task_struct *p = current;
3084 struct migration_req req; 3152 struct migration_req req;
3153 int dest_cpu, this_cpu;
3085 unsigned long flags; 3154 unsigned long flags;
3086 struct rq *rq; 3155 struct rq *rq;
3087 3156
3157again:
3158 this_cpu = get_cpu();
3159 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3160 if (dest_cpu == this_cpu) {
3161 put_cpu();
3162 return;
3163 }
3164
3088 rq = task_rq_lock(p, &flags); 3165 rq = task_rq_lock(p, &flags);
3166 put_cpu();
3167
3168 /*
3169 * select_task_rq() can race against ->cpus_allowed
3170 */
3089 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3171 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3090 || unlikely(!cpu_active(dest_cpu))) 3172 || unlikely(!cpu_active(dest_cpu))) {
3091 goto out; 3173 task_rq_unlock(rq, &flags);
3174 goto again;
3175 }
3092 3176
3093 /* force the process onto the specified CPU */ 3177 /* force the process onto the specified CPU */
3094 if (migrate_task(p, dest_cpu, &req)) { 3178 if (migrate_task(p, dest_cpu, &req)) {
@@ -3103,24 +3187,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3103 3187
3104 return; 3188 return;
3105 } 3189 }
3106out:
3107 task_rq_unlock(rq, &flags); 3190 task_rq_unlock(rq, &flags);
3108} 3191}
3109 3192
3110/* 3193/*
3111 * sched_exec - execve() is a valuable balancing opportunity, because at
3112 * this point the task has the smallest effective memory and cache footprint.
3113 */
3114void sched_exec(void)
3115{
3116 int new_cpu, this_cpu = get_cpu();
3117 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3118 put_cpu();
3119 if (new_cpu != this_cpu)
3120 sched_migrate_task(current, new_cpu);
3121}
3122
3123/*
3124 * pull_task - move a task from a remote runqueue to the local runqueue. 3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3125 * Both runqueues must be locked. 3195 * Both runqueues must be locked.
3126 */ 3196 */
@@ -3130,10 +3200,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3130 deactivate_task(src_rq, p, 0); 3200 deactivate_task(src_rq, p, 0);
3131 set_task_cpu(p, this_cpu); 3201 set_task_cpu(p, this_cpu);
3132 activate_task(this_rq, p, 0); 3202 activate_task(this_rq, p, 0);
3133 /*
3134 * Note that idle threads have a prio of MAX_PRIO, for this test
3135 * to be always true for them.
3136 */
3137 check_preempt_curr(this_rq, p, 0); 3203 check_preempt_curr(this_rq, p, 0);
3138} 3204}
3139 3205
@@ -3656,6 +3722,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
3656 3722
3657/** 3723/**
3658 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3724 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3725 * @sd: The sched_domain whose statistics are to be updated.
3659 * @group: sched_group whose statistics are to be updated. 3726 * @group: sched_group whose statistics are to be updated.
3660 * @this_cpu: Cpu for which load balance is currently performed. 3727 * @this_cpu: Cpu for which load balance is currently performed.
3661 * @idle: Idle status of this_cpu 3728 * @idle: Idle status of this_cpu
@@ -4091,7 +4158,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4091 unsigned long flags; 4158 unsigned long flags;
4092 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4093 4160
4094 cpumask_setall(cpus); 4161 cpumask_copy(cpus, cpu_active_mask);
4095 4162
4096 /* 4163 /*
4097 * When power savings policy is enabled for the parent domain, idle 4164 * When power savings policy is enabled for the parent domain, idle
@@ -4164,14 +4231,15 @@ redo:
4164 4231
4165 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4166 4233
4167 spin_lock_irqsave(&busiest->lock, flags); 4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4168 4235
4169 /* don't kick the migration_thread, if the curr 4236 /* don't kick the migration_thread, if the curr
4170 * task on busiest cpu can't be moved to this_cpu 4237 * task on busiest cpu can't be moved to this_cpu
4171 */ 4238 */
4172 if (!cpumask_test_cpu(this_cpu, 4239 if (!cpumask_test_cpu(this_cpu,
4173 &busiest->curr->cpus_allowed)) { 4240 &busiest->curr->cpus_allowed)) {
4174 spin_unlock_irqrestore(&busiest->lock, flags); 4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4175 all_pinned = 1; 4243 all_pinned = 1;
4176 goto out_one_pinned; 4244 goto out_one_pinned;
4177 } 4245 }
@@ -4181,7 +4249,7 @@ redo:
4181 busiest->push_cpu = this_cpu; 4249 busiest->push_cpu = this_cpu;
4182 active_balance = 1; 4250 active_balance = 1;
4183 } 4251 }
4184 spin_unlock_irqrestore(&busiest->lock, flags); 4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4185 if (active_balance) 4253 if (active_balance)
4186 wake_up_process(busiest->migration_thread); 4254 wake_up_process(busiest->migration_thread);
4187 4255
@@ -4254,7 +4322,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4254 int all_pinned = 0; 4322 int all_pinned = 0;
4255 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4256 4324
4257 cpumask_setall(cpus); 4325 cpumask_copy(cpus, cpu_active_mask);
4258 4326
4259 /* 4327 /*
4260 * When power savings policy is enabled for the parent domain, idle 4328 * When power savings policy is enabled for the parent domain, idle
@@ -4363,10 +4431,10 @@ redo:
4363 /* 4431 /*
4364 * Should not call ttwu while holding a rq->lock 4432 * Should not call ttwu while holding a rq->lock
4365 */ 4433 */
4366 spin_unlock(&this_rq->lock); 4434 raw_spin_unlock(&this_rq->lock);
4367 if (active_balance) 4435 if (active_balance)
4368 wake_up_process(busiest->migration_thread); 4436 wake_up_process(busiest->migration_thread);
4369 spin_lock(&this_rq->lock); 4437 raw_spin_lock(&this_rq->lock);
4370 4438
4371 } else 4439 } else
4372 sd->nr_balance_failed = 0; 4440 sd->nr_balance_failed = 0;
@@ -4394,6 +4462,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4394 int pulled_task = 0; 4462 int pulled_task = 0;
4395 unsigned long next_balance = jiffies + HZ; 4463 unsigned long next_balance = jiffies + HZ;
4396 4464
4465 this_rq->idle_stamp = this_rq->clock;
4466
4467 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4468 return;
4469
4397 for_each_domain(this_cpu, sd) { 4470 for_each_domain(this_cpu, sd) {
4398 unsigned long interval; 4471 unsigned long interval;
4399 4472
@@ -4408,8 +4481,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4408 interval = msecs_to_jiffies(sd->balance_interval); 4481 interval = msecs_to_jiffies(sd->balance_interval);
4409 if (time_after(next_balance, sd->last_balance + interval)) 4482 if (time_after(next_balance, sd->last_balance + interval))
4410 next_balance = sd->last_balance + interval; 4483 next_balance = sd->last_balance + interval;
4411 if (pulled_task) 4484 if (pulled_task) {
4485 this_rq->idle_stamp = 0;
4412 break; 4486 break;
4487 }
4413 } 4488 }
4414 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4489 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4415 /* 4490 /*
@@ -4644,7 +4719,7 @@ int select_nohz_load_balancer(int stop_tick)
4644 cpumask_set_cpu(cpu, nohz.cpu_mask); 4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4645 4720
4646 /* time for ilb owner also to sleep */ 4721 /* time for ilb owner also to sleep */
4647 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4648 if (atomic_read(&nohz.load_balancer) == cpu) 4723 if (atomic_read(&nohz.load_balancer) == cpu)
4649 atomic_set(&nohz.load_balancer, -1); 4724 atomic_set(&nohz.load_balancer, -1);
4650 return 0; 4725 return 0;
@@ -5011,8 +5086,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
5011 p->gtime = cputime_add(p->gtime, cputime); 5086 p->gtime = cputime_add(p->gtime, cputime);
5012 5087
5013 /* Add guest time to cpustat. */ 5088 /* Add guest time to cpustat. */
5014 cpustat->user = cputime64_add(cpustat->user, tmp); 5089 if (TASK_NICE(p) > 0) {
5015 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5090 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5091 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5092 } else {
5093 cpustat->user = cputime64_add(cpustat->user, tmp);
5094 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5095 }
5016} 5096}
5017 5097
5018/* 5098/*
@@ -5127,60 +5207,86 @@ void account_idle_ticks(unsigned long ticks)
5127 * Use precise platform statistics if available: 5207 * Use precise platform statistics if available:
5128 */ 5208 */
5129#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5209#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5130cputime_t task_utime(struct task_struct *p) 5210void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5131{ 5211{
5132 return p->utime; 5212 *ut = p->utime;
5213 *st = p->stime;
5133} 5214}
5134 5215
5135cputime_t task_stime(struct task_struct *p) 5216void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5136{ 5217{
5137 return p->stime; 5218 struct task_cputime cputime;
5219
5220 thread_group_cputime(p, &cputime);
5221
5222 *ut = cputime.utime;
5223 *st = cputime.stime;
5138} 5224}
5139#else 5225#else
5140cputime_t task_utime(struct task_struct *p) 5226
5227#ifndef nsecs_to_cputime
5228# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5229#endif
5230
5231void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5141{ 5232{
5142 clock_t utime = cputime_to_clock_t(p->utime), 5233 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5143 total = utime + cputime_to_clock_t(p->stime);
5144 u64 temp;
5145 5234
5146 /* 5235 /*
5147 * Use CFS's precise accounting: 5236 * Use CFS's precise accounting:
5148 */ 5237 */
5149 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5238 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5150 5239
5151 if (total) { 5240 if (total) {
5152 temp *= utime; 5241 u64 temp;
5242
5243 temp = (u64)(rtime * utime);
5153 do_div(temp, total); 5244 do_div(temp, total);
5154 } 5245 utime = (cputime_t)temp;
5155 utime = (clock_t)temp; 5246 } else
5247 utime = rtime;
5248
5249 /*
5250 * Compare with previous values, to keep monotonicity:
5251 */
5252 p->prev_utime = max(p->prev_utime, utime);
5253 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5156 5254
5157 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5255 *ut = p->prev_utime;
5158 return p->prev_utime; 5256 *st = p->prev_stime;
5159} 5257}
5160 5258
5161cputime_t task_stime(struct task_struct *p) 5259/*
5260 * Must be called with siglock held.
5261 */
5262void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5162{ 5263{
5163 clock_t stime; 5264 struct signal_struct *sig = p->signal;
5265 struct task_cputime cputime;
5266 cputime_t rtime, utime, total;
5164 5267
5165 /* 5268 thread_group_cputime(p, &cputime);
5166 * Use CFS's precise accounting. (we subtract utime from
5167 * the total, to make sure the total observed by userspace
5168 * grows monotonically - apps rely on that):
5169 */
5170 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5171 cputime_to_clock_t(task_utime(p));
5172 5269
5173 if (stime >= 0) 5270 total = cputime_add(cputime.utime, cputime.stime);
5174 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5271 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5175 5272
5176 return p->prev_stime; 5273 if (total) {
5177} 5274 u64 temp;
5178#endif
5179 5275
5180inline cputime_t task_gtime(struct task_struct *p) 5276 temp = (u64)(rtime * cputime.utime);
5181{ 5277 do_div(temp, total);
5182 return p->gtime; 5278 utime = (cputime_t)temp;
5279 } else
5280 utime = rtime;
5281
5282 sig->prev_utime = max(sig->prev_utime, utime);
5283 sig->prev_stime = max(sig->prev_stime,
5284 cputime_sub(rtime, sig->prev_utime));
5285
5286 *ut = sig->prev_utime;
5287 *st = sig->prev_stime;
5183} 5288}
5289#endif
5184 5290
5185/* 5291/*
5186 * This function gets called by the timer code, with HZ frequency. 5292 * This function gets called by the timer code, with HZ frequency.
@@ -5197,11 +5303,11 @@ void scheduler_tick(void)
5197 5303
5198 sched_clock_tick(); 5304 sched_clock_tick();
5199 5305
5200 spin_lock(&rq->lock); 5306 raw_spin_lock(&rq->lock);
5201 update_rq_clock(rq); 5307 update_rq_clock(rq);
5202 update_cpu_load(rq); 5308 update_cpu_load(rq);
5203 curr->sched_class->task_tick(rq, curr, 0); 5309 curr->sched_class->task_tick(rq, curr, 0);
5204 spin_unlock(&rq->lock); 5310 raw_spin_unlock(&rq->lock);
5205 5311
5206 perf_event_task_tick(curr, cpu); 5312 perf_event_task_tick(curr, cpu);
5207 5313
@@ -5315,13 +5421,14 @@ static inline void schedule_debug(struct task_struct *prev)
5315#endif 5421#endif
5316} 5422}
5317 5423
5318static void put_prev_task(struct rq *rq, struct task_struct *p) 5424static void put_prev_task(struct rq *rq, struct task_struct *prev)
5319{ 5425{
5320 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5426 if (prev->state == TASK_RUNNING) {
5427 u64 runtime = prev->se.sum_exec_runtime;
5321 5428
5322 update_avg(&p->se.avg_running, runtime); 5429 runtime -= prev->se.prev_sum_exec_runtime;
5430 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5323 5431
5324 if (p->state == TASK_RUNNING) {
5325 /* 5432 /*
5326 * In order to avoid avg_overlap growing stale when we are 5433 * In order to avoid avg_overlap growing stale when we are
5327 * indeed overlapping and hence not getting put to sleep, grow 5434 * indeed overlapping and hence not getting put to sleep, grow
@@ -5331,12 +5438,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5331 * correlates to the amount of cache footprint a task can 5438 * correlates to the amount of cache footprint a task can
5332 * build up. 5439 * build up.
5333 */ 5440 */
5334 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5441 update_avg(&prev->se.avg_overlap, runtime);
5335 update_avg(&p->se.avg_overlap, runtime);
5336 } else {
5337 update_avg(&p->se.avg_running, 0);
5338 } 5442 }
5339 p->sched_class->put_prev_task(rq, p); 5443 prev->sched_class->put_prev_task(rq, prev);
5340} 5444}
5341 5445
5342/* 5446/*
@@ -5397,7 +5501,7 @@ need_resched_nonpreemptible:
5397 if (sched_feat(HRTICK)) 5501 if (sched_feat(HRTICK))
5398 hrtick_clear(rq); 5502 hrtick_clear(rq);
5399 5503
5400 spin_lock_irq(&rq->lock); 5504 raw_spin_lock_irq(&rq->lock);
5401 update_rq_clock(rq); 5505 update_rq_clock(rq);
5402 clear_tsk_need_resched(prev); 5506 clear_tsk_need_resched(prev);
5403 5507
@@ -5433,12 +5537,15 @@ need_resched_nonpreemptible:
5433 cpu = smp_processor_id(); 5537 cpu = smp_processor_id();
5434 rq = cpu_rq(cpu); 5538 rq = cpu_rq(cpu);
5435 } else 5539 } else
5436 spin_unlock_irq(&rq->lock); 5540 raw_spin_unlock_irq(&rq->lock);
5437 5541
5438 post_schedule(rq); 5542 post_schedule(rq);
5439 5543
5440 if (unlikely(reacquire_kernel_lock(current) < 0)) 5544 if (unlikely(reacquire_kernel_lock(current) < 0)) {
5545 prev = rq->curr;
5546 switch_count = &prev->nivcsw;
5441 goto need_resched_nonpreemptible; 5547 goto need_resched_nonpreemptible;
5548 }
5442 5549
5443 preempt_enable_no_resched(); 5550 preempt_enable_no_resched();
5444 if (need_resched()) 5551 if (need_resched())
@@ -5446,7 +5553,7 @@ need_resched_nonpreemptible:
5446} 5553}
5447EXPORT_SYMBOL(schedule); 5554EXPORT_SYMBOL(schedule);
5448 5555
5449#ifdef CONFIG_SMP 5556#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5450/* 5557/*
5451 * Look out! "owner" is an entirely speculative pointer 5558 * Look out! "owner" is an entirely speculative pointer
5452 * access and not reliable. 5559 * access and not reliable.
@@ -5850,14 +5957,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5850 */ 5957 */
5851bool try_wait_for_completion(struct completion *x) 5958bool try_wait_for_completion(struct completion *x)
5852{ 5959{
5960 unsigned long flags;
5853 int ret = 1; 5961 int ret = 1;
5854 5962
5855 spin_lock_irq(&x->wait.lock); 5963 spin_lock_irqsave(&x->wait.lock, flags);
5856 if (!x->done) 5964 if (!x->done)
5857 ret = 0; 5965 ret = 0;
5858 else 5966 else
5859 x->done--; 5967 x->done--;
5860 spin_unlock_irq(&x->wait.lock); 5968 spin_unlock_irqrestore(&x->wait.lock, flags);
5861 return ret; 5969 return ret;
5862} 5970}
5863EXPORT_SYMBOL(try_wait_for_completion); 5971EXPORT_SYMBOL(try_wait_for_completion);
@@ -5872,12 +5980,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5872 */ 5980 */
5873bool completion_done(struct completion *x) 5981bool completion_done(struct completion *x)
5874{ 5982{
5983 unsigned long flags;
5875 int ret = 1; 5984 int ret = 1;
5876 5985
5877 spin_lock_irq(&x->wait.lock); 5986 spin_lock_irqsave(&x->wait.lock, flags);
5878 if (!x->done) 5987 if (!x->done)
5879 ret = 0; 5988 ret = 0;
5880 spin_unlock_irq(&x->wait.lock); 5989 spin_unlock_irqrestore(&x->wait.lock, flags);
5881 return ret; 5990 return ret;
5882} 5991}
5883EXPORT_SYMBOL(completion_done); 5992EXPORT_SYMBOL(completion_done);
@@ -6140,22 +6249,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6140 BUG_ON(p->se.on_rq); 6249 BUG_ON(p->se.on_rq);
6141 6250
6142 p->policy = policy; 6251 p->policy = policy;
6143 switch (p->policy) {
6144 case SCHED_NORMAL:
6145 case SCHED_BATCH:
6146 case SCHED_IDLE:
6147 p->sched_class = &fair_sched_class;
6148 break;
6149 case SCHED_FIFO:
6150 case SCHED_RR:
6151 p->sched_class = &rt_sched_class;
6152 break;
6153 }
6154
6155 p->rt_priority = prio; 6252 p->rt_priority = prio;
6156 p->normal_prio = normal_prio(p); 6253 p->normal_prio = normal_prio(p);
6157 /* we are holding p->pi_lock already */ 6254 /* we are holding p->pi_lock already */
6158 p->prio = rt_mutex_getprio(p); 6255 p->prio = rt_mutex_getprio(p);
6256 if (rt_prio(p->prio))
6257 p->sched_class = &rt_sched_class;
6258 else
6259 p->sched_class = &fair_sched_class;
6159 set_load_weight(p); 6260 set_load_weight(p);
6160} 6261}
6161 6262
@@ -6270,7 +6371,7 @@ recheck:
6270 * make sure no PI-waiters arrive (or leave) while we are 6371 * make sure no PI-waiters arrive (or leave) while we are
6271 * changing the priority of the task: 6372 * changing the priority of the task:
6272 */ 6373 */
6273 spin_lock_irqsave(&p->pi_lock, flags); 6374 raw_spin_lock_irqsave(&p->pi_lock, flags);
6274 /* 6375 /*
6275 * To be able to change p->policy safely, the apropriate 6376 * To be able to change p->policy safely, the apropriate
6276 * runqueue lock must be held. 6377 * runqueue lock must be held.
@@ -6280,7 +6381,7 @@ recheck:
6280 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6381 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6281 policy = oldpolicy = -1; 6382 policy = oldpolicy = -1;
6282 __task_rq_unlock(rq); 6383 __task_rq_unlock(rq);
6283 spin_unlock_irqrestore(&p->pi_lock, flags); 6384 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6284 goto recheck; 6385 goto recheck;
6285 } 6386 }
6286 update_rq_clock(rq); 6387 update_rq_clock(rq);
@@ -6304,7 +6405,7 @@ recheck:
6304 check_class_changed(rq, p, prev_class, oldprio, running); 6405 check_class_changed(rq, p, prev_class, oldprio, running);
6305 } 6406 }
6306 __task_rq_unlock(rq); 6407 __task_rq_unlock(rq);
6307 spin_unlock_irqrestore(&p->pi_lock, flags); 6408 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6308 6409
6309 rt_mutex_adjust_pi(p); 6410 rt_mutex_adjust_pi(p);
6310 6411
@@ -6404,7 +6505,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6404 return -EINVAL; 6505 return -EINVAL;
6405 6506
6406 retval = -ESRCH; 6507 retval = -ESRCH;
6407 read_lock(&tasklist_lock); 6508 rcu_read_lock();
6408 p = find_process_by_pid(pid); 6509 p = find_process_by_pid(pid);
6409 if (p) { 6510 if (p) {
6410 retval = security_task_getscheduler(p); 6511 retval = security_task_getscheduler(p);
@@ -6412,7 +6513,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6412 retval = p->policy 6513 retval = p->policy
6413 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 6514 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6414 } 6515 }
6415 read_unlock(&tasklist_lock); 6516 rcu_read_unlock();
6416 return retval; 6517 return retval;
6417} 6518}
6418 6519
@@ -6430,7 +6531,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6430 if (!param || pid < 0) 6531 if (!param || pid < 0)
6431 return -EINVAL; 6532 return -EINVAL;
6432 6533
6433 read_lock(&tasklist_lock); 6534 rcu_read_lock();
6434 p = find_process_by_pid(pid); 6535 p = find_process_by_pid(pid);
6435 retval = -ESRCH; 6536 retval = -ESRCH;
6436 if (!p) 6537 if (!p)
@@ -6441,7 +6542,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6441 goto out_unlock; 6542 goto out_unlock;
6442 6543
6443 lp.sched_priority = p->rt_priority; 6544 lp.sched_priority = p->rt_priority;
6444 read_unlock(&tasklist_lock); 6545 rcu_read_unlock();
6445 6546
6446 /* 6547 /*
6447 * This one might sleep, we cannot do it with a spinlock held ... 6548 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6451,7 +6552,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6451 return retval; 6552 return retval;
6452 6553
6453out_unlock: 6554out_unlock:
6454 read_unlock(&tasklist_lock); 6555 rcu_read_unlock();
6455 return retval; 6556 return retval;
6456} 6557}
6457 6558
@@ -6462,22 +6563,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6462 int retval; 6563 int retval;
6463 6564
6464 get_online_cpus(); 6565 get_online_cpus();
6465 read_lock(&tasklist_lock); 6566 rcu_read_lock();
6466 6567
6467 p = find_process_by_pid(pid); 6568 p = find_process_by_pid(pid);
6468 if (!p) { 6569 if (!p) {
6469 read_unlock(&tasklist_lock); 6570 rcu_read_unlock();
6470 put_online_cpus(); 6571 put_online_cpus();
6471 return -ESRCH; 6572 return -ESRCH;
6472 } 6573 }
6473 6574
6474 /* 6575 /* Prevent p going away */
6475 * It is not safe to call set_cpus_allowed with the
6476 * tasklist_lock held. We will bump the task_struct's
6477 * usage count and then drop tasklist_lock.
6478 */
6479 get_task_struct(p); 6576 get_task_struct(p);
6480 read_unlock(&tasklist_lock); 6577 rcu_read_unlock();
6481 6578
6482 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 6579 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6483 retval = -ENOMEM; 6580 retval = -ENOMEM;
@@ -6558,10 +6655,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6558long sched_getaffinity(pid_t pid, struct cpumask *mask) 6655long sched_getaffinity(pid_t pid, struct cpumask *mask)
6559{ 6656{
6560 struct task_struct *p; 6657 struct task_struct *p;
6658 unsigned long flags;
6659 struct rq *rq;
6561 int retval; 6660 int retval;
6562 6661
6563 get_online_cpus(); 6662 get_online_cpus();
6564 read_lock(&tasklist_lock); 6663 rcu_read_lock();
6565 6664
6566 retval = -ESRCH; 6665 retval = -ESRCH;
6567 p = find_process_by_pid(pid); 6666 p = find_process_by_pid(pid);
@@ -6572,10 +6671,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6572 if (retval) 6671 if (retval)
6573 goto out_unlock; 6672 goto out_unlock;
6574 6673
6674 rq = task_rq_lock(p, &flags);
6575 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6675 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6676 task_rq_unlock(rq, &flags);
6576 6677
6577out_unlock: 6678out_unlock:
6578 read_unlock(&tasklist_lock); 6679 rcu_read_unlock();
6579 put_online_cpus(); 6680 put_online_cpus();
6580 6681
6581 return retval; 6682 return retval;
@@ -6630,7 +6731,7 @@ SYSCALL_DEFINE0(sched_yield)
6630 */ 6731 */
6631 __release(rq->lock); 6732 __release(rq->lock);
6632 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6733 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6633 _raw_spin_unlock(&rq->lock); 6734 do_raw_spin_unlock(&rq->lock);
6634 preempt_enable_no_resched(); 6735 preempt_enable_no_resched();
6635 6736
6636 schedule(); 6737 schedule();
@@ -6718,9 +6819,6 @@ EXPORT_SYMBOL(yield);
6718/* 6819/*
6719 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6820 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6720 * that process accounting knows that this is a task in IO wait state. 6821 * that process accounting knows that this is a task in IO wait state.
6721 *
6722 * But don't do that if it is a deliberate, throttling IO wait (this task
6723 * has set its backing_dev_info: the queue against which it should throttle)
6724 */ 6822 */
6725void __sched io_schedule(void) 6823void __sched io_schedule(void)
6726{ 6824{
@@ -6813,6 +6911,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6813{ 6911{
6814 struct task_struct *p; 6912 struct task_struct *p;
6815 unsigned int time_slice; 6913 unsigned int time_slice;
6914 unsigned long flags;
6915 struct rq *rq;
6816 int retval; 6916 int retval;
6817 struct timespec t; 6917 struct timespec t;
6818 6918
@@ -6820,7 +6920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6820 return -EINVAL; 6920 return -EINVAL;
6821 6921
6822 retval = -ESRCH; 6922 retval = -ESRCH;
6823 read_lock(&tasklist_lock); 6923 rcu_read_lock();
6824 p = find_process_by_pid(pid); 6924 p = find_process_by_pid(pid);
6825 if (!p) 6925 if (!p)
6826 goto out_unlock; 6926 goto out_unlock;
@@ -6829,15 +6929,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6829 if (retval) 6929 if (retval)
6830 goto out_unlock; 6930 goto out_unlock;
6831 6931
6832 time_slice = p->sched_class->get_rr_interval(p); 6932 rq = task_rq_lock(p, &flags);
6933 time_slice = p->sched_class->get_rr_interval(rq, p);
6934 task_rq_unlock(rq, &flags);
6833 6935
6834 read_unlock(&tasklist_lock); 6936 rcu_read_unlock();
6835 jiffies_to_timespec(time_slice, &t); 6937 jiffies_to_timespec(time_slice, &t);
6836 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6938 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6837 return retval; 6939 return retval;
6838 6940
6839out_unlock: 6941out_unlock:
6840 read_unlock(&tasklist_lock); 6942 rcu_read_unlock();
6841 return retval; 6943 return retval;
6842} 6944}
6843 6945
@@ -6903,7 +7005,7 @@ void show_state_filter(unsigned long state_filter)
6903 /* 7005 /*
6904 * Only show locks if all tasks are dumped: 7006 * Only show locks if all tasks are dumped:
6905 */ 7007 */
6906 if (state_filter == -1) 7008 if (!state_filter)
6907 debug_show_all_locks(); 7009 debug_show_all_locks();
6908} 7010}
6909 7011
@@ -6925,12 +7027,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6925 struct rq *rq = cpu_rq(cpu); 7027 struct rq *rq = cpu_rq(cpu);
6926 unsigned long flags; 7028 unsigned long flags;
6927 7029
6928 spin_lock_irqsave(&rq->lock, flags); 7030 raw_spin_lock_irqsave(&rq->lock, flags);
6929 7031
6930 __sched_fork(idle); 7032 __sched_fork(idle);
7033 idle->state = TASK_RUNNING;
6931 idle->se.exec_start = sched_clock(); 7034 idle->se.exec_start = sched_clock();
6932 7035
6933 idle->prio = idle->normal_prio = MAX_PRIO;
6934 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 7036 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
6935 __set_task_cpu(idle, cpu); 7037 __set_task_cpu(idle, cpu);
6936 7038
@@ -6938,7 +7040,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6938#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 7040#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
6939 idle->oncpu = 1; 7041 idle->oncpu = 1;
6940#endif 7042#endif
6941 spin_unlock_irqrestore(&rq->lock, flags); 7043 raw_spin_unlock_irqrestore(&rq->lock, flags);
6942 7044
6943 /* Set the preempt count _outside_ the spinlocks! */ 7045 /* Set the preempt count _outside_ the spinlocks! */
6944#if defined(CONFIG_PREEMPT) 7046#if defined(CONFIG_PREEMPT)
@@ -6971,22 +7073,43 @@ cpumask_var_t nohz_cpu_mask;
6971 * 7073 *
6972 * This idea comes from the SD scheduler of Con Kolivas: 7074 * This idea comes from the SD scheduler of Con Kolivas:
6973 */ 7075 */
6974static inline void sched_init_granularity(void) 7076static int get_update_sysctl_factor(void)
6975{ 7077{
6976 unsigned int factor = 1 + ilog2(num_online_cpus()); 7078 unsigned int cpus = min_t(int, num_online_cpus(), 8);
6977 const unsigned long limit = 200000000; 7079 unsigned int factor;
6978 7080
6979 sysctl_sched_min_granularity *= factor; 7081 switch (sysctl_sched_tunable_scaling) {
6980 if (sysctl_sched_min_granularity > limit) 7082 case SCHED_TUNABLESCALING_NONE:
6981 sysctl_sched_min_granularity = limit; 7083 factor = 1;
7084 break;
7085 case SCHED_TUNABLESCALING_LINEAR:
7086 factor = cpus;
7087 break;
7088 case SCHED_TUNABLESCALING_LOG:
7089 default:
7090 factor = 1 + ilog2(cpus);
7091 break;
7092 }
6982 7093
6983 sysctl_sched_latency *= factor; 7094 return factor;
6984 if (sysctl_sched_latency > limit) 7095}
6985 sysctl_sched_latency = limit;
6986 7096
6987 sysctl_sched_wakeup_granularity *= factor; 7097static void update_sysctl(void)
7098{
7099 unsigned int factor = get_update_sysctl_factor();
7100
7101#define SET_SYSCTL(name) \
7102 (sysctl_##name = (factor) * normalized_sysctl_##name)
7103 SET_SYSCTL(sched_min_granularity);
7104 SET_SYSCTL(sched_latency);
7105 SET_SYSCTL(sched_wakeup_granularity);
7106 SET_SYSCTL(sched_shares_ratelimit);
7107#undef SET_SYSCTL
7108}
6988 7109
6989 sysctl_sched_shares_ratelimit *= factor; 7110static inline void sched_init_granularity(void)
7111{
7112 update_sysctl();
6990} 7113}
6991 7114
6992#ifdef CONFIG_SMP 7115#ifdef CONFIG_SMP
@@ -7022,8 +7145,28 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7022 struct rq *rq; 7145 struct rq *rq;
7023 int ret = 0; 7146 int ret = 0;
7024 7147
7148 /*
7149 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7150 * the ->cpus_allowed mask from under waking tasks, which would be
7151 * possible when we change rq->lock in ttwu(), so synchronize against
7152 * TASK_WAKING to avoid that.
7153 *
7154 * Make an exception for freshly cloned tasks, since cpuset namespaces
7155 * might move the task about, we have to validate the target in
7156 * wake_up_new_task() anyway since the cpu might have gone away.
7157 */
7158again:
7159 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7160 cpu_relax();
7161
7025 rq = task_rq_lock(p, &flags); 7162 rq = task_rq_lock(p, &flags);
7026 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7163
7164 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7165 task_rq_unlock(rq, &flags);
7166 goto again;
7167 }
7168
7169 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7027 ret = -EINVAL; 7170 ret = -EINVAL;
7028 goto out; 7171 goto out;
7029 } 7172 }
@@ -7045,7 +7188,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7045 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7188 if (cpumask_test_cpu(task_cpu(p), new_mask))
7046 goto out; 7189 goto out;
7047 7190
7048 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7191 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7049 /* Need help from migration thread: drop lock and wait. */ 7192 /* Need help from migration thread: drop lock and wait. */
7050 struct task_struct *mt = rq->migration_thread; 7193 struct task_struct *mt = rq->migration_thread;
7051 7194
@@ -7078,7 +7221,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7078static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7221static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7079{ 7222{
7080 struct rq *rq_dest, *rq_src; 7223 struct rq *rq_dest, *rq_src;
7081 int ret = 0, on_rq; 7224 int ret = 0;
7082 7225
7083 if (unlikely(!cpu_active(dest_cpu))) 7226 if (unlikely(!cpu_active(dest_cpu)))
7084 return ret; 7227 return ret;
@@ -7094,12 +7237,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7094 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7237 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7095 goto fail; 7238 goto fail;
7096 7239
7097 on_rq = p->se.on_rq; 7240 /*
7098 if (on_rq) 7241 * If we're not on a rq, the next wake-up will ensure we're
7242 * placed properly.
7243 */
7244 if (p->se.on_rq) {
7099 deactivate_task(rq_src, p, 0); 7245 deactivate_task(rq_src, p, 0);
7100 7246 set_task_cpu(p, dest_cpu);
7101 set_task_cpu(p, dest_cpu);
7102 if (on_rq) {
7103 activate_task(rq_dest, p, 0); 7247 activate_task(rq_dest, p, 0);
7104 check_preempt_curr(rq_dest, p, 0); 7248 check_preempt_curr(rq_dest, p, 0);
7105 } 7249 }
@@ -7134,10 +7278,10 @@ static int migration_thread(void *data)
7134 struct migration_req *req; 7278 struct migration_req *req;
7135 struct list_head *head; 7279 struct list_head *head;
7136 7280
7137 spin_lock_irq(&rq->lock); 7281 raw_spin_lock_irq(&rq->lock);
7138 7282
7139 if (cpu_is_offline(cpu)) { 7283 if (cpu_is_offline(cpu)) {
7140 spin_unlock_irq(&rq->lock); 7284 raw_spin_unlock_irq(&rq->lock);
7141 break; 7285 break;
7142 } 7286 }
7143 7287
@@ -7149,7 +7293,7 @@ static int migration_thread(void *data)
7149 head = &rq->migration_queue; 7293 head = &rq->migration_queue;
7150 7294
7151 if (list_empty(head)) { 7295 if (list_empty(head)) {
7152 spin_unlock_irq(&rq->lock); 7296 raw_spin_unlock_irq(&rq->lock);
7153 schedule(); 7297 schedule();
7154 set_current_state(TASK_INTERRUPTIBLE); 7298 set_current_state(TASK_INTERRUPTIBLE);
7155 continue; 7299 continue;
@@ -7158,14 +7302,14 @@ static int migration_thread(void *data)
7158 list_del_init(head->next); 7302 list_del_init(head->next);
7159 7303
7160 if (req->task != NULL) { 7304 if (req->task != NULL) {
7161 spin_unlock(&rq->lock); 7305 raw_spin_unlock(&rq->lock);
7162 __migrate_task(req->task, cpu, req->dest_cpu); 7306 __migrate_task(req->task, cpu, req->dest_cpu);
7163 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7307 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7164 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7308 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7165 spin_unlock(&rq->lock); 7309 raw_spin_unlock(&rq->lock);
7166 } else { 7310 } else {
7167 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7311 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7168 spin_unlock(&rq->lock); 7312 raw_spin_unlock(&rq->lock);
7169 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7313 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7170 } 7314 }
7171 local_irq_enable(); 7315 local_irq_enable();
@@ -7195,37 +7339,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7195static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 7339static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7196{ 7340{
7197 int dest_cpu; 7341 int dest_cpu;
7198 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7199 7342
7200again: 7343again:
7201 /* Look for allowed, online CPU in same node. */ 7344 dest_cpu = select_fallback_rq(dead_cpu, p);
7202 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7203 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7204 goto move;
7205
7206 /* Any allowed, online CPU? */
7207 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7208 if (dest_cpu < nr_cpu_ids)
7209 goto move;
7210
7211 /* No more Mr. Nice Guy. */
7212 if (dest_cpu >= nr_cpu_ids) {
7213 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7214 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7215 7345
7216 /*
7217 * Don't tell them about moving exiting tasks or
7218 * kernel threads (both mm NULL), since they never
7219 * leave kernel.
7220 */
7221 if (p->mm && printk_ratelimit()) {
7222 printk(KERN_INFO "process %d (%s) no "
7223 "longer affine to cpu%d\n",
7224 task_pid_nr(p), p->comm, dead_cpu);
7225 }
7226 }
7227
7228move:
7229 /* It can have affinity changed while we were choosing. */ 7346 /* It can have affinity changed while we were choosing. */
7230 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 7347 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7231 goto again; 7348 goto again;
@@ -7240,7 +7357,7 @@ move:
7240 */ 7357 */
7241static void migrate_nr_uninterruptible(struct rq *rq_src) 7358static void migrate_nr_uninterruptible(struct rq *rq_src)
7242{ 7359{
7243 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7360 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7244 unsigned long flags; 7361 unsigned long flags;
7245 7362
7246 local_irq_save(flags); 7363 local_irq_save(flags);
@@ -7288,14 +7405,14 @@ void sched_idle_next(void)
7288 * Strictly not necessary since rest of the CPUs are stopped by now 7405 * Strictly not necessary since rest of the CPUs are stopped by now
7289 * and interrupts disabled on the current cpu. 7406 * and interrupts disabled on the current cpu.
7290 */ 7407 */
7291 spin_lock_irqsave(&rq->lock, flags); 7408 raw_spin_lock_irqsave(&rq->lock, flags);
7292 7409
7293 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7410 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7294 7411
7295 update_rq_clock(rq); 7412 update_rq_clock(rq);
7296 activate_task(rq, p, 0); 7413 activate_task(rq, p, 0);
7297 7414
7298 spin_unlock_irqrestore(&rq->lock, flags); 7415 raw_spin_unlock_irqrestore(&rq->lock, flags);
7299} 7416}
7300 7417
7301/* 7418/*
@@ -7331,9 +7448,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7331 * that's OK. No task can be added to this CPU, so iteration is 7448 * that's OK. No task can be added to this CPU, so iteration is
7332 * fine. 7449 * fine.
7333 */ 7450 */
7334 spin_unlock_irq(&rq->lock); 7451 raw_spin_unlock_irq(&rq->lock);
7335 move_task_off_dead_cpu(dead_cpu, p); 7452 move_task_off_dead_cpu(dead_cpu, p);
7336 spin_lock_irq(&rq->lock); 7453 raw_spin_lock_irq(&rq->lock);
7337 7454
7338 put_task_struct(p); 7455 put_task_struct(p);
7339} 7456}
@@ -7374,17 +7491,16 @@ static struct ctl_table sd_ctl_dir[] = {
7374 .procname = "sched_domain", 7491 .procname = "sched_domain",
7375 .mode = 0555, 7492 .mode = 0555,
7376 }, 7493 },
7377 {0, }, 7494 {}
7378}; 7495};
7379 7496
7380static struct ctl_table sd_ctl_root[] = { 7497static struct ctl_table sd_ctl_root[] = {
7381 { 7498 {
7382 .ctl_name = CTL_KERN,
7383 .procname = "kernel", 7499 .procname = "kernel",
7384 .mode = 0555, 7500 .mode = 0555,
7385 .child = sd_ctl_dir, 7501 .child = sd_ctl_dir,
7386 }, 7502 },
7387 {0, }, 7503 {}
7388}; 7504};
7389 7505
7390static struct ctl_table *sd_alloc_ctl_entry(int n) 7506static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7494,7 +7610,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7494static struct ctl_table_header *sd_sysctl_header; 7610static struct ctl_table_header *sd_sysctl_header;
7495static void register_sched_domain_sysctl(void) 7611static void register_sched_domain_sysctl(void)
7496{ 7612{
7497 int i, cpu_num = num_online_cpus(); 7613 int i, cpu_num = num_possible_cpus();
7498 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7614 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7499 char buf[32]; 7615 char buf[32];
7500 7616
@@ -7504,7 +7620,7 @@ static void register_sched_domain_sysctl(void)
7504 if (entry == NULL) 7620 if (entry == NULL)
7505 return; 7621 return;
7506 7622
7507 for_each_online_cpu(i) { 7623 for_each_possible_cpu(i) {
7508 snprintf(buf, 32, "cpu%d", i); 7624 snprintf(buf, 32, "cpu%d", i);
7509 entry->procname = kstrdup(buf, GFP_KERNEL); 7625 entry->procname = kstrdup(buf, GFP_KERNEL);
7510 entry->mode = 0555; 7626 entry->mode = 0555;
@@ -7600,13 +7716,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7600 7716
7601 /* Update our root-domain */ 7717 /* Update our root-domain */
7602 rq = cpu_rq(cpu); 7718 rq = cpu_rq(cpu);
7603 spin_lock_irqsave(&rq->lock, flags); 7719 raw_spin_lock_irqsave(&rq->lock, flags);
7604 if (rq->rd) { 7720 if (rq->rd) {
7605 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7721 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7606 7722
7607 set_rq_online(rq); 7723 set_rq_online(rq);
7608 } 7724 }
7609 spin_unlock_irqrestore(&rq->lock, flags); 7725 raw_spin_unlock_irqrestore(&rq->lock, flags);
7610 break; 7726 break;
7611 7727
7612#ifdef CONFIG_HOTPLUG_CPU 7728#ifdef CONFIG_HOTPLUG_CPU
@@ -7631,14 +7747,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7631 put_task_struct(rq->migration_thread); 7747 put_task_struct(rq->migration_thread);
7632 rq->migration_thread = NULL; 7748 rq->migration_thread = NULL;
7633 /* Idle task back to normal (off runqueue, low prio) */ 7749 /* Idle task back to normal (off runqueue, low prio) */
7634 spin_lock_irq(&rq->lock); 7750 raw_spin_lock_irq(&rq->lock);
7635 update_rq_clock(rq); 7751 update_rq_clock(rq);
7636 deactivate_task(rq, rq->idle, 0); 7752 deactivate_task(rq, rq->idle, 0);
7637 rq->idle->static_prio = MAX_PRIO;
7638 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7753 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7639 rq->idle->sched_class = &idle_sched_class; 7754 rq->idle->sched_class = &idle_sched_class;
7640 migrate_dead_tasks(cpu); 7755 migrate_dead_tasks(cpu);
7641 spin_unlock_irq(&rq->lock); 7756 raw_spin_unlock_irq(&rq->lock);
7642 cpuset_unlock(); 7757 cpuset_unlock();
7643 migrate_nr_uninterruptible(rq); 7758 migrate_nr_uninterruptible(rq);
7644 BUG_ON(rq->nr_running != 0); 7759 BUG_ON(rq->nr_running != 0);
@@ -7648,30 +7763,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7648 * they didn't take sched_hotcpu_mutex. Just wake up 7763 * they didn't take sched_hotcpu_mutex. Just wake up
7649 * the requestors. 7764 * the requestors.
7650 */ 7765 */
7651 spin_lock_irq(&rq->lock); 7766 raw_spin_lock_irq(&rq->lock);
7652 while (!list_empty(&rq->migration_queue)) { 7767 while (!list_empty(&rq->migration_queue)) {
7653 struct migration_req *req; 7768 struct migration_req *req;
7654 7769
7655 req = list_entry(rq->migration_queue.next, 7770 req = list_entry(rq->migration_queue.next,
7656 struct migration_req, list); 7771 struct migration_req, list);
7657 list_del_init(&req->list); 7772 list_del_init(&req->list);
7658 spin_unlock_irq(&rq->lock); 7773 raw_spin_unlock_irq(&rq->lock);
7659 complete(&req->done); 7774 complete(&req->done);
7660 spin_lock_irq(&rq->lock); 7775 raw_spin_lock_irq(&rq->lock);
7661 } 7776 }
7662 spin_unlock_irq(&rq->lock); 7777 raw_spin_unlock_irq(&rq->lock);
7663 break; 7778 break;
7664 7779
7665 case CPU_DYING: 7780 case CPU_DYING:
7666 case CPU_DYING_FROZEN: 7781 case CPU_DYING_FROZEN:
7667 /* Update our root-domain */ 7782 /* Update our root-domain */
7668 rq = cpu_rq(cpu); 7783 rq = cpu_rq(cpu);
7669 spin_lock_irqsave(&rq->lock, flags); 7784 raw_spin_lock_irqsave(&rq->lock, flags);
7670 if (rq->rd) { 7785 if (rq->rd) {
7671 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7786 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7672 set_rq_offline(rq); 7787 set_rq_offline(rq);
7673 } 7788 }
7674 spin_unlock_irqrestore(&rq->lock, flags); 7789 raw_spin_unlock_irqrestore(&rq->lock, flags);
7675 break; 7790 break;
7676#endif 7791#endif
7677 } 7792 }
@@ -7708,6 +7823,16 @@ early_initcall(migration_init);
7708 7823
7709#ifdef CONFIG_SCHED_DEBUG 7824#ifdef CONFIG_SCHED_DEBUG
7710 7825
7826static __read_mostly int sched_domain_debug_enabled;
7827
7828static int __init sched_domain_debug_setup(char *str)
7829{
7830 sched_domain_debug_enabled = 1;
7831
7832 return 0;
7833}
7834early_param("sched_debug", sched_domain_debug_setup);
7835
7711static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7836static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7712 struct cpumask *groupmask) 7837 struct cpumask *groupmask)
7713{ 7838{
@@ -7794,6 +7919,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7794 cpumask_var_t groupmask; 7919 cpumask_var_t groupmask;
7795 int level = 0; 7920 int level = 0;
7796 7921
7922 if (!sched_domain_debug_enabled)
7923 return;
7924
7797 if (!sd) { 7925 if (!sd) {
7798 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7926 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7799 return; 7927 return;
@@ -7873,6 +8001,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7873 8001
7874static void free_rootdomain(struct root_domain *rd) 8002static void free_rootdomain(struct root_domain *rd)
7875{ 8003{
8004 synchronize_sched();
8005
7876 cpupri_cleanup(&rd->cpupri); 8006 cpupri_cleanup(&rd->cpupri);
7877 8007
7878 free_cpumask_var(rd->rto_mask); 8008 free_cpumask_var(rd->rto_mask);
@@ -7886,7 +8016,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7886 struct root_domain *old_rd = NULL; 8016 struct root_domain *old_rd = NULL;
7887 unsigned long flags; 8017 unsigned long flags;
7888 8018
7889 spin_lock_irqsave(&rq->lock, flags); 8019 raw_spin_lock_irqsave(&rq->lock, flags);
7890 8020
7891 if (rq->rd) { 8021 if (rq->rd) {
7892 old_rd = rq->rd; 8022 old_rd = rq->rd;
@@ -7912,7 +8042,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7912 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 8042 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7913 set_rq_online(rq); 8043 set_rq_online(rq);
7914 8044
7915 spin_unlock_irqrestore(&rq->lock, flags); 8045 raw_spin_unlock_irqrestore(&rq->lock, flags);
7916 8046
7917 if (old_rd) 8047 if (old_rd)
7918 free_rootdomain(old_rd); 8048 free_rootdomain(old_rd);
@@ -8013,6 +8143,7 @@ static cpumask_var_t cpu_isolated_map;
8013/* Setup the mask of cpus configured for isolated domains */ 8143/* Setup the mask of cpus configured for isolated domains */
8014static int __init isolated_cpu_setup(char *str) 8144static int __init isolated_cpu_setup(char *str)
8015{ 8145{
8146 alloc_bootmem_cpumask_var(&cpu_isolated_map);
8016 cpulist_parse(str, cpu_isolated_map); 8147 cpulist_parse(str, cpu_isolated_map);
8017 return 1; 8148 return 1;
8018} 8149}
@@ -8197,14 +8328,14 @@ enum s_alloc {
8197 */ 8328 */
8198#ifdef CONFIG_SCHED_SMT 8329#ifdef CONFIG_SCHED_SMT
8199static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8330static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8200static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8331static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8201 8332
8202static int 8333static int
8203cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8334cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8204 struct sched_group **sg, struct cpumask *unused) 8335 struct sched_group **sg, struct cpumask *unused)
8205{ 8336{
8206 if (sg) 8337 if (sg)
8207 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8338 *sg = &per_cpu(sched_groups, cpu).sg;
8208 return cpu; 8339 return cpu;
8209} 8340}
8210#endif /* CONFIG_SCHED_SMT */ 8341#endif /* CONFIG_SCHED_SMT */
@@ -8849,7 +8980,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8849 return __build_sched_domains(cpu_map, NULL); 8980 return __build_sched_domains(cpu_map, NULL);
8850} 8981}
8851 8982
8852static struct cpumask *doms_cur; /* current sched domains */ 8983static cpumask_var_t *doms_cur; /* current sched domains */
8853static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8984static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8854static struct sched_domain_attr *dattr_cur; 8985static struct sched_domain_attr *dattr_cur;
8855 /* attribues of custom domains in 'doms_cur' */ 8986 /* attribues of custom domains in 'doms_cur' */
@@ -8871,6 +9002,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8871 return 0; 9002 return 0;
8872} 9003}
8873 9004
9005cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
9006{
9007 int i;
9008 cpumask_var_t *doms;
9009
9010 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
9011 if (!doms)
9012 return NULL;
9013 for (i = 0; i < ndoms; i++) {
9014 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
9015 free_sched_domains(doms, i);
9016 return NULL;
9017 }
9018 }
9019 return doms;
9020}
9021
9022void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
9023{
9024 unsigned int i;
9025 for (i = 0; i < ndoms; i++)
9026 free_cpumask_var(doms[i]);
9027 kfree(doms);
9028}
9029
8874/* 9030/*
8875 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 9031 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8876 * For now this just excludes isolated cpus, but could be used to 9032 * For now this just excludes isolated cpus, but could be used to
@@ -8882,12 +9038,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8882 9038
8883 arch_update_cpu_topology(); 9039 arch_update_cpu_topology();
8884 ndoms_cur = 1; 9040 ndoms_cur = 1;
8885 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 9041 doms_cur = alloc_sched_domains(ndoms_cur);
8886 if (!doms_cur) 9042 if (!doms_cur)
8887 doms_cur = fallback_doms; 9043 doms_cur = &fallback_doms;
8888 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 9044 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8889 dattr_cur = NULL; 9045 dattr_cur = NULL;
8890 err = build_sched_domains(doms_cur); 9046 err = build_sched_domains(doms_cur[0]);
8891 register_sched_domain_sysctl(); 9047 register_sched_domain_sysctl();
8892 9048
8893 return err; 9049 return err;
@@ -8937,19 +9093,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8937 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9093 * doms_new[] to the current sched domain partitioning, doms_cur[].
8938 * It destroys each deleted domain and builds each new domain. 9094 * It destroys each deleted domain and builds each new domain.
8939 * 9095 *
8940 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9096 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8941 * The masks don't intersect (don't overlap.) We should setup one 9097 * The masks don't intersect (don't overlap.) We should setup one
8942 * sched domain for each mask. CPUs not in any of the cpumasks will 9098 * sched domain for each mask. CPUs not in any of the cpumasks will
8943 * not be load balanced. If the same cpumask appears both in the 9099 * not be load balanced. If the same cpumask appears both in the
8944 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9100 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8945 * it as it is. 9101 * it as it is.
8946 * 9102 *
8947 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9103 * The passed in 'doms_new' should be allocated using
8948 * ownership of it and will kfree it when done with it. If the caller 9104 * alloc_sched_domains. This routine takes ownership of it and will
8949 * failed the kmalloc call, then it can pass in doms_new == NULL && 9105 * free_sched_domains it when done with it. If the caller failed the
8950 * ndoms_new == 1, and partition_sched_domains() will fallback to 9106 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8951 * the single partition 'fallback_doms', it also forces the domains 9107 * and partition_sched_domains() will fallback to the single partition
8952 * to be rebuilt. 9108 * 'fallback_doms', it also forces the domains to be rebuilt.
8953 * 9109 *
8954 * If doms_new == NULL it will be replaced with cpu_online_mask. 9110 * If doms_new == NULL it will be replaced with cpu_online_mask.
8955 * ndoms_new == 0 is a special case for destroying existing domains, 9111 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8957,8 +9113,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8957 * 9113 *
8958 * Call with hotplug lock held 9114 * Call with hotplug lock held
8959 */ 9115 */
8960/* FIXME: Change to struct cpumask *doms_new[] */ 9116void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8961void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8962 struct sched_domain_attr *dattr_new) 9117 struct sched_domain_attr *dattr_new)
8963{ 9118{
8964 int i, j, n; 9119 int i, j, n;
@@ -8977,40 +9132,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8977 /* Destroy deleted domains */ 9132 /* Destroy deleted domains */
8978 for (i = 0; i < ndoms_cur; i++) { 9133 for (i = 0; i < ndoms_cur; i++) {
8979 for (j = 0; j < n && !new_topology; j++) { 9134 for (j = 0; j < n && !new_topology; j++) {
8980 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9135 if (cpumask_equal(doms_cur[i], doms_new[j])
8981 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9136 && dattrs_equal(dattr_cur, i, dattr_new, j))
8982 goto match1; 9137 goto match1;
8983 } 9138 }
8984 /* no match - a current sched domain not in new doms_new[] */ 9139 /* no match - a current sched domain not in new doms_new[] */
8985 detach_destroy_domains(doms_cur + i); 9140 detach_destroy_domains(doms_cur[i]);
8986match1: 9141match1:
8987 ; 9142 ;
8988 } 9143 }
8989 9144
8990 if (doms_new == NULL) { 9145 if (doms_new == NULL) {
8991 ndoms_cur = 0; 9146 ndoms_cur = 0;
8992 doms_new = fallback_doms; 9147 doms_new = &fallback_doms;
8993 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9148 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
8994 WARN_ON_ONCE(dattr_new); 9149 WARN_ON_ONCE(dattr_new);
8995 } 9150 }
8996 9151
8997 /* Build new domains */ 9152 /* Build new domains */
8998 for (i = 0; i < ndoms_new; i++) { 9153 for (i = 0; i < ndoms_new; i++) {
8999 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9154 for (j = 0; j < ndoms_cur && !new_topology; j++) {
9000 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9155 if (cpumask_equal(doms_new[i], doms_cur[j])
9001 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9156 && dattrs_equal(dattr_new, i, dattr_cur, j))
9002 goto match2; 9157 goto match2;
9003 } 9158 }
9004 /* no match - add a new doms_new */ 9159 /* no match - add a new doms_new */
9005 __build_sched_domains(doms_new + i, 9160 __build_sched_domains(doms_new[i],
9006 dattr_new ? dattr_new + i : NULL); 9161 dattr_new ? dattr_new + i : NULL);
9007match2: 9162match2:
9008 ; 9163 ;
9009 } 9164 }
9010 9165
9011 /* Remember the new sched domains */ 9166 /* Remember the new sched domains */
9012 if (doms_cur != fallback_doms) 9167 if (doms_cur != &fallback_doms)
9013 kfree(doms_cur); 9168 free_sched_domains(doms_cur, ndoms_cur);
9014 kfree(dattr_cur); /* kfree(NULL) is safe */ 9169 kfree(dattr_cur); /* kfree(NULL) is safe */
9015 doms_cur = doms_new; 9170 doms_cur = doms_new;
9016 dattr_cur = dattr_new; 9171 dattr_cur = dattr_new;
@@ -9121,8 +9276,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9121 switch (action) { 9276 switch (action) {
9122 case CPU_ONLINE: 9277 case CPU_ONLINE:
9123 case CPU_ONLINE_FROZEN: 9278 case CPU_ONLINE_FROZEN:
9124 case CPU_DEAD: 9279 case CPU_DOWN_PREPARE:
9125 case CPU_DEAD_FROZEN: 9280 case CPU_DOWN_PREPARE_FROZEN:
9281 case CPU_DOWN_FAILED:
9282 case CPU_DOWN_FAILED_FROZEN:
9126 partition_sched_domains(1, NULL, NULL); 9283 partition_sched_domains(1, NULL, NULL);
9127 return NOTIFY_OK; 9284 return NOTIFY_OK;
9128 9285
@@ -9169,7 +9326,7 @@ void __init sched_init_smp(void)
9169#endif 9326#endif
9170 get_online_cpus(); 9327 get_online_cpus();
9171 mutex_lock(&sched_domains_mutex); 9328 mutex_lock(&sched_domains_mutex);
9172 arch_init_sched_domains(cpu_online_mask); 9329 arch_init_sched_domains(cpu_active_mask);
9173 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9330 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9174 if (cpumask_empty(non_isolated_cpus)) 9331 if (cpumask_empty(non_isolated_cpus))
9175 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9332 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9242,13 +9399,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9242#ifdef CONFIG_SMP 9399#ifdef CONFIG_SMP
9243 rt_rq->rt_nr_migratory = 0; 9400 rt_rq->rt_nr_migratory = 0;
9244 rt_rq->overloaded = 0; 9401 rt_rq->overloaded = 0;
9245 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9402 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9246#endif 9403#endif
9247 9404
9248 rt_rq->rt_time = 0; 9405 rt_rq->rt_time = 0;
9249 rt_rq->rt_throttled = 0; 9406 rt_rq->rt_throttled = 0;
9250 rt_rq->rt_runtime = 0; 9407 rt_rq->rt_runtime = 0;
9251 spin_lock_init(&rt_rq->rt_runtime_lock); 9408 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9252 9409
9253#ifdef CONFIG_RT_GROUP_SCHED 9410#ifdef CONFIG_RT_GROUP_SCHED
9254 rt_rq->rt_nr_boosted = 0; 9411 rt_rq->rt_nr_boosted = 0;
@@ -9332,10 +9489,6 @@ void __init sched_init(void)
9332#ifdef CONFIG_CPUMASK_OFFSTACK 9489#ifdef CONFIG_CPUMASK_OFFSTACK
9333 alloc_size += num_possible_cpus() * cpumask_size(); 9490 alloc_size += num_possible_cpus() * cpumask_size();
9334#endif 9491#endif
9335 /*
9336 * As sched_init() is called before page_alloc is setup,
9337 * we use alloc_bootmem().
9338 */
9339 if (alloc_size) { 9492 if (alloc_size) {
9340 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9493 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9341 9494
@@ -9404,11 +9557,15 @@ void __init sched_init(void)
9404#endif /* CONFIG_USER_SCHED */ 9557#endif /* CONFIG_USER_SCHED */
9405#endif /* CONFIG_GROUP_SCHED */ 9558#endif /* CONFIG_GROUP_SCHED */
9406 9559
9560#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9561 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9562 __alignof__(unsigned long));
9563#endif
9407 for_each_possible_cpu(i) { 9564 for_each_possible_cpu(i) {
9408 struct rq *rq; 9565 struct rq *rq;
9409 9566
9410 rq = cpu_rq(i); 9567 rq = cpu_rq(i);
9411 spin_lock_init(&rq->lock); 9568 raw_spin_lock_init(&rq->lock);
9412 rq->nr_running = 0; 9569 rq->nr_running = 0;
9413 rq->calc_load_active = 0; 9570 rq->calc_load_active = 0;
9414 rq->calc_load_update = jiffies + LOAD_FREQ; 9571 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9468,7 +9625,7 @@ void __init sched_init(void)
9468#elif defined CONFIG_USER_SCHED 9625#elif defined CONFIG_USER_SCHED
9469 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9470 init_tg_rt_entry(&init_task_group, 9627 init_tg_rt_entry(&init_task_group,
9471 &per_cpu(init_rt_rq, i), 9628 &per_cpu(init_rt_rq_var, i),
9472 &per_cpu(init_sched_rt_entity, i), i, 1, 9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9473 root_task_group.rt_se[i]); 9630 root_task_group.rt_se[i]);
9474#endif 9631#endif
@@ -9486,6 +9643,8 @@ void __init sched_init(void)
9486 rq->cpu = i; 9643 rq->cpu = i;
9487 rq->online = 0; 9644 rq->online = 0;
9488 rq->migration_thread = NULL; 9645 rq->migration_thread = NULL;
9646 rq->idle_stamp = 0;
9647 rq->avg_idle = 2*sysctl_sched_migration_cost;
9489 INIT_LIST_HEAD(&rq->migration_queue); 9648 INIT_LIST_HEAD(&rq->migration_queue);
9490 rq_attach_root(rq, &def_root_domain); 9649 rq_attach_root(rq, &def_root_domain);
9491#endif 9650#endif
@@ -9504,7 +9663,7 @@ void __init sched_init(void)
9504#endif 9663#endif
9505 9664
9506#ifdef CONFIG_RT_MUTEXES 9665#ifdef CONFIG_RT_MUTEXES
9507 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9666 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9508#endif 9667#endif
9509 9668
9510 /* 9669 /*
@@ -9529,13 +9688,15 @@ void __init sched_init(void)
9529 current->sched_class = &fair_sched_class; 9688 current->sched_class = &fair_sched_class;
9530 9689
9531 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9690 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9532 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9691 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9533#ifdef CONFIG_SMP 9692#ifdef CONFIG_SMP
9534#ifdef CONFIG_NO_HZ 9693#ifdef CONFIG_NO_HZ
9535 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9694 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9536 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9695 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9537#endif 9696#endif
9538 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9697 /* May be allocated at isolcpus cmdline parse time */
9698 if (cpu_isolated_map == NULL)
9699 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9539#endif /* SMP */ 9700#endif /* SMP */
9540 9701
9541 perf_event_init(); 9702 perf_event_init();
@@ -9546,7 +9707,7 @@ void __init sched_init(void)
9546#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9707#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9547static inline int preempt_count_equals(int preempt_offset) 9708static inline int preempt_count_equals(int preempt_offset)
9548{ 9709{
9549 int nested = preempt_count() & ~PREEMPT_ACTIVE; 9710 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9550 9711
9551 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9552} 9713}
@@ -9627,13 +9788,13 @@ void normalize_rt_tasks(void)
9627 continue; 9788 continue;
9628 } 9789 }
9629 9790
9630 spin_lock(&p->pi_lock); 9791 raw_spin_lock(&p->pi_lock);
9631 rq = __task_rq_lock(p); 9792 rq = __task_rq_lock(p);
9632 9793
9633 normalize_task(rq, p); 9794 normalize_task(rq, p);
9634 9795
9635 __task_rq_unlock(rq); 9796 __task_rq_unlock(rq);
9636 spin_unlock(&p->pi_lock); 9797 raw_spin_unlock(&p->pi_lock);
9637 } while_each_thread(g, p); 9798 } while_each_thread(g, p);
9638 9799
9639 read_unlock_irqrestore(&tasklist_lock, flags); 9800 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9729,13 +9890,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9729 se = kzalloc_node(sizeof(struct sched_entity), 9890 se = kzalloc_node(sizeof(struct sched_entity),
9730 GFP_KERNEL, cpu_to_node(i)); 9891 GFP_KERNEL, cpu_to_node(i));
9731 if (!se) 9892 if (!se)
9732 goto err; 9893 goto err_free_rq;
9733 9894
9734 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9895 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9735 } 9896 }
9736 9897
9737 return 1; 9898 return 1;
9738 9899
9900 err_free_rq:
9901 kfree(cfs_rq);
9739 err: 9902 err:
9740 return 0; 9903 return 0;
9741} 9904}
@@ -9817,13 +9980,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9817 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9980 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9818 GFP_KERNEL, cpu_to_node(i)); 9981 GFP_KERNEL, cpu_to_node(i));
9819 if (!rt_se) 9982 if (!rt_se)
9820 goto err; 9983 goto err_free_rq;
9821 9984
9822 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9985 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9823 } 9986 }
9824 9987
9825 return 1; 9988 return 1;
9826 9989
9990 err_free_rq:
9991 kfree(rt_rq);
9827 err: 9992 err:
9828 return 0; 9993 return 0;
9829} 9994}
@@ -9957,7 +10122,7 @@ void sched_move_task(struct task_struct *tsk)
9957 10122
9958#ifdef CONFIG_FAIR_GROUP_SCHED 10123#ifdef CONFIG_FAIR_GROUP_SCHED
9959 if (tsk->sched_class->moved_group) 10124 if (tsk->sched_class->moved_group)
9960 tsk->sched_class->moved_group(tsk); 10125 tsk->sched_class->moved_group(tsk, on_rq);
9961#endif 10126#endif
9962 10127
9963 if (unlikely(running)) 10128 if (unlikely(running))
@@ -9992,9 +10157,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
9992 struct rq *rq = cfs_rq->rq; 10157 struct rq *rq = cfs_rq->rq;
9993 unsigned long flags; 10158 unsigned long flags;
9994 10159
9995 spin_lock_irqsave(&rq->lock, flags); 10160 raw_spin_lock_irqsave(&rq->lock, flags);
9996 __set_se_shares(se, shares); 10161 __set_se_shares(se, shares);
9997 spin_unlock_irqrestore(&rq->lock, flags); 10162 raw_spin_unlock_irqrestore(&rq->lock, flags);
9998} 10163}
9999 10164
10000static DEFINE_MUTEX(shares_mutex); 10165static DEFINE_MUTEX(shares_mutex);
@@ -10179,18 +10344,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10179 if (err) 10344 if (err)
10180 goto unlock; 10345 goto unlock;
10181 10346
10182 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10347 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10183 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10348 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10184 tg->rt_bandwidth.rt_runtime = rt_runtime; 10349 tg->rt_bandwidth.rt_runtime = rt_runtime;
10185 10350
10186 for_each_possible_cpu(i) { 10351 for_each_possible_cpu(i) {
10187 struct rt_rq *rt_rq = tg->rt_rq[i]; 10352 struct rt_rq *rt_rq = tg->rt_rq[i];
10188 10353
10189 spin_lock(&rt_rq->rt_runtime_lock); 10354 raw_spin_lock(&rt_rq->rt_runtime_lock);
10190 rt_rq->rt_runtime = rt_runtime; 10355 rt_rq->rt_runtime = rt_runtime;
10191 spin_unlock(&rt_rq->rt_runtime_lock); 10356 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10192 } 10357 }
10193 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10358 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10194 unlock: 10359 unlock:
10195 read_unlock(&tasklist_lock); 10360 read_unlock(&tasklist_lock);
10196 mutex_unlock(&rt_constraints_mutex); 10361 mutex_unlock(&rt_constraints_mutex);
@@ -10295,15 +10460,15 @@ static int sched_rt_global_constraints(void)
10295 if (sysctl_sched_rt_runtime == 0) 10460 if (sysctl_sched_rt_runtime == 0)
10296 return -EBUSY; 10461 return -EBUSY;
10297 10462
10298 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10463 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10299 for_each_possible_cpu(i) { 10464 for_each_possible_cpu(i) {
10300 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10465 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10301 10466
10302 spin_lock(&rt_rq->rt_runtime_lock); 10467 raw_spin_lock(&rt_rq->rt_runtime_lock);
10303 rt_rq->rt_runtime = global_rt_runtime(); 10468 rt_rq->rt_runtime = global_rt_runtime();
10304 spin_unlock(&rt_rq->rt_runtime_lock); 10469 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10305 } 10470 }
10306 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10471 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10307 10472
10308 return 0; 10473 return 0;
10309} 10474}
@@ -10594,9 +10759,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10594 /* 10759 /*
10595 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10760 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10596 */ 10761 */
10597 spin_lock_irq(&cpu_rq(cpu)->lock); 10762 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10598 data = *cpuusage; 10763 data = *cpuusage;
10599 spin_unlock_irq(&cpu_rq(cpu)->lock); 10764 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10600#else 10765#else
10601 data = *cpuusage; 10766 data = *cpuusage;
10602#endif 10767#endif
@@ -10612,9 +10777,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10612 /* 10777 /*
10613 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10778 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10614 */ 10779 */
10615 spin_lock_irq(&cpu_rq(cpu)->lock); 10780 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10616 *cpuusage = val; 10781 *cpuusage = val;
10617 spin_unlock_irq(&cpu_rq(cpu)->lock); 10782 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10618#else 10783#else
10619 *cpuusage = val; 10784 *cpuusage = val;
10620#endif 10785#endif
@@ -10848,9 +11013,9 @@ void synchronize_sched_expedited(void)
10848 init_completion(&req->done); 11013 init_completion(&req->done);
10849 req->task = NULL; 11014 req->task = NULL;
10850 req->dest_cpu = RCU_MIGRATION_NEED_QS; 11015 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10851 spin_lock_irqsave(&rq->lock, flags); 11016 raw_spin_lock_irqsave(&rq->lock, flags);
10852 list_add(&req->list, &rq->migration_queue); 11017 list_add(&req->list, &rq->migration_queue);
10853 spin_unlock_irqrestore(&rq->lock, flags); 11018 raw_spin_unlock_irqrestore(&rq->lock, flags);
10854 wake_up_process(rq->migration_thread); 11019 wake_up_process(rq->migration_thread);
10855 } 11020 }
10856 for_each_online_cpu(cpu) { 11021 for_each_online_cpu(cpu) {
@@ -10858,13 +11023,14 @@ void synchronize_sched_expedited(void)
10858 req = &per_cpu(rcu_migration_req, cpu); 11023 req = &per_cpu(rcu_migration_req, cpu);
10859 rq = cpu_rq(cpu); 11024 rq = cpu_rq(cpu);
10860 wait_for_completion(&req->done); 11025 wait_for_completion(&req->done);
10861 spin_lock_irqsave(&rq->lock, flags); 11026 raw_spin_lock_irqsave(&rq->lock, flags);
10862 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 11027 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10863 need_full_sync = 1; 11028 need_full_sync = 1;
10864 req->dest_cpu = RCU_MIGRATION_IDLE; 11029 req->dest_cpu = RCU_MIGRATION_IDLE;
10865 spin_unlock_irqrestore(&rq->lock, flags); 11030 raw_spin_unlock_irqrestore(&rq->lock, flags);
10866 } 11031 }
10867 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 11032 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
11033 synchronize_sched_expedited_count++;
10868 mutex_unlock(&rcu_sched_expedited_mutex); 11034 mutex_unlock(&rcu_sched_expedited_mutex);
10869 put_online_cpus(); 11035 put_online_cpus();
10870 if (need_full_sync) 11036 if (need_full_sync)