aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorLen Brown <len.brown@intel.com>2009-09-19 00:11:26 -0400
committerLen Brown <len.brown@intel.com>2009-09-19 00:11:26 -0400
commitc602c65b2f81d14456771d1e3f15d1381f4b7efa (patch)
treef1f833c8dd6c1519eeb101be32f7fe54a9605af5 /kernel/sched.c
parent3834f47291df475be3f0f0fb7ccaa098967cc054 (diff)
parent78f28b7c555359c67c2a0d23f7436e915329421e (diff)
Merge branch 'linus' into sfi-release
Conflicts: arch/x86/kernel/setup.c drivers/acpi/power.c init/main.c Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1632
1 files changed, 953 insertions, 679 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..faf4d463bbff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 376
402#else 377#else
403 378
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 379static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 380static inline struct task_group *task_group(struct task_struct *p)
413{ 381{
@@ -537,14 +505,6 @@ struct root_domain {
537#ifdef CONFIG_SMP 505#ifdef CONFIG_SMP
538 struct cpupri cpupri; 506 struct cpupri cpupri;
539#endif 507#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541 /*
542 * Preferred wake up cpu nominated by sched_mc balance that will be
543 * used when most cpus are idle in the system indicating overall very
544 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
545 */
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548}; 508};
549 509
550/* 510/*
@@ -616,6 +576,7 @@ struct rq {
616 576
617 unsigned char idle_at_tick; 577 unsigned char idle_at_tick;
618 /* For active balancing */ 578 /* For active balancing */
579 int post_schedule;
619 int active_balance; 580 int active_balance;
620 int push_cpu; 581 int push_cpu;
621 /* cpu of this runqueue: */ 582 /* cpu of this runqueue: */
@@ -626,6 +587,9 @@ struct rq {
626 587
627 struct task_struct *migration_thread; 588 struct task_struct *migration_thread;
628 struct list_head migration_queue; 589 struct list_head migration_queue;
590
591 u64 rt_avg;
592 u64 age_stamp;
629#endif 593#endif
630 594
631 /* calc_load related fields */ 595 /* calc_load related fields */
@@ -665,9 +629,10 @@ struct rq {
665 629
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 630static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667 631
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 632static inline
633void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
669{ 634{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 635 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
671} 636}
672 637
673static inline int cpu_of(struct rq *rq) 638static inline int cpu_of(struct rq *rq)
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 658#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 659#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 660#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
661#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 662
697inline void update_rq_clock(struct rq *rq) 663inline void update_rq_clock(struct rq *rq)
698{ 664{
@@ -861,6 +827,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 827unsigned int sysctl_sched_shares_thresh = 4;
862 828
863/* 829/*
830 * period over which we average the RT time consumption, measured
831 * in ms.
832 *
833 * default: 1s
834 */
835const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
836
837/*
864 * period over which we measure -rt task cpu usage in us. 838 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 839 * default: 1s
866 */ 840 */
@@ -1278,12 +1252,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1252}
1279#endif /* CONFIG_NO_HZ */ 1253#endif /* CONFIG_NO_HZ */
1280 1254
1255static u64 sched_avg_period(void)
1256{
1257 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1258}
1259
1260static void sched_avg_update(struct rq *rq)
1261{
1262 s64 period = sched_avg_period();
1263
1264 while ((s64)(rq->clock - rq->age_stamp) > period) {
1265 rq->age_stamp += period;
1266 rq->rt_avg /= 2;
1267 }
1268}
1269
1270static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1271{
1272 rq->rt_avg += rt_delta;
1273 sched_avg_update(rq);
1274}
1275
1281#else /* !CONFIG_SMP */ 1276#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1277static void resched_task(struct task_struct *p)
1283{ 1278{
1284 assert_spin_locked(&task_rq(p)->lock); 1279 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1280 set_tsk_need_resched(p);
1286} 1281}
1282
1283static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1284{
1285}
1287#endif /* CONFIG_SMP */ 1286#endif /* CONFIG_SMP */
1288 1287
1289#if BITS_PER_LONG == 32 1288#if BITS_PER_LONG == 32
@@ -1494,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)
1494#endif 1493#endif
1495 1494
1496#ifdef CONFIG_SMP 1495#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type); 1496/* Used instead of source_load when we know the type == 0 */
1498static unsigned long target_load(int cpu, int type); 1497static unsigned long weighted_cpuload(const int cpu)
1498{
1499 return cpu_rq(cpu)->load.weight;
1500}
1501
1502/*
1503 * Return a low guess at the load of a migration-source cpu weighted
1504 * according to the scheduling class and "nice" value.
1505 *
1506 * We want to under-estimate the load of migration sources, to
1507 * balance conservatively.
1508 */
1509static unsigned long source_load(int cpu, int type)
1510{
1511 struct rq *rq = cpu_rq(cpu);
1512 unsigned long total = weighted_cpuload(cpu);
1513
1514 if (type == 0 || !sched_feat(LB_BIAS))
1515 return total;
1516
1517 return min(rq->cpu_load[type-1], total);
1518}
1519
1520/*
1521 * Return a high guess at the load of a migration-target cpu weighted
1522 * according to the scheduling class and "nice" value.
1523 */
1524static unsigned long target_load(int cpu, int type)
1525{
1526 struct rq *rq = cpu_rq(cpu);
1527 unsigned long total = weighted_cpuload(cpu);
1528
1529 if (type == 0 || !sched_feat(LB_BIAS))
1530 return total;
1531
1532 return max(rq->cpu_load[type-1], total);
1533}
1534
1535static struct sched_group *group_of(int cpu)
1536{
1537 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1538
1539 if (!sd)
1540 return NULL;
1541
1542 return sd->groups;
1543}
1544
1545static unsigned long power_of(int cpu)
1546{
1547 struct sched_group *group = group_of(cpu);
1548
1549 if (!group)
1550 return SCHED_LOAD_SCALE;
1551
1552 return group->cpu_power;
1553}
1554
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1555static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500 1556
1501static unsigned long cpu_avg_load_per_task(int cpu) 1557static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1569,35 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1569
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1570#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1571
1572struct update_shares_data {
1573 unsigned long rq_weight[NR_CPUS];
1574};
1575
1576static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1577
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1578static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1579
1518/* 1580/*
1519 * Calculate and set the cpu's group shares. 1581 * Calculate and set the cpu's group shares.
1520 */ 1582 */
1521static void 1583static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1584 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1585 unsigned long sd_rq_weight,
1586 struct update_shares_data *usd)
1524{ 1587{
1525 unsigned long shares; 1588 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1589 int boost = 0;
1527
1528 if (!tg->se[cpu])
1529 return;
1530 1590
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1591 rq_weight = usd->rq_weight[cpu];
1592 if (!rq_weight) {
1593 boost = 1;
1594 rq_weight = NICE_0_LOAD;
1595 }
1532 1596
1533 /* 1597 /*
1534 * \Sum shares * rq_weight 1598 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1599 * shares_i = -----------------------------
1536 * \Sum rq_weight 1600 * \Sum_j rq_weight_j
1537 *
1538 */ 1601 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1602 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1603 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1608,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1608 unsigned long flags;
1546 1609
1547 spin_lock_irqsave(&rq->lock, flags); 1610 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1611 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1612 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1613 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1614 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1615 }
@@ -1559,22 +1622,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1622 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1623static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1624{
1562 unsigned long weight, rq_weight = 0; 1625 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1626 struct update_shares_data *usd;
1564 struct sched_domain *sd = data; 1627 struct sched_domain *sd = data;
1628 unsigned long flags;
1565 int i; 1629 int i;
1566 1630
1631 if (!tg->se[0])
1632 return 0;
1633
1634 local_irq_save(flags);
1635 usd = &__get_cpu_var(update_shares_data);
1636
1567 for_each_cpu(i, sched_domain_span(sd)) { 1637 for_each_cpu(i, sched_domain_span(sd)) {
1638 weight = tg->cfs_rq[i]->load.weight;
1639 usd->rq_weight[i] = weight;
1640
1568 /* 1641 /*
1569 * If there are currently no tasks on the cpu pretend there 1642 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1643 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1644 * run here it will not get delayed by group starvation.
1572 */ 1645 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1646 if (!weight)
1575 weight = NICE_0_LOAD; 1647 weight = NICE_0_LOAD;
1576 1648
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1649 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1650 shares += tg->cfs_rq[i]->shares;
1580 } 1651 }
@@ -1586,7 +1657,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1657 shares = tg->shares;
1587 1658
1588 for_each_cpu(i, sched_domain_span(sd)) 1659 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1660 update_group_shares_cpu(tg, i, shares, rq_weight, usd);
1661
1662 local_irq_restore(flags);
1590 1663
1591 return 0; 1664 return 0;
1592} 1665}
@@ -1616,8 +1689,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1689
1617static void update_shares(struct sched_domain *sd) 1690static void update_shares(struct sched_domain *sd)
1618{ 1691{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1692 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1693 u64 now;
1694
1695 if (root_task_group_empty())
1696 return;
1697
1698 now = cpu_clock(raw_smp_processor_id());
1699 elapsed = now - sd->last_update;
1621 1700
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1701 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1702 sd->last_update = now;
@@ -1627,6 +1706,9 @@ static void update_shares(struct sched_domain *sd)
1627 1706
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1707static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1708{
1709 if (root_task_group_empty())
1710 return;
1711
1630 spin_unlock(&rq->lock); 1712 spin_unlock(&rq->lock);
1631 update_shares(sd); 1713 update_shares(sd);
1632 spin_lock(&rq->lock); 1714 spin_lock(&rq->lock);
@@ -1634,6 +1716,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1716
1635static void update_h_load(long cpu) 1717static void update_h_load(long cpu)
1636{ 1718{
1719 if (root_task_group_empty())
1720 return;
1721
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1722 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1723}
1639 1724
@@ -1651,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1651 1736
1652#ifdef CONFIG_PREEMPT 1737#ifdef CONFIG_PREEMPT
1653 1738
1739static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1740
1654/* 1741/*
1655 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1742 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1656 * way at the expense of forcing extra atomic operations in all 1743 * way at the expense of forcing extra atomic operations in all
@@ -1915,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1915} 2002}
1916 2003
1917#ifdef CONFIG_SMP 2004#ifdef CONFIG_SMP
1918
1919/* Used instead of source_load when we know the type == 0 */
1920static unsigned long weighted_cpuload(const int cpu)
1921{
1922 return cpu_rq(cpu)->load.weight;
1923}
1924
1925/* 2005/*
1926 * Is this task likely cache-hot: 2006 * Is this task likely cache-hot:
1927 */ 2007 */
@@ -2195,186 +2275,6 @@ void kick_process(struct task_struct *p)
2195 preempt_enable(); 2275 preempt_enable();
2196} 2276}
2197EXPORT_SYMBOL_GPL(kick_process); 2277EXPORT_SYMBOL_GPL(kick_process);
2198
2199/*
2200 * Return a low guess at the load of a migration-source cpu weighted
2201 * according to the scheduling class and "nice" value.
2202 *
2203 * We want to under-estimate the load of migration sources, to
2204 * balance conservatively.
2205 */
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217/*
2218 * Return a high guess at the load of a migration-target cpu weighted
2219 * according to the scheduling class and "nice" value.
2220 */
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232/*
2233 * find_idlest_group finds and returns the least busy CPU group within the
2234 * domain.
2235 */
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249 /* Skip over this group if it has no CPUs allowed */
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257 /* Tally up the load of all CPUs in the group */
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261 /* Bias balancing toward cpus of our domain */
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288/*
2289 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2290 */
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298 /* Traverse only the allowed CPUs */
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311/*
2312 * sched_balance_self: balance the current task (running on cpu) in domains
2313 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2314 * SD_BALANCE_EXEC.
2315 *
2316 * Balance, ie. select the least loaded group.
2317 *
2318 * Returns the target CPU number, or the same CPU if no balancing is needed.
2319 *
2320 * preempt must be disabled.
2321 */
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328 /*
2329 * If power savings logic is enabled for a domain, stop there.
2330 */
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357 /* Now try balancing at a lower domain level of cpu */
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362 /* Now try balancing at a lower domain level of new_cpu */
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372 /* while loop will break here if sd == NULL */
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif /* CONFIG_SMP */ 2278#endif /* CONFIG_SMP */
2379 2279
2380/** 2280/**
@@ -2412,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,
2412 * 2312 *
2413 * returns failure only if the task is already active. 2313 * returns failure only if the task is already active.
2414 */ 2314 */
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2315static int try_to_wake_up(struct task_struct *p, unsigned int state,
2316 int wake_flags)
2416{ 2317{
2417 int cpu, orig_cpu, this_cpu, success = 0; 2318 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2319 unsigned long flags;
2419 long old_state;
2420 struct rq *rq; 2320 struct rq *rq;
2421 2321
2422 if (!sched_feat(SYNC_WAKEUPS)) 2322 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0; 2323 wake_flags &= ~WF_SYNC;
2424
2425#ifdef CONFIG_SMP
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428
2429 this_cpu = raw_smp_processor_id();
2430 cpu = task_cpu(p);
2431 2324
2432 for_each_domain(this_cpu, sd) { 2325 this_cpu = get_cpu();
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440 2326
2441 smp_wmb(); 2327 smp_wmb();
2442 rq = task_rq_lock(p, &flags); 2328 rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq); 2329 update_rq_clock(rq);
2444 old_state = p->state; 2330 if (!(p->state & state))
2445 if (!(old_state & state))
2446 goto out; 2331 goto out;
2447 2332
2448 if (p->se.on_rq) 2333 if (p->se.on_rq)
@@ -2450,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2450 2335
2451 cpu = task_cpu(p); 2336 cpu = task_cpu(p);
2452 orig_cpu = cpu; 2337 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454 2338
2455#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p))) 2340 if (unlikely(task_running(rq, p)))
2457 goto out_activate; 2341 goto out_activate;
2458 2342
2459 cpu = p->sched_class->select_task_rq(p, sync); 2343 /*
2460 if (cpu != orig_cpu) { 2344 * In order to handle concurrent wakeups and release the rq->lock
2345 * we put the task in TASK_WAKING state.
2346 *
2347 * First fix up the nr_uninterruptible count:
2348 */
2349 if (task_contributes_to_load(p))
2350 rq->nr_uninterruptible--;
2351 p->state = TASK_WAKING;
2352 task_rq_unlock(rq, &flags);
2353
2354 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2355 if (cpu != orig_cpu)
2461 set_task_cpu(p, cpu); 2356 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags);
2463 /* might preempt at this point */
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470 2357
2471 this_cpu = smp_processor_id(); 2358 rq = task_rq_lock(p, &flags);
2472 cpu = task_cpu(p); 2359 WARN_ON(p->state != TASK_WAKING);
2473 } 2360 cpu = task_cpu(p);
2474 2361
2475#ifdef CONFIG_SCHEDSTATS 2362#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count); 2363 schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2490out_activate: 2377out_activate:
2491#endif /* CONFIG_SMP */ 2378#endif /* CONFIG_SMP */
2492 schedstat_inc(p, se.nr_wakeups); 2379 schedstat_inc(p, se.nr_wakeups);
2493 if (sync) 2380 if (wake_flags & WF_SYNC)
2494 schedstat_inc(p, se.nr_wakeups_sync); 2381 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu) 2382 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate); 2383 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,7 +2406,7 @@ out_activate:
2519 2406
2520out_running: 2407out_running:
2521 trace_sched_wakeup(rq, p, success); 2408 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync); 2409 check_preempt_curr(rq, p, wake_flags);
2523 2410
2524 p->state = TASK_RUNNING; 2411 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP 2412#ifdef CONFIG_SMP
@@ -2528,6 +2415,7 @@ out_running:
2528#endif 2415#endif
2529out: 2416out:
2530 task_rq_unlock(rq, &flags); 2417 task_rq_unlock(rq, &flags);
2418 put_cpu();
2531 2419
2532 return success; 2420 return success;
2533} 2421}
@@ -2570,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_overlap = 0; 2458 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0; 2459 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2460 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2461 p->se.avg_running = 0;
2573 2462
2574#ifdef CONFIG_SCHEDSTATS 2463#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0; 2464 p->se.wait_start = 0;
@@ -2631,18 +2520,41 @@ void sched_fork(struct task_struct *p, int clone_flags)
2631 2520
2632 __sched_fork(p); 2521 __sched_fork(p);
2633 2522
2634#ifdef CONFIG_SMP
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
2636#endif
2637 set_task_cpu(p, cpu);
2638
2639 /* 2523 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2524 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2525 */
2642 p->prio = current->normal_prio; 2526 p->prio = current->normal_prio;
2527
2528 /*
2529 * Revert to default priority/policy on fork if requested.
2530 */
2531 if (unlikely(p->sched_reset_on_fork)) {
2532 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
2533 p->policy = SCHED_NORMAL;
2534
2535 if (p->normal_prio < DEFAULT_PRIO)
2536 p->prio = DEFAULT_PRIO;
2537
2538 if (PRIO_TO_NICE(p->static_prio) < 0) {
2539 p->static_prio = NICE_TO_PRIO(0);
2540 set_load_weight(p);
2541 }
2542
2543 /*
2544 * We don't need the reset flag anymore after the fork. It has
2545 * fulfilled its duty:
2546 */
2547 p->sched_reset_on_fork = 0;
2548 }
2549
2643 if (!rt_prio(p->prio)) 2550 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2551 p->sched_class = &fair_sched_class;
2645 2552
2553#ifdef CONFIG_SMP
2554 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2555#endif
2556 set_task_cpu(p, cpu);
2557
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2558#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on())) 2559 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2560 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2688,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2688 inc_nr_running(rq); 2600 inc_nr_running(rq);
2689 } 2601 }
2690 trace_sched_wakeup_new(rq, p, 1); 2602 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0); 2603 check_preempt_curr(rq, p, WF_FORK);
2692#ifdef CONFIG_SMP 2604#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up) 2605 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p); 2606 p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2708,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2708{
2797 struct mm_struct *mm = rq->prev_mm; 2709 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2710 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2711
2806 rq->prev_mm = NULL; 2712 rq->prev_mm = NULL;
2807 2713
@@ -2820,10 +2726,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2820 finish_arch_switch(prev); 2726 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2727 perf_counter_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2728 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2729
2828 fire_sched_in_preempt_notifiers(current); 2730 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2731 if (mm)
@@ -2838,6 +2740,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2740 }
2839} 2741}
2840 2742
2743#ifdef CONFIG_SMP
2744
2745/* assumes rq->lock is held */
2746static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2747{
2748 if (prev->sched_class->pre_schedule)
2749 prev->sched_class->pre_schedule(rq, prev);
2750}
2751
2752/* rq->lock is NOT held, but preemption is disabled */
2753static inline void post_schedule(struct rq *rq)
2754{
2755 if (rq->post_schedule) {
2756 unsigned long flags;
2757
2758 spin_lock_irqsave(&rq->lock, flags);
2759 if (rq->curr->sched_class->post_schedule)
2760 rq->curr->sched_class->post_schedule(rq);
2761 spin_unlock_irqrestore(&rq->lock, flags);
2762
2763 rq->post_schedule = 0;
2764 }
2765}
2766
2767#else
2768
2769static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2770{
2771}
2772
2773static inline void post_schedule(struct rq *rq)
2774{
2775}
2776
2777#endif
2778
2841/** 2779/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2780 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2781 * @prev: the thread we just switched away from.
@@ -2848,6 +2786,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2786 struct rq *rq = this_rq();
2849 2787
2850 finish_task_switch(rq, prev); 2788 finish_task_switch(rq, prev);
2789
2790 /*
2791 * FIXME: do we need to worry about rq being invalidated by the
2792 * task_switch?
2793 */
2794 post_schedule(rq);
2795
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2796#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2797 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2798 preempt_enable();
@@ -3164,7 +3109,7 @@ out:
3164void sched_exec(void) 3109void sched_exec(void)
3165{ 3110{
3166 int new_cpu, this_cpu = get_cpu(); 3111 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3112 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3168 put_cpu(); 3113 put_cpu();
3169 if (new_cpu != this_cpu) 3114 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu); 3115 sched_migrate_task(current, new_cpu);
@@ -3379,9 +3324,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3324{
3380 const struct sched_class *class; 3325 const struct sched_class *class;
3381 3326
3382 for (class = sched_class_highest; class; class = class->next) 3327 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3328 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3329 return 1;
3330 }
3385 3331
3386 return 0; 3332 return 0;
3387} 3333}
@@ -3544,7 +3490,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3490 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3491 * from other group and save more power
3546 */ 3492 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3493 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3494 return;
3549 3495
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3496 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3583 *imbalance = sds->min_load_per_task; 3529 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min; 3530 sds->busiest = sds->group_min;
3585 3531
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1; 3532 return 1;
3592 3533
3593} 3534}
@@ -3612,6 +3553,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3553#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3554
3614 3555
3556unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3557{
3558 return SCHED_LOAD_SCALE;
3559}
3560
3561unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3562{
3563 return default_scale_freq_power(sd, cpu);
3564}
3565
3566unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3567{
3568 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3569 unsigned long smt_gain = sd->smt_gain;
3570
3571 smt_gain /= weight;
3572
3573 return smt_gain;
3574}
3575
3576unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3577{
3578 return default_scale_smt_power(sd, cpu);
3579}
3580
3581unsigned long scale_rt_power(int cpu)
3582{
3583 struct rq *rq = cpu_rq(cpu);
3584 u64 total, available;
3585
3586 sched_avg_update(rq);
3587
3588 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3589 available = total - rq->rt_avg;
3590
3591 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3592 total = SCHED_LOAD_SCALE;
3593
3594 total >>= SCHED_LOAD_SHIFT;
3595
3596 return div_u64(available, total);
3597}
3598
3599static void update_cpu_power(struct sched_domain *sd, int cpu)
3600{
3601 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3602 unsigned long power = SCHED_LOAD_SCALE;
3603 struct sched_group *sdg = sd->groups;
3604
3605 if (sched_feat(ARCH_POWER))
3606 power *= arch_scale_freq_power(sd, cpu);
3607 else
3608 power *= default_scale_freq_power(sd, cpu);
3609
3610 power >>= SCHED_LOAD_SHIFT;
3611
3612 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3613 if (sched_feat(ARCH_POWER))
3614 power *= arch_scale_smt_power(sd, cpu);
3615 else
3616 power *= default_scale_smt_power(sd, cpu);
3617
3618 power >>= SCHED_LOAD_SHIFT;
3619 }
3620
3621 power *= scale_rt_power(cpu);
3622 power >>= SCHED_LOAD_SHIFT;
3623
3624 if (!power)
3625 power = 1;
3626
3627 sdg->cpu_power = power;
3628}
3629
3630static void update_group_power(struct sched_domain *sd, int cpu)
3631{
3632 struct sched_domain *child = sd->child;
3633 struct sched_group *group, *sdg = sd->groups;
3634 unsigned long power;
3635
3636 if (!child) {
3637 update_cpu_power(sd, cpu);
3638 return;
3639 }
3640
3641 power = 0;
3642
3643 group = child->groups;
3644 do {
3645 power += group->cpu_power;
3646 group = group->next;
3647 } while (group != child->groups);
3648
3649 sdg->cpu_power = power;
3650}
3651
3615/** 3652/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3653 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3617 * @group: sched_group whose statistics are to be updated. 3654 * @group: sched_group whose statistics are to be updated.
@@ -3624,7 +3661,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3661 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3662 * @sgs: variable to hold the statistics for this group.
3626 */ 3663 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3664static inline void update_sg_lb_stats(struct sched_domain *sd,
3665 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3666 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3667 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3668 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3673,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3673 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3674 unsigned long avg_load_per_task;
3637 3675
3638 if (local_group) 3676 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3677 balance_cpu = group_first_cpu(group);
3678 if (balance_cpu == this_cpu)
3679 update_group_power(sd, this_cpu);
3680 }
3640 3681
3641 /* Tally up the load of all CPUs in the group */ 3682 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3683 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3726,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3726 }
3686 3727
3687 /* Adjust by relative CPU power of the group */ 3728 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3729 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3730
3691 3731
3692 /* 3732 /*
@@ -3698,14 +3738,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3738 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3739 * the hierarchy?
3700 */ 3740 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3741 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3742 group->cpu_power;
3703 3743
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3744 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3745 sgs->group_imb = 1;
3706 3746
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3747 sgs->group_capacity =
3708 3748 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3749}
3710 3750
3711/** 3751/**
@@ -3723,9 +3763,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3763 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3764 struct sd_lb_stats *sds)
3725{ 3765{
3766 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3767 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3768 struct sg_lb_stats sgs;
3728 int load_idx; 3769 int load_idx, prefer_sibling = 0;
3770
3771 if (child && child->flags & SD_PREFER_SIBLING)
3772 prefer_sibling = 1;
3729 3773
3730 init_sd_power_savings_stats(sd, sds, idle); 3774 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3775 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3780,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3780 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3781 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3782 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3783 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3784 local_group, cpus, balance, &sgs);
3741 3785
3742 if (local_group && balance && !(*balance)) 3786 if (local_group && balance && !(*balance))
3743 return; 3787 return;
3744 3788
3745 sds->total_load += sgs.group_load; 3789 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3790 sds->total_pwr += group->cpu_power;
3791
3792 /*
3793 * In case the child domain prefers tasks go to siblings
3794 * first, lower the group capacity to one so that we'll try
3795 * and move all the excess tasks away.
3796 */
3797 if (prefer_sibling)
3798 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3799
3748 if (local_group) { 3800 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3801 sds->this_load = sgs.avg_load;
@@ -3763,7 +3815,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3815 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3816 group = group->next;
3765 } while (group != sd->groups); 3817 } while (group != sd->groups);
3766
3767} 3818}
3768 3819
3769/** 3820/**
@@ -3801,28 +3852,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3852 * moving them.
3802 */ 3853 */
3803 3854
3804 pwr_now += sds->busiest->__cpu_power * 3855 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3856 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3857 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3858 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3859 pwr_now /= SCHED_LOAD_SCALE;
3809 3860
3810 /* Amount of load we'd subtract */ 3861 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3862 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3863 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3864 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3865 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 3866 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 3867
3817 /* Amount of load we'd add */ 3868 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 3869 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3870 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 3871 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 3872 sds->this->cpu_power;
3822 else 3873 else
3823 tmp = sg_div_cpu_power(sds->this, 3874 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3875 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 3876 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 3877 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 3878 pwr_move /= SCHED_LOAD_SCALE;
3828 3879
@@ -3857,8 +3908,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 3908 sds->max_load - sds->busiest_load_per_task);
3858 3909
3859 /* How much load to actually move to equalise the imbalance */ 3910 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3911 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3912 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 3913 / SCHED_LOAD_SCALE;
3863 3914
3864 /* 3915 /*
@@ -3988,15 +4039,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4039 int i;
3989 4040
3990 for_each_cpu(i, sched_group_cpus(group)) { 4041 for_each_cpu(i, sched_group_cpus(group)) {
4042 unsigned long power = power_of(i);
4043 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4044 unsigned long wl;
3992 4045
3993 if (!cpumask_test_cpu(i, cpus)) 4046 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4047 continue;
3995 4048
3996 rq = cpu_rq(i); 4049 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4050 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4051 wl /= power;
3998 4052
3999 if (rq->nr_running == 1 && wl > imbalance) 4053 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4054 continue;
4001 4055
4002 if (wl > max_load) { 4056 if (wl > max_load) {
@@ -5257,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)
5257#endif 5311#endif
5258} 5312}
5259 5313
5260static void put_prev_task(struct rq *rq, struct task_struct *prev) 5314static void put_prev_task(struct rq *rq, struct task_struct *p)
5261{ 5315{
5262 if (prev->state == TASK_RUNNING) { 5316 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5263 u64 runtime = prev->se.sum_exec_runtime;
5264 5317
5265 runtime -= prev->se.prev_sum_exec_runtime; 5318 update_avg(&p->se.avg_running, runtime);
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267 5319
5320 if (p->state == TASK_RUNNING) {
5268 /* 5321 /*
5269 * In order to avoid avg_overlap growing stale when we are 5322 * In order to avoid avg_overlap growing stale when we are
5270 * indeed overlapping and hence not getting put to sleep, grow 5323 * indeed overlapping and hence not getting put to sleep, grow
@@ -5274,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5274 * correlates to the amount of cache footprint a task can 5327 * correlates to the amount of cache footprint a task can
5275 * build up. 5328 * build up.
5276 */ 5329 */
5277 update_avg(&prev->se.avg_overlap, runtime); 5330 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5331 update_avg(&p->se.avg_overlap, runtime);
5332 } else {
5333 update_avg(&p->se.avg_running, 0);
5278 } 5334 }
5279 prev->sched_class->put_prev_task(rq, prev); 5335 p->sched_class->put_prev_task(rq, p);
5280} 5336}
5281 5337
5282/* 5338/*
@@ -5325,7 +5381,7 @@ need_resched:
5325 preempt_disable(); 5381 preempt_disable();
5326 cpu = smp_processor_id(); 5382 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5383 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5384 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5385 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5386 switch_count = &prev->nivcsw;
5331 5387
@@ -5349,10 +5405,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5405 switch_count = &prev->nvcsw;
5350 } 5406 }
5351 5407
5352#ifdef CONFIG_SMP 5408 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5409
5357 if (unlikely(!rq->nr_running)) 5410 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5411 idle_balance(cpu, rq);
@@ -5378,6 +5431,8 @@ need_resched_nonpreemptible:
5378 } else 5431 } else
5379 spin_unlock_irq(&rq->lock); 5432 spin_unlock_irq(&rq->lock);
5380 5433
5434 post_schedule(rq);
5435
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5436 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5437 goto need_resched_nonpreemptible;
5383 5438
@@ -5509,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5509 5564
5510#endif /* CONFIG_PREEMPT */ 5565#endif /* CONFIG_PREEMPT */
5511 5566
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5567int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5513 void *key) 5568 void *key)
5514{ 5569{
5515 return try_to_wake_up(curr->private, mode, sync); 5570 return try_to_wake_up(curr->private, mode, wake_flags);
5516} 5571}
5517EXPORT_SYMBOL(default_wake_function); 5572EXPORT_SYMBOL(default_wake_function);
5518 5573
@@ -5526,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);
5526 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5581 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5527 */ 5582 */
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5583static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key) 5584 int nr_exclusive, int wake_flags, void *key)
5530{ 5585{
5531 wait_queue_t *curr, *next; 5586 wait_queue_t *curr, *next;
5532 5587
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5588 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags; 5589 unsigned flags = curr->flags;
5535 5590
5536 if (curr->func(curr, mode, sync, key) && 5591 if (curr->func(curr, mode, wake_flags, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5592 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break; 5593 break;
5539 } 5594 }
@@ -5594,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key) 5649 int nr_exclusive, void *key)
5595{ 5650{
5596 unsigned long flags; 5651 unsigned long flags;
5597 int sync = 1; 5652 int wake_flags = WF_SYNC;
5598 5653
5599 if (unlikely(!q)) 5654 if (unlikely(!q))
5600 return; 5655 return;
5601 5656
5602 if (unlikely(!nr_exclusive)) 5657 if (unlikely(!nr_exclusive))
5603 sync = 0; 5658 wake_flags = 0;
5604 5659
5605 spin_lock_irqsave(&q->lock, flags); 5660 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key); 5661 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5607 spin_unlock_irqrestore(&q->lock, flags); 5662 spin_unlock_irqrestore(&q->lock, flags);
5608} 5663}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5664EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6123,17 +6178,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6178 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6179 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6180 struct rq *rq;
6181 int reset_on_fork;
6126 6182
6127 /* may grab non-irq protected spin_locks */ 6183 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6184 BUG_ON(in_interrupt());
6129recheck: 6185recheck:
6130 /* double check policy once rq lock held */ 6186 /* double check policy once rq lock held */
6131 if (policy < 0) 6187 if (policy < 0) {
6188 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6189 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6190 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6191 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6192 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6193
6194 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6195 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6196 policy != SCHED_IDLE)
6197 return -EINVAL;
6198 }
6199
6137 /* 6200 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6201 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6202 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6240,10 @@ recheck:
6177 /* can't change other user's priorities */ 6240 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6241 if (!check_same_owner(p))
6179 return -EPERM; 6242 return -EPERM;
6243
6244 /* Normal users shall not reset the sched_reset_on_fork flag */
6245 if (p->sched_reset_on_fork && !reset_on_fork)
6246 return -EPERM;
6180 } 6247 }
6181 6248
6182 if (user) { 6249 if (user) {
@@ -6220,6 +6287,8 @@ recheck:
6220 if (running) 6287 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6288 p->sched_class->put_prev_task(rq, p);
6222 6289
6290 p->sched_reset_on_fork = reset_on_fork;
6291
6223 oldprio = p->prio; 6292 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6293 __setscheduler(rq, p, policy, param->sched_priority);
6225 6294
@@ -6336,14 +6405,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6405 if (p) {
6337 retval = security_task_getscheduler(p); 6406 retval = security_task_getscheduler(p);
6338 if (!retval) 6407 if (!retval)
6339 retval = p->policy; 6408 retval = p->policy
6409 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6410 }
6341 read_unlock(&tasklist_lock); 6411 read_unlock(&tasklist_lock);
6342 return retval; 6412 return retval;
6343} 6413}
6344 6414
6345/** 6415/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6416 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6417 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6418 * @param: structure containing the RT priority.
6349 */ 6419 */
@@ -6571,19 +6641,9 @@ static inline int should_resched(void)
6571 6641
6572static void __cond_resched(void) 6642static void __cond_resched(void)
6573{ 6643{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6644 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6645 schedule();
6576#endif 6646 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6647}
6588 6648
6589int __sched _cond_resched(void) 6649int __sched _cond_resched(void)
@@ -6597,18 +6657,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6657EXPORT_SYMBOL(_cond_resched);
6598 6658
6599/* 6659/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6660 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6661 * call schedule, and on return reacquire the lock.
6602 * 6662 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6663 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6664 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6665 * spin_unlock(), once by hand).
6606 */ 6666 */
6607int cond_resched_lock(spinlock_t *lock) 6667int __cond_resched_lock(spinlock_t *lock)
6608{ 6668{
6609 int resched = should_resched(); 6669 int resched = should_resched();
6610 int ret = 0; 6670 int ret = 0;
6611 6671
6672 lockdep_assert_held(lock);
6673
6612 if (spin_needbreak(lock) || resched) { 6674 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6675 spin_unlock(lock);
6614 if (resched) 6676 if (resched)
@@ -6620,9 +6682,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6682 }
6621 return ret; 6683 return ret;
6622} 6684}
6623EXPORT_SYMBOL(cond_resched_lock); 6685EXPORT_SYMBOL(__cond_resched_lock);
6624 6686
6625int __sched cond_resched_softirq(void) 6687int __sched __cond_resched_softirq(void)
6626{ 6688{
6627 BUG_ON(!in_softirq()); 6689 BUG_ON(!in_softirq());
6628 6690
@@ -6634,7 +6696,7 @@ int __sched cond_resched_softirq(void)
6634 } 6696 }
6635 return 0; 6697 return 0;
6636} 6698}
6637EXPORT_SYMBOL(cond_resched_softirq); 6699EXPORT_SYMBOL(__cond_resched_softirq);
6638 6700
6639/** 6701/**
6640 * yield - yield the current processor to other threads. 6702 * yield - yield the current processor to other threads.
@@ -6658,11 +6720,13 @@ EXPORT_SYMBOL(yield);
6658 */ 6720 */
6659void __sched io_schedule(void) 6721void __sched io_schedule(void)
6660{ 6722{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6723 struct rq *rq = raw_rq();
6662 6724
6663 delayacct_blkio_start(); 6725 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6726 atomic_inc(&rq->nr_iowait);
6727 current->in_iowait = 1;
6665 schedule(); 6728 schedule();
6729 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6730 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6731 delayacct_blkio_end();
6668} 6732}
@@ -6670,12 +6734,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6734
6671long __sched io_schedule_timeout(long timeout) 6735long __sched io_schedule_timeout(long timeout)
6672{ 6736{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6737 struct rq *rq = raw_rq();
6674 long ret; 6738 long ret;
6675 6739
6676 delayacct_blkio_start(); 6740 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6741 atomic_inc(&rq->nr_iowait);
6742 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6743 ret = schedule_timeout(timeout);
6744 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6745 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6746 delayacct_blkio_end();
6681 return ret; 6747 return ret;
@@ -6992,8 +7058,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7058
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7059 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7060 /* Need help from migration thread: drop lock and wait. */
7061 struct task_struct *mt = rq->migration_thread;
7062
7063 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7064 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7065 wake_up_process(rq->migration_thread);
7066 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7067 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7068 tlb_migrate_finish(p->mm);
6999 return 0; 7069 return 0;
@@ -7051,6 +7121,11 @@ fail:
7051 return ret; 7121 return ret;
7052} 7122}
7053 7123
7124#define RCU_MIGRATION_IDLE 0
7125#define RCU_MIGRATION_NEED_QS 1
7126#define RCU_MIGRATION_GOT_QS 2
7127#define RCU_MIGRATION_MUST_SYNC 3
7128
7054/* 7129/*
7055 * migration_thread - this is a highprio system thread that performs 7130 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7131 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7133,7 @@ fail:
7058 */ 7133 */
7059static int migration_thread(void *data) 7134static int migration_thread(void *data)
7060{ 7135{
7136 int badcpu;
7061 int cpu = (long)data; 7137 int cpu = (long)data;
7062 struct rq *rq; 7138 struct rq *rq;
7063 7139
@@ -7092,8 +7168,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7168 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7169 list_del_init(head->next);
7094 7170
7095 spin_unlock(&rq->lock); 7171 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7172 spin_unlock(&rq->lock);
7173 __migrate_task(req->task, cpu, req->dest_cpu);
7174 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7175 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7176 spin_unlock(&rq->lock);
7177 } else {
7178 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7179 spin_unlock(&rq->lock);
7180 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7181 }
7097 local_irq_enable(); 7182 local_irq_enable();
7098 7183
7099 complete(&req->done); 7184 complete(&req->done);
@@ -7625,7 +7710,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7710 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7711 register_cpu_notifier(&migration_notifier);
7627 7712
7628 return err; 7713 return 0;
7629} 7714}
7630early_initcall(migration_init); 7715early_initcall(migration_init);
7631#endif 7716#endif
@@ -7672,7 +7757,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7757 break;
7673 } 7758 }
7674 7759
7675 if (!group->__cpu_power) { 7760 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7761 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7762 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7763 "set\n");
@@ -7696,9 +7781,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7781 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7782
7698 printk(KERN_CONT " %s", str); 7783 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7784 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7785 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7786 group->cpu_power);
7702 } 7787 }
7703 7788
7704 group = group->next; 7789 group = group->next;
@@ -7763,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)
7763 } 7848 }
7764 7849
7765 /* Following flags don't use groups */ 7850 /* Following flags don't use groups */
7766 if (sd->flags & (SD_WAKE_IDLE | 7851 if (sd->flags & (SD_WAKE_AFFINE))
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0; 7852 return 0;
7770 7853
7771 return 1; 7854 return 1;
@@ -7782,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7865 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0; 7866 return 0;
7784 7867
7785 /* Does parent contain flags not in child? */
7786 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789 /* Flags needing groups don't count if only 1 group in parent */ 7868 /* Flags needing groups don't count if only 1 group in parent */
7790 if (parent->groups == parent->groups->next) { 7869 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE | 7870 pflags &= ~(SD_LOAD_BALANCE |
@@ -7841,7 +7920,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7920 rq->rd = rd;
7842 7921
7843 cpumask_set_cpu(rq->cpu, rd->span); 7922 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7923 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7924 set_rq_online(rq);
7846 7925
7847 spin_unlock_irqrestore(&rq->lock, flags); 7926 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7983,7 +8062,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8062 continue;
7984 8063
7985 cpumask_clear(sched_group_cpus(sg)); 8064 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8065 sg->cpu_power = 0;
7987 8066
7988 for_each_cpu(j, span) { 8067 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8068 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8170,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8170 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8171};
8093 8172
8173struct s_data {
8174#ifdef CONFIG_NUMA
8175 int sd_allnodes;
8176 cpumask_var_t domainspan;
8177 cpumask_var_t covered;
8178 cpumask_var_t notcovered;
8179#endif
8180 cpumask_var_t nodemask;
8181 cpumask_var_t this_sibling_map;
8182 cpumask_var_t this_core_map;
8183 cpumask_var_t send_covered;
8184 cpumask_var_t tmpmask;
8185 struct sched_group **sched_group_nodes;
8186 struct root_domain *rd;
8187};
8188
8189enum s_alloc {
8190 sa_sched_groups = 0,
8191 sa_rootdomain,
8192 sa_tmpmask,
8193 sa_send_covered,
8194 sa_this_core_map,
8195 sa_this_sibling_map,
8196 sa_nodemask,
8197 sa_sched_group_nodes,
8198#ifdef CONFIG_NUMA
8199 sa_notcovered,
8200 sa_covered,
8201 sa_domainspan,
8202#endif
8203 sa_none,
8204};
8205
8094/* 8206/*
8095 * SMT sched-domains: 8207 * SMT sched-domains:
8096 */ 8208 */
@@ -8208,11 +8320,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8320 continue;
8209 } 8321 }
8210 8322
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8323 sg->cpu_power += sd->groups->cpu_power;
8212 } 8324 }
8213 sg = sg->next; 8325 sg = sg->next;
8214 } while (sg != group_head); 8326 } while (sg != group_head);
8215} 8327}
8328
8329static int build_numa_sched_groups(struct s_data *d,
8330 const struct cpumask *cpu_map, int num)
8331{
8332 struct sched_domain *sd;
8333 struct sched_group *sg, *prev;
8334 int n, j;
8335
8336 cpumask_clear(d->covered);
8337 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8338 if (cpumask_empty(d->nodemask)) {
8339 d->sched_group_nodes[num] = NULL;
8340 goto out;
8341 }
8342
8343 sched_domain_node_span(num, d->domainspan);
8344 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8345
8346 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8347 GFP_KERNEL, num);
8348 if (!sg) {
8349 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8350 num);
8351 return -ENOMEM;
8352 }
8353 d->sched_group_nodes[num] = sg;
8354
8355 for_each_cpu(j, d->nodemask) {
8356 sd = &per_cpu(node_domains, j).sd;
8357 sd->groups = sg;
8358 }
8359
8360 sg->cpu_power = 0;
8361 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8362 sg->next = sg;
8363 cpumask_or(d->covered, d->covered, d->nodemask);
8364
8365 prev = sg;
8366 for (j = 0; j < nr_node_ids; j++) {
8367 n = (num + j) % nr_node_ids;
8368 cpumask_complement(d->notcovered, d->covered);
8369 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8370 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8371 if (cpumask_empty(d->tmpmask))
8372 break;
8373 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8374 if (cpumask_empty(d->tmpmask))
8375 continue;
8376 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8377 GFP_KERNEL, num);
8378 if (!sg) {
8379 printk(KERN_WARNING
8380 "Can not alloc domain group for node %d\n", j);
8381 return -ENOMEM;
8382 }
8383 sg->cpu_power = 0;
8384 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8385 sg->next = prev->next;
8386 cpumask_or(d->covered, d->covered, d->tmpmask);
8387 prev->next = sg;
8388 prev = sg;
8389 }
8390out:
8391 return 0;
8392}
8216#endif /* CONFIG_NUMA */ 8393#endif /* CONFIG_NUMA */
8217 8394
8218#ifdef CONFIG_NUMA 8395#ifdef CONFIG_NUMA
@@ -8266,15 +8443,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8443 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8444 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8445 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8446 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8447static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8448{
8276 struct sched_domain *child; 8449 struct sched_domain *child;
8277 struct sched_group *group; 8450 struct sched_group *group;
8451 long power;
8452 int weight;
8278 8453
8279 WARN_ON(!sd || !sd->groups); 8454 WARN_ON(!sd || !sd->groups);
8280 8455
@@ -8283,28 +8458,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8458
8284 child = sd->child; 8459 child = sd->child;
8285 8460
8286 sd->groups->__cpu_power = 0; 8461 sd->groups->cpu_power = 0;
8287 8462
8288 /* 8463 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8464 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8465 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8466 /*
8292 * can handle only one task, when there are other idle groups in the 8467 * SMT siblings share the power of a single core.
8293 * same sched domain. 8468 * Usually multiple threads get a better yield out of
8294 */ 8469 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8470 * reflect that in sd->smt_gain.
8296 (child->flags & 8471 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8472 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8473 power *= sd->smt_gain;
8474 power /= weight;
8475 power >>= SCHED_LOAD_SHIFT;
8476 }
8477 sd->groups->cpu_power += power;
8299 return; 8478 return;
8300 } 8479 }
8301 8480
8302 /* 8481 /*
8303 * add cpu_power of each child group to this groups cpu_power 8482 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8483 */
8305 group = child->groups; 8484 group = child->groups;
8306 do { 8485 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8486 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8487 group = group->next;
8309 } while (group != child->groups); 8488 } while (group != child->groups);
8310} 8489}
@@ -8371,287 +8550,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8371 request = attr->relax_domain_level; 8550 request = attr->relax_domain_level;
8372 if (request < sd->level) { 8551 if (request < sd->level) {
8373 /* turn off idle balance on this domain */ 8552 /* turn off idle balance on this domain */
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8553 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8375 } else { 8554 } else {
8376 /* turn on idle balance on this domain */ 8555 /* turn on idle balance on this domain */
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8556 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8557 }
8558}
8559
8560static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8561 const struct cpumask *cpu_map)
8562{
8563 switch (what) {
8564 case sa_sched_groups:
8565 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8566 d->sched_group_nodes = NULL;
8567 case sa_rootdomain:
8568 free_rootdomain(d->rd); /* fall through */
8569 case sa_tmpmask:
8570 free_cpumask_var(d->tmpmask); /* fall through */
8571 case sa_send_covered:
8572 free_cpumask_var(d->send_covered); /* fall through */
8573 case sa_this_core_map:
8574 free_cpumask_var(d->this_core_map); /* fall through */
8575 case sa_this_sibling_map:
8576 free_cpumask_var(d->this_sibling_map); /* fall through */
8577 case sa_nodemask:
8578 free_cpumask_var(d->nodemask); /* fall through */
8579 case sa_sched_group_nodes:
8580#ifdef CONFIG_NUMA
8581 kfree(d->sched_group_nodes); /* fall through */
8582 case sa_notcovered:
8583 free_cpumask_var(d->notcovered); /* fall through */
8584 case sa_covered:
8585 free_cpumask_var(d->covered); /* fall through */
8586 case sa_domainspan:
8587 free_cpumask_var(d->domainspan); /* fall through */
8588#endif
8589 case sa_none:
8590 break;
8378 } 8591 }
8379} 8592}
8380 8593
8381/* 8594static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8595 const struct cpumask *cpu_map)
8383 * to the individual cpus
8384 */
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{ 8596{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered;
8394 struct sched_group **sched_group_nodes = NULL;
8395 int sd_allnodes = 0;
8396
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
8398 goto out;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
8400 goto free_domainspan;
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
8402 goto free_covered;
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA 8597#ifdef CONFIG_NUMA
8417 /* 8598 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8599 return sa_none;
8419 */ 8600 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8601 return sa_domainspan;
8421 GFP_KERNEL); 8602 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8603 return sa_covered;
8604 /* Allocate the per-node list of sched groups */
8605 d->sched_group_nodes = kcalloc(nr_node_ids,
8606 sizeof(struct sched_group *), GFP_KERNEL);
8607 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8608 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8609 return sa_notcovered;
8425 } 8610 }
8426#endif 8611 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8612#endif
8428 rd = alloc_rootdomain(); 8613 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8614 return sa_sched_group_nodes;
8615 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8616 return sa_nodemask;
8617 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8618 return sa_this_sibling_map;
8619 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8620 return sa_this_core_map;
8621 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8622 return sa_send_covered;
8623 d->rd = alloc_rootdomain();
8624 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8625 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8626 return sa_tmpmask;
8432 } 8627 }
8628 return sa_rootdomain;
8629}
8433 8630
8631static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8632 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8633{
8634 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8635#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8636 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8637
8459 sd = &per_cpu(node_domains, i).sd; 8638 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8639 if (cpumask_weight(cpu_map) >
8640 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8641 sd = &per_cpu(allnodes_domains, i).sd;
8642 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8643 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8644 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8645 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8646 d->sd_allnodes = 1;
8465 p->child = sd; 8647 }
8466 cpumask_and(sched_domain_span(sd), 8648 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8649
8650 sd = &per_cpu(node_domains, i).sd;
8651 SD_INIT(sd, NODE);
8652 set_domain_attribute(sd, attr);
8653 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8654 sd->parent = parent;
8655 if (parent)
8656 parent->child = sd;
8657 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8658#endif
8659 return sd;
8660}
8469 8661
8470 p = sd; 8662static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8663 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8664 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8665{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8666 struct sched_domain *sd;
8475 sd->parent = p; 8667 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8668 SD_INIT(sd, CPU);
8477 p->child = sd; 8669 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8670 cpumask_copy(sched_domain_span(sd), d->nodemask);
8671 sd->parent = parent;
8672 if (parent)
8673 parent->child = sd;
8674 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8675 return sd;
8676}
8479 8677
8678static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8679 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8680 struct sched_domain *parent, int i)
8681{
8682 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8683#ifdef CONFIG_SCHED_MC
8481 p = sd; 8684 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8685 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8686 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8687 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8688 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8689 parent->child = sd;
8487 sd->parent = p; 8690 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8691#endif
8692 return sd;
8693}
8491 8694
8695static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8696 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8697 struct sched_domain *parent, int i)
8698{
8699 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8700#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8701 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8702 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8703 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8704 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8705 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8706 parent->child = sd;
8499 sd->parent = p; 8707 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8708#endif
8503 } 8709 return sd;
8710}
8504 8711
8712static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8713 const struct cpumask *cpu_map, int cpu)
8714{
8715 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8716#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8717 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8718 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8719 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8720 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8721 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8722 &cpu_to_cpu_group,
8512 8723 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8724 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8725#endif
8518
8519#ifdef CONFIG_SCHED_MC 8726#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8727 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8728 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8729 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8730 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8731 &cpu_to_core_group,
8525 8732 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8733 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8734#endif
8531 8735 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8736 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8737 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8738 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8739 &cpu_to_phys_group,
8536 continue; 8740 d->send_covered, d->tmpmask);
8537 8741 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8742#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8743 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8744 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8745 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8746 break;
8548 send_covered, tmpmask); 8747#endif
8748 default:
8749 break;
8549 } 8750 }
8751}
8550 8752
8551 for (i = 0; i < nr_node_ids; i++) { 8753/*
8552 /* Set up node groups */ 8754 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8755 * to the individual cpus
8554 int j; 8756 */
8555 8757static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8758 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8759{
8558 if (cpumask_empty(nodemask)) { 8760 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8761 struct s_data d;
8560 continue; 8762 struct sched_domain *sd;
8561 } 8763 int i;
8764#ifdef CONFIG_NUMA
8765 d.sd_allnodes = 0;
8766#endif
8562 8767
8563 sched_domain_node_span(i, domainspan); 8768 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8769 if (alloc_state != sa_rootdomain)
8770 goto error;
8771 alloc_state = sa_sched_groups;
8565 8772
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8773 /*
8567 GFP_KERNEL, i); 8774 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8775 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8776 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8777 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8778 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8779
8577 sd = &per_cpu(node_domains, j).sd; 8780 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8781 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8782 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8783 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8784 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8785
8586 for (j = 0; j < nr_node_ids; j++) { 8786 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8787 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8788 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8789 }
8588 8790
8589 cpumask_complement(notcovered, covered); 8791 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8792 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8793 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8794
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8795#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8796 /* Set up node groups */
8597 continue; 8797 if (d.sd_allnodes)
8798 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8799
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8800 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8801 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8802 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8803#endif
8616 8804
8617 /* Calculate CPU power for physical packages and nodes */ 8805 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8806#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8807 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8808 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8809 init_sched_groups_power(i, sd);
8623 } 8810 }
8624#endif 8811#endif
8625#ifdef CONFIG_SCHED_MC 8812#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8813 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8814 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8815 init_sched_groups_power(i, sd);
8630 } 8816 }
8631#endif 8817#endif
8632 8818
8633 for_each_cpu(i, cpu_map) { 8819 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8820 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8821 init_sched_groups_power(i, sd);
8637 } 8822 }
8638 8823
8639#ifdef CONFIG_NUMA 8824#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8825 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8826 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8827
8643 if (sd_allnodes) { 8828 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8829 struct sched_group *sg;
8645 8830
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8831 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8832 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8833 init_numa_sched_groups_power(sg);
8649 } 8834 }
8650#endif 8835#endif
8651 8836
8652 /* Attach the domains */ 8837 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8838 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8839#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8840 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8841#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8843,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8843#else
8660 sd = &per_cpu(phys_domains, i).sd; 8844 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8845#endif
8662 cpu_attach_domain(sd, rd, i); 8846 cpu_attach_domain(sd, d.rd, i);
8663 } 8847 }
8664 8848
8665 err = 0; 8849 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8850 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8851 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8852
8694#ifdef CONFIG_NUMA
8695error: 8853error:
8696 free_sched_groups(cpu_map, tmpmask); 8854 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8855 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8856}
8701 8857
8702static int build_sched_domains(const struct cpumask *cpu_map) 8858static int build_sched_domains(const struct cpumask *cpu_map)
@@ -9304,11 +9460,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9460 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9461 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9462 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9463 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9464 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9465 */
9310 init_tg_cfs_entry(&init_task_group, 9466 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9467 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9468 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9469 root_task_group.se[i]);
9314 9470
@@ -9334,6 +9490,7 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9490#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9491 rq->sd = NULL;
9336 rq->rd = NULL; 9492 rq->rd = NULL;
9493 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9494 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9495 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9496 rq->push_cpu = 0;
@@ -9398,13 +9555,20 @@ void __init sched_init(void)
9398} 9555}
9399 9556
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9557#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9558static inline int preempt_count_equals(int preempt_offset)
9559{
9560 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9561
9562 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9563}
9564
9565void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9566{
9403#ifdef in_atomic 9567#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9568 static unsigned long prev_jiffy; /* ratelimiting */
9405 9569
9406 if ((!in_atomic() && !irqs_disabled()) || 9570 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9571 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9572 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9573 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9574 return;
@@ -10581,3 +10745,113 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10745 .subsys_id = cpuacct_subsys_id,
10582}; 10746};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10747#endif /* CONFIG_CGROUP_CPUACCT */
10748
10749#ifndef CONFIG_SMP
10750
10751int rcu_expedited_torture_stats(char *page)
10752{
10753 return 0;
10754}
10755EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10756
10757void synchronize_sched_expedited(void)
10758{
10759}
10760EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10761
10762#else /* #ifndef CONFIG_SMP */
10763
10764static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10765static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10766
10767#define RCU_EXPEDITED_STATE_POST -2
10768#define RCU_EXPEDITED_STATE_IDLE -1
10769
10770static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10771
10772int rcu_expedited_torture_stats(char *page)
10773{
10774 int cnt = 0;
10775 int cpu;
10776
10777 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10778 for_each_online_cpu(cpu) {
10779 cnt += sprintf(&page[cnt], " %d:%d",
10780 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10781 }
10782 cnt += sprintf(&page[cnt], "\n");
10783 return cnt;
10784}
10785EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10786
10787static long synchronize_sched_expedited_count;
10788
10789/*
10790 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10791 * approach to force grace period to end quickly. This consumes
10792 * significant time on all CPUs, and is thus not recommended for
10793 * any sort of common-case code.
10794 *
10795 * Note that it is illegal to call this function while holding any
10796 * lock that is acquired by a CPU-hotplug notifier. Failing to
10797 * observe this restriction will result in deadlock.
10798 */
10799void synchronize_sched_expedited(void)
10800{
10801 int cpu;
10802 unsigned long flags;
10803 bool need_full_sync = 0;
10804 struct rq *rq;
10805 struct migration_req *req;
10806 long snap;
10807 int trycount = 0;
10808
10809 smp_mb(); /* ensure prior mod happens before capturing snap. */
10810 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10811 get_online_cpus();
10812 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10813 put_online_cpus();
10814 if (trycount++ < 10)
10815 udelay(trycount * num_online_cpus());
10816 else {
10817 synchronize_sched();
10818 return;
10819 }
10820 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10821 smp_mb(); /* ensure test happens before caller kfree */
10822 return;
10823 }
10824 get_online_cpus();
10825 }
10826 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10827 for_each_online_cpu(cpu) {
10828 rq = cpu_rq(cpu);
10829 req = &per_cpu(rcu_migration_req, cpu);
10830 init_completion(&req->done);
10831 req->task = NULL;
10832 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10833 spin_lock_irqsave(&rq->lock, flags);
10834 list_add(&req->list, &rq->migration_queue);
10835 spin_unlock_irqrestore(&rq->lock, flags);
10836 wake_up_process(rq->migration_thread);
10837 }
10838 for_each_online_cpu(cpu) {
10839 rcu_expedited_state = cpu;
10840 req = &per_cpu(rcu_migration_req, cpu);
10841 rq = cpu_rq(cpu);
10842 wait_for_completion(&req->done);
10843 spin_lock_irqsave(&rq->lock, flags);
10844 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10845 need_full_sync = 1;
10846 req->dest_cpu = RCU_MIGRATION_IDLE;
10847 spin_unlock_irqrestore(&rq->lock, flags);
10848 }
10849 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10850 mutex_unlock(&rcu_sched_expedited_mutex);
10851 put_online_cpus();
10852 if (need_full_sync)
10853 synchronize_sched();
10854}
10855EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10856
10857#endif /* #else #ifndef CONFIG_SMP */