diff options
author | Len Brown <len.brown@intel.com> | 2009-09-19 00:11:26 -0400 |
---|---|---|
committer | Len Brown <len.brown@intel.com> | 2009-09-19 00:11:26 -0400 |
commit | c602c65b2f81d14456771d1e3f15d1381f4b7efa (patch) | |
tree | f1f833c8dd6c1519eeb101be32f7fe54a9605af5 /kernel/sched.c | |
parent | 3834f47291df475be3f0f0fb7ccaa098967cc054 (diff) | |
parent | 78f28b7c555359c67c2a0d23f7436e915329421e (diff) |
Merge branch 'linus' into sfi-release
Conflicts:
arch/x86/kernel/setup.c
drivers/acpi/power.c
init/main.c
Signed-off-by: Len Brown <len.brown@intel.com>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1632 |
1 files changed, 953 insertions, 679 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..faf4d463bbff 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -64,7 +64,6 @@ | |||
64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
67 | #include <linux/reciprocal_div.h> | ||
68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
@@ -120,30 +119,6 @@ | |||
120 | */ | 119 | */ |
121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
122 | 121 | ||
123 | #ifdef CONFIG_SMP | ||
124 | |||
125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
126 | |||
127 | /* | ||
128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
130 | */ | ||
131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
132 | { | ||
133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Each time a sched group cpu_power is changed, | ||
138 | * we must compute its reciprocal value | ||
139 | */ | ||
140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
141 | { | ||
142 | sg->__cpu_power += val; | ||
143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline int rt_policy(int policy) | 122 | static inline int rt_policy(int policy) |
148 | { | 123 | { |
149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 124 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 284 | ||
310 | /* | 285 | /* |
311 | * Root task group. | 286 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 287 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 288 | * be a child to this group. |
314 | */ | 289 | */ |
315 | struct task_group root_task_group; | 290 | struct task_group root_task_group; |
316 | 291 | ||
@@ -318,12 +293,12 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 293 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 295 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 298 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 299 | #ifdef CONFIG_RT_GROUP_SCHED |
325 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
326 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); |
327 | #endif /* CONFIG_RT_GROUP_SCHED */ | 302 | #endif /* CONFIG_RT_GROUP_SCHED */ |
328 | #else /* !CONFIG_USER_SCHED */ | 303 | #else /* !CONFIG_USER_SCHED */ |
329 | #define root_task_group init_task_group | 304 | #define root_task_group init_task_group |
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
401 | 376 | ||
402 | #else | 377 | #else |
403 | 378 | ||
404 | #ifdef CONFIG_SMP | ||
405 | static int root_task_group_empty(void) | ||
406 | { | ||
407 | return 1; | ||
408 | } | ||
409 | #endif | ||
410 | |||
411 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 379 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
412 | static inline struct task_group *task_group(struct task_struct *p) | 380 | static inline struct task_group *task_group(struct task_struct *p) |
413 | { | 381 | { |
@@ -537,14 +505,6 @@ struct root_domain { | |||
537 | #ifdef CONFIG_SMP | 505 | #ifdef CONFIG_SMP |
538 | struct cpupri cpupri; | 506 | struct cpupri cpupri; |
539 | #endif | 507 | #endif |
540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
541 | /* | ||
542 | * Preferred wake up cpu nominated by sched_mc balance that will be | ||
543 | * used when most cpus are idle in the system indicating overall very | ||
544 | * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) | ||
545 | */ | ||
546 | unsigned int sched_mc_preferred_wakeup_cpu; | ||
547 | #endif | ||
548 | }; | 508 | }; |
549 | 509 | ||
550 | /* | 510 | /* |
@@ -616,6 +576,7 @@ struct rq { | |||
616 | 576 | ||
617 | unsigned char idle_at_tick; | 577 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 578 | /* For active balancing */ |
579 | int post_schedule; | ||
619 | int active_balance; | 580 | int active_balance; |
620 | int push_cpu; | 581 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 582 | /* cpu of this runqueue: */ |
@@ -626,6 +587,9 @@ struct rq { | |||
626 | 587 | ||
627 | struct task_struct *migration_thread; | 588 | struct task_struct *migration_thread; |
628 | struct list_head migration_queue; | 589 | struct list_head migration_queue; |
590 | |||
591 | u64 rt_avg; | ||
592 | u64 age_stamp; | ||
629 | #endif | 593 | #endif |
630 | 594 | ||
631 | /* calc_load related fields */ | 595 | /* calc_load related fields */ |
@@ -665,9 +629,10 @@ struct rq { | |||
665 | 629 | ||
666 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 630 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
667 | 631 | ||
668 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) | 632 | static inline |
633 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
669 | { | 634 | { |
670 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); | 635 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
671 | } | 636 | } |
672 | 637 | ||
673 | static inline int cpu_of(struct rq *rq) | 638 | static inline int cpu_of(struct rq *rq) |
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 658 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 659 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 660 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
661 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 662 | ||
697 | inline void update_rq_clock(struct rq *rq) | 663 | inline void update_rq_clock(struct rq *rq) |
698 | { | 664 | { |
@@ -861,6 +827,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
861 | unsigned int sysctl_sched_shares_thresh = 4; | 827 | unsigned int sysctl_sched_shares_thresh = 4; |
862 | 828 | ||
863 | /* | 829 | /* |
830 | * period over which we average the RT time consumption, measured | ||
831 | * in ms. | ||
832 | * | ||
833 | * default: 1s | ||
834 | */ | ||
835 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
836 | |||
837 | /* | ||
864 | * period over which we measure -rt task cpu usage in us. | 838 | * period over which we measure -rt task cpu usage in us. |
865 | * default: 1s | 839 | * default: 1s |
866 | */ | 840 | */ |
@@ -1278,12 +1252,37 @@ void wake_up_idle_cpu(int cpu) | |||
1278 | } | 1252 | } |
1279 | #endif /* CONFIG_NO_HZ */ | 1253 | #endif /* CONFIG_NO_HZ */ |
1280 | 1254 | ||
1255 | static u64 sched_avg_period(void) | ||
1256 | { | ||
1257 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1258 | } | ||
1259 | |||
1260 | static void sched_avg_update(struct rq *rq) | ||
1261 | { | ||
1262 | s64 period = sched_avg_period(); | ||
1263 | |||
1264 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
1265 | rq->age_stamp += period; | ||
1266 | rq->rt_avg /= 2; | ||
1267 | } | ||
1268 | } | ||
1269 | |||
1270 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1271 | { | ||
1272 | rq->rt_avg += rt_delta; | ||
1273 | sched_avg_update(rq); | ||
1274 | } | ||
1275 | |||
1281 | #else /* !CONFIG_SMP */ | 1276 | #else /* !CONFIG_SMP */ |
1282 | static void resched_task(struct task_struct *p) | 1277 | static void resched_task(struct task_struct *p) |
1283 | { | 1278 | { |
1284 | assert_spin_locked(&task_rq(p)->lock); | 1279 | assert_spin_locked(&task_rq(p)->lock); |
1285 | set_tsk_need_resched(p); | 1280 | set_tsk_need_resched(p); |
1286 | } | 1281 | } |
1282 | |||
1283 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1284 | { | ||
1285 | } | ||
1287 | #endif /* CONFIG_SMP */ | 1286 | #endif /* CONFIG_SMP */ |
1288 | 1287 | ||
1289 | #if BITS_PER_LONG == 32 | 1288 | #if BITS_PER_LONG == 32 |
@@ -1494,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data) | |||
1494 | #endif | 1493 | #endif |
1495 | 1494 | ||
1496 | #ifdef CONFIG_SMP | 1495 | #ifdef CONFIG_SMP |
1497 | static unsigned long source_load(int cpu, int type); | 1496 | /* Used instead of source_load when we know the type == 0 */ |
1498 | static unsigned long target_load(int cpu, int type); | 1497 | static unsigned long weighted_cpuload(const int cpu) |
1498 | { | ||
1499 | return cpu_rq(cpu)->load.weight; | ||
1500 | } | ||
1501 | |||
1502 | /* | ||
1503 | * Return a low guess at the load of a migration-source cpu weighted | ||
1504 | * according to the scheduling class and "nice" value. | ||
1505 | * | ||
1506 | * We want to under-estimate the load of migration sources, to | ||
1507 | * balance conservatively. | ||
1508 | */ | ||
1509 | static unsigned long source_load(int cpu, int type) | ||
1510 | { | ||
1511 | struct rq *rq = cpu_rq(cpu); | ||
1512 | unsigned long total = weighted_cpuload(cpu); | ||
1513 | |||
1514 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1515 | return total; | ||
1516 | |||
1517 | return min(rq->cpu_load[type-1], total); | ||
1518 | } | ||
1519 | |||
1520 | /* | ||
1521 | * Return a high guess at the load of a migration-target cpu weighted | ||
1522 | * according to the scheduling class and "nice" value. | ||
1523 | */ | ||
1524 | static unsigned long target_load(int cpu, int type) | ||
1525 | { | ||
1526 | struct rq *rq = cpu_rq(cpu); | ||
1527 | unsigned long total = weighted_cpuload(cpu); | ||
1528 | |||
1529 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1530 | return total; | ||
1531 | |||
1532 | return max(rq->cpu_load[type-1], total); | ||
1533 | } | ||
1534 | |||
1535 | static struct sched_group *group_of(int cpu) | ||
1536 | { | ||
1537 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
1538 | |||
1539 | if (!sd) | ||
1540 | return NULL; | ||
1541 | |||
1542 | return sd->groups; | ||
1543 | } | ||
1544 | |||
1545 | static unsigned long power_of(int cpu) | ||
1546 | { | ||
1547 | struct sched_group *group = group_of(cpu); | ||
1548 | |||
1549 | if (!group) | ||
1550 | return SCHED_LOAD_SCALE; | ||
1551 | |||
1552 | return group->cpu_power; | ||
1553 | } | ||
1554 | |||
1499 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1555 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1500 | 1556 | ||
1501 | static unsigned long cpu_avg_load_per_task(int cpu) | 1557 | static unsigned long cpu_avg_load_per_task(int cpu) |
@@ -1513,28 +1569,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1569 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1570 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1571 | ||
1572 | struct update_shares_data { | ||
1573 | unsigned long rq_weight[NR_CPUS]; | ||
1574 | }; | ||
1575 | |||
1576 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1577 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1578 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1579 | ||
1518 | /* | 1580 | /* |
1519 | * Calculate and set the cpu's group shares. | 1581 | * Calculate and set the cpu's group shares. |
1520 | */ | 1582 | */ |
1521 | static void | 1583 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1584 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1585 | unsigned long sd_rq_weight, |
1586 | struct update_shares_data *usd) | ||
1524 | { | 1587 | { |
1525 | unsigned long shares; | 1588 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1589 | int boost = 0; |
1527 | |||
1528 | if (!tg->se[cpu]) | ||
1529 | return; | ||
1530 | 1590 | ||
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1591 | rq_weight = usd->rq_weight[cpu]; |
1592 | if (!rq_weight) { | ||
1593 | boost = 1; | ||
1594 | rq_weight = NICE_0_LOAD; | ||
1595 | } | ||
1532 | 1596 | ||
1533 | /* | 1597 | /* |
1534 | * \Sum shares * rq_weight | 1598 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1599 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1600 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1601 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1602 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1603 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1608,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1608 | unsigned long flags; |
1546 | 1609 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1610 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1611 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1612 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1613 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1614 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1615 | } |
@@ -1559,22 +1622,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1622 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1623 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1624 | { |
1562 | unsigned long weight, rq_weight = 0; | 1625 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1626 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1627 | struct sched_domain *sd = data; |
1628 | unsigned long flags; | ||
1565 | int i; | 1629 | int i; |
1566 | 1630 | ||
1631 | if (!tg->se[0]) | ||
1632 | return 0; | ||
1633 | |||
1634 | local_irq_save(flags); | ||
1635 | usd = &__get_cpu_var(update_shares_data); | ||
1636 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1637 | for_each_cpu(i, sched_domain_span(sd)) { |
1638 | weight = tg->cfs_rq[i]->load.weight; | ||
1639 | usd->rq_weight[i] = weight; | ||
1640 | |||
1568 | /* | 1641 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1642 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1643 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1644 | * run here it will not get delayed by group starvation. |
1572 | */ | 1645 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1646 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1647 | weight = NICE_0_LOAD; |
1576 | 1648 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1649 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1650 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1651 | } |
@@ -1586,7 +1657,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1657 | shares = tg->shares; |
1587 | 1658 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1659 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1660 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1661 | |||
1662 | local_irq_restore(flags); | ||
1590 | 1663 | ||
1591 | return 0; | 1664 | return 0; |
1592 | } | 1665 | } |
@@ -1616,8 +1689,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1689 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1690 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1691 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1692 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1693 | u64 now; |
1694 | |||
1695 | if (root_task_group_empty()) | ||
1696 | return; | ||
1697 | |||
1698 | now = cpu_clock(raw_smp_processor_id()); | ||
1699 | elapsed = now - sd->last_update; | ||
1621 | 1700 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1701 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1702 | sd->last_update = now; |
@@ -1627,6 +1706,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1706 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1707 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1708 | { |
1709 | if (root_task_group_empty()) | ||
1710 | return; | ||
1711 | |||
1630 | spin_unlock(&rq->lock); | 1712 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1713 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1714 | spin_lock(&rq->lock); |
@@ -1634,6 +1716,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1716 | ||
1635 | static void update_h_load(long cpu) | 1717 | static void update_h_load(long cpu) |
1636 | { | 1718 | { |
1719 | if (root_task_group_empty()) | ||
1720 | return; | ||
1721 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1722 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1723 | } |
1639 | 1724 | ||
@@ -1651,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1651 | 1736 | ||
1652 | #ifdef CONFIG_PREEMPT | 1737 | #ifdef CONFIG_PREEMPT |
1653 | 1738 | ||
1739 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1740 | |||
1654 | /* | 1741 | /* |
1655 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | 1742 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
1656 | * way at the expense of forcing extra atomic operations in all | 1743 | * way at the expense of forcing extra atomic operations in all |
@@ -1915,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1915 | } | 2002 | } |
1916 | 2003 | ||
1917 | #ifdef CONFIG_SMP | 2004 | #ifdef CONFIG_SMP |
1918 | |||
1919 | /* Used instead of source_load when we know the type == 0 */ | ||
1920 | static unsigned long weighted_cpuload(const int cpu) | ||
1921 | { | ||
1922 | return cpu_rq(cpu)->load.weight; | ||
1923 | } | ||
1924 | |||
1925 | /* | 2005 | /* |
1926 | * Is this task likely cache-hot: | 2006 | * Is this task likely cache-hot: |
1927 | */ | 2007 | */ |
@@ -2195,186 +2275,6 @@ void kick_process(struct task_struct *p) | |||
2195 | preempt_enable(); | 2275 | preempt_enable(); |
2196 | } | 2276 | } |
2197 | EXPORT_SYMBOL_GPL(kick_process); | 2277 | EXPORT_SYMBOL_GPL(kick_process); |
2198 | |||
2199 | /* | ||
2200 | * Return a low guess at the load of a migration-source cpu weighted | ||
2201 | * according to the scheduling class and "nice" value. | ||
2202 | * | ||
2203 | * We want to under-estimate the load of migration sources, to | ||
2204 | * balance conservatively. | ||
2205 | */ | ||
2206 | static unsigned long source_load(int cpu, int type) | ||
2207 | { | ||
2208 | struct rq *rq = cpu_rq(cpu); | ||
2209 | unsigned long total = weighted_cpuload(cpu); | ||
2210 | |||
2211 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2212 | return total; | ||
2213 | |||
2214 | return min(rq->cpu_load[type-1], total); | ||
2215 | } | ||
2216 | |||
2217 | /* | ||
2218 | * Return a high guess at the load of a migration-target cpu weighted | ||
2219 | * according to the scheduling class and "nice" value. | ||
2220 | */ | ||
2221 | static unsigned long target_load(int cpu, int type) | ||
2222 | { | ||
2223 | struct rq *rq = cpu_rq(cpu); | ||
2224 | unsigned long total = weighted_cpuload(cpu); | ||
2225 | |||
2226 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2227 | return total; | ||
2228 | |||
2229 | return max(rq->cpu_load[type-1], total); | ||
2230 | } | ||
2231 | |||
2232 | /* | ||
2233 | * find_idlest_group finds and returns the least busy CPU group within the | ||
2234 | * domain. | ||
2235 | */ | ||
2236 | static struct sched_group * | ||
2237 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
2238 | { | ||
2239 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
2240 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
2241 | int load_idx = sd->forkexec_idx; | ||
2242 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
2243 | |||
2244 | do { | ||
2245 | unsigned long load, avg_load; | ||
2246 | int local_group; | ||
2247 | int i; | ||
2248 | |||
2249 | /* Skip over this group if it has no CPUs allowed */ | ||
2250 | if (!cpumask_intersects(sched_group_cpus(group), | ||
2251 | &p->cpus_allowed)) | ||
2252 | continue; | ||
2253 | |||
2254 | local_group = cpumask_test_cpu(this_cpu, | ||
2255 | sched_group_cpus(group)); | ||
2256 | |||
2257 | /* Tally up the load of all CPUs in the group */ | ||
2258 | avg_load = 0; | ||
2259 | |||
2260 | for_each_cpu(i, sched_group_cpus(group)) { | ||
2261 | /* Bias balancing toward cpus of our domain */ | ||
2262 | if (local_group) | ||
2263 | load = source_load(i, load_idx); | ||
2264 | else | ||
2265 | load = target_load(i, load_idx); | ||
2266 | |||
2267 | avg_load += load; | ||
2268 | } | ||
2269 | |||
2270 | /* Adjust by relative CPU power of the group */ | ||
2271 | avg_load = sg_div_cpu_power(group, | ||
2272 | avg_load * SCHED_LOAD_SCALE); | ||
2273 | |||
2274 | if (local_group) { | ||
2275 | this_load = avg_load; | ||
2276 | this = group; | ||
2277 | } else if (avg_load < min_load) { | ||
2278 | min_load = avg_load; | ||
2279 | idlest = group; | ||
2280 | } | ||
2281 | } while (group = group->next, group != sd->groups); | ||
2282 | |||
2283 | if (!idlest || 100*this_load < imbalance*min_load) | ||
2284 | return NULL; | ||
2285 | return idlest; | ||
2286 | } | ||
2287 | |||
2288 | /* | ||
2289 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | ||
2290 | */ | ||
2291 | static int | ||
2292 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
2293 | { | ||
2294 | unsigned long load, min_load = ULONG_MAX; | ||
2295 | int idlest = -1; | ||
2296 | int i; | ||
2297 | |||
2298 | /* Traverse only the allowed CPUs */ | ||
2299 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | ||
2300 | load = weighted_cpuload(i); | ||
2301 | |||
2302 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
2303 | min_load = load; | ||
2304 | idlest = i; | ||
2305 | } | ||
2306 | } | ||
2307 | |||
2308 | return idlest; | ||
2309 | } | ||
2310 | |||
2311 | /* | ||
2312 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
2313 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
2314 | * SD_BALANCE_EXEC. | ||
2315 | * | ||
2316 | * Balance, ie. select the least loaded group. | ||
2317 | * | ||
2318 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
2319 | * | ||
2320 | * preempt must be disabled. | ||
2321 | */ | ||
2322 | static int sched_balance_self(int cpu, int flag) | ||
2323 | { | ||
2324 | struct task_struct *t = current; | ||
2325 | struct sched_domain *tmp, *sd = NULL; | ||
2326 | |||
2327 | for_each_domain(cpu, tmp) { | ||
2328 | /* | ||
2329 | * If power savings logic is enabled for a domain, stop there. | ||
2330 | */ | ||
2331 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2332 | break; | ||
2333 | if (tmp->flags & flag) | ||
2334 | sd = tmp; | ||
2335 | } | ||
2336 | |||
2337 | if (sd) | ||
2338 | update_shares(sd); | ||
2339 | |||
2340 | while (sd) { | ||
2341 | struct sched_group *group; | ||
2342 | int new_cpu, weight; | ||
2343 | |||
2344 | if (!(sd->flags & flag)) { | ||
2345 | sd = sd->child; | ||
2346 | continue; | ||
2347 | } | ||
2348 | |||
2349 | group = find_idlest_group(sd, t, cpu); | ||
2350 | if (!group) { | ||
2351 | sd = sd->child; | ||
2352 | continue; | ||
2353 | } | ||
2354 | |||
2355 | new_cpu = find_idlest_cpu(group, t, cpu); | ||
2356 | if (new_cpu == -1 || new_cpu == cpu) { | ||
2357 | /* Now try balancing at a lower domain level of cpu */ | ||
2358 | sd = sd->child; | ||
2359 | continue; | ||
2360 | } | ||
2361 | |||
2362 | /* Now try balancing at a lower domain level of new_cpu */ | ||
2363 | cpu = new_cpu; | ||
2364 | weight = cpumask_weight(sched_domain_span(sd)); | ||
2365 | sd = NULL; | ||
2366 | for_each_domain(cpu, tmp) { | ||
2367 | if (weight <= cpumask_weight(sched_domain_span(tmp))) | ||
2368 | break; | ||
2369 | if (tmp->flags & flag) | ||
2370 | sd = tmp; | ||
2371 | } | ||
2372 | /* while loop will break here if sd == NULL */ | ||
2373 | } | ||
2374 | |||
2375 | return cpu; | ||
2376 | } | ||
2377 | |||
2378 | #endif /* CONFIG_SMP */ | 2278 | #endif /* CONFIG_SMP */ |
2379 | 2279 | ||
2380 | /** | 2280 | /** |
@@ -2412,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p, | |||
2412 | * | 2312 | * |
2413 | * returns failure only if the task is already active. | 2313 | * returns failure only if the task is already active. |
2414 | */ | 2314 | */ |
2415 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 2315 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2316 | int wake_flags) | ||
2416 | { | 2317 | { |
2417 | int cpu, orig_cpu, this_cpu, success = 0; | 2318 | int cpu, orig_cpu, this_cpu, success = 0; |
2418 | unsigned long flags; | 2319 | unsigned long flags; |
2419 | long old_state; | ||
2420 | struct rq *rq; | 2320 | struct rq *rq; |
2421 | 2321 | ||
2422 | if (!sched_feat(SYNC_WAKEUPS)) | 2322 | if (!sched_feat(SYNC_WAKEUPS)) |
2423 | sync = 0; | 2323 | wake_flags &= ~WF_SYNC; |
2424 | |||
2425 | #ifdef CONFIG_SMP | ||
2426 | if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { | ||
2427 | struct sched_domain *sd; | ||
2428 | |||
2429 | this_cpu = raw_smp_processor_id(); | ||
2430 | cpu = task_cpu(p); | ||
2431 | 2324 | ||
2432 | for_each_domain(this_cpu, sd) { | 2325 | this_cpu = get_cpu(); |
2433 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2434 | update_shares(sd); | ||
2435 | break; | ||
2436 | } | ||
2437 | } | ||
2438 | } | ||
2439 | #endif | ||
2440 | 2326 | ||
2441 | smp_wmb(); | 2327 | smp_wmb(); |
2442 | rq = task_rq_lock(p, &flags); | 2328 | rq = task_rq_lock(p, &flags); |
2443 | update_rq_clock(rq); | 2329 | update_rq_clock(rq); |
2444 | old_state = p->state; | 2330 | if (!(p->state & state)) |
2445 | if (!(old_state & state)) | ||
2446 | goto out; | 2331 | goto out; |
2447 | 2332 | ||
2448 | if (p->se.on_rq) | 2333 | if (p->se.on_rq) |
@@ -2450,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2450 | 2335 | ||
2451 | cpu = task_cpu(p); | 2336 | cpu = task_cpu(p); |
2452 | orig_cpu = cpu; | 2337 | orig_cpu = cpu; |
2453 | this_cpu = smp_processor_id(); | ||
2454 | 2338 | ||
2455 | #ifdef CONFIG_SMP | 2339 | #ifdef CONFIG_SMP |
2456 | if (unlikely(task_running(rq, p))) | 2340 | if (unlikely(task_running(rq, p))) |
2457 | goto out_activate; | 2341 | goto out_activate; |
2458 | 2342 | ||
2459 | cpu = p->sched_class->select_task_rq(p, sync); | 2343 | /* |
2460 | if (cpu != orig_cpu) { | 2344 | * In order to handle concurrent wakeups and release the rq->lock |
2345 | * we put the task in TASK_WAKING state. | ||
2346 | * | ||
2347 | * First fix up the nr_uninterruptible count: | ||
2348 | */ | ||
2349 | if (task_contributes_to_load(p)) | ||
2350 | rq->nr_uninterruptible--; | ||
2351 | p->state = TASK_WAKING; | ||
2352 | task_rq_unlock(rq, &flags); | ||
2353 | |||
2354 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2355 | if (cpu != orig_cpu) | ||
2461 | set_task_cpu(p, cpu); | 2356 | set_task_cpu(p, cpu); |
2462 | task_rq_unlock(rq, &flags); | ||
2463 | /* might preempt at this point */ | ||
2464 | rq = task_rq_lock(p, &flags); | ||
2465 | old_state = p->state; | ||
2466 | if (!(old_state & state)) | ||
2467 | goto out; | ||
2468 | if (p->se.on_rq) | ||
2469 | goto out_running; | ||
2470 | 2357 | ||
2471 | this_cpu = smp_processor_id(); | 2358 | rq = task_rq_lock(p, &flags); |
2472 | cpu = task_cpu(p); | 2359 | WARN_ON(p->state != TASK_WAKING); |
2473 | } | 2360 | cpu = task_cpu(p); |
2474 | 2361 | ||
2475 | #ifdef CONFIG_SCHEDSTATS | 2362 | #ifdef CONFIG_SCHEDSTATS |
2476 | schedstat_inc(rq, ttwu_count); | 2363 | schedstat_inc(rq, ttwu_count); |
@@ -2490,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2490 | out_activate: | 2377 | out_activate: |
2491 | #endif /* CONFIG_SMP */ | 2378 | #endif /* CONFIG_SMP */ |
2492 | schedstat_inc(p, se.nr_wakeups); | 2379 | schedstat_inc(p, se.nr_wakeups); |
2493 | if (sync) | 2380 | if (wake_flags & WF_SYNC) |
2494 | schedstat_inc(p, se.nr_wakeups_sync); | 2381 | schedstat_inc(p, se.nr_wakeups_sync); |
2495 | if (orig_cpu != cpu) | 2382 | if (orig_cpu != cpu) |
2496 | schedstat_inc(p, se.nr_wakeups_migrate); | 2383 | schedstat_inc(p, se.nr_wakeups_migrate); |
@@ -2519,7 +2406,7 @@ out_activate: | |||
2519 | 2406 | ||
2520 | out_running: | 2407 | out_running: |
2521 | trace_sched_wakeup(rq, p, success); | 2408 | trace_sched_wakeup(rq, p, success); |
2522 | check_preempt_curr(rq, p, sync); | 2409 | check_preempt_curr(rq, p, wake_flags); |
2523 | 2410 | ||
2524 | p->state = TASK_RUNNING; | 2411 | p->state = TASK_RUNNING; |
2525 | #ifdef CONFIG_SMP | 2412 | #ifdef CONFIG_SMP |
@@ -2528,6 +2415,7 @@ out_running: | |||
2528 | #endif | 2415 | #endif |
2529 | out: | 2416 | out: |
2530 | task_rq_unlock(rq, &flags); | 2417 | task_rq_unlock(rq, &flags); |
2418 | put_cpu(); | ||
2531 | 2419 | ||
2532 | return success; | 2420 | return success; |
2533 | } | 2421 | } |
@@ -2570,6 +2458,7 @@ static void __sched_fork(struct task_struct *p) | |||
2570 | p->se.avg_overlap = 0; | 2458 | p->se.avg_overlap = 0; |
2571 | p->se.start_runtime = 0; | 2459 | p->se.start_runtime = 0; |
2572 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2460 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
2461 | p->se.avg_running = 0; | ||
2573 | 2462 | ||
2574 | #ifdef CONFIG_SCHEDSTATS | 2463 | #ifdef CONFIG_SCHEDSTATS |
2575 | p->se.wait_start = 0; | 2464 | p->se.wait_start = 0; |
@@ -2631,18 +2520,41 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2631 | 2520 | ||
2632 | __sched_fork(p); | 2521 | __sched_fork(p); |
2633 | 2522 | ||
2634 | #ifdef CONFIG_SMP | ||
2635 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
2636 | #endif | ||
2637 | set_task_cpu(p, cpu); | ||
2638 | |||
2639 | /* | 2523 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2524 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2525 | */ |
2642 | p->prio = current->normal_prio; | 2526 | p->prio = current->normal_prio; |
2527 | |||
2528 | /* | ||
2529 | * Revert to default priority/policy on fork if requested. | ||
2530 | */ | ||
2531 | if (unlikely(p->sched_reset_on_fork)) { | ||
2532 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
2533 | p->policy = SCHED_NORMAL; | ||
2534 | |||
2535 | if (p->normal_prio < DEFAULT_PRIO) | ||
2536 | p->prio = DEFAULT_PRIO; | ||
2537 | |||
2538 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2539 | p->static_prio = NICE_TO_PRIO(0); | ||
2540 | set_load_weight(p); | ||
2541 | } | ||
2542 | |||
2543 | /* | ||
2544 | * We don't need the reset flag anymore after the fork. It has | ||
2545 | * fulfilled its duty: | ||
2546 | */ | ||
2547 | p->sched_reset_on_fork = 0; | ||
2548 | } | ||
2549 | |||
2643 | if (!rt_prio(p->prio)) | 2550 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2551 | p->sched_class = &fair_sched_class; |
2645 | 2552 | ||
2553 | #ifdef CONFIG_SMP | ||
2554 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2555 | #endif | ||
2556 | set_task_cpu(p, cpu); | ||
2557 | |||
2646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2558 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2647 | if (likely(sched_info_on())) | 2559 | if (likely(sched_info_on())) |
2648 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2560 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
@@ -2688,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2688 | inc_nr_running(rq); | 2600 | inc_nr_running(rq); |
2689 | } | 2601 | } |
2690 | trace_sched_wakeup_new(rq, p, 1); | 2602 | trace_sched_wakeup_new(rq, p, 1); |
2691 | check_preempt_curr(rq, p, 0); | 2603 | check_preempt_curr(rq, p, WF_FORK); |
2692 | #ifdef CONFIG_SMP | 2604 | #ifdef CONFIG_SMP |
2693 | if (p->sched_class->task_wake_up) | 2605 | if (p->sched_class->task_wake_up) |
2694 | p->sched_class->task_wake_up(rq, p); | 2606 | p->sched_class->task_wake_up(rq, p); |
@@ -2796,12 +2708,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2708 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2709 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2710 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2711 | ||
2806 | rq->prev_mm = NULL; | 2712 | rq->prev_mm = NULL; |
2807 | 2713 | ||
@@ -2820,10 +2726,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2820 | finish_arch_switch(prev); | 2726 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2727 | perf_counter_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2728 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2729 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2730 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2731 | if (mm) |
@@ -2838,6 +2740,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2740 | } |
2839 | } | 2741 | } |
2840 | 2742 | ||
2743 | #ifdef CONFIG_SMP | ||
2744 | |||
2745 | /* assumes rq->lock is held */ | ||
2746 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2747 | { | ||
2748 | if (prev->sched_class->pre_schedule) | ||
2749 | prev->sched_class->pre_schedule(rq, prev); | ||
2750 | } | ||
2751 | |||
2752 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2753 | static inline void post_schedule(struct rq *rq) | ||
2754 | { | ||
2755 | if (rq->post_schedule) { | ||
2756 | unsigned long flags; | ||
2757 | |||
2758 | spin_lock_irqsave(&rq->lock, flags); | ||
2759 | if (rq->curr->sched_class->post_schedule) | ||
2760 | rq->curr->sched_class->post_schedule(rq); | ||
2761 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2762 | |||
2763 | rq->post_schedule = 0; | ||
2764 | } | ||
2765 | } | ||
2766 | |||
2767 | #else | ||
2768 | |||
2769 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2770 | { | ||
2771 | } | ||
2772 | |||
2773 | static inline void post_schedule(struct rq *rq) | ||
2774 | { | ||
2775 | } | ||
2776 | |||
2777 | #endif | ||
2778 | |||
2841 | /** | 2779 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2780 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2781 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2786,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2786 | struct rq *rq = this_rq(); |
2849 | 2787 | ||
2850 | finish_task_switch(rq, prev); | 2788 | finish_task_switch(rq, prev); |
2789 | |||
2790 | /* | ||
2791 | * FIXME: do we need to worry about rq being invalidated by the | ||
2792 | * task_switch? | ||
2793 | */ | ||
2794 | post_schedule(rq); | ||
2795 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2796 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2797 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2798 | preempt_enable(); |
@@ -3164,7 +3109,7 @@ out: | |||
3164 | void sched_exec(void) | 3109 | void sched_exec(void) |
3165 | { | 3110 | { |
3166 | int new_cpu, this_cpu = get_cpu(); | 3111 | int new_cpu, this_cpu = get_cpu(); |
3167 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 3112 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); |
3168 | put_cpu(); | 3113 | put_cpu(); |
3169 | if (new_cpu != this_cpu) | 3114 | if (new_cpu != this_cpu) |
3170 | sched_migrate_task(current, new_cpu); | 3115 | sched_migrate_task(current, new_cpu); |
@@ -3379,9 +3324,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3324 | { |
3380 | const struct sched_class *class; | 3325 | const struct sched_class *class; |
3381 | 3326 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3327 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3328 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3329 | return 1; |
3330 | } | ||
3385 | 3331 | ||
3386 | return 0; | 3332 | return 0; |
3387 | } | 3333 | } |
@@ -3544,7 +3490,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
3544 | * capacity but still has some space to pick up some load | 3490 | * capacity but still has some space to pick up some load |
3545 | * from other group and save more power | 3491 | * from other group and save more power |
3546 | */ | 3492 | */ |
3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3493 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
3548 | return; | 3494 | return; |
3549 | 3495 | ||
3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3496 | if (sgs->sum_nr_running > sds->leader_nr_running || |
@@ -3583,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3583 | *imbalance = sds->min_load_per_task; | 3529 | *imbalance = sds->min_load_per_task; |
3584 | sds->busiest = sds->group_min; | 3530 | sds->busiest = sds->group_min; |
3585 | 3531 | ||
3586 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | ||
3587 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | ||
3588 | group_first_cpu(sds->group_leader); | ||
3589 | } | ||
3590 | |||
3591 | return 1; | 3532 | return 1; |
3592 | 3533 | ||
3593 | } | 3534 | } |
@@ -3612,6 +3553,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3553 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3613 | 3554 | ||
3614 | 3555 | ||
3556 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3557 | { | ||
3558 | return SCHED_LOAD_SCALE; | ||
3559 | } | ||
3560 | |||
3561 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3562 | { | ||
3563 | return default_scale_freq_power(sd, cpu); | ||
3564 | } | ||
3565 | |||
3566 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3567 | { | ||
3568 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3569 | unsigned long smt_gain = sd->smt_gain; | ||
3570 | |||
3571 | smt_gain /= weight; | ||
3572 | |||
3573 | return smt_gain; | ||
3574 | } | ||
3575 | |||
3576 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3577 | { | ||
3578 | return default_scale_smt_power(sd, cpu); | ||
3579 | } | ||
3580 | |||
3581 | unsigned long scale_rt_power(int cpu) | ||
3582 | { | ||
3583 | struct rq *rq = cpu_rq(cpu); | ||
3584 | u64 total, available; | ||
3585 | |||
3586 | sched_avg_update(rq); | ||
3587 | |||
3588 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3589 | available = total - rq->rt_avg; | ||
3590 | |||
3591 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3592 | total = SCHED_LOAD_SCALE; | ||
3593 | |||
3594 | total >>= SCHED_LOAD_SHIFT; | ||
3595 | |||
3596 | return div_u64(available, total); | ||
3597 | } | ||
3598 | |||
3599 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3600 | { | ||
3601 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3602 | unsigned long power = SCHED_LOAD_SCALE; | ||
3603 | struct sched_group *sdg = sd->groups; | ||
3604 | |||
3605 | if (sched_feat(ARCH_POWER)) | ||
3606 | power *= arch_scale_freq_power(sd, cpu); | ||
3607 | else | ||
3608 | power *= default_scale_freq_power(sd, cpu); | ||
3609 | |||
3610 | power >>= SCHED_LOAD_SHIFT; | ||
3611 | |||
3612 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3613 | if (sched_feat(ARCH_POWER)) | ||
3614 | power *= arch_scale_smt_power(sd, cpu); | ||
3615 | else | ||
3616 | power *= default_scale_smt_power(sd, cpu); | ||
3617 | |||
3618 | power >>= SCHED_LOAD_SHIFT; | ||
3619 | } | ||
3620 | |||
3621 | power *= scale_rt_power(cpu); | ||
3622 | power >>= SCHED_LOAD_SHIFT; | ||
3623 | |||
3624 | if (!power) | ||
3625 | power = 1; | ||
3626 | |||
3627 | sdg->cpu_power = power; | ||
3628 | } | ||
3629 | |||
3630 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3631 | { | ||
3632 | struct sched_domain *child = sd->child; | ||
3633 | struct sched_group *group, *sdg = sd->groups; | ||
3634 | unsigned long power; | ||
3635 | |||
3636 | if (!child) { | ||
3637 | update_cpu_power(sd, cpu); | ||
3638 | return; | ||
3639 | } | ||
3640 | |||
3641 | power = 0; | ||
3642 | |||
3643 | group = child->groups; | ||
3644 | do { | ||
3645 | power += group->cpu_power; | ||
3646 | group = group->next; | ||
3647 | } while (group != child->groups); | ||
3648 | |||
3649 | sdg->cpu_power = power; | ||
3650 | } | ||
3651 | |||
3615 | /** | 3652 | /** |
3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3653 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3617 | * @group: sched_group whose statistics are to be updated. | 3654 | * @group: sched_group whose statistics are to be updated. |
@@ -3624,7 +3661,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3624 | * @balance: Should we balance. | 3661 | * @balance: Should we balance. |
3625 | * @sgs: variable to hold the statistics for this group. | 3662 | * @sgs: variable to hold the statistics for this group. |
3626 | */ | 3663 | */ |
3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3664 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
3665 | struct sched_group *group, int this_cpu, | ||
3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3666 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
3629 | int local_group, const struct cpumask *cpus, | 3667 | int local_group, const struct cpumask *cpus, |
3630 | int *balance, struct sg_lb_stats *sgs) | 3668 | int *balance, struct sg_lb_stats *sgs) |
@@ -3635,8 +3673,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3635 | unsigned long sum_avg_load_per_task; | 3673 | unsigned long sum_avg_load_per_task; |
3636 | unsigned long avg_load_per_task; | 3674 | unsigned long avg_load_per_task; |
3637 | 3675 | ||
3638 | if (local_group) | 3676 | if (local_group) { |
3639 | balance_cpu = group_first_cpu(group); | 3677 | balance_cpu = group_first_cpu(group); |
3678 | if (balance_cpu == this_cpu) | ||
3679 | update_group_power(sd, this_cpu); | ||
3680 | } | ||
3640 | 3681 | ||
3641 | /* Tally up the load of all CPUs in the group */ | 3682 | /* Tally up the load of all CPUs in the group */ |
3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3683 | sum_avg_load_per_task = avg_load_per_task = 0; |
@@ -3685,8 +3726,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3685 | } | 3726 | } |
3686 | 3727 | ||
3687 | /* Adjust by relative CPU power of the group */ | 3728 | /* Adjust by relative CPU power of the group */ |
3688 | sgs->avg_load = sg_div_cpu_power(group, | 3729 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
3690 | 3730 | ||
3691 | 3731 | ||
3692 | /* | 3732 | /* |
@@ -3698,14 +3738,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3698 | * normalized nr_running number somewhere that negates | 3738 | * normalized nr_running number somewhere that negates |
3699 | * the hierarchy? | 3739 | * the hierarchy? |
3700 | */ | 3740 | */ |
3701 | avg_load_per_task = sg_div_cpu_power(group, | 3741 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3742 | group->cpu_power; |
3703 | 3743 | ||
3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3744 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3705 | sgs->group_imb = 1; | 3745 | sgs->group_imb = 1; |
3706 | 3746 | ||
3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3747 | sgs->group_capacity = |
3708 | 3748 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
3709 | } | 3749 | } |
3710 | 3750 | ||
3711 | /** | 3751 | /** |
@@ -3723,9 +3763,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3723 | const struct cpumask *cpus, int *balance, | 3763 | const struct cpumask *cpus, int *balance, |
3724 | struct sd_lb_stats *sds) | 3764 | struct sd_lb_stats *sds) |
3725 | { | 3765 | { |
3766 | struct sched_domain *child = sd->child; | ||
3726 | struct sched_group *group = sd->groups; | 3767 | struct sched_group *group = sd->groups; |
3727 | struct sg_lb_stats sgs; | 3768 | struct sg_lb_stats sgs; |
3728 | int load_idx; | 3769 | int load_idx, prefer_sibling = 0; |
3770 | |||
3771 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3772 | prefer_sibling = 1; | ||
3729 | 3773 | ||
3730 | init_sd_power_savings_stats(sd, sds, idle); | 3774 | init_sd_power_savings_stats(sd, sds, idle); |
3731 | load_idx = get_sd_load_idx(sd, idle); | 3775 | load_idx = get_sd_load_idx(sd, idle); |
@@ -3736,14 +3780,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3736 | local_group = cpumask_test_cpu(this_cpu, | 3780 | local_group = cpumask_test_cpu(this_cpu, |
3737 | sched_group_cpus(group)); | 3781 | sched_group_cpus(group)); |
3738 | memset(&sgs, 0, sizeof(sgs)); | 3782 | memset(&sgs, 0, sizeof(sgs)); |
3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3783 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
3740 | local_group, cpus, balance, &sgs); | 3784 | local_group, cpus, balance, &sgs); |
3741 | 3785 | ||
3742 | if (local_group && balance && !(*balance)) | 3786 | if (local_group && balance && !(*balance)) |
3743 | return; | 3787 | return; |
3744 | 3788 | ||
3745 | sds->total_load += sgs.group_load; | 3789 | sds->total_load += sgs.group_load; |
3746 | sds->total_pwr += group->__cpu_power; | 3790 | sds->total_pwr += group->cpu_power; |
3791 | |||
3792 | /* | ||
3793 | * In case the child domain prefers tasks go to siblings | ||
3794 | * first, lower the group capacity to one so that we'll try | ||
3795 | * and move all the excess tasks away. | ||
3796 | */ | ||
3797 | if (prefer_sibling) | ||
3798 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3747 | 3799 | ||
3748 | if (local_group) { | 3800 | if (local_group) { |
3749 | sds->this_load = sgs.avg_load; | 3801 | sds->this_load = sgs.avg_load; |
@@ -3763,7 +3815,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3815 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3764 | group = group->next; | 3816 | group = group->next; |
3765 | } while (group != sd->groups); | 3817 | } while (group != sd->groups); |
3766 | |||
3767 | } | 3818 | } |
3768 | 3819 | ||
3769 | /** | 3820 | /** |
@@ -3801,28 +3852,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3801 | * moving them. | 3852 | * moving them. |
3802 | */ | 3853 | */ |
3803 | 3854 | ||
3804 | pwr_now += sds->busiest->__cpu_power * | 3855 | pwr_now += sds->busiest->cpu_power * |
3805 | min(sds->busiest_load_per_task, sds->max_load); | 3856 | min(sds->busiest_load_per_task, sds->max_load); |
3806 | pwr_now += sds->this->__cpu_power * | 3857 | pwr_now += sds->this->cpu_power * |
3807 | min(sds->this_load_per_task, sds->this_load); | 3858 | min(sds->this_load_per_task, sds->this_load); |
3808 | pwr_now /= SCHED_LOAD_SCALE; | 3859 | pwr_now /= SCHED_LOAD_SCALE; |
3809 | 3860 | ||
3810 | /* Amount of load we'd subtract */ | 3861 | /* Amount of load we'd subtract */ |
3811 | tmp = sg_div_cpu_power(sds->busiest, | 3862 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3863 | sds->busiest->cpu_power; |
3813 | if (sds->max_load > tmp) | 3864 | if (sds->max_load > tmp) |
3814 | pwr_move += sds->busiest->__cpu_power * | 3865 | pwr_move += sds->busiest->cpu_power * |
3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3866 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
3816 | 3867 | ||
3817 | /* Amount of load we'd add */ | 3868 | /* Amount of load we'd add */ |
3818 | if (sds->max_load * sds->busiest->__cpu_power < | 3869 | if (sds->max_load * sds->busiest->cpu_power < |
3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3870 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
3820 | tmp = sg_div_cpu_power(sds->this, | 3871 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
3821 | sds->max_load * sds->busiest->__cpu_power); | 3872 | sds->this->cpu_power; |
3822 | else | 3873 | else |
3823 | tmp = sg_div_cpu_power(sds->this, | 3874 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3875 | sds->this->cpu_power; |
3825 | pwr_move += sds->this->__cpu_power * | 3876 | pwr_move += sds->this->cpu_power * |
3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 3877 | min(sds->this_load_per_task, sds->this_load + tmp); |
3827 | pwr_move /= SCHED_LOAD_SCALE; | 3878 | pwr_move /= SCHED_LOAD_SCALE; |
3828 | 3879 | ||
@@ -3857,8 +3908,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3857 | sds->max_load - sds->busiest_load_per_task); | 3908 | sds->max_load - sds->busiest_load_per_task); |
3858 | 3909 | ||
3859 | /* How much load to actually move to equalise the imbalance */ | 3910 | /* How much load to actually move to equalise the imbalance */ |
3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 3911 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 3912 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3862 | / SCHED_LOAD_SCALE; | 3913 | / SCHED_LOAD_SCALE; |
3863 | 3914 | ||
3864 | /* | 3915 | /* |
@@ -3988,15 +4039,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3988 | int i; | 4039 | int i; |
3989 | 4040 | ||
3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4041 | for_each_cpu(i, sched_group_cpus(group)) { |
4042 | unsigned long power = power_of(i); | ||
4043 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
3991 | unsigned long wl; | 4044 | unsigned long wl; |
3992 | 4045 | ||
3993 | if (!cpumask_test_cpu(i, cpus)) | 4046 | if (!cpumask_test_cpu(i, cpus)) |
3994 | continue; | 4047 | continue; |
3995 | 4048 | ||
3996 | rq = cpu_rq(i); | 4049 | rq = cpu_rq(i); |
3997 | wl = weighted_cpuload(i); | 4050 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
4051 | wl /= power; | ||
3998 | 4052 | ||
3999 | if (rq->nr_running == 1 && wl > imbalance) | 4053 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
4000 | continue; | 4054 | continue; |
4001 | 4055 | ||
4002 | if (wl > max_load) { | 4056 | if (wl > max_load) { |
@@ -5257,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
5257 | #endif | 5311 | #endif |
5258 | } | 5312 | } |
5259 | 5313 | ||
5260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 5314 | static void put_prev_task(struct rq *rq, struct task_struct *p) |
5261 | { | 5315 | { |
5262 | if (prev->state == TASK_RUNNING) { | 5316 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; |
5263 | u64 runtime = prev->se.sum_exec_runtime; | ||
5264 | 5317 | ||
5265 | runtime -= prev->se.prev_sum_exec_runtime; | 5318 | update_avg(&p->se.avg_running, runtime); |
5266 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
5267 | 5319 | ||
5320 | if (p->state == TASK_RUNNING) { | ||
5268 | /* | 5321 | /* |
5269 | * In order to avoid avg_overlap growing stale when we are | 5322 | * In order to avoid avg_overlap growing stale when we are |
5270 | * indeed overlapping and hence not getting put to sleep, grow | 5323 | * indeed overlapping and hence not getting put to sleep, grow |
@@ -5274,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
5274 | * correlates to the amount of cache footprint a task can | 5327 | * correlates to the amount of cache footprint a task can |
5275 | * build up. | 5328 | * build up. |
5276 | */ | 5329 | */ |
5277 | update_avg(&prev->se.avg_overlap, runtime); | 5330 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); |
5331 | update_avg(&p->se.avg_overlap, runtime); | ||
5332 | } else { | ||
5333 | update_avg(&p->se.avg_running, 0); | ||
5278 | } | 5334 | } |
5279 | prev->sched_class->put_prev_task(rq, prev); | 5335 | p->sched_class->put_prev_task(rq, p); |
5280 | } | 5336 | } |
5281 | 5337 | ||
5282 | /* | 5338 | /* |
@@ -5325,7 +5381,7 @@ need_resched: | |||
5325 | preempt_disable(); | 5381 | preempt_disable(); |
5326 | cpu = smp_processor_id(); | 5382 | cpu = smp_processor_id(); |
5327 | rq = cpu_rq(cpu); | 5383 | rq = cpu_rq(cpu); |
5328 | rcu_qsctr_inc(cpu); | 5384 | rcu_sched_qs(cpu); |
5329 | prev = rq->curr; | 5385 | prev = rq->curr; |
5330 | switch_count = &prev->nivcsw; | 5386 | switch_count = &prev->nivcsw; |
5331 | 5387 | ||
@@ -5349,10 +5405,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5405 | switch_count = &prev->nvcsw; |
5350 | } | 5406 | } |
5351 | 5407 | ||
5352 | #ifdef CONFIG_SMP | 5408 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5409 | ||
5357 | if (unlikely(!rq->nr_running)) | 5410 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5411 | idle_balance(cpu, rq); |
@@ -5378,6 +5431,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5431 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5432 | spin_unlock_irq(&rq->lock); |
5380 | 5433 | ||
5434 | post_schedule(rq); | ||
5435 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5436 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5437 | goto need_resched_nonpreemptible; |
5383 | 5438 | ||
@@ -5509,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
5509 | 5564 | ||
5510 | #endif /* CONFIG_PREEMPT */ | 5565 | #endif /* CONFIG_PREEMPT */ |
5511 | 5566 | ||
5512 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 5567 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
5513 | void *key) | 5568 | void *key) |
5514 | { | 5569 | { |
5515 | return try_to_wake_up(curr->private, mode, sync); | 5570 | return try_to_wake_up(curr->private, mode, wake_flags); |
5516 | } | 5571 | } |
5517 | EXPORT_SYMBOL(default_wake_function); | 5572 | EXPORT_SYMBOL(default_wake_function); |
5518 | 5573 | ||
@@ -5526,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function); | |||
5526 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5581 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5527 | */ | 5582 | */ |
5528 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5583 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5529 | int nr_exclusive, int sync, void *key) | 5584 | int nr_exclusive, int wake_flags, void *key) |
5530 | { | 5585 | { |
5531 | wait_queue_t *curr, *next; | 5586 | wait_queue_t *curr, *next; |
5532 | 5587 | ||
5533 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 5588 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
5534 | unsigned flags = curr->flags; | 5589 | unsigned flags = curr->flags; |
5535 | 5590 | ||
5536 | if (curr->func(curr, mode, sync, key) && | 5591 | if (curr->func(curr, mode, wake_flags, key) && |
5537 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 5592 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
5538 | break; | 5593 | break; |
5539 | } | 5594 | } |
@@ -5594,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
5594 | int nr_exclusive, void *key) | 5649 | int nr_exclusive, void *key) |
5595 | { | 5650 | { |
5596 | unsigned long flags; | 5651 | unsigned long flags; |
5597 | int sync = 1; | 5652 | int wake_flags = WF_SYNC; |
5598 | 5653 | ||
5599 | if (unlikely(!q)) | 5654 | if (unlikely(!q)) |
5600 | return; | 5655 | return; |
5601 | 5656 | ||
5602 | if (unlikely(!nr_exclusive)) | 5657 | if (unlikely(!nr_exclusive)) |
5603 | sync = 0; | 5658 | wake_flags = 0; |
5604 | 5659 | ||
5605 | spin_lock_irqsave(&q->lock, flags); | 5660 | spin_lock_irqsave(&q->lock, flags); |
5606 | __wake_up_common(q, mode, nr_exclusive, sync, key); | 5661 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
5607 | spin_unlock_irqrestore(&q->lock, flags); | 5662 | spin_unlock_irqrestore(&q->lock, flags); |
5608 | } | 5663 | } |
5609 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 5664 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
@@ -6123,17 +6178,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6178 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6179 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6180 | struct rq *rq; |
6181 | int reset_on_fork; | ||
6126 | 6182 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6183 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6184 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6185 | recheck: |
6130 | /* double check policy once rq lock held */ | 6186 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6187 | if (policy < 0) { |
6188 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6189 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6190 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6191 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6192 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6193 | |
6194 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6195 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6196 | policy != SCHED_IDLE) | ||
6197 | return -EINVAL; | ||
6198 | } | ||
6199 | |||
6137 | /* | 6200 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6201 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6202 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6240,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6240 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6241 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6242 | return -EPERM; |
6243 | |||
6244 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6245 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6246 | return -EPERM; | ||
6180 | } | 6247 | } |
6181 | 6248 | ||
6182 | if (user) { | 6249 | if (user) { |
@@ -6220,6 +6287,8 @@ recheck: | |||
6220 | if (running) | 6287 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6288 | p->sched_class->put_prev_task(rq, p); |
6222 | 6289 | ||
6290 | p->sched_reset_on_fork = reset_on_fork; | ||
6291 | |||
6223 | oldprio = p->prio; | 6292 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6293 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6294 | ||
@@ -6336,14 +6405,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6405 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6406 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6407 | if (!retval) |
6339 | retval = p->policy; | 6408 | retval = p->policy |
6409 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6410 | } |
6341 | read_unlock(&tasklist_lock); | 6411 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6412 | return retval; |
6343 | } | 6413 | } |
6344 | 6414 | ||
6345 | /** | 6415 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6416 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6417 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6418 | * @param: structure containing the RT priority. |
6349 | */ | 6419 | */ |
@@ -6571,19 +6641,9 @@ static inline int should_resched(void) | |||
6571 | 6641 | ||
6572 | static void __cond_resched(void) | 6642 | static void __cond_resched(void) |
6573 | { | 6643 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6644 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6645 | schedule(); |
6576 | #endif | 6646 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6647 | } |
6588 | 6648 | ||
6589 | int __sched _cond_resched(void) | 6649 | int __sched _cond_resched(void) |
@@ -6597,18 +6657,20 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6657 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6658 | ||
6599 | /* | 6659 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6660 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6661 | * call schedule, and on return reacquire the lock. |
6602 | * | 6662 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6663 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6664 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6665 | * spin_unlock(), once by hand). |
6606 | */ | 6666 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6667 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6668 | { |
6609 | int resched = should_resched(); | 6669 | int resched = should_resched(); |
6610 | int ret = 0; | 6670 | int ret = 0; |
6611 | 6671 | ||
6672 | lockdep_assert_held(lock); | ||
6673 | |||
6612 | if (spin_needbreak(lock) || resched) { | 6674 | if (spin_needbreak(lock) || resched) { |
6613 | spin_unlock(lock); | 6675 | spin_unlock(lock); |
6614 | if (resched) | 6676 | if (resched) |
@@ -6620,9 +6682,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6620 | } | 6682 | } |
6621 | return ret; | 6683 | return ret; |
6622 | } | 6684 | } |
6623 | EXPORT_SYMBOL(cond_resched_lock); | 6685 | EXPORT_SYMBOL(__cond_resched_lock); |
6624 | 6686 | ||
6625 | int __sched cond_resched_softirq(void) | 6687 | int __sched __cond_resched_softirq(void) |
6626 | { | 6688 | { |
6627 | BUG_ON(!in_softirq()); | 6689 | BUG_ON(!in_softirq()); |
6628 | 6690 | ||
@@ -6634,7 +6696,7 @@ int __sched cond_resched_softirq(void) | |||
6634 | } | 6696 | } |
6635 | return 0; | 6697 | return 0; |
6636 | } | 6698 | } |
6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6699 | EXPORT_SYMBOL(__cond_resched_softirq); |
6638 | 6700 | ||
6639 | /** | 6701 | /** |
6640 | * yield - yield the current processor to other threads. | 6702 | * yield - yield the current processor to other threads. |
@@ -6658,11 +6720,13 @@ EXPORT_SYMBOL(yield); | |||
6658 | */ | 6720 | */ |
6659 | void __sched io_schedule(void) | 6721 | void __sched io_schedule(void) |
6660 | { | 6722 | { |
6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6723 | struct rq *rq = raw_rq(); |
6662 | 6724 | ||
6663 | delayacct_blkio_start(); | 6725 | delayacct_blkio_start(); |
6664 | atomic_inc(&rq->nr_iowait); | 6726 | atomic_inc(&rq->nr_iowait); |
6727 | current->in_iowait = 1; | ||
6665 | schedule(); | 6728 | schedule(); |
6729 | current->in_iowait = 0; | ||
6666 | atomic_dec(&rq->nr_iowait); | 6730 | atomic_dec(&rq->nr_iowait); |
6667 | delayacct_blkio_end(); | 6731 | delayacct_blkio_end(); |
6668 | } | 6732 | } |
@@ -6670,12 +6734,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6670 | 6734 | ||
6671 | long __sched io_schedule_timeout(long timeout) | 6735 | long __sched io_schedule_timeout(long timeout) |
6672 | { | 6736 | { |
6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6737 | struct rq *rq = raw_rq(); |
6674 | long ret; | 6738 | long ret; |
6675 | 6739 | ||
6676 | delayacct_blkio_start(); | 6740 | delayacct_blkio_start(); |
6677 | atomic_inc(&rq->nr_iowait); | 6741 | atomic_inc(&rq->nr_iowait); |
6742 | current->in_iowait = 1; | ||
6678 | ret = schedule_timeout(timeout); | 6743 | ret = schedule_timeout(timeout); |
6744 | current->in_iowait = 0; | ||
6679 | atomic_dec(&rq->nr_iowait); | 6745 | atomic_dec(&rq->nr_iowait); |
6680 | delayacct_blkio_end(); | 6746 | delayacct_blkio_end(); |
6681 | return ret; | 6747 | return ret; |
@@ -6992,8 +7058,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6992 | 7058 | ||
6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7059 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6994 | /* Need help from migration thread: drop lock and wait. */ | 7060 | /* Need help from migration thread: drop lock and wait. */ |
7061 | struct task_struct *mt = rq->migration_thread; | ||
7062 | |||
7063 | get_task_struct(mt); | ||
6995 | task_rq_unlock(rq, &flags); | 7064 | task_rq_unlock(rq, &flags); |
6996 | wake_up_process(rq->migration_thread); | 7065 | wake_up_process(rq->migration_thread); |
7066 | put_task_struct(mt); | ||
6997 | wait_for_completion(&req.done); | 7067 | wait_for_completion(&req.done); |
6998 | tlb_migrate_finish(p->mm); | 7068 | tlb_migrate_finish(p->mm); |
6999 | return 0; | 7069 | return 0; |
@@ -7051,6 +7121,11 @@ fail: | |||
7051 | return ret; | 7121 | return ret; |
7052 | } | 7122 | } |
7053 | 7123 | ||
7124 | #define RCU_MIGRATION_IDLE 0 | ||
7125 | #define RCU_MIGRATION_NEED_QS 1 | ||
7126 | #define RCU_MIGRATION_GOT_QS 2 | ||
7127 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
7128 | |||
7054 | /* | 7129 | /* |
7055 | * migration_thread - this is a highprio system thread that performs | 7130 | * migration_thread - this is a highprio system thread that performs |
7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7131 | * thread migration by bumping thread off CPU then 'pushing' onto |
@@ -7058,6 +7133,7 @@ fail: | |||
7058 | */ | 7133 | */ |
7059 | static int migration_thread(void *data) | 7134 | static int migration_thread(void *data) |
7060 | { | 7135 | { |
7136 | int badcpu; | ||
7061 | int cpu = (long)data; | 7137 | int cpu = (long)data; |
7062 | struct rq *rq; | 7138 | struct rq *rq; |
7063 | 7139 | ||
@@ -7092,8 +7168,17 @@ static int migration_thread(void *data) | |||
7092 | req = list_entry(head->next, struct migration_req, list); | 7168 | req = list_entry(head->next, struct migration_req, list); |
7093 | list_del_init(head->next); | 7169 | list_del_init(head->next); |
7094 | 7170 | ||
7095 | spin_unlock(&rq->lock); | 7171 | if (req->task != NULL) { |
7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7172 | spin_unlock(&rq->lock); |
7173 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
7174 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
7175 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
7176 | spin_unlock(&rq->lock); | ||
7177 | } else { | ||
7178 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
7179 | spin_unlock(&rq->lock); | ||
7180 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
7181 | } | ||
7097 | local_irq_enable(); | 7182 | local_irq_enable(); |
7098 | 7183 | ||
7099 | complete(&req->done); | 7184 | complete(&req->done); |
@@ -7625,7 +7710,7 @@ static int __init migration_init(void) | |||
7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7710 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7626 | register_cpu_notifier(&migration_notifier); | 7711 | register_cpu_notifier(&migration_notifier); |
7627 | 7712 | ||
7628 | return err; | 7713 | return 0; |
7629 | } | 7714 | } |
7630 | early_initcall(migration_init); | 7715 | early_initcall(migration_init); |
7631 | #endif | 7716 | #endif |
@@ -7672,7 +7757,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7672 | break; | 7757 | break; |
7673 | } | 7758 | } |
7674 | 7759 | ||
7675 | if (!group->__cpu_power) { | 7760 | if (!group->cpu_power) { |
7676 | printk(KERN_CONT "\n"); | 7761 | printk(KERN_CONT "\n"); |
7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7762 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
7678 | "set\n"); | 7763 | "set\n"); |
@@ -7696,9 +7781,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7781 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
7697 | 7782 | ||
7698 | printk(KERN_CONT " %s", str); | 7783 | printk(KERN_CONT " %s", str); |
7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7784 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7785 | printk(KERN_CONT " (cpu_power = %d)", |
7701 | group->__cpu_power); | 7786 | group->cpu_power); |
7702 | } | 7787 | } |
7703 | 7788 | ||
7704 | group = group->next; | 7789 | group = group->next; |
@@ -7763,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
7763 | } | 7848 | } |
7764 | 7849 | ||
7765 | /* Following flags don't use groups */ | 7850 | /* Following flags don't use groups */ |
7766 | if (sd->flags & (SD_WAKE_IDLE | | 7851 | if (sd->flags & (SD_WAKE_AFFINE)) |
7767 | SD_WAKE_AFFINE | | ||
7768 | SD_WAKE_BALANCE)) | ||
7769 | return 0; | 7852 | return 0; |
7770 | 7853 | ||
7771 | return 1; | 7854 | return 1; |
@@ -7782,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
7782 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | 7865 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
7783 | return 0; | 7866 | return 0; |
7784 | 7867 | ||
7785 | /* Does parent contain flags not in child? */ | ||
7786 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
7787 | if (cflags & SD_WAKE_AFFINE) | ||
7788 | pflags &= ~SD_WAKE_BALANCE; | ||
7789 | /* Flags needing groups don't count if only 1 group in parent */ | 7868 | /* Flags needing groups don't count if only 1 group in parent */ |
7790 | if (parent->groups == parent->groups->next) { | 7869 | if (parent->groups == parent->groups->next) { |
7791 | pflags &= ~(SD_LOAD_BALANCE | | 7870 | pflags &= ~(SD_LOAD_BALANCE | |
@@ -7841,7 +7920,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7841 | rq->rd = rd; | 7920 | rq->rd = rd; |
7842 | 7921 | ||
7843 | cpumask_set_cpu(rq->cpu, rd->span); | 7922 | cpumask_set_cpu(rq->cpu, rd->span); |
7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 7923 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7845 | set_rq_online(rq); | 7924 | set_rq_online(rq); |
7846 | 7925 | ||
7847 | spin_unlock_irqrestore(&rq->lock, flags); | 7926 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -7983,7 +8062,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
7983 | continue; | 8062 | continue; |
7984 | 8063 | ||
7985 | cpumask_clear(sched_group_cpus(sg)); | 8064 | cpumask_clear(sched_group_cpus(sg)); |
7986 | sg->__cpu_power = 0; | 8065 | sg->cpu_power = 0; |
7987 | 8066 | ||
7988 | for_each_cpu(j, span) { | 8067 | for_each_cpu(j, span) { |
7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8068 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
@@ -8091,6 +8170,39 @@ struct static_sched_domain { | |||
8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8170 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8092 | }; | 8171 | }; |
8093 | 8172 | ||
8173 | struct s_data { | ||
8174 | #ifdef CONFIG_NUMA | ||
8175 | int sd_allnodes; | ||
8176 | cpumask_var_t domainspan; | ||
8177 | cpumask_var_t covered; | ||
8178 | cpumask_var_t notcovered; | ||
8179 | #endif | ||
8180 | cpumask_var_t nodemask; | ||
8181 | cpumask_var_t this_sibling_map; | ||
8182 | cpumask_var_t this_core_map; | ||
8183 | cpumask_var_t send_covered; | ||
8184 | cpumask_var_t tmpmask; | ||
8185 | struct sched_group **sched_group_nodes; | ||
8186 | struct root_domain *rd; | ||
8187 | }; | ||
8188 | |||
8189 | enum s_alloc { | ||
8190 | sa_sched_groups = 0, | ||
8191 | sa_rootdomain, | ||
8192 | sa_tmpmask, | ||
8193 | sa_send_covered, | ||
8194 | sa_this_core_map, | ||
8195 | sa_this_sibling_map, | ||
8196 | sa_nodemask, | ||
8197 | sa_sched_group_nodes, | ||
8198 | #ifdef CONFIG_NUMA | ||
8199 | sa_notcovered, | ||
8200 | sa_covered, | ||
8201 | sa_domainspan, | ||
8202 | #endif | ||
8203 | sa_none, | ||
8204 | }; | ||
8205 | |||
8094 | /* | 8206 | /* |
8095 | * SMT sched-domains: | 8207 | * SMT sched-domains: |
8096 | */ | 8208 | */ |
@@ -8208,11 +8320,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8208 | continue; | 8320 | continue; |
8209 | } | 8321 | } |
8210 | 8322 | ||
8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8323 | sg->cpu_power += sd->groups->cpu_power; |
8212 | } | 8324 | } |
8213 | sg = sg->next; | 8325 | sg = sg->next; |
8214 | } while (sg != group_head); | 8326 | } while (sg != group_head); |
8215 | } | 8327 | } |
8328 | |||
8329 | static int build_numa_sched_groups(struct s_data *d, | ||
8330 | const struct cpumask *cpu_map, int num) | ||
8331 | { | ||
8332 | struct sched_domain *sd; | ||
8333 | struct sched_group *sg, *prev; | ||
8334 | int n, j; | ||
8335 | |||
8336 | cpumask_clear(d->covered); | ||
8337 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8338 | if (cpumask_empty(d->nodemask)) { | ||
8339 | d->sched_group_nodes[num] = NULL; | ||
8340 | goto out; | ||
8341 | } | ||
8342 | |||
8343 | sched_domain_node_span(num, d->domainspan); | ||
8344 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8345 | |||
8346 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8347 | GFP_KERNEL, num); | ||
8348 | if (!sg) { | ||
8349 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8350 | num); | ||
8351 | return -ENOMEM; | ||
8352 | } | ||
8353 | d->sched_group_nodes[num] = sg; | ||
8354 | |||
8355 | for_each_cpu(j, d->nodemask) { | ||
8356 | sd = &per_cpu(node_domains, j).sd; | ||
8357 | sd->groups = sg; | ||
8358 | } | ||
8359 | |||
8360 | sg->cpu_power = 0; | ||
8361 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8362 | sg->next = sg; | ||
8363 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8364 | |||
8365 | prev = sg; | ||
8366 | for (j = 0; j < nr_node_ids; j++) { | ||
8367 | n = (num + j) % nr_node_ids; | ||
8368 | cpumask_complement(d->notcovered, d->covered); | ||
8369 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8370 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8371 | if (cpumask_empty(d->tmpmask)) | ||
8372 | break; | ||
8373 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8374 | if (cpumask_empty(d->tmpmask)) | ||
8375 | continue; | ||
8376 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8377 | GFP_KERNEL, num); | ||
8378 | if (!sg) { | ||
8379 | printk(KERN_WARNING | ||
8380 | "Can not alloc domain group for node %d\n", j); | ||
8381 | return -ENOMEM; | ||
8382 | } | ||
8383 | sg->cpu_power = 0; | ||
8384 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8385 | sg->next = prev->next; | ||
8386 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8387 | prev->next = sg; | ||
8388 | prev = sg; | ||
8389 | } | ||
8390 | out: | ||
8391 | return 0; | ||
8392 | } | ||
8216 | #endif /* CONFIG_NUMA */ | 8393 | #endif /* CONFIG_NUMA */ |
8217 | 8394 | ||
8218 | #ifdef CONFIG_NUMA | 8395 | #ifdef CONFIG_NUMA |
@@ -8266,15 +8443,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8443 | * there are asymmetries in the topology. If there are asymmetries, group |
8267 | * having more cpu_power will pickup more load compared to the group having | 8444 | * having more cpu_power will pickup more load compared to the group having |
8268 | * less cpu_power. | 8445 | * less cpu_power. |
8269 | * | ||
8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
8272 | * or lightly loaded groups in the same sched domain. | ||
8273 | */ | 8446 | */ |
8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8447 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
8275 | { | 8448 | { |
8276 | struct sched_domain *child; | 8449 | struct sched_domain *child; |
8277 | struct sched_group *group; | 8450 | struct sched_group *group; |
8451 | long power; | ||
8452 | int weight; | ||
8278 | 8453 | ||
8279 | WARN_ON(!sd || !sd->groups); | 8454 | WARN_ON(!sd || !sd->groups); |
8280 | 8455 | ||
@@ -8283,28 +8458,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
8283 | 8458 | ||
8284 | child = sd->child; | 8459 | child = sd->child; |
8285 | 8460 | ||
8286 | sd->groups->__cpu_power = 0; | 8461 | sd->groups->cpu_power = 0; |
8287 | 8462 | ||
8288 | /* | 8463 | if (!child) { |
8289 | * For perf policy, if the groups in child domain share resources | 8464 | power = SCHED_LOAD_SCALE; |
8290 | * (for example cores sharing some portions of the cache hierarchy | 8465 | weight = cpumask_weight(sched_domain_span(sd)); |
8291 | * or SMT), then set this domain groups cpu_power such that each group | 8466 | /* |
8292 | * can handle only one task, when there are other idle groups in the | 8467 | * SMT siblings share the power of a single core. |
8293 | * same sched domain. | 8468 | * Usually multiple threads get a better yield out of |
8294 | */ | 8469 | * that one core than a single thread would have, |
8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8470 | * reflect that in sd->smt_gain. |
8296 | (child->flags & | 8471 | */ |
8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8472 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8473 | power *= sd->smt_gain; |
8474 | power /= weight; | ||
8475 | power >>= SCHED_LOAD_SHIFT; | ||
8476 | } | ||
8477 | sd->groups->cpu_power += power; | ||
8299 | return; | 8478 | return; |
8300 | } | 8479 | } |
8301 | 8480 | ||
8302 | /* | 8481 | /* |
8303 | * add cpu_power of each child group to this groups cpu_power | 8482 | * Add cpu_power of each child group to this groups cpu_power. |
8304 | */ | 8483 | */ |
8305 | group = child->groups; | 8484 | group = child->groups; |
8306 | do { | 8485 | do { |
8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8486 | sd->groups->cpu_power += group->cpu_power; |
8308 | group = group->next; | 8487 | group = group->next; |
8309 | } while (group != child->groups); | 8488 | } while (group != child->groups); |
8310 | } | 8489 | } |
@@ -8371,287 +8550,292 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8371 | request = attr->relax_domain_level; | 8550 | request = attr->relax_domain_level; |
8372 | if (request < sd->level) { | 8551 | if (request < sd->level) { |
8373 | /* turn off idle balance on this domain */ | 8552 | /* turn off idle balance on this domain */ |
8374 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | 8553 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8375 | } else { | 8554 | } else { |
8376 | /* turn on idle balance on this domain */ | 8555 | /* turn on idle balance on this domain */ |
8377 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | 8556 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8557 | } | ||
8558 | } | ||
8559 | |||
8560 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
8561 | const struct cpumask *cpu_map) | ||
8562 | { | ||
8563 | switch (what) { | ||
8564 | case sa_sched_groups: | ||
8565 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
8566 | d->sched_group_nodes = NULL; | ||
8567 | case sa_rootdomain: | ||
8568 | free_rootdomain(d->rd); /* fall through */ | ||
8569 | case sa_tmpmask: | ||
8570 | free_cpumask_var(d->tmpmask); /* fall through */ | ||
8571 | case sa_send_covered: | ||
8572 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8573 | case sa_this_core_map: | ||
8574 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8575 | case sa_this_sibling_map: | ||
8576 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8577 | case sa_nodemask: | ||
8578 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8579 | case sa_sched_group_nodes: | ||
8580 | #ifdef CONFIG_NUMA | ||
8581 | kfree(d->sched_group_nodes); /* fall through */ | ||
8582 | case sa_notcovered: | ||
8583 | free_cpumask_var(d->notcovered); /* fall through */ | ||
8584 | case sa_covered: | ||
8585 | free_cpumask_var(d->covered); /* fall through */ | ||
8586 | case sa_domainspan: | ||
8587 | free_cpumask_var(d->domainspan); /* fall through */ | ||
8588 | #endif | ||
8589 | case sa_none: | ||
8590 | break; | ||
8378 | } | 8591 | } |
8379 | } | 8592 | } |
8380 | 8593 | ||
8381 | /* | 8594 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8595 | const struct cpumask *cpu_map) |
8383 | * to the individual cpus | ||
8384 | */ | ||
8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | ||
8386 | struct sched_domain_attr *attr) | ||
8387 | { | 8596 | { |
8388 | int i, err = -ENOMEM; | ||
8389 | struct root_domain *rd; | ||
8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | ||
8391 | tmpmask; | ||
8392 | #ifdef CONFIG_NUMA | ||
8393 | cpumask_var_t domainspan, covered, notcovered; | ||
8394 | struct sched_group **sched_group_nodes = NULL; | ||
8395 | int sd_allnodes = 0; | ||
8396 | |||
8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | ||
8398 | goto out; | ||
8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | ||
8400 | goto free_domainspan; | ||
8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | ||
8402 | goto free_covered; | ||
8403 | #endif | ||
8404 | |||
8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8406 | goto free_notcovered; | ||
8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8408 | goto free_nodemask; | ||
8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8410 | goto free_this_sibling_map; | ||
8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8412 | goto free_this_core_map; | ||
8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8414 | goto free_send_covered; | ||
8415 | |||
8416 | #ifdef CONFIG_NUMA | 8597 | #ifdef CONFIG_NUMA |
8417 | /* | 8598 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8418 | * Allocate the per-node list of sched groups | 8599 | return sa_none; |
8419 | */ | 8600 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8601 | return sa_domainspan; |
8421 | GFP_KERNEL); | 8602 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8422 | if (!sched_group_nodes) { | 8603 | return sa_covered; |
8604 | /* Allocate the per-node list of sched groups */ | ||
8605 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
8606 | sizeof(struct sched_group *), GFP_KERNEL); | ||
8607 | if (!d->sched_group_nodes) { | ||
8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8608 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8424 | goto free_tmpmask; | 8609 | return sa_notcovered; |
8425 | } | 8610 | } |
8426 | #endif | 8611 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8427 | 8612 | #endif | |
8428 | rd = alloc_rootdomain(); | 8613 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8429 | if (!rd) { | 8614 | return sa_sched_group_nodes; |
8615 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8616 | return sa_nodemask; | ||
8617 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8618 | return sa_this_sibling_map; | ||
8619 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8620 | return sa_this_core_map; | ||
8621 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8622 | return sa_send_covered; | ||
8623 | d->rd = alloc_rootdomain(); | ||
8624 | if (!d->rd) { | ||
8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8625 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8431 | goto free_sched_groups; | 8626 | return sa_tmpmask; |
8432 | } | 8627 | } |
8628 | return sa_rootdomain; | ||
8629 | } | ||
8433 | 8630 | ||
8631 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8632 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8633 | { | ||
8634 | struct sched_domain *sd = NULL; | ||
8434 | #ifdef CONFIG_NUMA | 8635 | #ifdef CONFIG_NUMA |
8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8636 | struct sched_domain *parent; |
8436 | #endif | ||
8437 | |||
8438 | /* | ||
8439 | * Set up domains for cpus specified by the cpu_map. | ||
8440 | */ | ||
8441 | for_each_cpu(i, cpu_map) { | ||
8442 | struct sched_domain *sd = NULL, *p; | ||
8443 | |||
8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
8445 | |||
8446 | #ifdef CONFIG_NUMA | ||
8447 | if (cpumask_weight(cpu_map) > | ||
8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8450 | SD_INIT(sd, ALLNODES); | ||
8451 | set_domain_attribute(sd, attr); | ||
8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8454 | p = sd; | ||
8455 | sd_allnodes = 1; | ||
8456 | } else | ||
8457 | p = NULL; | ||
8458 | 8637 | ||
8459 | sd = &per_cpu(node_domains, i).sd; | 8638 | d->sd_allnodes = 0; |
8460 | SD_INIT(sd, NODE); | 8639 | if (cpumask_weight(cpu_map) > |
8640 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
8641 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8642 | SD_INIT(sd, ALLNODES); | ||
8461 | set_domain_attribute(sd, attr); | 8643 | set_domain_attribute(sd, attr); |
8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8644 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8463 | sd->parent = p; | 8645 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8464 | if (p) | 8646 | d->sd_allnodes = 1; |
8465 | p->child = sd; | 8647 | } |
8466 | cpumask_and(sched_domain_span(sd), | 8648 | parent = sd; |
8467 | sched_domain_span(sd), cpu_map); | 8649 | |
8650 | sd = &per_cpu(node_domains, i).sd; | ||
8651 | SD_INIT(sd, NODE); | ||
8652 | set_domain_attribute(sd, attr); | ||
8653 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8654 | sd->parent = parent; | ||
8655 | if (parent) | ||
8656 | parent->child = sd; | ||
8657 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8468 | #endif | 8658 | #endif |
8659 | return sd; | ||
8660 | } | ||
8469 | 8661 | ||
8470 | p = sd; | 8662 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8471 | sd = &per_cpu(phys_domains, i).sd; | 8663 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8472 | SD_INIT(sd, CPU); | 8664 | struct sched_domain *parent, int i) |
8473 | set_domain_attribute(sd, attr); | 8665 | { |
8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8666 | struct sched_domain *sd; |
8475 | sd->parent = p; | 8667 | sd = &per_cpu(phys_domains, i).sd; |
8476 | if (p) | 8668 | SD_INIT(sd, CPU); |
8477 | p->child = sd; | 8669 | set_domain_attribute(sd, attr); |
8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8670 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8671 | sd->parent = parent; | ||
8672 | if (parent) | ||
8673 | parent->child = sd; | ||
8674 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8675 | return sd; | ||
8676 | } | ||
8479 | 8677 | ||
8678 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8679 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8680 | struct sched_domain *parent, int i) | ||
8681 | { | ||
8682 | struct sched_domain *sd = parent; | ||
8480 | #ifdef CONFIG_SCHED_MC | 8683 | #ifdef CONFIG_SCHED_MC |
8481 | p = sd; | 8684 | sd = &per_cpu(core_domains, i).sd; |
8482 | sd = &per_cpu(core_domains, i).sd; | 8685 | SD_INIT(sd, MC); |
8483 | SD_INIT(sd, MC); | 8686 | set_domain_attribute(sd, attr); |
8484 | set_domain_attribute(sd, attr); | 8687 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8688 | sd->parent = parent; |
8486 | cpu_coregroup_mask(i)); | 8689 | parent->child = sd; |
8487 | sd->parent = p; | 8690 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8488 | p->child = sd; | ||
8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8490 | #endif | 8691 | #endif |
8692 | return sd; | ||
8693 | } | ||
8491 | 8694 | ||
8695 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8696 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8697 | struct sched_domain *parent, int i) | ||
8698 | { | ||
8699 | struct sched_domain *sd = parent; | ||
8492 | #ifdef CONFIG_SCHED_SMT | 8700 | #ifdef CONFIG_SCHED_SMT |
8493 | p = sd; | 8701 | sd = &per_cpu(cpu_domains, i).sd; |
8494 | sd = &per_cpu(cpu_domains, i).sd; | 8702 | SD_INIT(sd, SIBLING); |
8495 | SD_INIT(sd, SIBLING); | 8703 | set_domain_attribute(sd, attr); |
8496 | set_domain_attribute(sd, attr); | 8704 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8497 | cpumask_and(sched_domain_span(sd), | 8705 | sd->parent = parent; |
8498 | topology_thread_cpumask(i), cpu_map); | 8706 | parent->child = sd; |
8499 | sd->parent = p; | 8707 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8500 | p->child = sd; | ||
8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8502 | #endif | 8708 | #endif |
8503 | } | 8709 | return sd; |
8710 | } | ||
8504 | 8711 | ||
8712 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8713 | const struct cpumask *cpu_map, int cpu) | ||
8714 | { | ||
8715 | switch (l) { | ||
8505 | #ifdef CONFIG_SCHED_SMT | 8716 | #ifdef CONFIG_SCHED_SMT |
8506 | /* Set up CPU (sibling) groups */ | 8717 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8507 | for_each_cpu(i, cpu_map) { | 8718 | cpumask_and(d->this_sibling_map, cpu_map, |
8508 | cpumask_and(this_sibling_map, | 8719 | topology_thread_cpumask(cpu)); |
8509 | topology_thread_cpumask(i), cpu_map); | 8720 | if (cpu == cpumask_first(d->this_sibling_map)) |
8510 | if (i != cpumask_first(this_sibling_map)) | 8721 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8511 | continue; | 8722 | &cpu_to_cpu_group, |
8512 | 8723 | d->send_covered, d->tmpmask); | |
8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8724 | break; |
8514 | &cpu_to_cpu_group, | ||
8515 | send_covered, tmpmask); | ||
8516 | } | ||
8517 | #endif | 8725 | #endif |
8518 | |||
8519 | #ifdef CONFIG_SCHED_MC | 8726 | #ifdef CONFIG_SCHED_MC |
8520 | /* Set up multi-core groups */ | 8727 | case SD_LV_MC: /* set up multi-core groups */ |
8521 | for_each_cpu(i, cpu_map) { | 8728 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8729 | if (cpu == cpumask_first(d->this_core_map)) |
8523 | if (i != cpumask_first(this_core_map)) | 8730 | init_sched_build_groups(d->this_core_map, cpu_map, |
8524 | continue; | 8731 | &cpu_to_core_group, |
8525 | 8732 | d->send_covered, d->tmpmask); | |
8526 | init_sched_build_groups(this_core_map, cpu_map, | 8733 | break; |
8527 | &cpu_to_core_group, | ||
8528 | send_covered, tmpmask); | ||
8529 | } | ||
8530 | #endif | 8734 | #endif |
8531 | 8735 | case SD_LV_CPU: /* set up physical groups */ | |
8532 | /* Set up physical groups */ | 8736 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8533 | for (i = 0; i < nr_node_ids; i++) { | 8737 | if (!cpumask_empty(d->nodemask)) |
8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8738 | init_sched_build_groups(d->nodemask, cpu_map, |
8535 | if (cpumask_empty(nodemask)) | 8739 | &cpu_to_phys_group, |
8536 | continue; | 8740 | d->send_covered, d->tmpmask); |
8537 | 8741 | break; | |
8538 | init_sched_build_groups(nodemask, cpu_map, | ||
8539 | &cpu_to_phys_group, | ||
8540 | send_covered, tmpmask); | ||
8541 | } | ||
8542 | |||
8543 | #ifdef CONFIG_NUMA | 8742 | #ifdef CONFIG_NUMA |
8544 | /* Set up node groups */ | 8743 | case SD_LV_ALLNODES: |
8545 | if (sd_allnodes) { | 8744 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8546 | init_sched_build_groups(cpu_map, cpu_map, | 8745 | d->send_covered, d->tmpmask); |
8547 | &cpu_to_allnodes_group, | 8746 | break; |
8548 | send_covered, tmpmask); | 8747 | #endif |
8748 | default: | ||
8749 | break; | ||
8549 | } | 8750 | } |
8751 | } | ||
8550 | 8752 | ||
8551 | for (i = 0; i < nr_node_ids; i++) { | 8753 | /* |
8552 | /* Set up node groups */ | 8754 | * Build sched domains for a given set of cpus and attach the sched domains |
8553 | struct sched_group *sg, *prev; | 8755 | * to the individual cpus |
8554 | int j; | 8756 | */ |
8555 | 8757 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8556 | cpumask_clear(covered); | 8758 | struct sched_domain_attr *attr) |
8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8759 | { |
8558 | if (cpumask_empty(nodemask)) { | 8760 | enum s_alloc alloc_state = sa_none; |
8559 | sched_group_nodes[i] = NULL; | 8761 | struct s_data d; |
8560 | continue; | 8762 | struct sched_domain *sd; |
8561 | } | 8763 | int i; |
8764 | #ifdef CONFIG_NUMA | ||
8765 | d.sd_allnodes = 0; | ||
8766 | #endif | ||
8562 | 8767 | ||
8563 | sched_domain_node_span(i, domainspan); | 8768 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8769 | if (alloc_state != sa_rootdomain) |
8770 | goto error; | ||
8771 | alloc_state = sa_sched_groups; | ||
8565 | 8772 | ||
8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8773 | /* |
8567 | GFP_KERNEL, i); | 8774 | * Set up domains for cpus specified by the cpu_map. |
8568 | if (!sg) { | 8775 | */ |
8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8776 | for_each_cpu(i, cpu_map) { |
8570 | "node %d\n", i); | 8777 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8571 | goto error; | 8778 | cpu_map); |
8572 | } | ||
8573 | sched_group_nodes[i] = sg; | ||
8574 | for_each_cpu(j, nodemask) { | ||
8575 | struct sched_domain *sd; | ||
8576 | 8779 | ||
8577 | sd = &per_cpu(node_domains, j).sd; | 8780 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8578 | sd->groups = sg; | 8781 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8579 | } | 8782 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8580 | sg->__cpu_power = 0; | 8783 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8784 | } |
8582 | sg->next = sg; | ||
8583 | cpumask_or(covered, covered, nodemask); | ||
8584 | prev = sg; | ||
8585 | 8785 | ||
8586 | for (j = 0; j < nr_node_ids; j++) { | 8786 | for_each_cpu(i, cpu_map) { |
8587 | int n = (i + j) % nr_node_ids; | 8787 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8788 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8789 | } | ||
8588 | 8790 | ||
8589 | cpumask_complement(notcovered, covered); | 8791 | /* Set up physical groups */ |
8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8792 | for (i = 0; i < nr_node_ids; i++) |
8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8793 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8592 | if (cpumask_empty(tmpmask)) | ||
8593 | break; | ||
8594 | 8794 | ||
8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8795 | #ifdef CONFIG_NUMA |
8596 | if (cpumask_empty(tmpmask)) | 8796 | /* Set up node groups */ |
8597 | continue; | 8797 | if (d.sd_allnodes) |
8798 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8598 | 8799 | ||
8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8800 | for (i = 0; i < nr_node_ids; i++) |
8600 | cpumask_size(), | 8801 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8601 | GFP_KERNEL, i); | 8802 | goto error; |
8602 | if (!sg) { | ||
8603 | printk(KERN_WARNING | ||
8604 | "Can not alloc domain group for node %d\n", j); | ||
8605 | goto error; | ||
8606 | } | ||
8607 | sg->__cpu_power = 0; | ||
8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8609 | sg->next = prev->next; | ||
8610 | cpumask_or(covered, covered, tmpmask); | ||
8611 | prev->next = sg; | ||
8612 | prev = sg; | ||
8613 | } | ||
8614 | } | ||
8615 | #endif | 8803 | #endif |
8616 | 8804 | ||
8617 | /* Calculate CPU power for physical packages and nodes */ | 8805 | /* Calculate CPU power for physical packages and nodes */ |
8618 | #ifdef CONFIG_SCHED_SMT | 8806 | #ifdef CONFIG_SCHED_SMT |
8619 | for_each_cpu(i, cpu_map) { | 8807 | for_each_cpu(i, cpu_map) { |
8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8808 | sd = &per_cpu(cpu_domains, i).sd; |
8621 | |||
8622 | init_sched_groups_power(i, sd); | 8809 | init_sched_groups_power(i, sd); |
8623 | } | 8810 | } |
8624 | #endif | 8811 | #endif |
8625 | #ifdef CONFIG_SCHED_MC | 8812 | #ifdef CONFIG_SCHED_MC |
8626 | for_each_cpu(i, cpu_map) { | 8813 | for_each_cpu(i, cpu_map) { |
8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8814 | sd = &per_cpu(core_domains, i).sd; |
8628 | |||
8629 | init_sched_groups_power(i, sd); | 8815 | init_sched_groups_power(i, sd); |
8630 | } | 8816 | } |
8631 | #endif | 8817 | #endif |
8632 | 8818 | ||
8633 | for_each_cpu(i, cpu_map) { | 8819 | for_each_cpu(i, cpu_map) { |
8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8820 | sd = &per_cpu(phys_domains, i).sd; |
8635 | |||
8636 | init_sched_groups_power(i, sd); | 8821 | init_sched_groups_power(i, sd); |
8637 | } | 8822 | } |
8638 | 8823 | ||
8639 | #ifdef CONFIG_NUMA | 8824 | #ifdef CONFIG_NUMA |
8640 | for (i = 0; i < nr_node_ids; i++) | 8825 | for (i = 0; i < nr_node_ids; i++) |
8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8826 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8642 | 8827 | ||
8643 | if (sd_allnodes) { | 8828 | if (d.sd_allnodes) { |
8644 | struct sched_group *sg; | 8829 | struct sched_group *sg; |
8645 | 8830 | ||
8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8831 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8647 | tmpmask); | 8832 | d.tmpmask); |
8648 | init_numa_sched_groups_power(sg); | 8833 | init_numa_sched_groups_power(sg); |
8649 | } | 8834 | } |
8650 | #endif | 8835 | #endif |
8651 | 8836 | ||
8652 | /* Attach the domains */ | 8837 | /* Attach the domains */ |
8653 | for_each_cpu(i, cpu_map) { | 8838 | for_each_cpu(i, cpu_map) { |
8654 | struct sched_domain *sd; | ||
8655 | #ifdef CONFIG_SCHED_SMT | 8839 | #ifdef CONFIG_SCHED_SMT |
8656 | sd = &per_cpu(cpu_domains, i).sd; | 8840 | sd = &per_cpu(cpu_domains, i).sd; |
8657 | #elif defined(CONFIG_SCHED_MC) | 8841 | #elif defined(CONFIG_SCHED_MC) |
@@ -8659,44 +8843,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8659 | #else | 8843 | #else |
8660 | sd = &per_cpu(phys_domains, i).sd; | 8844 | sd = &per_cpu(phys_domains, i).sd; |
8661 | #endif | 8845 | #endif |
8662 | cpu_attach_domain(sd, rd, i); | 8846 | cpu_attach_domain(sd, d.rd, i); |
8663 | } | 8847 | } |
8664 | 8848 | ||
8665 | err = 0; | 8849 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8666 | 8850 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8667 | free_tmpmask: | 8851 | return 0; |
8668 | free_cpumask_var(tmpmask); | ||
8669 | free_send_covered: | ||
8670 | free_cpumask_var(send_covered); | ||
8671 | free_this_core_map: | ||
8672 | free_cpumask_var(this_core_map); | ||
8673 | free_this_sibling_map: | ||
8674 | free_cpumask_var(this_sibling_map); | ||
8675 | free_nodemask: | ||
8676 | free_cpumask_var(nodemask); | ||
8677 | free_notcovered: | ||
8678 | #ifdef CONFIG_NUMA | ||
8679 | free_cpumask_var(notcovered); | ||
8680 | free_covered: | ||
8681 | free_cpumask_var(covered); | ||
8682 | free_domainspan: | ||
8683 | free_cpumask_var(domainspan); | ||
8684 | out: | ||
8685 | #endif | ||
8686 | return err; | ||
8687 | |||
8688 | free_sched_groups: | ||
8689 | #ifdef CONFIG_NUMA | ||
8690 | kfree(sched_group_nodes); | ||
8691 | #endif | ||
8692 | goto free_tmpmask; | ||
8693 | 8852 | ||
8694 | #ifdef CONFIG_NUMA | ||
8695 | error: | 8853 | error: |
8696 | free_sched_groups(cpu_map, tmpmask); | 8854 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8697 | free_rootdomain(rd); | 8855 | return -ENOMEM; |
8698 | goto free_tmpmask; | ||
8699 | #endif | ||
8700 | } | 8856 | } |
8701 | 8857 | ||
8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 8858 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9304,11 +9460,11 @@ void __init sched_init(void) | |||
9304 | * system cpu resource, based on the weight assigned to root | 9460 | * system cpu resource, based on the weight assigned to root |
9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9461 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9462 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9307 | * (init_cfs_rq) and having one entity represent this group of | 9463 | * (init_tg_cfs_rq) and having one entity represent this group of |
9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9464 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9309 | */ | 9465 | */ |
9310 | init_tg_cfs_entry(&init_task_group, | 9466 | init_tg_cfs_entry(&init_task_group, |
9311 | &per_cpu(init_cfs_rq, i), | 9467 | &per_cpu(init_tg_cfs_rq, i), |
9312 | &per_cpu(init_sched_entity, i), i, 1, | 9468 | &per_cpu(init_sched_entity, i), i, 1, |
9313 | root_task_group.se[i]); | 9469 | root_task_group.se[i]); |
9314 | 9470 | ||
@@ -9334,6 +9490,7 @@ void __init sched_init(void) | |||
9334 | #ifdef CONFIG_SMP | 9490 | #ifdef CONFIG_SMP |
9335 | rq->sd = NULL; | 9491 | rq->sd = NULL; |
9336 | rq->rd = NULL; | 9492 | rq->rd = NULL; |
9493 | rq->post_schedule = 0; | ||
9337 | rq->active_balance = 0; | 9494 | rq->active_balance = 0; |
9338 | rq->next_balance = jiffies; | 9495 | rq->next_balance = jiffies; |
9339 | rq->push_cpu = 0; | 9496 | rq->push_cpu = 0; |
@@ -9398,13 +9555,20 @@ void __init sched_init(void) | |||
9398 | } | 9555 | } |
9399 | 9556 | ||
9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9557 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9401 | void __might_sleep(char *file, int line) | 9558 | static inline int preempt_count_equals(int preempt_offset) |
9559 | { | ||
9560 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9561 | |||
9562 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9563 | } | ||
9564 | |||
9565 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9402 | { | 9566 | { |
9403 | #ifdef in_atomic | 9567 | #ifdef in_atomic |
9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9568 | static unsigned long prev_jiffy; /* ratelimiting */ |
9405 | 9569 | ||
9406 | if ((!in_atomic() && !irqs_disabled()) || | 9570 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9571 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9408 | return; | 9572 | return; |
9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9573 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9410 | return; | 9574 | return; |
@@ -10581,3 +10745,113 @@ struct cgroup_subsys cpuacct_subsys = { | |||
10581 | .subsys_id = cpuacct_subsys_id, | 10745 | .subsys_id = cpuacct_subsys_id, |
10582 | }; | 10746 | }; |
10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10747 | #endif /* CONFIG_CGROUP_CPUACCT */ |
10748 | |||
10749 | #ifndef CONFIG_SMP | ||
10750 | |||
10751 | int rcu_expedited_torture_stats(char *page) | ||
10752 | { | ||
10753 | return 0; | ||
10754 | } | ||
10755 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10756 | |||
10757 | void synchronize_sched_expedited(void) | ||
10758 | { | ||
10759 | } | ||
10760 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10761 | |||
10762 | #else /* #ifndef CONFIG_SMP */ | ||
10763 | |||
10764 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
10765 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
10766 | |||
10767 | #define RCU_EXPEDITED_STATE_POST -2 | ||
10768 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
10769 | |||
10770 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10771 | |||
10772 | int rcu_expedited_torture_stats(char *page) | ||
10773 | { | ||
10774 | int cnt = 0; | ||
10775 | int cpu; | ||
10776 | |||
10777 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
10778 | for_each_online_cpu(cpu) { | ||
10779 | cnt += sprintf(&page[cnt], " %d:%d", | ||
10780 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
10781 | } | ||
10782 | cnt += sprintf(&page[cnt], "\n"); | ||
10783 | return cnt; | ||
10784 | } | ||
10785 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10786 | |||
10787 | static long synchronize_sched_expedited_count; | ||
10788 | |||
10789 | /* | ||
10790 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
10791 | * approach to force grace period to end quickly. This consumes | ||
10792 | * significant time on all CPUs, and is thus not recommended for | ||
10793 | * any sort of common-case code. | ||
10794 | * | ||
10795 | * Note that it is illegal to call this function while holding any | ||
10796 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
10797 | * observe this restriction will result in deadlock. | ||
10798 | */ | ||
10799 | void synchronize_sched_expedited(void) | ||
10800 | { | ||
10801 | int cpu; | ||
10802 | unsigned long flags; | ||
10803 | bool need_full_sync = 0; | ||
10804 | struct rq *rq; | ||
10805 | struct migration_req *req; | ||
10806 | long snap; | ||
10807 | int trycount = 0; | ||
10808 | |||
10809 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
10810 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
10811 | get_online_cpus(); | ||
10812 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
10813 | put_online_cpus(); | ||
10814 | if (trycount++ < 10) | ||
10815 | udelay(trycount * num_online_cpus()); | ||
10816 | else { | ||
10817 | synchronize_sched(); | ||
10818 | return; | ||
10819 | } | ||
10820 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
10821 | smp_mb(); /* ensure test happens before caller kfree */ | ||
10822 | return; | ||
10823 | } | ||
10824 | get_online_cpus(); | ||
10825 | } | ||
10826 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
10827 | for_each_online_cpu(cpu) { | ||
10828 | rq = cpu_rq(cpu); | ||
10829 | req = &per_cpu(rcu_migration_req, cpu); | ||
10830 | init_completion(&req->done); | ||
10831 | req->task = NULL; | ||
10832 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
10833 | spin_lock_irqsave(&rq->lock, flags); | ||
10834 | list_add(&req->list, &rq->migration_queue); | ||
10835 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10836 | wake_up_process(rq->migration_thread); | ||
10837 | } | ||
10838 | for_each_online_cpu(cpu) { | ||
10839 | rcu_expedited_state = cpu; | ||
10840 | req = &per_cpu(rcu_migration_req, cpu); | ||
10841 | rq = cpu_rq(cpu); | ||
10842 | wait_for_completion(&req->done); | ||
10843 | spin_lock_irqsave(&rq->lock, flags); | ||
10844 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
10845 | need_full_sync = 1; | ||
10846 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
10847 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10848 | } | ||
10849 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10850 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
10851 | put_online_cpus(); | ||
10852 | if (need_full_sync) | ||
10853 | synchronize_sched(); | ||
10854 | } | ||
10855 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10856 | |||
10857 | #endif /* #else #ifndef CONFIG_SMP */ | ||