diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1751 |
1 files changed, 1023 insertions, 728 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..e88689522e66 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -39,7 +39,7 @@ | |||
39 | #include <linux/completion.h> | 39 | #include <linux/completion.h> |
40 | #include <linux/kernel_stat.h> | 40 | #include <linux/kernel_stat.h> |
41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
42 | #include <linux/perf_counter.h> | 42 | #include <linux/perf_event.h> |
43 | #include <linux/security.h> | 43 | #include <linux/security.h> |
44 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
45 | #include <linux/profile.h> | 45 | #include <linux/profile.h> |
@@ -64,7 +64,6 @@ | |||
64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
67 | #include <linux/reciprocal_div.h> | ||
68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
@@ -120,30 +119,6 @@ | |||
120 | */ | 119 | */ |
121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
122 | 121 | ||
123 | #ifdef CONFIG_SMP | ||
124 | |||
125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
126 | |||
127 | /* | ||
128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
130 | */ | ||
131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
132 | { | ||
133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Each time a sched group cpu_power is changed, | ||
138 | * we must compute its reciprocal value | ||
139 | */ | ||
140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
141 | { | ||
142 | sg->__cpu_power += val; | ||
143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline int rt_policy(int policy) | 122 | static inline int rt_policy(int policy) |
148 | { | 123 | { |
149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 124 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 284 | ||
310 | /* | 285 | /* |
311 | * Root task group. | 286 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 287 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 288 | * be a child to this group. |
314 | */ | 289 | */ |
315 | struct task_group root_task_group; | 290 | struct task_group root_task_group; |
316 | 291 | ||
@@ -318,12 +293,12 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 293 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 295 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 298 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 299 | #ifdef CONFIG_RT_GROUP_SCHED |
325 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
326 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); |
327 | #endif /* CONFIG_RT_GROUP_SCHED */ | 302 | #endif /* CONFIG_RT_GROUP_SCHED */ |
328 | #else /* !CONFIG_USER_SCHED */ | 303 | #else /* !CONFIG_USER_SCHED */ |
329 | #define root_task_group init_task_group | 304 | #define root_task_group init_task_group |
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
401 | 376 | ||
402 | #else | 377 | #else |
403 | 378 | ||
404 | #ifdef CONFIG_SMP | ||
405 | static int root_task_group_empty(void) | ||
406 | { | ||
407 | return 1; | ||
408 | } | ||
409 | #endif | ||
410 | |||
411 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 379 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
412 | static inline struct task_group *task_group(struct task_struct *p) | 380 | static inline struct task_group *task_group(struct task_struct *p) |
413 | { | 381 | { |
@@ -537,14 +505,6 @@ struct root_domain { | |||
537 | #ifdef CONFIG_SMP | 505 | #ifdef CONFIG_SMP |
538 | struct cpupri cpupri; | 506 | struct cpupri cpupri; |
539 | #endif | 507 | #endif |
540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
541 | /* | ||
542 | * Preferred wake up cpu nominated by sched_mc balance that will be | ||
543 | * used when most cpus are idle in the system indicating overall very | ||
544 | * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) | ||
545 | */ | ||
546 | unsigned int sched_mc_preferred_wakeup_cpu; | ||
547 | #endif | ||
548 | }; | 508 | }; |
549 | 509 | ||
550 | /* | 510 | /* |
@@ -616,6 +576,7 @@ struct rq { | |||
616 | 576 | ||
617 | unsigned char idle_at_tick; | 577 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 578 | /* For active balancing */ |
579 | int post_schedule; | ||
619 | int active_balance; | 580 | int active_balance; |
620 | int push_cpu; | 581 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 582 | /* cpu of this runqueue: */ |
@@ -626,6 +587,9 @@ struct rq { | |||
626 | 587 | ||
627 | struct task_struct *migration_thread; | 588 | struct task_struct *migration_thread; |
628 | struct list_head migration_queue; | 589 | struct list_head migration_queue; |
590 | |||
591 | u64 rt_avg; | ||
592 | u64 age_stamp; | ||
629 | #endif | 593 | #endif |
630 | 594 | ||
631 | /* calc_load related fields */ | 595 | /* calc_load related fields */ |
@@ -665,9 +629,10 @@ struct rq { | |||
665 | 629 | ||
666 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 630 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
667 | 631 | ||
668 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) | 632 | static inline |
633 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
669 | { | 634 | { |
670 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); | 635 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
671 | } | 636 | } |
672 | 637 | ||
673 | static inline int cpu_of(struct rq *rq) | 638 | static inline int cpu_of(struct rq *rq) |
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 658 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 659 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 660 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
661 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 662 | ||
697 | inline void update_rq_clock(struct rq *rq) | 663 | inline void update_rq_clock(struct rq *rq) |
698 | { | 664 | { |
@@ -710,20 +676,15 @@ inline void update_rq_clock(struct rq *rq) | |||
710 | 676 | ||
711 | /** | 677 | /** |
712 | * runqueue_is_locked | 678 | * runqueue_is_locked |
679 | * @cpu: the processor in question. | ||
713 | * | 680 | * |
714 | * Returns true if the current cpu runqueue is locked. | 681 | * Returns true if the current cpu runqueue is locked. |
715 | * This interface allows printk to be called with the runqueue lock | 682 | * This interface allows printk to be called with the runqueue lock |
716 | * held and know whether or not it is OK to wake up the klogd. | 683 | * held and know whether or not it is OK to wake up the klogd. |
717 | */ | 684 | */ |
718 | int runqueue_is_locked(void) | 685 | int runqueue_is_locked(int cpu) |
719 | { | 686 | { |
720 | int cpu = get_cpu(); | 687 | return spin_is_locked(&cpu_rq(cpu)->lock); |
721 | struct rq *rq = cpu_rq(cpu); | ||
722 | int ret; | ||
723 | |||
724 | ret = spin_is_locked(&rq->lock); | ||
725 | put_cpu(); | ||
726 | return ret; | ||
727 | } | 688 | } |
728 | 689 | ||
729 | /* | 690 | /* |
@@ -820,7 +781,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) | |||
820 | return single_open(filp, sched_feat_show, NULL); | 781 | return single_open(filp, sched_feat_show, NULL); |
821 | } | 782 | } |
822 | 783 | ||
823 | static struct file_operations sched_feat_fops = { | 784 | static const struct file_operations sched_feat_fops = { |
824 | .open = sched_feat_open, | 785 | .open = sched_feat_open, |
825 | .write = sched_feat_write, | 786 | .write = sched_feat_write, |
826 | .read = seq_read, | 787 | .read = seq_read, |
@@ -861,6 +822,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
861 | unsigned int sysctl_sched_shares_thresh = 4; | 822 | unsigned int sysctl_sched_shares_thresh = 4; |
862 | 823 | ||
863 | /* | 824 | /* |
825 | * period over which we average the RT time consumption, measured | ||
826 | * in ms. | ||
827 | * | ||
828 | * default: 1s | ||
829 | */ | ||
830 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
831 | |||
832 | /* | ||
864 | * period over which we measure -rt task cpu usage in us. | 833 | * period over which we measure -rt task cpu usage in us. |
865 | * default: 1s | 834 | * default: 1s |
866 | */ | 835 | */ |
@@ -1278,12 +1247,37 @@ void wake_up_idle_cpu(int cpu) | |||
1278 | } | 1247 | } |
1279 | #endif /* CONFIG_NO_HZ */ | 1248 | #endif /* CONFIG_NO_HZ */ |
1280 | 1249 | ||
1250 | static u64 sched_avg_period(void) | ||
1251 | { | ||
1252 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1253 | } | ||
1254 | |||
1255 | static void sched_avg_update(struct rq *rq) | ||
1256 | { | ||
1257 | s64 period = sched_avg_period(); | ||
1258 | |||
1259 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
1260 | rq->age_stamp += period; | ||
1261 | rq->rt_avg /= 2; | ||
1262 | } | ||
1263 | } | ||
1264 | |||
1265 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1266 | { | ||
1267 | rq->rt_avg += rt_delta; | ||
1268 | sched_avg_update(rq); | ||
1269 | } | ||
1270 | |||
1281 | #else /* !CONFIG_SMP */ | 1271 | #else /* !CONFIG_SMP */ |
1282 | static void resched_task(struct task_struct *p) | 1272 | static void resched_task(struct task_struct *p) |
1283 | { | 1273 | { |
1284 | assert_spin_locked(&task_rq(p)->lock); | 1274 | assert_spin_locked(&task_rq(p)->lock); |
1285 | set_tsk_need_resched(p); | 1275 | set_tsk_need_resched(p); |
1286 | } | 1276 | } |
1277 | |||
1278 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1279 | { | ||
1280 | } | ||
1287 | #endif /* CONFIG_SMP */ | 1281 | #endif /* CONFIG_SMP */ |
1288 | 1282 | ||
1289 | #if BITS_PER_LONG == 32 | 1283 | #if BITS_PER_LONG == 32 |
@@ -1494,8 +1488,65 @@ static int tg_nop(struct task_group *tg, void *data) | |||
1494 | #endif | 1488 | #endif |
1495 | 1489 | ||
1496 | #ifdef CONFIG_SMP | 1490 | #ifdef CONFIG_SMP |
1497 | static unsigned long source_load(int cpu, int type); | 1491 | /* Used instead of source_load when we know the type == 0 */ |
1498 | static unsigned long target_load(int cpu, int type); | 1492 | static unsigned long weighted_cpuload(const int cpu) |
1493 | { | ||
1494 | return cpu_rq(cpu)->load.weight; | ||
1495 | } | ||
1496 | |||
1497 | /* | ||
1498 | * Return a low guess at the load of a migration-source cpu weighted | ||
1499 | * according to the scheduling class and "nice" value. | ||
1500 | * | ||
1501 | * We want to under-estimate the load of migration sources, to | ||
1502 | * balance conservatively. | ||
1503 | */ | ||
1504 | static unsigned long source_load(int cpu, int type) | ||
1505 | { | ||
1506 | struct rq *rq = cpu_rq(cpu); | ||
1507 | unsigned long total = weighted_cpuload(cpu); | ||
1508 | |||
1509 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1510 | return total; | ||
1511 | |||
1512 | return min(rq->cpu_load[type-1], total); | ||
1513 | } | ||
1514 | |||
1515 | /* | ||
1516 | * Return a high guess at the load of a migration-target cpu weighted | ||
1517 | * according to the scheduling class and "nice" value. | ||
1518 | */ | ||
1519 | static unsigned long target_load(int cpu, int type) | ||
1520 | { | ||
1521 | struct rq *rq = cpu_rq(cpu); | ||
1522 | unsigned long total = weighted_cpuload(cpu); | ||
1523 | |||
1524 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1525 | return total; | ||
1526 | |||
1527 | return max(rq->cpu_load[type-1], total); | ||
1528 | } | ||
1529 | |||
1530 | static struct sched_group *group_of(int cpu) | ||
1531 | { | ||
1532 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
1533 | |||
1534 | if (!sd) | ||
1535 | return NULL; | ||
1536 | |||
1537 | return sd->groups; | ||
1538 | } | ||
1539 | |||
1540 | static unsigned long power_of(int cpu) | ||
1541 | { | ||
1542 | struct sched_group *group = group_of(cpu); | ||
1543 | |||
1544 | if (!group) | ||
1545 | return SCHED_LOAD_SCALE; | ||
1546 | |||
1547 | return group->cpu_power; | ||
1548 | } | ||
1549 | |||
1499 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1550 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1500 | 1551 | ||
1501 | static unsigned long cpu_avg_load_per_task(int cpu) | 1552 | static unsigned long cpu_avg_load_per_task(int cpu) |
@@ -1513,28 +1564,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1564 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1565 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1566 | ||
1567 | struct update_shares_data { | ||
1568 | unsigned long rq_weight[NR_CPUS]; | ||
1569 | }; | ||
1570 | |||
1571 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1572 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1573 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1574 | ||
1518 | /* | 1575 | /* |
1519 | * Calculate and set the cpu's group shares. | 1576 | * Calculate and set the cpu's group shares. |
1520 | */ | 1577 | */ |
1521 | static void | 1578 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1579 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1580 | unsigned long sd_rq_weight, |
1581 | struct update_shares_data *usd) | ||
1524 | { | 1582 | { |
1525 | unsigned long shares; | 1583 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1584 | int boost = 0; |
1527 | |||
1528 | if (!tg->se[cpu]) | ||
1529 | return; | ||
1530 | 1585 | ||
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1586 | rq_weight = usd->rq_weight[cpu]; |
1587 | if (!rq_weight) { | ||
1588 | boost = 1; | ||
1589 | rq_weight = NICE_0_LOAD; | ||
1590 | } | ||
1532 | 1591 | ||
1533 | /* | 1592 | /* |
1534 | * \Sum shares * rq_weight | 1593 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1594 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1595 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1596 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1597 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1598 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1603,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1603 | unsigned long flags; |
1546 | 1604 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1605 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1606 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1607 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1608 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1609 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1610 | } |
@@ -1559,22 +1617,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1617 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1618 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1619 | { |
1562 | unsigned long weight, rq_weight = 0; | 1620 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1621 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1622 | struct sched_domain *sd = data; |
1623 | unsigned long flags; | ||
1565 | int i; | 1624 | int i; |
1566 | 1625 | ||
1626 | if (!tg->se[0]) | ||
1627 | return 0; | ||
1628 | |||
1629 | local_irq_save(flags); | ||
1630 | usd = &__get_cpu_var(update_shares_data); | ||
1631 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1632 | for_each_cpu(i, sched_domain_span(sd)) { |
1633 | weight = tg->cfs_rq[i]->load.weight; | ||
1634 | usd->rq_weight[i] = weight; | ||
1635 | |||
1568 | /* | 1636 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1637 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1638 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1639 | * run here it will not get delayed by group starvation. |
1572 | */ | 1640 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1641 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1642 | weight = NICE_0_LOAD; |
1576 | 1643 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1644 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1645 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1646 | } |
@@ -1586,7 +1652,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1652 | shares = tg->shares; |
1587 | 1653 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1654 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1655 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1656 | |||
1657 | local_irq_restore(flags); | ||
1590 | 1658 | ||
1591 | return 0; | 1659 | return 0; |
1592 | } | 1660 | } |
@@ -1616,8 +1684,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1684 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1685 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1686 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1687 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1688 | u64 now; |
1689 | |||
1690 | if (root_task_group_empty()) | ||
1691 | return; | ||
1692 | |||
1693 | now = cpu_clock(raw_smp_processor_id()); | ||
1694 | elapsed = now - sd->last_update; | ||
1621 | 1695 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1696 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1697 | sd->last_update = now; |
@@ -1627,6 +1701,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1701 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1702 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1703 | { |
1704 | if (root_task_group_empty()) | ||
1705 | return; | ||
1706 | |||
1630 | spin_unlock(&rq->lock); | 1707 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1708 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1709 | spin_lock(&rq->lock); |
@@ -1634,6 +1711,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1711 | ||
1635 | static void update_h_load(long cpu) | 1712 | static void update_h_load(long cpu) |
1636 | { | 1713 | { |
1714 | if (root_task_group_empty()) | ||
1715 | return; | ||
1716 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1717 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1718 | } |
1639 | 1719 | ||
@@ -1651,6 +1731,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1651 | 1731 | ||
1652 | #ifdef CONFIG_PREEMPT | 1732 | #ifdef CONFIG_PREEMPT |
1653 | 1733 | ||
1734 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1735 | |||
1654 | /* | 1736 | /* |
1655 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | 1737 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
1656 | * way at the expense of forcing extra atomic operations in all | 1738 | * way at the expense of forcing extra atomic operations in all |
@@ -1915,13 +1997,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1915 | } | 1997 | } |
1916 | 1998 | ||
1917 | #ifdef CONFIG_SMP | 1999 | #ifdef CONFIG_SMP |
1918 | |||
1919 | /* Used instead of source_load when we know the type == 0 */ | ||
1920 | static unsigned long weighted_cpuload(const int cpu) | ||
1921 | { | ||
1922 | return cpu_rq(cpu)->load.weight; | ||
1923 | } | ||
1924 | |||
1925 | /* | 2000 | /* |
1926 | * Is this task likely cache-hot: | 2001 | * Is this task likely cache-hot: |
1927 | */ | 2002 | */ |
@@ -1979,7 +2054,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1979 | if (task_hot(p, old_rq->clock, NULL)) | 2054 | if (task_hot(p, old_rq->clock, NULL)) |
1980 | schedstat_inc(p, se.nr_forced2_migrations); | 2055 | schedstat_inc(p, se.nr_forced2_migrations); |
1981 | #endif | 2056 | #endif |
1982 | perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, | 2057 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, |
1983 | 1, 1, NULL, 0); | 2058 | 1, 1, NULL, 0); |
1984 | } | 2059 | } |
1985 | p->se.vruntime -= old_cfsrq->min_vruntime - | 2060 | p->se.vruntime -= old_cfsrq->min_vruntime - |
@@ -2195,186 +2270,6 @@ void kick_process(struct task_struct *p) | |||
2195 | preempt_enable(); | 2270 | preempt_enable(); |
2196 | } | 2271 | } |
2197 | EXPORT_SYMBOL_GPL(kick_process); | 2272 | EXPORT_SYMBOL_GPL(kick_process); |
2198 | |||
2199 | /* | ||
2200 | * Return a low guess at the load of a migration-source cpu weighted | ||
2201 | * according to the scheduling class and "nice" value. | ||
2202 | * | ||
2203 | * We want to under-estimate the load of migration sources, to | ||
2204 | * balance conservatively. | ||
2205 | */ | ||
2206 | static unsigned long source_load(int cpu, int type) | ||
2207 | { | ||
2208 | struct rq *rq = cpu_rq(cpu); | ||
2209 | unsigned long total = weighted_cpuload(cpu); | ||
2210 | |||
2211 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2212 | return total; | ||
2213 | |||
2214 | return min(rq->cpu_load[type-1], total); | ||
2215 | } | ||
2216 | |||
2217 | /* | ||
2218 | * Return a high guess at the load of a migration-target cpu weighted | ||
2219 | * according to the scheduling class and "nice" value. | ||
2220 | */ | ||
2221 | static unsigned long target_load(int cpu, int type) | ||
2222 | { | ||
2223 | struct rq *rq = cpu_rq(cpu); | ||
2224 | unsigned long total = weighted_cpuload(cpu); | ||
2225 | |||
2226 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2227 | return total; | ||
2228 | |||
2229 | return max(rq->cpu_load[type-1], total); | ||
2230 | } | ||
2231 | |||
2232 | /* | ||
2233 | * find_idlest_group finds and returns the least busy CPU group within the | ||
2234 | * domain. | ||
2235 | */ | ||
2236 | static struct sched_group * | ||
2237 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
2238 | { | ||
2239 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
2240 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
2241 | int load_idx = sd->forkexec_idx; | ||
2242 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
2243 | |||
2244 | do { | ||
2245 | unsigned long load, avg_load; | ||
2246 | int local_group; | ||
2247 | int i; | ||
2248 | |||
2249 | /* Skip over this group if it has no CPUs allowed */ | ||
2250 | if (!cpumask_intersects(sched_group_cpus(group), | ||
2251 | &p->cpus_allowed)) | ||
2252 | continue; | ||
2253 | |||
2254 | local_group = cpumask_test_cpu(this_cpu, | ||
2255 | sched_group_cpus(group)); | ||
2256 | |||
2257 | /* Tally up the load of all CPUs in the group */ | ||
2258 | avg_load = 0; | ||
2259 | |||
2260 | for_each_cpu(i, sched_group_cpus(group)) { | ||
2261 | /* Bias balancing toward cpus of our domain */ | ||
2262 | if (local_group) | ||
2263 | load = source_load(i, load_idx); | ||
2264 | else | ||
2265 | load = target_load(i, load_idx); | ||
2266 | |||
2267 | avg_load += load; | ||
2268 | } | ||
2269 | |||
2270 | /* Adjust by relative CPU power of the group */ | ||
2271 | avg_load = sg_div_cpu_power(group, | ||
2272 | avg_load * SCHED_LOAD_SCALE); | ||
2273 | |||
2274 | if (local_group) { | ||
2275 | this_load = avg_load; | ||
2276 | this = group; | ||
2277 | } else if (avg_load < min_load) { | ||
2278 | min_load = avg_load; | ||
2279 | idlest = group; | ||
2280 | } | ||
2281 | } while (group = group->next, group != sd->groups); | ||
2282 | |||
2283 | if (!idlest || 100*this_load < imbalance*min_load) | ||
2284 | return NULL; | ||
2285 | return idlest; | ||
2286 | } | ||
2287 | |||
2288 | /* | ||
2289 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | ||
2290 | */ | ||
2291 | static int | ||
2292 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
2293 | { | ||
2294 | unsigned long load, min_load = ULONG_MAX; | ||
2295 | int idlest = -1; | ||
2296 | int i; | ||
2297 | |||
2298 | /* Traverse only the allowed CPUs */ | ||
2299 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | ||
2300 | load = weighted_cpuload(i); | ||
2301 | |||
2302 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
2303 | min_load = load; | ||
2304 | idlest = i; | ||
2305 | } | ||
2306 | } | ||
2307 | |||
2308 | return idlest; | ||
2309 | } | ||
2310 | |||
2311 | /* | ||
2312 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
2313 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
2314 | * SD_BALANCE_EXEC. | ||
2315 | * | ||
2316 | * Balance, ie. select the least loaded group. | ||
2317 | * | ||
2318 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
2319 | * | ||
2320 | * preempt must be disabled. | ||
2321 | */ | ||
2322 | static int sched_balance_self(int cpu, int flag) | ||
2323 | { | ||
2324 | struct task_struct *t = current; | ||
2325 | struct sched_domain *tmp, *sd = NULL; | ||
2326 | |||
2327 | for_each_domain(cpu, tmp) { | ||
2328 | /* | ||
2329 | * If power savings logic is enabled for a domain, stop there. | ||
2330 | */ | ||
2331 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2332 | break; | ||
2333 | if (tmp->flags & flag) | ||
2334 | sd = tmp; | ||
2335 | } | ||
2336 | |||
2337 | if (sd) | ||
2338 | update_shares(sd); | ||
2339 | |||
2340 | while (sd) { | ||
2341 | struct sched_group *group; | ||
2342 | int new_cpu, weight; | ||
2343 | |||
2344 | if (!(sd->flags & flag)) { | ||
2345 | sd = sd->child; | ||
2346 | continue; | ||
2347 | } | ||
2348 | |||
2349 | group = find_idlest_group(sd, t, cpu); | ||
2350 | if (!group) { | ||
2351 | sd = sd->child; | ||
2352 | continue; | ||
2353 | } | ||
2354 | |||
2355 | new_cpu = find_idlest_cpu(group, t, cpu); | ||
2356 | if (new_cpu == -1 || new_cpu == cpu) { | ||
2357 | /* Now try balancing at a lower domain level of cpu */ | ||
2358 | sd = sd->child; | ||
2359 | continue; | ||
2360 | } | ||
2361 | |||
2362 | /* Now try balancing at a lower domain level of new_cpu */ | ||
2363 | cpu = new_cpu; | ||
2364 | weight = cpumask_weight(sched_domain_span(sd)); | ||
2365 | sd = NULL; | ||
2366 | for_each_domain(cpu, tmp) { | ||
2367 | if (weight <= cpumask_weight(sched_domain_span(tmp))) | ||
2368 | break; | ||
2369 | if (tmp->flags & flag) | ||
2370 | sd = tmp; | ||
2371 | } | ||
2372 | /* while loop will break here if sd == NULL */ | ||
2373 | } | ||
2374 | |||
2375 | return cpu; | ||
2376 | } | ||
2377 | |||
2378 | #endif /* CONFIG_SMP */ | 2273 | #endif /* CONFIG_SMP */ |
2379 | 2274 | ||
2380 | /** | 2275 | /** |
@@ -2412,37 +2307,22 @@ void task_oncpu_function_call(struct task_struct *p, | |||
2412 | * | 2307 | * |
2413 | * returns failure only if the task is already active. | 2308 | * returns failure only if the task is already active. |
2414 | */ | 2309 | */ |
2415 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 2310 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2311 | int wake_flags) | ||
2416 | { | 2312 | { |
2417 | int cpu, orig_cpu, this_cpu, success = 0; | 2313 | int cpu, orig_cpu, this_cpu, success = 0; |
2418 | unsigned long flags; | 2314 | unsigned long flags; |
2419 | long old_state; | 2315 | struct rq *rq, *orig_rq; |
2420 | struct rq *rq; | ||
2421 | 2316 | ||
2422 | if (!sched_feat(SYNC_WAKEUPS)) | 2317 | if (!sched_feat(SYNC_WAKEUPS)) |
2423 | sync = 0; | 2318 | wake_flags &= ~WF_SYNC; |
2424 | 2319 | ||
2425 | #ifdef CONFIG_SMP | 2320 | this_cpu = get_cpu(); |
2426 | if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { | ||
2427 | struct sched_domain *sd; | ||
2428 | |||
2429 | this_cpu = raw_smp_processor_id(); | ||
2430 | cpu = task_cpu(p); | ||
2431 | |||
2432 | for_each_domain(this_cpu, sd) { | ||
2433 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2434 | update_shares(sd); | ||
2435 | break; | ||
2436 | } | ||
2437 | } | ||
2438 | } | ||
2439 | #endif | ||
2440 | 2321 | ||
2441 | smp_wmb(); | 2322 | smp_wmb(); |
2442 | rq = task_rq_lock(p, &flags); | 2323 | rq = orig_rq = task_rq_lock(p, &flags); |
2443 | update_rq_clock(rq); | 2324 | update_rq_clock(rq); |
2444 | old_state = p->state; | 2325 | if (!(p->state & state)) |
2445 | if (!(old_state & state)) | ||
2446 | goto out; | 2326 | goto out; |
2447 | 2327 | ||
2448 | if (p->se.on_rq) | 2328 | if (p->se.on_rq) |
@@ -2450,27 +2330,33 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2450 | 2330 | ||
2451 | cpu = task_cpu(p); | 2331 | cpu = task_cpu(p); |
2452 | orig_cpu = cpu; | 2332 | orig_cpu = cpu; |
2453 | this_cpu = smp_processor_id(); | ||
2454 | 2333 | ||
2455 | #ifdef CONFIG_SMP | 2334 | #ifdef CONFIG_SMP |
2456 | if (unlikely(task_running(rq, p))) | 2335 | if (unlikely(task_running(rq, p))) |
2457 | goto out_activate; | 2336 | goto out_activate; |
2458 | 2337 | ||
2459 | cpu = p->sched_class->select_task_rq(p, sync); | 2338 | /* |
2460 | if (cpu != orig_cpu) { | 2339 | * In order to handle concurrent wakeups and release the rq->lock |
2340 | * we put the task in TASK_WAKING state. | ||
2341 | * | ||
2342 | * First fix up the nr_uninterruptible count: | ||
2343 | */ | ||
2344 | if (task_contributes_to_load(p)) | ||
2345 | rq->nr_uninterruptible--; | ||
2346 | p->state = TASK_WAKING; | ||
2347 | task_rq_unlock(rq, &flags); | ||
2348 | |||
2349 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2350 | if (cpu != orig_cpu) | ||
2461 | set_task_cpu(p, cpu); | 2351 | set_task_cpu(p, cpu); |
2462 | task_rq_unlock(rq, &flags); | ||
2463 | /* might preempt at this point */ | ||
2464 | rq = task_rq_lock(p, &flags); | ||
2465 | old_state = p->state; | ||
2466 | if (!(old_state & state)) | ||
2467 | goto out; | ||
2468 | if (p->se.on_rq) | ||
2469 | goto out_running; | ||
2470 | 2352 | ||
2471 | this_cpu = smp_processor_id(); | 2353 | rq = task_rq_lock(p, &flags); |
2472 | cpu = task_cpu(p); | 2354 | |
2473 | } | 2355 | if (rq != orig_rq) |
2356 | update_rq_clock(rq); | ||
2357 | |||
2358 | WARN_ON(p->state != TASK_WAKING); | ||
2359 | cpu = task_cpu(p); | ||
2474 | 2360 | ||
2475 | #ifdef CONFIG_SCHEDSTATS | 2361 | #ifdef CONFIG_SCHEDSTATS |
2476 | schedstat_inc(rq, ttwu_count); | 2362 | schedstat_inc(rq, ttwu_count); |
@@ -2490,7 +2376,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2490 | out_activate: | 2376 | out_activate: |
2491 | #endif /* CONFIG_SMP */ | 2377 | #endif /* CONFIG_SMP */ |
2492 | schedstat_inc(p, se.nr_wakeups); | 2378 | schedstat_inc(p, se.nr_wakeups); |
2493 | if (sync) | 2379 | if (wake_flags & WF_SYNC) |
2494 | schedstat_inc(p, se.nr_wakeups_sync); | 2380 | schedstat_inc(p, se.nr_wakeups_sync); |
2495 | if (orig_cpu != cpu) | 2381 | if (orig_cpu != cpu) |
2496 | schedstat_inc(p, se.nr_wakeups_migrate); | 2382 | schedstat_inc(p, se.nr_wakeups_migrate); |
@@ -2519,7 +2405,7 @@ out_activate: | |||
2519 | 2405 | ||
2520 | out_running: | 2406 | out_running: |
2521 | trace_sched_wakeup(rq, p, success); | 2407 | trace_sched_wakeup(rq, p, success); |
2522 | check_preempt_curr(rq, p, sync); | 2408 | check_preempt_curr(rq, p, wake_flags); |
2523 | 2409 | ||
2524 | p->state = TASK_RUNNING; | 2410 | p->state = TASK_RUNNING; |
2525 | #ifdef CONFIG_SMP | 2411 | #ifdef CONFIG_SMP |
@@ -2528,6 +2414,7 @@ out_running: | |||
2528 | #endif | 2414 | #endif |
2529 | out: | 2415 | out: |
2530 | task_rq_unlock(rq, &flags); | 2416 | task_rq_unlock(rq, &flags); |
2417 | put_cpu(); | ||
2531 | 2418 | ||
2532 | return success; | 2419 | return success; |
2533 | } | 2420 | } |
@@ -2570,6 +2457,7 @@ static void __sched_fork(struct task_struct *p) | |||
2570 | p->se.avg_overlap = 0; | 2457 | p->se.avg_overlap = 0; |
2571 | p->se.start_runtime = 0; | 2458 | p->se.start_runtime = 0; |
2572 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2459 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
2460 | p->se.avg_running = 0; | ||
2573 | 2461 | ||
2574 | #ifdef CONFIG_SCHEDSTATS | 2462 | #ifdef CONFIG_SCHEDSTATS |
2575 | p->se.wait_start = 0; | 2463 | p->se.wait_start = 0; |
@@ -2631,18 +2519,41 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2631 | 2519 | ||
2632 | __sched_fork(p); | 2520 | __sched_fork(p); |
2633 | 2521 | ||
2634 | #ifdef CONFIG_SMP | 2522 | /* |
2635 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | 2523 | * Revert to default priority/policy on fork if requested. |
2636 | #endif | 2524 | */ |
2637 | set_task_cpu(p, cpu); | 2525 | if (unlikely(p->sched_reset_on_fork)) { |
2526 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { | ||
2527 | p->policy = SCHED_NORMAL; | ||
2528 | p->normal_prio = p->static_prio; | ||
2529 | } | ||
2530 | |||
2531 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2532 | p->static_prio = NICE_TO_PRIO(0); | ||
2533 | p->normal_prio = p->static_prio; | ||
2534 | set_load_weight(p); | ||
2535 | } | ||
2536 | |||
2537 | /* | ||
2538 | * We don't need the reset flag anymore after the fork. It has | ||
2539 | * fulfilled its duty: | ||
2540 | */ | ||
2541 | p->sched_reset_on_fork = 0; | ||
2542 | } | ||
2638 | 2543 | ||
2639 | /* | 2544 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2545 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2546 | */ |
2642 | p->prio = current->normal_prio; | 2547 | p->prio = current->normal_prio; |
2548 | |||
2643 | if (!rt_prio(p->prio)) | 2549 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2550 | p->sched_class = &fair_sched_class; |
2645 | 2551 | ||
2552 | #ifdef CONFIG_SMP | ||
2553 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2554 | #endif | ||
2555 | set_task_cpu(p, cpu); | ||
2556 | |||
2646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2557 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2647 | if (likely(sched_info_on())) | 2558 | if (likely(sched_info_on())) |
2648 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2559 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
@@ -2675,8 +2586,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2675 | BUG_ON(p->state != TASK_RUNNING); | 2586 | BUG_ON(p->state != TASK_RUNNING); |
2676 | update_rq_clock(rq); | 2587 | update_rq_clock(rq); |
2677 | 2588 | ||
2678 | p->prio = effective_prio(p); | ||
2679 | |||
2680 | if (!p->sched_class->task_new || !current->se.on_rq) { | 2589 | if (!p->sched_class->task_new || !current->se.on_rq) { |
2681 | activate_task(rq, p, 0); | 2590 | activate_task(rq, p, 0); |
2682 | } else { | 2591 | } else { |
@@ -2688,7 +2597,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2688 | inc_nr_running(rq); | 2597 | inc_nr_running(rq); |
2689 | } | 2598 | } |
2690 | trace_sched_wakeup_new(rq, p, 1); | 2599 | trace_sched_wakeup_new(rq, p, 1); |
2691 | check_preempt_curr(rq, p, 0); | 2600 | check_preempt_curr(rq, p, WF_FORK); |
2692 | #ifdef CONFIG_SMP | 2601 | #ifdef CONFIG_SMP |
2693 | if (p->sched_class->task_wake_up) | 2602 | if (p->sched_class->task_wake_up) |
2694 | p->sched_class->task_wake_up(rq, p); | 2603 | p->sched_class->task_wake_up(rq, p); |
@@ -2796,12 +2705,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2705 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2706 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2707 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2708 | ||
2806 | rq->prev_mm = NULL; | 2709 | rq->prev_mm = NULL; |
2807 | 2710 | ||
@@ -2818,12 +2721,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2818 | */ | 2721 | */ |
2819 | prev_state = prev->state; | 2722 | prev_state = prev->state; |
2820 | finish_arch_switch(prev); | 2723 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2724 | perf_event_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2725 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2726 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2727 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2728 | if (mm) |
@@ -2838,6 +2737,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2737 | } |
2839 | } | 2738 | } |
2840 | 2739 | ||
2740 | #ifdef CONFIG_SMP | ||
2741 | |||
2742 | /* assumes rq->lock is held */ | ||
2743 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2744 | { | ||
2745 | if (prev->sched_class->pre_schedule) | ||
2746 | prev->sched_class->pre_schedule(rq, prev); | ||
2747 | } | ||
2748 | |||
2749 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2750 | static inline void post_schedule(struct rq *rq) | ||
2751 | { | ||
2752 | if (rq->post_schedule) { | ||
2753 | unsigned long flags; | ||
2754 | |||
2755 | spin_lock_irqsave(&rq->lock, flags); | ||
2756 | if (rq->curr->sched_class->post_schedule) | ||
2757 | rq->curr->sched_class->post_schedule(rq); | ||
2758 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2759 | |||
2760 | rq->post_schedule = 0; | ||
2761 | } | ||
2762 | } | ||
2763 | |||
2764 | #else | ||
2765 | |||
2766 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2767 | { | ||
2768 | } | ||
2769 | |||
2770 | static inline void post_schedule(struct rq *rq) | ||
2771 | { | ||
2772 | } | ||
2773 | |||
2774 | #endif | ||
2775 | |||
2841 | /** | 2776 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2777 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2778 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2783,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2783 | struct rq *rq = this_rq(); |
2849 | 2784 | ||
2850 | finish_task_switch(rq, prev); | 2785 | finish_task_switch(rq, prev); |
2786 | |||
2787 | /* | ||
2788 | * FIXME: do we need to worry about rq being invalidated by the | ||
2789 | * task_switch? | ||
2790 | */ | ||
2791 | post_schedule(rq); | ||
2792 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2793 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2794 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2795 | preempt_enable(); |
@@ -2965,6 +2907,19 @@ unsigned long nr_iowait(void) | |||
2965 | return sum; | 2907 | return sum; |
2966 | } | 2908 | } |
2967 | 2909 | ||
2910 | unsigned long nr_iowait_cpu(void) | ||
2911 | { | ||
2912 | struct rq *this = this_rq(); | ||
2913 | return atomic_read(&this->nr_iowait); | ||
2914 | } | ||
2915 | |||
2916 | unsigned long this_cpu_load(void) | ||
2917 | { | ||
2918 | struct rq *this = this_rq(); | ||
2919 | return this->cpu_load[0]; | ||
2920 | } | ||
2921 | |||
2922 | |||
2968 | /* Variables and functions for calc_load */ | 2923 | /* Variables and functions for calc_load */ |
2969 | static atomic_long_t calc_load_tasks; | 2924 | static atomic_long_t calc_load_tasks; |
2970 | static unsigned long calc_load_update; | 2925 | static unsigned long calc_load_update; |
@@ -3164,7 +3119,7 @@ out: | |||
3164 | void sched_exec(void) | 3119 | void sched_exec(void) |
3165 | { | 3120 | { |
3166 | int new_cpu, this_cpu = get_cpu(); | 3121 | int new_cpu, this_cpu = get_cpu(); |
3167 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 3122 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); |
3168 | put_cpu(); | 3123 | put_cpu(); |
3169 | if (new_cpu != this_cpu) | 3124 | if (new_cpu != this_cpu) |
3170 | sched_migrate_task(current, new_cpu); | 3125 | sched_migrate_task(current, new_cpu); |
@@ -3379,9 +3334,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3334 | { |
3380 | const struct sched_class *class; | 3335 | const struct sched_class *class; |
3381 | 3336 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3337 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3338 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3339 | return 1; |
3340 | } | ||
3385 | 3341 | ||
3386 | return 0; | 3342 | return 0; |
3387 | } | 3343 | } |
@@ -3544,7 +3500,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
3544 | * capacity but still has some space to pick up some load | 3500 | * capacity but still has some space to pick up some load |
3545 | * from other group and save more power | 3501 | * from other group and save more power |
3546 | */ | 3502 | */ |
3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3503 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
3548 | return; | 3504 | return; |
3549 | 3505 | ||
3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3506 | if (sgs->sum_nr_running > sds->leader_nr_running || |
@@ -3583,11 +3539,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3583 | *imbalance = sds->min_load_per_task; | 3539 | *imbalance = sds->min_load_per_task; |
3584 | sds->busiest = sds->group_min; | 3540 | sds->busiest = sds->group_min; |
3585 | 3541 | ||
3586 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | ||
3587 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | ||
3588 | group_first_cpu(sds->group_leader); | ||
3589 | } | ||
3590 | |||
3591 | return 1; | 3542 | return 1; |
3592 | 3543 | ||
3593 | } | 3544 | } |
@@ -3612,8 +3563,105 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3563 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3613 | 3564 | ||
3614 | 3565 | ||
3566 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3567 | { | ||
3568 | return SCHED_LOAD_SCALE; | ||
3569 | } | ||
3570 | |||
3571 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3572 | { | ||
3573 | return default_scale_freq_power(sd, cpu); | ||
3574 | } | ||
3575 | |||
3576 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3577 | { | ||
3578 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3579 | unsigned long smt_gain = sd->smt_gain; | ||
3580 | |||
3581 | smt_gain /= weight; | ||
3582 | |||
3583 | return smt_gain; | ||
3584 | } | ||
3585 | |||
3586 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3587 | { | ||
3588 | return default_scale_smt_power(sd, cpu); | ||
3589 | } | ||
3590 | |||
3591 | unsigned long scale_rt_power(int cpu) | ||
3592 | { | ||
3593 | struct rq *rq = cpu_rq(cpu); | ||
3594 | u64 total, available; | ||
3595 | |||
3596 | sched_avg_update(rq); | ||
3597 | |||
3598 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3599 | available = total - rq->rt_avg; | ||
3600 | |||
3601 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3602 | total = SCHED_LOAD_SCALE; | ||
3603 | |||
3604 | total >>= SCHED_LOAD_SHIFT; | ||
3605 | |||
3606 | return div_u64(available, total); | ||
3607 | } | ||
3608 | |||
3609 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3610 | { | ||
3611 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3612 | unsigned long power = SCHED_LOAD_SCALE; | ||
3613 | struct sched_group *sdg = sd->groups; | ||
3614 | |||
3615 | if (sched_feat(ARCH_POWER)) | ||
3616 | power *= arch_scale_freq_power(sd, cpu); | ||
3617 | else | ||
3618 | power *= default_scale_freq_power(sd, cpu); | ||
3619 | |||
3620 | power >>= SCHED_LOAD_SHIFT; | ||
3621 | |||
3622 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3623 | if (sched_feat(ARCH_POWER)) | ||
3624 | power *= arch_scale_smt_power(sd, cpu); | ||
3625 | else | ||
3626 | power *= default_scale_smt_power(sd, cpu); | ||
3627 | |||
3628 | power >>= SCHED_LOAD_SHIFT; | ||
3629 | } | ||
3630 | |||
3631 | power *= scale_rt_power(cpu); | ||
3632 | power >>= SCHED_LOAD_SHIFT; | ||
3633 | |||
3634 | if (!power) | ||
3635 | power = 1; | ||
3636 | |||
3637 | sdg->cpu_power = power; | ||
3638 | } | ||
3639 | |||
3640 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3641 | { | ||
3642 | struct sched_domain *child = sd->child; | ||
3643 | struct sched_group *group, *sdg = sd->groups; | ||
3644 | unsigned long power; | ||
3645 | |||
3646 | if (!child) { | ||
3647 | update_cpu_power(sd, cpu); | ||
3648 | return; | ||
3649 | } | ||
3650 | |||
3651 | power = 0; | ||
3652 | |||
3653 | group = child->groups; | ||
3654 | do { | ||
3655 | power += group->cpu_power; | ||
3656 | group = group->next; | ||
3657 | } while (group != child->groups); | ||
3658 | |||
3659 | sdg->cpu_power = power; | ||
3660 | } | ||
3661 | |||
3615 | /** | 3662 | /** |
3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3663 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3664 | * @sd: The sched_domain whose statistics are to be updated. | ||
3617 | * @group: sched_group whose statistics are to be updated. | 3665 | * @group: sched_group whose statistics are to be updated. |
3618 | * @this_cpu: Cpu for which load balance is currently performed. | 3666 | * @this_cpu: Cpu for which load balance is currently performed. |
3619 | * @idle: Idle status of this_cpu | 3667 | * @idle: Idle status of this_cpu |
@@ -3624,7 +3672,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3624 | * @balance: Should we balance. | 3672 | * @balance: Should we balance. |
3625 | * @sgs: variable to hold the statistics for this group. | 3673 | * @sgs: variable to hold the statistics for this group. |
3626 | */ | 3674 | */ |
3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3675 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
3676 | struct sched_group *group, int this_cpu, | ||
3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3677 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
3629 | int local_group, const struct cpumask *cpus, | 3678 | int local_group, const struct cpumask *cpus, |
3630 | int *balance, struct sg_lb_stats *sgs) | 3679 | int *balance, struct sg_lb_stats *sgs) |
@@ -3635,8 +3684,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3635 | unsigned long sum_avg_load_per_task; | 3684 | unsigned long sum_avg_load_per_task; |
3636 | unsigned long avg_load_per_task; | 3685 | unsigned long avg_load_per_task; |
3637 | 3686 | ||
3638 | if (local_group) | 3687 | if (local_group) { |
3639 | balance_cpu = group_first_cpu(group); | 3688 | balance_cpu = group_first_cpu(group); |
3689 | if (balance_cpu == this_cpu) | ||
3690 | update_group_power(sd, this_cpu); | ||
3691 | } | ||
3640 | 3692 | ||
3641 | /* Tally up the load of all CPUs in the group */ | 3693 | /* Tally up the load of all CPUs in the group */ |
3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3694 | sum_avg_load_per_task = avg_load_per_task = 0; |
@@ -3685,8 +3737,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3685 | } | 3737 | } |
3686 | 3738 | ||
3687 | /* Adjust by relative CPU power of the group */ | 3739 | /* Adjust by relative CPU power of the group */ |
3688 | sgs->avg_load = sg_div_cpu_power(group, | 3740 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
3690 | 3741 | ||
3691 | 3742 | ||
3692 | /* | 3743 | /* |
@@ -3698,14 +3749,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3698 | * normalized nr_running number somewhere that negates | 3749 | * normalized nr_running number somewhere that negates |
3699 | * the hierarchy? | 3750 | * the hierarchy? |
3700 | */ | 3751 | */ |
3701 | avg_load_per_task = sg_div_cpu_power(group, | 3752 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3753 | group->cpu_power; |
3703 | 3754 | ||
3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3755 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3705 | sgs->group_imb = 1; | 3756 | sgs->group_imb = 1; |
3706 | 3757 | ||
3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3758 | sgs->group_capacity = |
3708 | 3759 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
3709 | } | 3760 | } |
3710 | 3761 | ||
3711 | /** | 3762 | /** |
@@ -3723,9 +3774,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3723 | const struct cpumask *cpus, int *balance, | 3774 | const struct cpumask *cpus, int *balance, |
3724 | struct sd_lb_stats *sds) | 3775 | struct sd_lb_stats *sds) |
3725 | { | 3776 | { |
3777 | struct sched_domain *child = sd->child; | ||
3726 | struct sched_group *group = sd->groups; | 3778 | struct sched_group *group = sd->groups; |
3727 | struct sg_lb_stats sgs; | 3779 | struct sg_lb_stats sgs; |
3728 | int load_idx; | 3780 | int load_idx, prefer_sibling = 0; |
3781 | |||
3782 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3783 | prefer_sibling = 1; | ||
3729 | 3784 | ||
3730 | init_sd_power_savings_stats(sd, sds, idle); | 3785 | init_sd_power_savings_stats(sd, sds, idle); |
3731 | load_idx = get_sd_load_idx(sd, idle); | 3786 | load_idx = get_sd_load_idx(sd, idle); |
@@ -3736,14 +3791,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3736 | local_group = cpumask_test_cpu(this_cpu, | 3791 | local_group = cpumask_test_cpu(this_cpu, |
3737 | sched_group_cpus(group)); | 3792 | sched_group_cpus(group)); |
3738 | memset(&sgs, 0, sizeof(sgs)); | 3793 | memset(&sgs, 0, sizeof(sgs)); |
3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3794 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
3740 | local_group, cpus, balance, &sgs); | 3795 | local_group, cpus, balance, &sgs); |
3741 | 3796 | ||
3742 | if (local_group && balance && !(*balance)) | 3797 | if (local_group && balance && !(*balance)) |
3743 | return; | 3798 | return; |
3744 | 3799 | ||
3745 | sds->total_load += sgs.group_load; | 3800 | sds->total_load += sgs.group_load; |
3746 | sds->total_pwr += group->__cpu_power; | 3801 | sds->total_pwr += group->cpu_power; |
3802 | |||
3803 | /* | ||
3804 | * In case the child domain prefers tasks go to siblings | ||
3805 | * first, lower the group capacity to one so that we'll try | ||
3806 | * and move all the excess tasks away. | ||
3807 | */ | ||
3808 | if (prefer_sibling) | ||
3809 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3747 | 3810 | ||
3748 | if (local_group) { | 3811 | if (local_group) { |
3749 | sds->this_load = sgs.avg_load; | 3812 | sds->this_load = sgs.avg_load; |
@@ -3763,7 +3826,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3826 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3764 | group = group->next; | 3827 | group = group->next; |
3765 | } while (group != sd->groups); | 3828 | } while (group != sd->groups); |
3766 | |||
3767 | } | 3829 | } |
3768 | 3830 | ||
3769 | /** | 3831 | /** |
@@ -3801,28 +3863,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3801 | * moving them. | 3863 | * moving them. |
3802 | */ | 3864 | */ |
3803 | 3865 | ||
3804 | pwr_now += sds->busiest->__cpu_power * | 3866 | pwr_now += sds->busiest->cpu_power * |
3805 | min(sds->busiest_load_per_task, sds->max_load); | 3867 | min(sds->busiest_load_per_task, sds->max_load); |
3806 | pwr_now += sds->this->__cpu_power * | 3868 | pwr_now += sds->this->cpu_power * |
3807 | min(sds->this_load_per_task, sds->this_load); | 3869 | min(sds->this_load_per_task, sds->this_load); |
3808 | pwr_now /= SCHED_LOAD_SCALE; | 3870 | pwr_now /= SCHED_LOAD_SCALE; |
3809 | 3871 | ||
3810 | /* Amount of load we'd subtract */ | 3872 | /* Amount of load we'd subtract */ |
3811 | tmp = sg_div_cpu_power(sds->busiest, | 3873 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3874 | sds->busiest->cpu_power; |
3813 | if (sds->max_load > tmp) | 3875 | if (sds->max_load > tmp) |
3814 | pwr_move += sds->busiest->__cpu_power * | 3876 | pwr_move += sds->busiest->cpu_power * |
3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3877 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
3816 | 3878 | ||
3817 | /* Amount of load we'd add */ | 3879 | /* Amount of load we'd add */ |
3818 | if (sds->max_load * sds->busiest->__cpu_power < | 3880 | if (sds->max_load * sds->busiest->cpu_power < |
3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3881 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
3820 | tmp = sg_div_cpu_power(sds->this, | 3882 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
3821 | sds->max_load * sds->busiest->__cpu_power); | 3883 | sds->this->cpu_power; |
3822 | else | 3884 | else |
3823 | tmp = sg_div_cpu_power(sds->this, | 3885 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3886 | sds->this->cpu_power; |
3825 | pwr_move += sds->this->__cpu_power * | 3887 | pwr_move += sds->this->cpu_power * |
3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 3888 | min(sds->this_load_per_task, sds->this_load + tmp); |
3827 | pwr_move /= SCHED_LOAD_SCALE; | 3889 | pwr_move /= SCHED_LOAD_SCALE; |
3828 | 3890 | ||
@@ -3857,8 +3919,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3857 | sds->max_load - sds->busiest_load_per_task); | 3919 | sds->max_load - sds->busiest_load_per_task); |
3858 | 3920 | ||
3859 | /* How much load to actually move to equalise the imbalance */ | 3921 | /* How much load to actually move to equalise the imbalance */ |
3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 3922 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 3923 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3862 | / SCHED_LOAD_SCALE; | 3924 | / SCHED_LOAD_SCALE; |
3863 | 3925 | ||
3864 | /* | 3926 | /* |
@@ -3988,15 +4050,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3988 | int i; | 4050 | int i; |
3989 | 4051 | ||
3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4052 | for_each_cpu(i, sched_group_cpus(group)) { |
4053 | unsigned long power = power_of(i); | ||
4054 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
3991 | unsigned long wl; | 4055 | unsigned long wl; |
3992 | 4056 | ||
3993 | if (!cpumask_test_cpu(i, cpus)) | 4057 | if (!cpumask_test_cpu(i, cpus)) |
3994 | continue; | 4058 | continue; |
3995 | 4059 | ||
3996 | rq = cpu_rq(i); | 4060 | rq = cpu_rq(i); |
3997 | wl = weighted_cpuload(i); | 4061 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
4062 | wl /= power; | ||
3998 | 4063 | ||
3999 | if (rq->nr_running == 1 && wl > imbalance) | 4064 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
4000 | continue; | 4065 | continue; |
4001 | 4066 | ||
4002 | if (wl > max_load) { | 4067 | if (wl > max_load) { |
@@ -5031,17 +5096,16 @@ void account_idle_time(cputime_t cputime) | |||
5031 | */ | 5096 | */ |
5032 | void account_process_tick(struct task_struct *p, int user_tick) | 5097 | void account_process_tick(struct task_struct *p, int user_tick) |
5033 | { | 5098 | { |
5034 | cputime_t one_jiffy = jiffies_to_cputime(1); | 5099 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
5035 | cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); | ||
5036 | struct rq *rq = this_rq(); | 5100 | struct rq *rq = this_rq(); |
5037 | 5101 | ||
5038 | if (user_tick) | 5102 | if (user_tick) |
5039 | account_user_time(p, one_jiffy, one_jiffy_scaled); | 5103 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
5040 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 5104 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
5041 | account_system_time(p, HARDIRQ_OFFSET, one_jiffy, | 5105 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, |
5042 | one_jiffy_scaled); | 5106 | one_jiffy_scaled); |
5043 | else | 5107 | else |
5044 | account_idle_time(one_jiffy); | 5108 | account_idle_time(cputime_one_jiffy); |
5045 | } | 5109 | } |
5046 | 5110 | ||
5047 | /* | 5111 | /* |
@@ -5145,7 +5209,7 @@ void scheduler_tick(void) | |||
5145 | curr->sched_class->task_tick(rq, curr, 0); | 5209 | curr->sched_class->task_tick(rq, curr, 0); |
5146 | spin_unlock(&rq->lock); | 5210 | spin_unlock(&rq->lock); |
5147 | 5211 | ||
5148 | perf_counter_task_tick(curr, cpu); | 5212 | perf_event_task_tick(curr, cpu); |
5149 | 5213 | ||
5150 | #ifdef CONFIG_SMP | 5214 | #ifdef CONFIG_SMP |
5151 | rq->idle_at_tick = idle_cpu(cpu); | 5215 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -5257,14 +5321,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
5257 | #endif | 5321 | #endif |
5258 | } | 5322 | } |
5259 | 5323 | ||
5260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 5324 | static void put_prev_task(struct rq *rq, struct task_struct *p) |
5261 | { | 5325 | { |
5262 | if (prev->state == TASK_RUNNING) { | 5326 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; |
5263 | u64 runtime = prev->se.sum_exec_runtime; | ||
5264 | 5327 | ||
5265 | runtime -= prev->se.prev_sum_exec_runtime; | 5328 | update_avg(&p->se.avg_running, runtime); |
5266 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
5267 | 5329 | ||
5330 | if (p->state == TASK_RUNNING) { | ||
5268 | /* | 5331 | /* |
5269 | * In order to avoid avg_overlap growing stale when we are | 5332 | * In order to avoid avg_overlap growing stale when we are |
5270 | * indeed overlapping and hence not getting put to sleep, grow | 5333 | * indeed overlapping and hence not getting put to sleep, grow |
@@ -5274,9 +5337,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
5274 | * correlates to the amount of cache footprint a task can | 5337 | * correlates to the amount of cache footprint a task can |
5275 | * build up. | 5338 | * build up. |
5276 | */ | 5339 | */ |
5277 | update_avg(&prev->se.avg_overlap, runtime); | 5340 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); |
5341 | update_avg(&p->se.avg_overlap, runtime); | ||
5342 | } else { | ||
5343 | update_avg(&p->se.avg_running, 0); | ||
5278 | } | 5344 | } |
5279 | prev->sched_class->put_prev_task(rq, prev); | 5345 | p->sched_class->put_prev_task(rq, p); |
5280 | } | 5346 | } |
5281 | 5347 | ||
5282 | /* | 5348 | /* |
@@ -5325,7 +5391,7 @@ need_resched: | |||
5325 | preempt_disable(); | 5391 | preempt_disable(); |
5326 | cpu = smp_processor_id(); | 5392 | cpu = smp_processor_id(); |
5327 | rq = cpu_rq(cpu); | 5393 | rq = cpu_rq(cpu); |
5328 | rcu_qsctr_inc(cpu); | 5394 | rcu_sched_qs(cpu); |
5329 | prev = rq->curr; | 5395 | prev = rq->curr; |
5330 | switch_count = &prev->nivcsw; | 5396 | switch_count = &prev->nivcsw; |
5331 | 5397 | ||
@@ -5349,10 +5415,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5415 | switch_count = &prev->nvcsw; |
5350 | } | 5416 | } |
5351 | 5417 | ||
5352 | #ifdef CONFIG_SMP | 5418 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5419 | ||
5357 | if (unlikely(!rq->nr_running)) | 5420 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5421 | idle_balance(cpu, rq); |
@@ -5362,7 +5425,7 @@ need_resched_nonpreemptible: | |||
5362 | 5425 | ||
5363 | if (likely(prev != next)) { | 5426 | if (likely(prev != next)) { |
5364 | sched_info_switch(prev, next); | 5427 | sched_info_switch(prev, next); |
5365 | perf_counter_task_sched_out(prev, next, cpu); | 5428 | perf_event_task_sched_out(prev, next, cpu); |
5366 | 5429 | ||
5367 | rq->nr_switches++; | 5430 | rq->nr_switches++; |
5368 | rq->curr = next; | 5431 | rq->curr = next; |
@@ -5378,6 +5441,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5441 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5442 | spin_unlock_irq(&rq->lock); |
5380 | 5443 | ||
5444 | post_schedule(rq); | ||
5445 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5446 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5447 | goto need_resched_nonpreemptible; |
5383 | 5448 | ||
@@ -5509,10 +5574,10 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
5509 | 5574 | ||
5510 | #endif /* CONFIG_PREEMPT */ | 5575 | #endif /* CONFIG_PREEMPT */ |
5511 | 5576 | ||
5512 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 5577 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
5513 | void *key) | 5578 | void *key) |
5514 | { | 5579 | { |
5515 | return try_to_wake_up(curr->private, mode, sync); | 5580 | return try_to_wake_up(curr->private, mode, wake_flags); |
5516 | } | 5581 | } |
5517 | EXPORT_SYMBOL(default_wake_function); | 5582 | EXPORT_SYMBOL(default_wake_function); |
5518 | 5583 | ||
@@ -5526,14 +5591,14 @@ EXPORT_SYMBOL(default_wake_function); | |||
5526 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5591 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5527 | */ | 5592 | */ |
5528 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5593 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5529 | int nr_exclusive, int sync, void *key) | 5594 | int nr_exclusive, int wake_flags, void *key) |
5530 | { | 5595 | { |
5531 | wait_queue_t *curr, *next; | 5596 | wait_queue_t *curr, *next; |
5532 | 5597 | ||
5533 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 5598 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
5534 | unsigned flags = curr->flags; | 5599 | unsigned flags = curr->flags; |
5535 | 5600 | ||
5536 | if (curr->func(curr, mode, sync, key) && | 5601 | if (curr->func(curr, mode, wake_flags, key) && |
5537 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 5602 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
5538 | break; | 5603 | break; |
5539 | } | 5604 | } |
@@ -5594,16 +5659,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
5594 | int nr_exclusive, void *key) | 5659 | int nr_exclusive, void *key) |
5595 | { | 5660 | { |
5596 | unsigned long flags; | 5661 | unsigned long flags; |
5597 | int sync = 1; | 5662 | int wake_flags = WF_SYNC; |
5598 | 5663 | ||
5599 | if (unlikely(!q)) | 5664 | if (unlikely(!q)) |
5600 | return; | 5665 | return; |
5601 | 5666 | ||
5602 | if (unlikely(!nr_exclusive)) | 5667 | if (unlikely(!nr_exclusive)) |
5603 | sync = 0; | 5668 | wake_flags = 0; |
5604 | 5669 | ||
5605 | spin_lock_irqsave(&q->lock, flags); | 5670 | spin_lock_irqsave(&q->lock, flags); |
5606 | __wake_up_common(q, mode, nr_exclusive, sync, key); | 5671 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
5607 | spin_unlock_irqrestore(&q->lock, flags); | 5672 | spin_unlock_irqrestore(&q->lock, flags); |
5608 | } | 5673 | } |
5609 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 5674 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
@@ -6123,17 +6188,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6188 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6189 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6190 | struct rq *rq; |
6191 | int reset_on_fork; | ||
6126 | 6192 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6193 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6194 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6195 | recheck: |
6130 | /* double check policy once rq lock held */ | 6196 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6197 | if (policy < 0) { |
6198 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6199 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6200 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6201 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6202 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6203 | |
6204 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6205 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6206 | policy != SCHED_IDLE) | ||
6207 | return -EINVAL; | ||
6208 | } | ||
6209 | |||
6137 | /* | 6210 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6211 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6212 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6250,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6250 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6251 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6252 | return -EPERM; |
6253 | |||
6254 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6255 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6256 | return -EPERM; | ||
6180 | } | 6257 | } |
6181 | 6258 | ||
6182 | if (user) { | 6259 | if (user) { |
@@ -6220,6 +6297,8 @@ recheck: | |||
6220 | if (running) | 6297 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6298 | p->sched_class->put_prev_task(rq, p); |
6222 | 6299 | ||
6300 | p->sched_reset_on_fork = reset_on_fork; | ||
6301 | |||
6223 | oldprio = p->prio; | 6302 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6303 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6304 | ||
@@ -6336,14 +6415,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6415 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6416 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6417 | if (!retval) |
6339 | retval = p->policy; | 6418 | retval = p->policy |
6419 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6420 | } |
6341 | read_unlock(&tasklist_lock); | 6421 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6422 | return retval; |
6343 | } | 6423 | } |
6344 | 6424 | ||
6345 | /** | 6425 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6426 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6427 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6428 | * @param: structure containing the RT priority. |
6349 | */ | 6429 | */ |
@@ -6571,19 +6651,9 @@ static inline int should_resched(void) | |||
6571 | 6651 | ||
6572 | static void __cond_resched(void) | 6652 | static void __cond_resched(void) |
6573 | { | 6653 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6654 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6655 | schedule(); |
6576 | #endif | 6656 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6657 | } |
6588 | 6658 | ||
6589 | int __sched _cond_resched(void) | 6659 | int __sched _cond_resched(void) |
@@ -6597,18 +6667,20 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6667 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6668 | ||
6599 | /* | 6669 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6670 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6671 | * call schedule, and on return reacquire the lock. |
6602 | * | 6672 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6673 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6674 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6675 | * spin_unlock(), once by hand). |
6606 | */ | 6676 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6677 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6678 | { |
6609 | int resched = should_resched(); | 6679 | int resched = should_resched(); |
6610 | int ret = 0; | 6680 | int ret = 0; |
6611 | 6681 | ||
6682 | lockdep_assert_held(lock); | ||
6683 | |||
6612 | if (spin_needbreak(lock) || resched) { | 6684 | if (spin_needbreak(lock) || resched) { |
6613 | spin_unlock(lock); | 6685 | spin_unlock(lock); |
6614 | if (resched) | 6686 | if (resched) |
@@ -6620,9 +6692,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6620 | } | 6692 | } |
6621 | return ret; | 6693 | return ret; |
6622 | } | 6694 | } |
6623 | EXPORT_SYMBOL(cond_resched_lock); | 6695 | EXPORT_SYMBOL(__cond_resched_lock); |
6624 | 6696 | ||
6625 | int __sched cond_resched_softirq(void) | 6697 | int __sched __cond_resched_softirq(void) |
6626 | { | 6698 | { |
6627 | BUG_ON(!in_softirq()); | 6699 | BUG_ON(!in_softirq()); |
6628 | 6700 | ||
@@ -6634,7 +6706,7 @@ int __sched cond_resched_softirq(void) | |||
6634 | } | 6706 | } |
6635 | return 0; | 6707 | return 0; |
6636 | } | 6708 | } |
6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6709 | EXPORT_SYMBOL(__cond_resched_softirq); |
6638 | 6710 | ||
6639 | /** | 6711 | /** |
6640 | * yield - yield the current processor to other threads. | 6712 | * yield - yield the current processor to other threads. |
@@ -6652,17 +6724,16 @@ EXPORT_SYMBOL(yield); | |||
6652 | /* | 6724 | /* |
6653 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 6725 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
6654 | * that process accounting knows that this is a task in IO wait state. | 6726 | * that process accounting knows that this is a task in IO wait state. |
6655 | * | ||
6656 | * But don't do that if it is a deliberate, throttling IO wait (this task | ||
6657 | * has set its backing_dev_info: the queue against which it should throttle) | ||
6658 | */ | 6727 | */ |
6659 | void __sched io_schedule(void) | 6728 | void __sched io_schedule(void) |
6660 | { | 6729 | { |
6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6730 | struct rq *rq = raw_rq(); |
6662 | 6731 | ||
6663 | delayacct_blkio_start(); | 6732 | delayacct_blkio_start(); |
6664 | atomic_inc(&rq->nr_iowait); | 6733 | atomic_inc(&rq->nr_iowait); |
6734 | current->in_iowait = 1; | ||
6665 | schedule(); | 6735 | schedule(); |
6736 | current->in_iowait = 0; | ||
6666 | atomic_dec(&rq->nr_iowait); | 6737 | atomic_dec(&rq->nr_iowait); |
6667 | delayacct_blkio_end(); | 6738 | delayacct_blkio_end(); |
6668 | } | 6739 | } |
@@ -6670,12 +6741,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6670 | 6741 | ||
6671 | long __sched io_schedule_timeout(long timeout) | 6742 | long __sched io_schedule_timeout(long timeout) |
6672 | { | 6743 | { |
6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6744 | struct rq *rq = raw_rq(); |
6674 | long ret; | 6745 | long ret; |
6675 | 6746 | ||
6676 | delayacct_blkio_start(); | 6747 | delayacct_blkio_start(); |
6677 | atomic_inc(&rq->nr_iowait); | 6748 | atomic_inc(&rq->nr_iowait); |
6749 | current->in_iowait = 1; | ||
6678 | ret = schedule_timeout(timeout); | 6750 | ret = schedule_timeout(timeout); |
6751 | current->in_iowait = 0; | ||
6679 | atomic_dec(&rq->nr_iowait); | 6752 | atomic_dec(&rq->nr_iowait); |
6680 | delayacct_blkio_end(); | 6753 | delayacct_blkio_end(); |
6681 | return ret; | 6754 | return ret; |
@@ -6759,23 +6832,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
6759 | if (retval) | 6832 | if (retval) |
6760 | goto out_unlock; | 6833 | goto out_unlock; |
6761 | 6834 | ||
6762 | /* | 6835 | time_slice = p->sched_class->get_rr_interval(p); |
6763 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER | ||
6764 | * tasks that are on an otherwise idle runqueue: | ||
6765 | */ | ||
6766 | time_slice = 0; | ||
6767 | if (p->policy == SCHED_RR) { | ||
6768 | time_slice = DEF_TIMESLICE; | ||
6769 | } else if (p->policy != SCHED_FIFO) { | ||
6770 | struct sched_entity *se = &p->se; | ||
6771 | unsigned long flags; | ||
6772 | struct rq *rq; | ||
6773 | 6836 | ||
6774 | rq = task_rq_lock(p, &flags); | ||
6775 | if (rq->cfs.load.weight) | ||
6776 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | ||
6777 | task_rq_unlock(rq, &flags); | ||
6778 | } | ||
6779 | read_unlock(&tasklist_lock); | 6837 | read_unlock(&tasklist_lock); |
6780 | jiffies_to_timespec(time_slice, &t); | 6838 | jiffies_to_timespec(time_slice, &t); |
6781 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 6839 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -6992,8 +7050,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6992 | 7050 | ||
6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7051 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6994 | /* Need help from migration thread: drop lock and wait. */ | 7052 | /* Need help from migration thread: drop lock and wait. */ |
7053 | struct task_struct *mt = rq->migration_thread; | ||
7054 | |||
7055 | get_task_struct(mt); | ||
6995 | task_rq_unlock(rq, &flags); | 7056 | task_rq_unlock(rq, &flags); |
6996 | wake_up_process(rq->migration_thread); | 7057 | wake_up_process(rq->migration_thread); |
7058 | put_task_struct(mt); | ||
6997 | wait_for_completion(&req.done); | 7059 | wait_for_completion(&req.done); |
6998 | tlb_migrate_finish(p->mm); | 7060 | tlb_migrate_finish(p->mm); |
6999 | return 0; | 7061 | return 0; |
@@ -7051,6 +7113,11 @@ fail: | |||
7051 | return ret; | 7113 | return ret; |
7052 | } | 7114 | } |
7053 | 7115 | ||
7116 | #define RCU_MIGRATION_IDLE 0 | ||
7117 | #define RCU_MIGRATION_NEED_QS 1 | ||
7118 | #define RCU_MIGRATION_GOT_QS 2 | ||
7119 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
7120 | |||
7054 | /* | 7121 | /* |
7055 | * migration_thread - this is a highprio system thread that performs | 7122 | * migration_thread - this is a highprio system thread that performs |
7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7123 | * thread migration by bumping thread off CPU then 'pushing' onto |
@@ -7058,6 +7125,7 @@ fail: | |||
7058 | */ | 7125 | */ |
7059 | static int migration_thread(void *data) | 7126 | static int migration_thread(void *data) |
7060 | { | 7127 | { |
7128 | int badcpu; | ||
7061 | int cpu = (long)data; | 7129 | int cpu = (long)data; |
7062 | struct rq *rq; | 7130 | struct rq *rq; |
7063 | 7131 | ||
@@ -7092,8 +7160,17 @@ static int migration_thread(void *data) | |||
7092 | req = list_entry(head->next, struct migration_req, list); | 7160 | req = list_entry(head->next, struct migration_req, list); |
7093 | list_del_init(head->next); | 7161 | list_del_init(head->next); |
7094 | 7162 | ||
7095 | spin_unlock(&rq->lock); | 7163 | if (req->task != NULL) { |
7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7164 | spin_unlock(&rq->lock); |
7165 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
7166 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
7167 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
7168 | spin_unlock(&rq->lock); | ||
7169 | } else { | ||
7170 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
7171 | spin_unlock(&rq->lock); | ||
7172 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
7173 | } | ||
7097 | local_irq_enable(); | 7174 | local_irq_enable(); |
7098 | 7175 | ||
7099 | complete(&req->done); | 7176 | complete(&req->done); |
@@ -7607,7 +7684,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7607 | /* | 7684 | /* |
7608 | * Register at high priority so that task migration (migrate_all_tasks) | 7685 | * Register at high priority so that task migration (migrate_all_tasks) |
7609 | * happens before everything else. This has to be lower priority than | 7686 | * happens before everything else. This has to be lower priority than |
7610 | * the notifier in the perf_counter subsystem, though. | 7687 | * the notifier in the perf_event subsystem, though. |
7611 | */ | 7688 | */ |
7612 | static struct notifier_block __cpuinitdata migration_notifier = { | 7689 | static struct notifier_block __cpuinitdata migration_notifier = { |
7613 | .notifier_call = migration_call, | 7690 | .notifier_call = migration_call, |
@@ -7625,7 +7702,7 @@ static int __init migration_init(void) | |||
7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7702 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7626 | register_cpu_notifier(&migration_notifier); | 7703 | register_cpu_notifier(&migration_notifier); |
7627 | 7704 | ||
7628 | return err; | 7705 | return 0; |
7629 | } | 7706 | } |
7630 | early_initcall(migration_init); | 7707 | early_initcall(migration_init); |
7631 | #endif | 7708 | #endif |
@@ -7672,7 +7749,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7672 | break; | 7749 | break; |
7673 | } | 7750 | } |
7674 | 7751 | ||
7675 | if (!group->__cpu_power) { | 7752 | if (!group->cpu_power) { |
7676 | printk(KERN_CONT "\n"); | 7753 | printk(KERN_CONT "\n"); |
7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7754 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
7678 | "set\n"); | 7755 | "set\n"); |
@@ -7696,9 +7773,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7773 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
7697 | 7774 | ||
7698 | printk(KERN_CONT " %s", str); | 7775 | printk(KERN_CONT " %s", str); |
7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7776 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7777 | printk(KERN_CONT " (cpu_power = %d)", |
7701 | group->__cpu_power); | 7778 | group->cpu_power); |
7702 | } | 7779 | } |
7703 | 7780 | ||
7704 | group = group->next; | 7781 | group = group->next; |
@@ -7763,9 +7840,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
7763 | } | 7840 | } |
7764 | 7841 | ||
7765 | /* Following flags don't use groups */ | 7842 | /* Following flags don't use groups */ |
7766 | if (sd->flags & (SD_WAKE_IDLE | | 7843 | if (sd->flags & (SD_WAKE_AFFINE)) |
7767 | SD_WAKE_AFFINE | | ||
7768 | SD_WAKE_BALANCE)) | ||
7769 | return 0; | 7844 | return 0; |
7770 | 7845 | ||
7771 | return 1; | 7846 | return 1; |
@@ -7782,10 +7857,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
7782 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | 7857 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
7783 | return 0; | 7858 | return 0; |
7784 | 7859 | ||
7785 | /* Does parent contain flags not in child? */ | ||
7786 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
7787 | if (cflags & SD_WAKE_AFFINE) | ||
7788 | pflags &= ~SD_WAKE_BALANCE; | ||
7789 | /* Flags needing groups don't count if only 1 group in parent */ | 7860 | /* Flags needing groups don't count if only 1 group in parent */ |
7790 | if (parent->groups == parent->groups->next) { | 7861 | if (parent->groups == parent->groups->next) { |
7791 | pflags &= ~(SD_LOAD_BALANCE | | 7862 | pflags &= ~(SD_LOAD_BALANCE | |
@@ -7841,7 +7912,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7841 | rq->rd = rd; | 7912 | rq->rd = rd; |
7842 | 7913 | ||
7843 | cpumask_set_cpu(rq->cpu, rd->span); | 7914 | cpumask_set_cpu(rq->cpu, rd->span); |
7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 7915 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7845 | set_rq_online(rq); | 7916 | set_rq_online(rq); |
7846 | 7917 | ||
7847 | spin_unlock_irqrestore(&rq->lock, flags); | 7918 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -7983,7 +8054,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
7983 | continue; | 8054 | continue; |
7984 | 8055 | ||
7985 | cpumask_clear(sched_group_cpus(sg)); | 8056 | cpumask_clear(sched_group_cpus(sg)); |
7986 | sg->__cpu_power = 0; | 8057 | sg->cpu_power = 0; |
7987 | 8058 | ||
7988 | for_each_cpu(j, span) { | 8059 | for_each_cpu(j, span) { |
7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8060 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
@@ -8091,6 +8162,39 @@ struct static_sched_domain { | |||
8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8162 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8092 | }; | 8163 | }; |
8093 | 8164 | ||
8165 | struct s_data { | ||
8166 | #ifdef CONFIG_NUMA | ||
8167 | int sd_allnodes; | ||
8168 | cpumask_var_t domainspan; | ||
8169 | cpumask_var_t covered; | ||
8170 | cpumask_var_t notcovered; | ||
8171 | #endif | ||
8172 | cpumask_var_t nodemask; | ||
8173 | cpumask_var_t this_sibling_map; | ||
8174 | cpumask_var_t this_core_map; | ||
8175 | cpumask_var_t send_covered; | ||
8176 | cpumask_var_t tmpmask; | ||
8177 | struct sched_group **sched_group_nodes; | ||
8178 | struct root_domain *rd; | ||
8179 | }; | ||
8180 | |||
8181 | enum s_alloc { | ||
8182 | sa_sched_groups = 0, | ||
8183 | sa_rootdomain, | ||
8184 | sa_tmpmask, | ||
8185 | sa_send_covered, | ||
8186 | sa_this_core_map, | ||
8187 | sa_this_sibling_map, | ||
8188 | sa_nodemask, | ||
8189 | sa_sched_group_nodes, | ||
8190 | #ifdef CONFIG_NUMA | ||
8191 | sa_notcovered, | ||
8192 | sa_covered, | ||
8193 | sa_domainspan, | ||
8194 | #endif | ||
8195 | sa_none, | ||
8196 | }; | ||
8197 | |||
8094 | /* | 8198 | /* |
8095 | * SMT sched-domains: | 8199 | * SMT sched-domains: |
8096 | */ | 8200 | */ |
@@ -8208,11 +8312,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8208 | continue; | 8312 | continue; |
8209 | } | 8313 | } |
8210 | 8314 | ||
8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8315 | sg->cpu_power += sd->groups->cpu_power; |
8212 | } | 8316 | } |
8213 | sg = sg->next; | 8317 | sg = sg->next; |
8214 | } while (sg != group_head); | 8318 | } while (sg != group_head); |
8215 | } | 8319 | } |
8320 | |||
8321 | static int build_numa_sched_groups(struct s_data *d, | ||
8322 | const struct cpumask *cpu_map, int num) | ||
8323 | { | ||
8324 | struct sched_domain *sd; | ||
8325 | struct sched_group *sg, *prev; | ||
8326 | int n, j; | ||
8327 | |||
8328 | cpumask_clear(d->covered); | ||
8329 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8330 | if (cpumask_empty(d->nodemask)) { | ||
8331 | d->sched_group_nodes[num] = NULL; | ||
8332 | goto out; | ||
8333 | } | ||
8334 | |||
8335 | sched_domain_node_span(num, d->domainspan); | ||
8336 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8337 | |||
8338 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8339 | GFP_KERNEL, num); | ||
8340 | if (!sg) { | ||
8341 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8342 | num); | ||
8343 | return -ENOMEM; | ||
8344 | } | ||
8345 | d->sched_group_nodes[num] = sg; | ||
8346 | |||
8347 | for_each_cpu(j, d->nodemask) { | ||
8348 | sd = &per_cpu(node_domains, j).sd; | ||
8349 | sd->groups = sg; | ||
8350 | } | ||
8351 | |||
8352 | sg->cpu_power = 0; | ||
8353 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8354 | sg->next = sg; | ||
8355 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8356 | |||
8357 | prev = sg; | ||
8358 | for (j = 0; j < nr_node_ids; j++) { | ||
8359 | n = (num + j) % nr_node_ids; | ||
8360 | cpumask_complement(d->notcovered, d->covered); | ||
8361 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8362 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8363 | if (cpumask_empty(d->tmpmask)) | ||
8364 | break; | ||
8365 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8366 | if (cpumask_empty(d->tmpmask)) | ||
8367 | continue; | ||
8368 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8369 | GFP_KERNEL, num); | ||
8370 | if (!sg) { | ||
8371 | printk(KERN_WARNING | ||
8372 | "Can not alloc domain group for node %d\n", j); | ||
8373 | return -ENOMEM; | ||
8374 | } | ||
8375 | sg->cpu_power = 0; | ||
8376 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8377 | sg->next = prev->next; | ||
8378 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8379 | prev->next = sg; | ||
8380 | prev = sg; | ||
8381 | } | ||
8382 | out: | ||
8383 | return 0; | ||
8384 | } | ||
8216 | #endif /* CONFIG_NUMA */ | 8385 | #endif /* CONFIG_NUMA */ |
8217 | 8386 | ||
8218 | #ifdef CONFIG_NUMA | 8387 | #ifdef CONFIG_NUMA |
@@ -8266,15 +8435,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8435 | * there are asymmetries in the topology. If there are asymmetries, group |
8267 | * having more cpu_power will pickup more load compared to the group having | 8436 | * having more cpu_power will pickup more load compared to the group having |
8268 | * less cpu_power. | 8437 | * less cpu_power. |
8269 | * | ||
8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
8272 | * or lightly loaded groups in the same sched domain. | ||
8273 | */ | 8438 | */ |
8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8439 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
8275 | { | 8440 | { |
8276 | struct sched_domain *child; | 8441 | struct sched_domain *child; |
8277 | struct sched_group *group; | 8442 | struct sched_group *group; |
8443 | long power; | ||
8444 | int weight; | ||
8278 | 8445 | ||
8279 | WARN_ON(!sd || !sd->groups); | 8446 | WARN_ON(!sd || !sd->groups); |
8280 | 8447 | ||
@@ -8283,28 +8450,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
8283 | 8450 | ||
8284 | child = sd->child; | 8451 | child = sd->child; |
8285 | 8452 | ||
8286 | sd->groups->__cpu_power = 0; | 8453 | sd->groups->cpu_power = 0; |
8287 | 8454 | ||
8288 | /* | 8455 | if (!child) { |
8289 | * For perf policy, if the groups in child domain share resources | 8456 | power = SCHED_LOAD_SCALE; |
8290 | * (for example cores sharing some portions of the cache hierarchy | 8457 | weight = cpumask_weight(sched_domain_span(sd)); |
8291 | * or SMT), then set this domain groups cpu_power such that each group | 8458 | /* |
8292 | * can handle only one task, when there are other idle groups in the | 8459 | * SMT siblings share the power of a single core. |
8293 | * same sched domain. | 8460 | * Usually multiple threads get a better yield out of |
8294 | */ | 8461 | * that one core than a single thread would have, |
8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8462 | * reflect that in sd->smt_gain. |
8296 | (child->flags & | 8463 | */ |
8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8464 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8465 | power *= sd->smt_gain; |
8466 | power /= weight; | ||
8467 | power >>= SCHED_LOAD_SHIFT; | ||
8468 | } | ||
8469 | sd->groups->cpu_power += power; | ||
8299 | return; | 8470 | return; |
8300 | } | 8471 | } |
8301 | 8472 | ||
8302 | /* | 8473 | /* |
8303 | * add cpu_power of each child group to this groups cpu_power | 8474 | * Add cpu_power of each child group to this groups cpu_power. |
8304 | */ | 8475 | */ |
8305 | group = child->groups; | 8476 | group = child->groups; |
8306 | do { | 8477 | do { |
8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8478 | sd->groups->cpu_power += group->cpu_power; |
8308 | group = group->next; | 8479 | group = group->next; |
8309 | } while (group != child->groups); | 8480 | } while (group != child->groups); |
8310 | } | 8481 | } |
@@ -8371,287 +8542,292 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8371 | request = attr->relax_domain_level; | 8542 | request = attr->relax_domain_level; |
8372 | if (request < sd->level) { | 8543 | if (request < sd->level) { |
8373 | /* turn off idle balance on this domain */ | 8544 | /* turn off idle balance on this domain */ |
8374 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | 8545 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8375 | } else { | 8546 | } else { |
8376 | /* turn on idle balance on this domain */ | 8547 | /* turn on idle balance on this domain */ |
8377 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | 8548 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8549 | } | ||
8550 | } | ||
8551 | |||
8552 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
8553 | const struct cpumask *cpu_map) | ||
8554 | { | ||
8555 | switch (what) { | ||
8556 | case sa_sched_groups: | ||
8557 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
8558 | d->sched_group_nodes = NULL; | ||
8559 | case sa_rootdomain: | ||
8560 | free_rootdomain(d->rd); /* fall through */ | ||
8561 | case sa_tmpmask: | ||
8562 | free_cpumask_var(d->tmpmask); /* fall through */ | ||
8563 | case sa_send_covered: | ||
8564 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8565 | case sa_this_core_map: | ||
8566 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8567 | case sa_this_sibling_map: | ||
8568 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8569 | case sa_nodemask: | ||
8570 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8571 | case sa_sched_group_nodes: | ||
8572 | #ifdef CONFIG_NUMA | ||
8573 | kfree(d->sched_group_nodes); /* fall through */ | ||
8574 | case sa_notcovered: | ||
8575 | free_cpumask_var(d->notcovered); /* fall through */ | ||
8576 | case sa_covered: | ||
8577 | free_cpumask_var(d->covered); /* fall through */ | ||
8578 | case sa_domainspan: | ||
8579 | free_cpumask_var(d->domainspan); /* fall through */ | ||
8580 | #endif | ||
8581 | case sa_none: | ||
8582 | break; | ||
8378 | } | 8583 | } |
8379 | } | 8584 | } |
8380 | 8585 | ||
8381 | /* | 8586 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8587 | const struct cpumask *cpu_map) |
8383 | * to the individual cpus | ||
8384 | */ | ||
8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | ||
8386 | struct sched_domain_attr *attr) | ||
8387 | { | 8588 | { |
8388 | int i, err = -ENOMEM; | ||
8389 | struct root_domain *rd; | ||
8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | ||
8391 | tmpmask; | ||
8392 | #ifdef CONFIG_NUMA | 8589 | #ifdef CONFIG_NUMA |
8393 | cpumask_var_t domainspan, covered, notcovered; | 8590 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8394 | struct sched_group **sched_group_nodes = NULL; | 8591 | return sa_none; |
8395 | int sd_allnodes = 0; | 8592 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8396 | 8593 | return sa_domainspan; | |
8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | 8594 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8398 | goto out; | 8595 | return sa_covered; |
8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | 8596 | /* Allocate the per-node list of sched groups */ |
8400 | goto free_domainspan; | 8597 | d->sched_group_nodes = kcalloc(nr_node_ids, |
8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | 8598 | sizeof(struct sched_group *), GFP_KERNEL); |
8402 | goto free_covered; | 8599 | if (!d->sched_group_nodes) { |
8403 | #endif | ||
8404 | |||
8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8406 | goto free_notcovered; | ||
8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8408 | goto free_nodemask; | ||
8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8410 | goto free_this_sibling_map; | ||
8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8412 | goto free_this_core_map; | ||
8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8414 | goto free_send_covered; | ||
8415 | |||
8416 | #ifdef CONFIG_NUMA | ||
8417 | /* | ||
8418 | * Allocate the per-node list of sched groups | ||
8419 | */ | ||
8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | ||
8421 | GFP_KERNEL); | ||
8422 | if (!sched_group_nodes) { | ||
8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8600 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8424 | goto free_tmpmask; | 8601 | return sa_notcovered; |
8425 | } | 8602 | } |
8426 | #endif | 8603 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8427 | 8604 | #endif | |
8428 | rd = alloc_rootdomain(); | 8605 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8429 | if (!rd) { | 8606 | return sa_sched_group_nodes; |
8607 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8608 | return sa_nodemask; | ||
8609 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8610 | return sa_this_sibling_map; | ||
8611 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8612 | return sa_this_core_map; | ||
8613 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8614 | return sa_send_covered; | ||
8615 | d->rd = alloc_rootdomain(); | ||
8616 | if (!d->rd) { | ||
8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8617 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8431 | goto free_sched_groups; | 8618 | return sa_tmpmask; |
8432 | } | 8619 | } |
8620 | return sa_rootdomain; | ||
8621 | } | ||
8433 | 8622 | ||
8623 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8624 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8625 | { | ||
8626 | struct sched_domain *sd = NULL; | ||
8434 | #ifdef CONFIG_NUMA | 8627 | #ifdef CONFIG_NUMA |
8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8628 | struct sched_domain *parent; |
8436 | #endif | ||
8437 | |||
8438 | /* | ||
8439 | * Set up domains for cpus specified by the cpu_map. | ||
8440 | */ | ||
8441 | for_each_cpu(i, cpu_map) { | ||
8442 | struct sched_domain *sd = NULL, *p; | ||
8443 | |||
8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
8445 | |||
8446 | #ifdef CONFIG_NUMA | ||
8447 | if (cpumask_weight(cpu_map) > | ||
8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8450 | SD_INIT(sd, ALLNODES); | ||
8451 | set_domain_attribute(sd, attr); | ||
8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8454 | p = sd; | ||
8455 | sd_allnodes = 1; | ||
8456 | } else | ||
8457 | p = NULL; | ||
8458 | 8629 | ||
8459 | sd = &per_cpu(node_domains, i).sd; | 8630 | d->sd_allnodes = 0; |
8460 | SD_INIT(sd, NODE); | 8631 | if (cpumask_weight(cpu_map) > |
8632 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { | ||
8633 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8634 | SD_INIT(sd, ALLNODES); | ||
8461 | set_domain_attribute(sd, attr); | 8635 | set_domain_attribute(sd, attr); |
8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8636 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8463 | sd->parent = p; | 8637 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8464 | if (p) | 8638 | d->sd_allnodes = 1; |
8465 | p->child = sd; | 8639 | } |
8466 | cpumask_and(sched_domain_span(sd), | 8640 | parent = sd; |
8467 | sched_domain_span(sd), cpu_map); | 8641 | |
8642 | sd = &per_cpu(node_domains, i).sd; | ||
8643 | SD_INIT(sd, NODE); | ||
8644 | set_domain_attribute(sd, attr); | ||
8645 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8646 | sd->parent = parent; | ||
8647 | if (parent) | ||
8648 | parent->child = sd; | ||
8649 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8468 | #endif | 8650 | #endif |
8651 | return sd; | ||
8652 | } | ||
8469 | 8653 | ||
8470 | p = sd; | 8654 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8471 | sd = &per_cpu(phys_domains, i).sd; | 8655 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8472 | SD_INIT(sd, CPU); | 8656 | struct sched_domain *parent, int i) |
8473 | set_domain_attribute(sd, attr); | 8657 | { |
8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8658 | struct sched_domain *sd; |
8475 | sd->parent = p; | 8659 | sd = &per_cpu(phys_domains, i).sd; |
8476 | if (p) | 8660 | SD_INIT(sd, CPU); |
8477 | p->child = sd; | 8661 | set_domain_attribute(sd, attr); |
8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8662 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8663 | sd->parent = parent; | ||
8664 | if (parent) | ||
8665 | parent->child = sd; | ||
8666 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8667 | return sd; | ||
8668 | } | ||
8479 | 8669 | ||
8670 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8671 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8672 | struct sched_domain *parent, int i) | ||
8673 | { | ||
8674 | struct sched_domain *sd = parent; | ||
8480 | #ifdef CONFIG_SCHED_MC | 8675 | #ifdef CONFIG_SCHED_MC |
8481 | p = sd; | 8676 | sd = &per_cpu(core_domains, i).sd; |
8482 | sd = &per_cpu(core_domains, i).sd; | 8677 | SD_INIT(sd, MC); |
8483 | SD_INIT(sd, MC); | 8678 | set_domain_attribute(sd, attr); |
8484 | set_domain_attribute(sd, attr); | 8679 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8680 | sd->parent = parent; |
8486 | cpu_coregroup_mask(i)); | 8681 | parent->child = sd; |
8487 | sd->parent = p; | 8682 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8488 | p->child = sd; | ||
8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8490 | #endif | 8683 | #endif |
8684 | return sd; | ||
8685 | } | ||
8491 | 8686 | ||
8687 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8688 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8689 | struct sched_domain *parent, int i) | ||
8690 | { | ||
8691 | struct sched_domain *sd = parent; | ||
8492 | #ifdef CONFIG_SCHED_SMT | 8692 | #ifdef CONFIG_SCHED_SMT |
8493 | p = sd; | 8693 | sd = &per_cpu(cpu_domains, i).sd; |
8494 | sd = &per_cpu(cpu_domains, i).sd; | 8694 | SD_INIT(sd, SIBLING); |
8495 | SD_INIT(sd, SIBLING); | 8695 | set_domain_attribute(sd, attr); |
8496 | set_domain_attribute(sd, attr); | 8696 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8497 | cpumask_and(sched_domain_span(sd), | 8697 | sd->parent = parent; |
8498 | topology_thread_cpumask(i), cpu_map); | 8698 | parent->child = sd; |
8499 | sd->parent = p; | 8699 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8500 | p->child = sd; | ||
8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8502 | #endif | 8700 | #endif |
8503 | } | 8701 | return sd; |
8702 | } | ||
8504 | 8703 | ||
8704 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8705 | const struct cpumask *cpu_map, int cpu) | ||
8706 | { | ||
8707 | switch (l) { | ||
8505 | #ifdef CONFIG_SCHED_SMT | 8708 | #ifdef CONFIG_SCHED_SMT |
8506 | /* Set up CPU (sibling) groups */ | 8709 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8507 | for_each_cpu(i, cpu_map) { | 8710 | cpumask_and(d->this_sibling_map, cpu_map, |
8508 | cpumask_and(this_sibling_map, | 8711 | topology_thread_cpumask(cpu)); |
8509 | topology_thread_cpumask(i), cpu_map); | 8712 | if (cpu == cpumask_first(d->this_sibling_map)) |
8510 | if (i != cpumask_first(this_sibling_map)) | 8713 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8511 | continue; | 8714 | &cpu_to_cpu_group, |
8512 | 8715 | d->send_covered, d->tmpmask); | |
8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8716 | break; |
8514 | &cpu_to_cpu_group, | ||
8515 | send_covered, tmpmask); | ||
8516 | } | ||
8517 | #endif | 8717 | #endif |
8518 | |||
8519 | #ifdef CONFIG_SCHED_MC | 8718 | #ifdef CONFIG_SCHED_MC |
8520 | /* Set up multi-core groups */ | 8719 | case SD_LV_MC: /* set up multi-core groups */ |
8521 | for_each_cpu(i, cpu_map) { | 8720 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8721 | if (cpu == cpumask_first(d->this_core_map)) |
8523 | if (i != cpumask_first(this_core_map)) | 8722 | init_sched_build_groups(d->this_core_map, cpu_map, |
8524 | continue; | 8723 | &cpu_to_core_group, |
8525 | 8724 | d->send_covered, d->tmpmask); | |
8526 | init_sched_build_groups(this_core_map, cpu_map, | 8725 | break; |
8527 | &cpu_to_core_group, | ||
8528 | send_covered, tmpmask); | ||
8529 | } | ||
8530 | #endif | 8726 | #endif |
8531 | 8727 | case SD_LV_CPU: /* set up physical groups */ | |
8532 | /* Set up physical groups */ | 8728 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8533 | for (i = 0; i < nr_node_ids; i++) { | 8729 | if (!cpumask_empty(d->nodemask)) |
8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8730 | init_sched_build_groups(d->nodemask, cpu_map, |
8535 | if (cpumask_empty(nodemask)) | 8731 | &cpu_to_phys_group, |
8536 | continue; | 8732 | d->send_covered, d->tmpmask); |
8537 | 8733 | break; | |
8538 | init_sched_build_groups(nodemask, cpu_map, | ||
8539 | &cpu_to_phys_group, | ||
8540 | send_covered, tmpmask); | ||
8541 | } | ||
8542 | |||
8543 | #ifdef CONFIG_NUMA | 8734 | #ifdef CONFIG_NUMA |
8544 | /* Set up node groups */ | 8735 | case SD_LV_ALLNODES: |
8545 | if (sd_allnodes) { | 8736 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8546 | init_sched_build_groups(cpu_map, cpu_map, | 8737 | d->send_covered, d->tmpmask); |
8547 | &cpu_to_allnodes_group, | 8738 | break; |
8548 | send_covered, tmpmask); | 8739 | #endif |
8740 | default: | ||
8741 | break; | ||
8549 | } | 8742 | } |
8743 | } | ||
8550 | 8744 | ||
8551 | for (i = 0; i < nr_node_ids; i++) { | 8745 | /* |
8552 | /* Set up node groups */ | 8746 | * Build sched domains for a given set of cpus and attach the sched domains |
8553 | struct sched_group *sg, *prev; | 8747 | * to the individual cpus |
8554 | int j; | 8748 | */ |
8555 | 8749 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8556 | cpumask_clear(covered); | 8750 | struct sched_domain_attr *attr) |
8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8751 | { |
8558 | if (cpumask_empty(nodemask)) { | 8752 | enum s_alloc alloc_state = sa_none; |
8559 | sched_group_nodes[i] = NULL; | 8753 | struct s_data d; |
8560 | continue; | 8754 | struct sched_domain *sd; |
8561 | } | 8755 | int i; |
8756 | #ifdef CONFIG_NUMA | ||
8757 | d.sd_allnodes = 0; | ||
8758 | #endif | ||
8562 | 8759 | ||
8563 | sched_domain_node_span(i, domainspan); | 8760 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8761 | if (alloc_state != sa_rootdomain) |
8762 | goto error; | ||
8763 | alloc_state = sa_sched_groups; | ||
8565 | 8764 | ||
8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8765 | /* |
8567 | GFP_KERNEL, i); | 8766 | * Set up domains for cpus specified by the cpu_map. |
8568 | if (!sg) { | 8767 | */ |
8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8768 | for_each_cpu(i, cpu_map) { |
8570 | "node %d\n", i); | 8769 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8571 | goto error; | 8770 | cpu_map); |
8572 | } | ||
8573 | sched_group_nodes[i] = sg; | ||
8574 | for_each_cpu(j, nodemask) { | ||
8575 | struct sched_domain *sd; | ||
8576 | 8771 | ||
8577 | sd = &per_cpu(node_domains, j).sd; | 8772 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8578 | sd->groups = sg; | 8773 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8579 | } | 8774 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8580 | sg->__cpu_power = 0; | 8775 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8776 | } |
8582 | sg->next = sg; | ||
8583 | cpumask_or(covered, covered, nodemask); | ||
8584 | prev = sg; | ||
8585 | 8777 | ||
8586 | for (j = 0; j < nr_node_ids; j++) { | 8778 | for_each_cpu(i, cpu_map) { |
8587 | int n = (i + j) % nr_node_ids; | 8779 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8780 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8781 | } | ||
8588 | 8782 | ||
8589 | cpumask_complement(notcovered, covered); | 8783 | /* Set up physical groups */ |
8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8784 | for (i = 0; i < nr_node_ids; i++) |
8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8785 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8592 | if (cpumask_empty(tmpmask)) | ||
8593 | break; | ||
8594 | 8786 | ||
8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8787 | #ifdef CONFIG_NUMA |
8596 | if (cpumask_empty(tmpmask)) | 8788 | /* Set up node groups */ |
8597 | continue; | 8789 | if (d.sd_allnodes) |
8790 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8598 | 8791 | ||
8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8792 | for (i = 0; i < nr_node_ids; i++) |
8600 | cpumask_size(), | 8793 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8601 | GFP_KERNEL, i); | 8794 | goto error; |
8602 | if (!sg) { | ||
8603 | printk(KERN_WARNING | ||
8604 | "Can not alloc domain group for node %d\n", j); | ||
8605 | goto error; | ||
8606 | } | ||
8607 | sg->__cpu_power = 0; | ||
8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8609 | sg->next = prev->next; | ||
8610 | cpumask_or(covered, covered, tmpmask); | ||
8611 | prev->next = sg; | ||
8612 | prev = sg; | ||
8613 | } | ||
8614 | } | ||
8615 | #endif | 8795 | #endif |
8616 | 8796 | ||
8617 | /* Calculate CPU power for physical packages and nodes */ | 8797 | /* Calculate CPU power for physical packages and nodes */ |
8618 | #ifdef CONFIG_SCHED_SMT | 8798 | #ifdef CONFIG_SCHED_SMT |
8619 | for_each_cpu(i, cpu_map) { | 8799 | for_each_cpu(i, cpu_map) { |
8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8800 | sd = &per_cpu(cpu_domains, i).sd; |
8621 | |||
8622 | init_sched_groups_power(i, sd); | 8801 | init_sched_groups_power(i, sd); |
8623 | } | 8802 | } |
8624 | #endif | 8803 | #endif |
8625 | #ifdef CONFIG_SCHED_MC | 8804 | #ifdef CONFIG_SCHED_MC |
8626 | for_each_cpu(i, cpu_map) { | 8805 | for_each_cpu(i, cpu_map) { |
8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8806 | sd = &per_cpu(core_domains, i).sd; |
8628 | |||
8629 | init_sched_groups_power(i, sd); | 8807 | init_sched_groups_power(i, sd); |
8630 | } | 8808 | } |
8631 | #endif | 8809 | #endif |
8632 | 8810 | ||
8633 | for_each_cpu(i, cpu_map) { | 8811 | for_each_cpu(i, cpu_map) { |
8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8812 | sd = &per_cpu(phys_domains, i).sd; |
8635 | |||
8636 | init_sched_groups_power(i, sd); | 8813 | init_sched_groups_power(i, sd); |
8637 | } | 8814 | } |
8638 | 8815 | ||
8639 | #ifdef CONFIG_NUMA | 8816 | #ifdef CONFIG_NUMA |
8640 | for (i = 0; i < nr_node_ids; i++) | 8817 | for (i = 0; i < nr_node_ids; i++) |
8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8818 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8642 | 8819 | ||
8643 | if (sd_allnodes) { | 8820 | if (d.sd_allnodes) { |
8644 | struct sched_group *sg; | 8821 | struct sched_group *sg; |
8645 | 8822 | ||
8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8823 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8647 | tmpmask); | 8824 | d.tmpmask); |
8648 | init_numa_sched_groups_power(sg); | 8825 | init_numa_sched_groups_power(sg); |
8649 | } | 8826 | } |
8650 | #endif | 8827 | #endif |
8651 | 8828 | ||
8652 | /* Attach the domains */ | 8829 | /* Attach the domains */ |
8653 | for_each_cpu(i, cpu_map) { | 8830 | for_each_cpu(i, cpu_map) { |
8654 | struct sched_domain *sd; | ||
8655 | #ifdef CONFIG_SCHED_SMT | 8831 | #ifdef CONFIG_SCHED_SMT |
8656 | sd = &per_cpu(cpu_domains, i).sd; | 8832 | sd = &per_cpu(cpu_domains, i).sd; |
8657 | #elif defined(CONFIG_SCHED_MC) | 8833 | #elif defined(CONFIG_SCHED_MC) |
@@ -8659,44 +8835,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8659 | #else | 8835 | #else |
8660 | sd = &per_cpu(phys_domains, i).sd; | 8836 | sd = &per_cpu(phys_domains, i).sd; |
8661 | #endif | 8837 | #endif |
8662 | cpu_attach_domain(sd, rd, i); | 8838 | cpu_attach_domain(sd, d.rd, i); |
8663 | } | 8839 | } |
8664 | 8840 | ||
8665 | err = 0; | 8841 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8666 | 8842 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8667 | free_tmpmask: | 8843 | return 0; |
8668 | free_cpumask_var(tmpmask); | ||
8669 | free_send_covered: | ||
8670 | free_cpumask_var(send_covered); | ||
8671 | free_this_core_map: | ||
8672 | free_cpumask_var(this_core_map); | ||
8673 | free_this_sibling_map: | ||
8674 | free_cpumask_var(this_sibling_map); | ||
8675 | free_nodemask: | ||
8676 | free_cpumask_var(nodemask); | ||
8677 | free_notcovered: | ||
8678 | #ifdef CONFIG_NUMA | ||
8679 | free_cpumask_var(notcovered); | ||
8680 | free_covered: | ||
8681 | free_cpumask_var(covered); | ||
8682 | free_domainspan: | ||
8683 | free_cpumask_var(domainspan); | ||
8684 | out: | ||
8685 | #endif | ||
8686 | return err; | ||
8687 | |||
8688 | free_sched_groups: | ||
8689 | #ifdef CONFIG_NUMA | ||
8690 | kfree(sched_group_nodes); | ||
8691 | #endif | ||
8692 | goto free_tmpmask; | ||
8693 | 8844 | ||
8694 | #ifdef CONFIG_NUMA | ||
8695 | error: | 8845 | error: |
8696 | free_sched_groups(cpu_map, tmpmask); | 8846 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8697 | free_rootdomain(rd); | 8847 | return -ENOMEM; |
8698 | goto free_tmpmask; | ||
8699 | #endif | ||
8700 | } | 8848 | } |
8701 | 8849 | ||
8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 8850 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9015,6 +9163,7 @@ void __init sched_init_smp(void) | |||
9015 | cpumask_var_t non_isolated_cpus; | 9163 | cpumask_var_t non_isolated_cpus; |
9016 | 9164 | ||
9017 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 9165 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
9166 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
9018 | 9167 | ||
9019 | #if defined(CONFIG_NUMA) | 9168 | #if defined(CONFIG_NUMA) |
9020 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | 9169 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), |
@@ -9046,7 +9195,6 @@ void __init sched_init_smp(void) | |||
9046 | sched_init_granularity(); | 9195 | sched_init_granularity(); |
9047 | free_cpumask_var(non_isolated_cpus); | 9196 | free_cpumask_var(non_isolated_cpus); |
9048 | 9197 | ||
9049 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
9050 | init_sched_rt_class(); | 9198 | init_sched_rt_class(); |
9051 | } | 9199 | } |
9052 | #else | 9200 | #else |
@@ -9304,11 +9452,11 @@ void __init sched_init(void) | |||
9304 | * system cpu resource, based on the weight assigned to root | 9452 | * system cpu resource, based on the weight assigned to root |
9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9453 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9454 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9307 | * (init_cfs_rq) and having one entity represent this group of | 9455 | * (init_tg_cfs_rq) and having one entity represent this group of |
9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9456 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9309 | */ | 9457 | */ |
9310 | init_tg_cfs_entry(&init_task_group, | 9458 | init_tg_cfs_entry(&init_task_group, |
9311 | &per_cpu(init_cfs_rq, i), | 9459 | &per_cpu(init_tg_cfs_rq, i), |
9312 | &per_cpu(init_sched_entity, i), i, 1, | 9460 | &per_cpu(init_sched_entity, i), i, 1, |
9313 | root_task_group.se[i]); | 9461 | root_task_group.se[i]); |
9314 | 9462 | ||
@@ -9334,6 +9482,7 @@ void __init sched_init(void) | |||
9334 | #ifdef CONFIG_SMP | 9482 | #ifdef CONFIG_SMP |
9335 | rq->sd = NULL; | 9483 | rq->sd = NULL; |
9336 | rq->rd = NULL; | 9484 | rq->rd = NULL; |
9485 | rq->post_schedule = 0; | ||
9337 | rq->active_balance = 0; | 9486 | rq->active_balance = 0; |
9338 | rq->next_balance = jiffies; | 9487 | rq->next_balance = jiffies; |
9339 | rq->push_cpu = 0; | 9488 | rq->push_cpu = 0; |
@@ -9392,19 +9541,26 @@ void __init sched_init(void) | |||
9392 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 9541 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
9393 | #endif /* SMP */ | 9542 | #endif /* SMP */ |
9394 | 9543 | ||
9395 | perf_counter_init(); | 9544 | perf_event_init(); |
9396 | 9545 | ||
9397 | scheduler_running = 1; | 9546 | scheduler_running = 1; |
9398 | } | 9547 | } |
9399 | 9548 | ||
9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9549 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9401 | void __might_sleep(char *file, int line) | 9550 | static inline int preempt_count_equals(int preempt_offset) |
9551 | { | ||
9552 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9553 | |||
9554 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9555 | } | ||
9556 | |||
9557 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9402 | { | 9558 | { |
9403 | #ifdef in_atomic | 9559 | #ifdef in_atomic |
9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9560 | static unsigned long prev_jiffy; /* ratelimiting */ |
9405 | 9561 | ||
9406 | if ((!in_atomic() && !irqs_disabled()) || | 9562 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9563 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9408 | return; | 9564 | return; |
9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9565 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9410 | return; | 9566 | return; |
@@ -10157,7 +10313,7 @@ static int sched_rt_global_constraints(void) | |||
10157 | #endif /* CONFIG_RT_GROUP_SCHED */ | 10313 | #endif /* CONFIG_RT_GROUP_SCHED */ |
10158 | 10314 | ||
10159 | int sched_rt_handler(struct ctl_table *table, int write, | 10315 | int sched_rt_handler(struct ctl_table *table, int write, |
10160 | struct file *filp, void __user *buffer, size_t *lenp, | 10316 | void __user *buffer, size_t *lenp, |
10161 | loff_t *ppos) | 10317 | loff_t *ppos) |
10162 | { | 10318 | { |
10163 | int ret; | 10319 | int ret; |
@@ -10168,7 +10324,7 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
10168 | old_period = sysctl_sched_rt_period; | 10324 | old_period = sysctl_sched_rt_period; |
10169 | old_runtime = sysctl_sched_rt_runtime; | 10325 | old_runtime = sysctl_sched_rt_runtime; |
10170 | 10326 | ||
10171 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | 10327 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
10172 | 10328 | ||
10173 | if (!ret && write) { | 10329 | if (!ret && write) { |
10174 | ret = sched_rt_global_constraints(); | 10330 | ret = sched_rt_global_constraints(); |
@@ -10222,8 +10378,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
10222 | } | 10378 | } |
10223 | 10379 | ||
10224 | static int | 10380 | static int |
10225 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10381 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
10226 | struct task_struct *tsk) | ||
10227 | { | 10382 | { |
10228 | #ifdef CONFIG_RT_GROUP_SCHED | 10383 | #ifdef CONFIG_RT_GROUP_SCHED |
10229 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 10384 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) |
@@ -10233,15 +10388,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
10233 | if (tsk->sched_class != &fair_sched_class) | 10388 | if (tsk->sched_class != &fair_sched_class) |
10234 | return -EINVAL; | 10389 | return -EINVAL; |
10235 | #endif | 10390 | #endif |
10391 | return 0; | ||
10392 | } | ||
10236 | 10393 | ||
10394 | static int | ||
10395 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
10396 | struct task_struct *tsk, bool threadgroup) | ||
10397 | { | ||
10398 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
10399 | if (retval) | ||
10400 | return retval; | ||
10401 | if (threadgroup) { | ||
10402 | struct task_struct *c; | ||
10403 | rcu_read_lock(); | ||
10404 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
10405 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
10406 | if (retval) { | ||
10407 | rcu_read_unlock(); | ||
10408 | return retval; | ||
10409 | } | ||
10410 | } | ||
10411 | rcu_read_unlock(); | ||
10412 | } | ||
10237 | return 0; | 10413 | return 0; |
10238 | } | 10414 | } |
10239 | 10415 | ||
10240 | static void | 10416 | static void |
10241 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10417 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
10242 | struct cgroup *old_cont, struct task_struct *tsk) | 10418 | struct cgroup *old_cont, struct task_struct *tsk, |
10419 | bool threadgroup) | ||
10243 | { | 10420 | { |
10244 | sched_move_task(tsk); | 10421 | sched_move_task(tsk); |
10422 | if (threadgroup) { | ||
10423 | struct task_struct *c; | ||
10424 | rcu_read_lock(); | ||
10425 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
10426 | sched_move_task(c); | ||
10427 | } | ||
10428 | rcu_read_unlock(); | ||
10429 | } | ||
10245 | } | 10430 | } |
10246 | 10431 | ||
10247 | #ifdef CONFIG_FAIR_GROUP_SCHED | 10432 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -10581,3 +10766,113 @@ struct cgroup_subsys cpuacct_subsys = { | |||
10581 | .subsys_id = cpuacct_subsys_id, | 10766 | .subsys_id = cpuacct_subsys_id, |
10582 | }; | 10767 | }; |
10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10768 | #endif /* CONFIG_CGROUP_CPUACCT */ |
10769 | |||
10770 | #ifndef CONFIG_SMP | ||
10771 | |||
10772 | int rcu_expedited_torture_stats(char *page) | ||
10773 | { | ||
10774 | return 0; | ||
10775 | } | ||
10776 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10777 | |||
10778 | void synchronize_sched_expedited(void) | ||
10779 | { | ||
10780 | } | ||
10781 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10782 | |||
10783 | #else /* #ifndef CONFIG_SMP */ | ||
10784 | |||
10785 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
10786 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
10787 | |||
10788 | #define RCU_EXPEDITED_STATE_POST -2 | ||
10789 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
10790 | |||
10791 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10792 | |||
10793 | int rcu_expedited_torture_stats(char *page) | ||
10794 | { | ||
10795 | int cnt = 0; | ||
10796 | int cpu; | ||
10797 | |||
10798 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
10799 | for_each_online_cpu(cpu) { | ||
10800 | cnt += sprintf(&page[cnt], " %d:%d", | ||
10801 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
10802 | } | ||
10803 | cnt += sprintf(&page[cnt], "\n"); | ||
10804 | return cnt; | ||
10805 | } | ||
10806 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10807 | |||
10808 | static long synchronize_sched_expedited_count; | ||
10809 | |||
10810 | /* | ||
10811 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
10812 | * approach to force grace period to end quickly. This consumes | ||
10813 | * significant time on all CPUs, and is thus not recommended for | ||
10814 | * any sort of common-case code. | ||
10815 | * | ||
10816 | * Note that it is illegal to call this function while holding any | ||
10817 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
10818 | * observe this restriction will result in deadlock. | ||
10819 | */ | ||
10820 | void synchronize_sched_expedited(void) | ||
10821 | { | ||
10822 | int cpu; | ||
10823 | unsigned long flags; | ||
10824 | bool need_full_sync = 0; | ||
10825 | struct rq *rq; | ||
10826 | struct migration_req *req; | ||
10827 | long snap; | ||
10828 | int trycount = 0; | ||
10829 | |||
10830 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
10831 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
10832 | get_online_cpus(); | ||
10833 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
10834 | put_online_cpus(); | ||
10835 | if (trycount++ < 10) | ||
10836 | udelay(trycount * num_online_cpus()); | ||
10837 | else { | ||
10838 | synchronize_sched(); | ||
10839 | return; | ||
10840 | } | ||
10841 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
10842 | smp_mb(); /* ensure test happens before caller kfree */ | ||
10843 | return; | ||
10844 | } | ||
10845 | get_online_cpus(); | ||
10846 | } | ||
10847 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
10848 | for_each_online_cpu(cpu) { | ||
10849 | rq = cpu_rq(cpu); | ||
10850 | req = &per_cpu(rcu_migration_req, cpu); | ||
10851 | init_completion(&req->done); | ||
10852 | req->task = NULL; | ||
10853 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
10854 | spin_lock_irqsave(&rq->lock, flags); | ||
10855 | list_add(&req->list, &rq->migration_queue); | ||
10856 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10857 | wake_up_process(rq->migration_thread); | ||
10858 | } | ||
10859 | for_each_online_cpu(cpu) { | ||
10860 | rcu_expedited_state = cpu; | ||
10861 | req = &per_cpu(rcu_migration_req, cpu); | ||
10862 | rq = cpu_rq(cpu); | ||
10863 | wait_for_completion(&req->done); | ||
10864 | spin_lock_irqsave(&rq->lock, flags); | ||
10865 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
10866 | need_full_sync = 1; | ||
10867 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
10868 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10869 | } | ||
10870 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10871 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
10872 | put_online_cpus(); | ||
10873 | if (need_full_sync) | ||
10874 | synchronize_sched(); | ||
10875 | } | ||
10876 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10877 | |||
10878 | #endif /* #else #ifndef CONFIG_SMP */ | ||