diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1738 |
1 files changed, 1016 insertions, 722 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273b..1535f3884b88 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -39,7 +39,7 @@ | |||
39 | #include <linux/completion.h> | 39 | #include <linux/completion.h> |
40 | #include <linux/kernel_stat.h> | 40 | #include <linux/kernel_stat.h> |
41 | #include <linux/debug_locks.h> | 41 | #include <linux/debug_locks.h> |
42 | #include <linux/perf_counter.h> | 42 | #include <linux/perf_event.h> |
43 | #include <linux/security.h> | 43 | #include <linux/security.h> |
44 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
45 | #include <linux/profile.h> | 45 | #include <linux/profile.h> |
@@ -64,7 +64,6 @@ | |||
64 | #include <linux/tsacct_kern.h> | 64 | #include <linux/tsacct_kern.h> |
65 | #include <linux/kprobes.h> | 65 | #include <linux/kprobes.h> |
66 | #include <linux/delayacct.h> | 66 | #include <linux/delayacct.h> |
67 | #include <linux/reciprocal_div.h> | ||
68 | #include <linux/unistd.h> | 67 | #include <linux/unistd.h> |
69 | #include <linux/pagemap.h> | 68 | #include <linux/pagemap.h> |
70 | #include <linux/hrtimer.h> | 69 | #include <linux/hrtimer.h> |
@@ -120,30 +119,6 @@ | |||
120 | */ | 119 | */ |
121 | #define RUNTIME_INF ((u64)~0ULL) | 120 | #define RUNTIME_INF ((u64)~0ULL) |
122 | 121 | ||
123 | #ifdef CONFIG_SMP | ||
124 | |||
125 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
126 | |||
127 | /* | ||
128 | * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) | ||
129 | * Since cpu_power is a 'constant', we can use a reciprocal divide. | ||
130 | */ | ||
131 | static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) | ||
132 | { | ||
133 | return reciprocal_divide(load, sg->reciprocal_cpu_power); | ||
134 | } | ||
135 | |||
136 | /* | ||
137 | * Each time a sched group cpu_power is changed, | ||
138 | * we must compute its reciprocal value | ||
139 | */ | ||
140 | static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | ||
141 | { | ||
142 | sg->__cpu_power += val; | ||
143 | sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); | ||
144 | } | ||
145 | #endif | ||
146 | |||
147 | static inline int rt_policy(int policy) | 122 | static inline int rt_policy(int policy) |
148 | { | 123 | { |
149 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) | 124 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user) | |||
309 | 284 | ||
310 | /* | 285 | /* |
311 | * Root task group. | 286 | * Root task group. |
312 | * Every UID task group (including init_task_group aka UID-0) will | 287 | * Every UID task group (including init_task_group aka UID-0) will |
313 | * be a child to this group. | 288 | * be a child to this group. |
314 | */ | 289 | */ |
315 | struct task_group root_task_group; | 290 | struct task_group root_task_group; |
316 | 291 | ||
@@ -318,12 +293,12 @@ struct task_group root_task_group; | |||
318 | /* Default task group's sched entity on each cpu */ | 293 | /* Default task group's sched entity on each cpu */ |
319 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | 294 | static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); |
320 | /* Default task group's cfs_rq on each cpu */ | 295 | /* Default task group's cfs_rq on each cpu */ |
321 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 296 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq); |
322 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 297 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
323 | 298 | ||
324 | #ifdef CONFIG_RT_GROUP_SCHED | 299 | #ifdef CONFIG_RT_GROUP_SCHED |
325 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | 300 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); |
326 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | 301 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); |
327 | #endif /* CONFIG_RT_GROUP_SCHED */ | 302 | #endif /* CONFIG_RT_GROUP_SCHED */ |
328 | #else /* !CONFIG_USER_SCHED */ | 303 | #else /* !CONFIG_USER_SCHED */ |
329 | #define root_task_group init_task_group | 304 | #define root_task_group init_task_group |
@@ -401,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) | |||
401 | 376 | ||
402 | #else | 377 | #else |
403 | 378 | ||
404 | #ifdef CONFIG_SMP | ||
405 | static int root_task_group_empty(void) | ||
406 | { | ||
407 | return 1; | ||
408 | } | ||
409 | #endif | ||
410 | |||
411 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } | 379 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
412 | static inline struct task_group *task_group(struct task_struct *p) | 380 | static inline struct task_group *task_group(struct task_struct *p) |
413 | { | 381 | { |
@@ -537,14 +505,6 @@ struct root_domain { | |||
537 | #ifdef CONFIG_SMP | 505 | #ifdef CONFIG_SMP |
538 | struct cpupri cpupri; | 506 | struct cpupri cpupri; |
539 | #endif | 507 | #endif |
540 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
541 | /* | ||
542 | * Preferred wake up cpu nominated by sched_mc balance that will be | ||
543 | * used when most cpus are idle in the system indicating overall very | ||
544 | * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) | ||
545 | */ | ||
546 | unsigned int sched_mc_preferred_wakeup_cpu; | ||
547 | #endif | ||
548 | }; | 508 | }; |
549 | 509 | ||
550 | /* | 510 | /* |
@@ -616,6 +576,7 @@ struct rq { | |||
616 | 576 | ||
617 | unsigned char idle_at_tick; | 577 | unsigned char idle_at_tick; |
618 | /* For active balancing */ | 578 | /* For active balancing */ |
579 | int post_schedule; | ||
619 | int active_balance; | 580 | int active_balance; |
620 | int push_cpu; | 581 | int push_cpu; |
621 | /* cpu of this runqueue: */ | 582 | /* cpu of this runqueue: */ |
@@ -626,6 +587,9 @@ struct rq { | |||
626 | 587 | ||
627 | struct task_struct *migration_thread; | 588 | struct task_struct *migration_thread; |
628 | struct list_head migration_queue; | 589 | struct list_head migration_queue; |
590 | |||
591 | u64 rt_avg; | ||
592 | u64 age_stamp; | ||
629 | #endif | 593 | #endif |
630 | 594 | ||
631 | /* calc_load related fields */ | 595 | /* calc_load related fields */ |
@@ -665,9 +629,10 @@ struct rq { | |||
665 | 629 | ||
666 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 630 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
667 | 631 | ||
668 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) | 632 | static inline |
633 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
669 | { | 634 | { |
670 | rq->curr->sched_class->check_preempt_curr(rq, p, sync); | 635 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); |
671 | } | 636 | } |
672 | 637 | ||
673 | static inline int cpu_of(struct rq *rq) | 638 | static inline int cpu_of(struct rq *rq) |
@@ -693,6 +658,7 @@ static inline int cpu_of(struct rq *rq) | |||
693 | #define this_rq() (&__get_cpu_var(runqueues)) | 658 | #define this_rq() (&__get_cpu_var(runqueues)) |
694 | #define task_rq(p) cpu_rq(task_cpu(p)) | 659 | #define task_rq(p) cpu_rq(task_cpu(p)) |
695 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 660 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
661 | #define raw_rq() (&__raw_get_cpu_var(runqueues)) | ||
696 | 662 | ||
697 | inline void update_rq_clock(struct rq *rq) | 663 | inline void update_rq_clock(struct rq *rq) |
698 | { | 664 | { |
@@ -715,15 +681,9 @@ inline void update_rq_clock(struct rq *rq) | |||
715 | * This interface allows printk to be called with the runqueue lock | 681 | * This interface allows printk to be called with the runqueue lock |
716 | * held and know whether or not it is OK to wake up the klogd. | 682 | * held and know whether or not it is OK to wake up the klogd. |
717 | */ | 683 | */ |
718 | int runqueue_is_locked(void) | 684 | int runqueue_is_locked(int cpu) |
719 | { | 685 | { |
720 | int cpu = get_cpu(); | 686 | return spin_is_locked(&cpu_rq(cpu)->lock); |
721 | struct rq *rq = cpu_rq(cpu); | ||
722 | int ret; | ||
723 | |||
724 | ret = spin_is_locked(&rq->lock); | ||
725 | put_cpu(); | ||
726 | return ret; | ||
727 | } | 687 | } |
728 | 688 | ||
729 | /* | 689 | /* |
@@ -820,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp) | |||
820 | return single_open(filp, sched_feat_show, NULL); | 780 | return single_open(filp, sched_feat_show, NULL); |
821 | } | 781 | } |
822 | 782 | ||
823 | static struct file_operations sched_feat_fops = { | 783 | static const struct file_operations sched_feat_fops = { |
824 | .open = sched_feat_open, | 784 | .open = sched_feat_open, |
825 | .write = sched_feat_write, | 785 | .write = sched_feat_write, |
826 | .read = seq_read, | 786 | .read = seq_read, |
@@ -861,6 +821,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; | |||
861 | unsigned int sysctl_sched_shares_thresh = 4; | 821 | unsigned int sysctl_sched_shares_thresh = 4; |
862 | 822 | ||
863 | /* | 823 | /* |
824 | * period over which we average the RT time consumption, measured | ||
825 | * in ms. | ||
826 | * | ||
827 | * default: 1s | ||
828 | */ | ||
829 | const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; | ||
830 | |||
831 | /* | ||
864 | * period over which we measure -rt task cpu usage in us. | 832 | * period over which we measure -rt task cpu usage in us. |
865 | * default: 1s | 833 | * default: 1s |
866 | */ | 834 | */ |
@@ -1278,12 +1246,37 @@ void wake_up_idle_cpu(int cpu) | |||
1278 | } | 1246 | } |
1279 | #endif /* CONFIG_NO_HZ */ | 1247 | #endif /* CONFIG_NO_HZ */ |
1280 | 1248 | ||
1249 | static u64 sched_avg_period(void) | ||
1250 | { | ||
1251 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | ||
1252 | } | ||
1253 | |||
1254 | static void sched_avg_update(struct rq *rq) | ||
1255 | { | ||
1256 | s64 period = sched_avg_period(); | ||
1257 | |||
1258 | while ((s64)(rq->clock - rq->age_stamp) > period) { | ||
1259 | rq->age_stamp += period; | ||
1260 | rq->rt_avg /= 2; | ||
1261 | } | ||
1262 | } | ||
1263 | |||
1264 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1265 | { | ||
1266 | rq->rt_avg += rt_delta; | ||
1267 | sched_avg_update(rq); | ||
1268 | } | ||
1269 | |||
1281 | #else /* !CONFIG_SMP */ | 1270 | #else /* !CONFIG_SMP */ |
1282 | static void resched_task(struct task_struct *p) | 1271 | static void resched_task(struct task_struct *p) |
1283 | { | 1272 | { |
1284 | assert_spin_locked(&task_rq(p)->lock); | 1273 | assert_spin_locked(&task_rq(p)->lock); |
1285 | set_tsk_need_resched(p); | 1274 | set_tsk_need_resched(p); |
1286 | } | 1275 | } |
1276 | |||
1277 | static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) | ||
1278 | { | ||
1279 | } | ||
1287 | #endif /* CONFIG_SMP */ | 1280 | #endif /* CONFIG_SMP */ |
1288 | 1281 | ||
1289 | #if BITS_PER_LONG == 32 | 1282 | #if BITS_PER_LONG == 32 |
@@ -1494,8 +1487,65 @@ static int tg_nop(struct task_group *tg, void *data) | |||
1494 | #endif | 1487 | #endif |
1495 | 1488 | ||
1496 | #ifdef CONFIG_SMP | 1489 | #ifdef CONFIG_SMP |
1497 | static unsigned long source_load(int cpu, int type); | 1490 | /* Used instead of source_load when we know the type == 0 */ |
1498 | static unsigned long target_load(int cpu, int type); | 1491 | static unsigned long weighted_cpuload(const int cpu) |
1492 | { | ||
1493 | return cpu_rq(cpu)->load.weight; | ||
1494 | } | ||
1495 | |||
1496 | /* | ||
1497 | * Return a low guess at the load of a migration-source cpu weighted | ||
1498 | * according to the scheduling class and "nice" value. | ||
1499 | * | ||
1500 | * We want to under-estimate the load of migration sources, to | ||
1501 | * balance conservatively. | ||
1502 | */ | ||
1503 | static unsigned long source_load(int cpu, int type) | ||
1504 | { | ||
1505 | struct rq *rq = cpu_rq(cpu); | ||
1506 | unsigned long total = weighted_cpuload(cpu); | ||
1507 | |||
1508 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1509 | return total; | ||
1510 | |||
1511 | return min(rq->cpu_load[type-1], total); | ||
1512 | } | ||
1513 | |||
1514 | /* | ||
1515 | * Return a high guess at the load of a migration-target cpu weighted | ||
1516 | * according to the scheduling class and "nice" value. | ||
1517 | */ | ||
1518 | static unsigned long target_load(int cpu, int type) | ||
1519 | { | ||
1520 | struct rq *rq = cpu_rq(cpu); | ||
1521 | unsigned long total = weighted_cpuload(cpu); | ||
1522 | |||
1523 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
1524 | return total; | ||
1525 | |||
1526 | return max(rq->cpu_load[type-1], total); | ||
1527 | } | ||
1528 | |||
1529 | static struct sched_group *group_of(int cpu) | ||
1530 | { | ||
1531 | struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); | ||
1532 | |||
1533 | if (!sd) | ||
1534 | return NULL; | ||
1535 | |||
1536 | return sd->groups; | ||
1537 | } | ||
1538 | |||
1539 | static unsigned long power_of(int cpu) | ||
1540 | { | ||
1541 | struct sched_group *group = group_of(cpu); | ||
1542 | |||
1543 | if (!group) | ||
1544 | return SCHED_LOAD_SCALE; | ||
1545 | |||
1546 | return group->cpu_power; | ||
1547 | } | ||
1548 | |||
1499 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1549 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1500 | 1550 | ||
1501 | static unsigned long cpu_avg_load_per_task(int cpu) | 1551 | static unsigned long cpu_avg_load_per_task(int cpu) |
@@ -1513,28 +1563,35 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1513 | 1563 | ||
1514 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1564 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1515 | 1565 | ||
1566 | struct update_shares_data { | ||
1567 | unsigned long rq_weight[NR_CPUS]; | ||
1568 | }; | ||
1569 | |||
1570 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1571 | |||
1516 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1572 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1517 | 1573 | ||
1518 | /* | 1574 | /* |
1519 | * Calculate and set the cpu's group shares. | 1575 | * Calculate and set the cpu's group shares. |
1520 | */ | 1576 | */ |
1521 | static void | 1577 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1522 | update_group_shares_cpu(struct task_group *tg, int cpu, | 1578 | unsigned long sd_shares, |
1523 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1579 | unsigned long sd_rq_weight, |
1580 | struct update_shares_data *usd) | ||
1524 | { | 1581 | { |
1525 | unsigned long shares; | 1582 | unsigned long shares, rq_weight; |
1526 | unsigned long rq_weight; | 1583 | int boost = 0; |
1527 | |||
1528 | if (!tg->se[cpu]) | ||
1529 | return; | ||
1530 | 1584 | ||
1531 | rq_weight = tg->cfs_rq[cpu]->rq_weight; | 1585 | rq_weight = usd->rq_weight[cpu]; |
1586 | if (!rq_weight) { | ||
1587 | boost = 1; | ||
1588 | rq_weight = NICE_0_LOAD; | ||
1589 | } | ||
1532 | 1590 | ||
1533 | /* | 1591 | /* |
1534 | * \Sum shares * rq_weight | 1592 | * \Sum_j shares_j * rq_weight_i |
1535 | * shares = ----------------------- | 1593 | * shares_i = ----------------------------- |
1536 | * \Sum rq_weight | 1594 | * \Sum_j rq_weight_j |
1537 | * | ||
1538 | */ | 1595 | */ |
1539 | shares = (sd_shares * rq_weight) / sd_rq_weight; | 1596 | shares = (sd_shares * rq_weight) / sd_rq_weight; |
1540 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | 1597 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); |
@@ -1545,8 +1602,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1545 | unsigned long flags; | 1602 | unsigned long flags; |
1546 | 1603 | ||
1547 | spin_lock_irqsave(&rq->lock, flags); | 1604 | spin_lock_irqsave(&rq->lock, flags); |
1548 | tg->cfs_rq[cpu]->shares = shares; | 1605 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; |
1549 | 1606 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | |
1550 | __set_se_shares(tg->se[cpu], shares); | 1607 | __set_se_shares(tg->se[cpu], shares); |
1551 | spin_unlock_irqrestore(&rq->lock, flags); | 1608 | spin_unlock_irqrestore(&rq->lock, flags); |
1552 | } | 1609 | } |
@@ -1559,22 +1616,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1559 | */ | 1616 | */ |
1560 | static int tg_shares_up(struct task_group *tg, void *data) | 1617 | static int tg_shares_up(struct task_group *tg, void *data) |
1561 | { | 1618 | { |
1562 | unsigned long weight, rq_weight = 0; | 1619 | unsigned long weight, rq_weight = 0, shares = 0; |
1563 | unsigned long shares = 0; | 1620 | struct update_shares_data *usd; |
1564 | struct sched_domain *sd = data; | 1621 | struct sched_domain *sd = data; |
1622 | unsigned long flags; | ||
1565 | int i; | 1623 | int i; |
1566 | 1624 | ||
1625 | if (!tg->se[0]) | ||
1626 | return 0; | ||
1627 | |||
1628 | local_irq_save(flags); | ||
1629 | usd = &__get_cpu_var(update_shares_data); | ||
1630 | |||
1567 | for_each_cpu(i, sched_domain_span(sd)) { | 1631 | for_each_cpu(i, sched_domain_span(sd)) { |
1632 | weight = tg->cfs_rq[i]->load.weight; | ||
1633 | usd->rq_weight[i] = weight; | ||
1634 | |||
1568 | /* | 1635 | /* |
1569 | * If there are currently no tasks on the cpu pretend there | 1636 | * If there are currently no tasks on the cpu pretend there |
1570 | * is one of average load so that when a new task gets to | 1637 | * is one of average load so that when a new task gets to |
1571 | * run here it will not get delayed by group starvation. | 1638 | * run here it will not get delayed by group starvation. |
1572 | */ | 1639 | */ |
1573 | weight = tg->cfs_rq[i]->load.weight; | ||
1574 | if (!weight) | 1640 | if (!weight) |
1575 | weight = NICE_0_LOAD; | 1641 | weight = NICE_0_LOAD; |
1576 | 1642 | ||
1577 | tg->cfs_rq[i]->rq_weight = weight; | ||
1578 | rq_weight += weight; | 1643 | rq_weight += weight; |
1579 | shares += tg->cfs_rq[i]->shares; | 1644 | shares += tg->cfs_rq[i]->shares; |
1580 | } | 1645 | } |
@@ -1586,7 +1651,9 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1586 | shares = tg->shares; | 1651 | shares = tg->shares; |
1587 | 1652 | ||
1588 | for_each_cpu(i, sched_domain_span(sd)) | 1653 | for_each_cpu(i, sched_domain_span(sd)) |
1589 | update_group_shares_cpu(tg, i, shares, rq_weight); | 1654 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); |
1655 | |||
1656 | local_irq_restore(flags); | ||
1590 | 1657 | ||
1591 | return 0; | 1658 | return 0; |
1592 | } | 1659 | } |
@@ -1616,8 +1683,14 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1616 | 1683 | ||
1617 | static void update_shares(struct sched_domain *sd) | 1684 | static void update_shares(struct sched_domain *sd) |
1618 | { | 1685 | { |
1619 | u64 now = cpu_clock(raw_smp_processor_id()); | 1686 | s64 elapsed; |
1620 | s64 elapsed = now - sd->last_update; | 1687 | u64 now; |
1688 | |||
1689 | if (root_task_group_empty()) | ||
1690 | return; | ||
1691 | |||
1692 | now = cpu_clock(raw_smp_processor_id()); | ||
1693 | elapsed = now - sd->last_update; | ||
1621 | 1694 | ||
1622 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1695 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { |
1623 | sd->last_update = now; | 1696 | sd->last_update = now; |
@@ -1627,6 +1700,9 @@ static void update_shares(struct sched_domain *sd) | |||
1627 | 1700 | ||
1628 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | 1701 | static void update_shares_locked(struct rq *rq, struct sched_domain *sd) |
1629 | { | 1702 | { |
1703 | if (root_task_group_empty()) | ||
1704 | return; | ||
1705 | |||
1630 | spin_unlock(&rq->lock); | 1706 | spin_unlock(&rq->lock); |
1631 | update_shares(sd); | 1707 | update_shares(sd); |
1632 | spin_lock(&rq->lock); | 1708 | spin_lock(&rq->lock); |
@@ -1634,6 +1710,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1634 | 1710 | ||
1635 | static void update_h_load(long cpu) | 1711 | static void update_h_load(long cpu) |
1636 | { | 1712 | { |
1713 | if (root_task_group_empty()) | ||
1714 | return; | ||
1715 | |||
1637 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1716 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1638 | } | 1717 | } |
1639 | 1718 | ||
@@ -1651,6 +1730,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd) | |||
1651 | 1730 | ||
1652 | #ifdef CONFIG_PREEMPT | 1731 | #ifdef CONFIG_PREEMPT |
1653 | 1732 | ||
1733 | static void double_rq_lock(struct rq *rq1, struct rq *rq2); | ||
1734 | |||
1654 | /* | 1735 | /* |
1655 | * fair double_lock_balance: Safely acquires both rq->locks in a fair | 1736 | * fair double_lock_balance: Safely acquires both rq->locks in a fair |
1656 | * way at the expense of forcing extra atomic operations in all | 1737 | * way at the expense of forcing extra atomic operations in all |
@@ -1915,13 +1996,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1915 | } | 1996 | } |
1916 | 1997 | ||
1917 | #ifdef CONFIG_SMP | 1998 | #ifdef CONFIG_SMP |
1918 | |||
1919 | /* Used instead of source_load when we know the type == 0 */ | ||
1920 | static unsigned long weighted_cpuload(const int cpu) | ||
1921 | { | ||
1922 | return cpu_rq(cpu)->load.weight; | ||
1923 | } | ||
1924 | |||
1925 | /* | 1999 | /* |
1926 | * Is this task likely cache-hot: | 2000 | * Is this task likely cache-hot: |
1927 | */ | 2001 | */ |
@@ -1979,7 +2053,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1979 | if (task_hot(p, old_rq->clock, NULL)) | 2053 | if (task_hot(p, old_rq->clock, NULL)) |
1980 | schedstat_inc(p, se.nr_forced2_migrations); | 2054 | schedstat_inc(p, se.nr_forced2_migrations); |
1981 | #endif | 2055 | #endif |
1982 | perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, | 2056 | perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, |
1983 | 1, 1, NULL, 0); | 2057 | 1, 1, NULL, 0); |
1984 | } | 2058 | } |
1985 | p->se.vruntime -= old_cfsrq->min_vruntime - | 2059 | p->se.vruntime -= old_cfsrq->min_vruntime - |
@@ -2195,186 +2269,6 @@ void kick_process(struct task_struct *p) | |||
2195 | preempt_enable(); | 2269 | preempt_enable(); |
2196 | } | 2270 | } |
2197 | EXPORT_SYMBOL_GPL(kick_process); | 2271 | EXPORT_SYMBOL_GPL(kick_process); |
2198 | |||
2199 | /* | ||
2200 | * Return a low guess at the load of a migration-source cpu weighted | ||
2201 | * according to the scheduling class and "nice" value. | ||
2202 | * | ||
2203 | * We want to under-estimate the load of migration sources, to | ||
2204 | * balance conservatively. | ||
2205 | */ | ||
2206 | static unsigned long source_load(int cpu, int type) | ||
2207 | { | ||
2208 | struct rq *rq = cpu_rq(cpu); | ||
2209 | unsigned long total = weighted_cpuload(cpu); | ||
2210 | |||
2211 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2212 | return total; | ||
2213 | |||
2214 | return min(rq->cpu_load[type-1], total); | ||
2215 | } | ||
2216 | |||
2217 | /* | ||
2218 | * Return a high guess at the load of a migration-target cpu weighted | ||
2219 | * according to the scheduling class and "nice" value. | ||
2220 | */ | ||
2221 | static unsigned long target_load(int cpu, int type) | ||
2222 | { | ||
2223 | struct rq *rq = cpu_rq(cpu); | ||
2224 | unsigned long total = weighted_cpuload(cpu); | ||
2225 | |||
2226 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
2227 | return total; | ||
2228 | |||
2229 | return max(rq->cpu_load[type-1], total); | ||
2230 | } | ||
2231 | |||
2232 | /* | ||
2233 | * find_idlest_group finds and returns the least busy CPU group within the | ||
2234 | * domain. | ||
2235 | */ | ||
2236 | static struct sched_group * | ||
2237 | find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | ||
2238 | { | ||
2239 | struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; | ||
2240 | unsigned long min_load = ULONG_MAX, this_load = 0; | ||
2241 | int load_idx = sd->forkexec_idx; | ||
2242 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | ||
2243 | |||
2244 | do { | ||
2245 | unsigned long load, avg_load; | ||
2246 | int local_group; | ||
2247 | int i; | ||
2248 | |||
2249 | /* Skip over this group if it has no CPUs allowed */ | ||
2250 | if (!cpumask_intersects(sched_group_cpus(group), | ||
2251 | &p->cpus_allowed)) | ||
2252 | continue; | ||
2253 | |||
2254 | local_group = cpumask_test_cpu(this_cpu, | ||
2255 | sched_group_cpus(group)); | ||
2256 | |||
2257 | /* Tally up the load of all CPUs in the group */ | ||
2258 | avg_load = 0; | ||
2259 | |||
2260 | for_each_cpu(i, sched_group_cpus(group)) { | ||
2261 | /* Bias balancing toward cpus of our domain */ | ||
2262 | if (local_group) | ||
2263 | load = source_load(i, load_idx); | ||
2264 | else | ||
2265 | load = target_load(i, load_idx); | ||
2266 | |||
2267 | avg_load += load; | ||
2268 | } | ||
2269 | |||
2270 | /* Adjust by relative CPU power of the group */ | ||
2271 | avg_load = sg_div_cpu_power(group, | ||
2272 | avg_load * SCHED_LOAD_SCALE); | ||
2273 | |||
2274 | if (local_group) { | ||
2275 | this_load = avg_load; | ||
2276 | this = group; | ||
2277 | } else if (avg_load < min_load) { | ||
2278 | min_load = avg_load; | ||
2279 | idlest = group; | ||
2280 | } | ||
2281 | } while (group = group->next, group != sd->groups); | ||
2282 | |||
2283 | if (!idlest || 100*this_load < imbalance*min_load) | ||
2284 | return NULL; | ||
2285 | return idlest; | ||
2286 | } | ||
2287 | |||
2288 | /* | ||
2289 | * find_idlest_cpu - find the idlest cpu among the cpus in group. | ||
2290 | */ | ||
2291 | static int | ||
2292 | find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | ||
2293 | { | ||
2294 | unsigned long load, min_load = ULONG_MAX; | ||
2295 | int idlest = -1; | ||
2296 | int i; | ||
2297 | |||
2298 | /* Traverse only the allowed CPUs */ | ||
2299 | for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { | ||
2300 | load = weighted_cpuload(i); | ||
2301 | |||
2302 | if (load < min_load || (load == min_load && i == this_cpu)) { | ||
2303 | min_load = load; | ||
2304 | idlest = i; | ||
2305 | } | ||
2306 | } | ||
2307 | |||
2308 | return idlest; | ||
2309 | } | ||
2310 | |||
2311 | /* | ||
2312 | * sched_balance_self: balance the current task (running on cpu) in domains | ||
2313 | * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and | ||
2314 | * SD_BALANCE_EXEC. | ||
2315 | * | ||
2316 | * Balance, ie. select the least loaded group. | ||
2317 | * | ||
2318 | * Returns the target CPU number, or the same CPU if no balancing is needed. | ||
2319 | * | ||
2320 | * preempt must be disabled. | ||
2321 | */ | ||
2322 | static int sched_balance_self(int cpu, int flag) | ||
2323 | { | ||
2324 | struct task_struct *t = current; | ||
2325 | struct sched_domain *tmp, *sd = NULL; | ||
2326 | |||
2327 | for_each_domain(cpu, tmp) { | ||
2328 | /* | ||
2329 | * If power savings logic is enabled for a domain, stop there. | ||
2330 | */ | ||
2331 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
2332 | break; | ||
2333 | if (tmp->flags & flag) | ||
2334 | sd = tmp; | ||
2335 | } | ||
2336 | |||
2337 | if (sd) | ||
2338 | update_shares(sd); | ||
2339 | |||
2340 | while (sd) { | ||
2341 | struct sched_group *group; | ||
2342 | int new_cpu, weight; | ||
2343 | |||
2344 | if (!(sd->flags & flag)) { | ||
2345 | sd = sd->child; | ||
2346 | continue; | ||
2347 | } | ||
2348 | |||
2349 | group = find_idlest_group(sd, t, cpu); | ||
2350 | if (!group) { | ||
2351 | sd = sd->child; | ||
2352 | continue; | ||
2353 | } | ||
2354 | |||
2355 | new_cpu = find_idlest_cpu(group, t, cpu); | ||
2356 | if (new_cpu == -1 || new_cpu == cpu) { | ||
2357 | /* Now try balancing at a lower domain level of cpu */ | ||
2358 | sd = sd->child; | ||
2359 | continue; | ||
2360 | } | ||
2361 | |||
2362 | /* Now try balancing at a lower domain level of new_cpu */ | ||
2363 | cpu = new_cpu; | ||
2364 | weight = cpumask_weight(sched_domain_span(sd)); | ||
2365 | sd = NULL; | ||
2366 | for_each_domain(cpu, tmp) { | ||
2367 | if (weight <= cpumask_weight(sched_domain_span(tmp))) | ||
2368 | break; | ||
2369 | if (tmp->flags & flag) | ||
2370 | sd = tmp; | ||
2371 | } | ||
2372 | /* while loop will break here if sd == NULL */ | ||
2373 | } | ||
2374 | |||
2375 | return cpu; | ||
2376 | } | ||
2377 | |||
2378 | #endif /* CONFIG_SMP */ | 2272 | #endif /* CONFIG_SMP */ |
2379 | 2273 | ||
2380 | /** | 2274 | /** |
@@ -2412,37 +2306,22 @@ void task_oncpu_function_call(struct task_struct *p, | |||
2412 | * | 2306 | * |
2413 | * returns failure only if the task is already active. | 2307 | * returns failure only if the task is already active. |
2414 | */ | 2308 | */ |
2415 | static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | 2309 | static int try_to_wake_up(struct task_struct *p, unsigned int state, |
2310 | int wake_flags) | ||
2416 | { | 2311 | { |
2417 | int cpu, orig_cpu, this_cpu, success = 0; | 2312 | int cpu, orig_cpu, this_cpu, success = 0; |
2418 | unsigned long flags; | 2313 | unsigned long flags; |
2419 | long old_state; | ||
2420 | struct rq *rq; | 2314 | struct rq *rq; |
2421 | 2315 | ||
2422 | if (!sched_feat(SYNC_WAKEUPS)) | 2316 | if (!sched_feat(SYNC_WAKEUPS)) |
2423 | sync = 0; | 2317 | wake_flags &= ~WF_SYNC; |
2424 | 2318 | ||
2425 | #ifdef CONFIG_SMP | 2319 | this_cpu = get_cpu(); |
2426 | if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { | ||
2427 | struct sched_domain *sd; | ||
2428 | |||
2429 | this_cpu = raw_smp_processor_id(); | ||
2430 | cpu = task_cpu(p); | ||
2431 | |||
2432 | for_each_domain(this_cpu, sd) { | ||
2433 | if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { | ||
2434 | update_shares(sd); | ||
2435 | break; | ||
2436 | } | ||
2437 | } | ||
2438 | } | ||
2439 | #endif | ||
2440 | 2320 | ||
2441 | smp_wmb(); | 2321 | smp_wmb(); |
2442 | rq = task_rq_lock(p, &flags); | 2322 | rq = task_rq_lock(p, &flags); |
2443 | update_rq_clock(rq); | 2323 | update_rq_clock(rq); |
2444 | old_state = p->state; | 2324 | if (!(p->state & state)) |
2445 | if (!(old_state & state)) | ||
2446 | goto out; | 2325 | goto out; |
2447 | 2326 | ||
2448 | if (p->se.on_rq) | 2327 | if (p->se.on_rq) |
@@ -2450,27 +2329,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2450 | 2329 | ||
2451 | cpu = task_cpu(p); | 2330 | cpu = task_cpu(p); |
2452 | orig_cpu = cpu; | 2331 | orig_cpu = cpu; |
2453 | this_cpu = smp_processor_id(); | ||
2454 | 2332 | ||
2455 | #ifdef CONFIG_SMP | 2333 | #ifdef CONFIG_SMP |
2456 | if (unlikely(task_running(rq, p))) | 2334 | if (unlikely(task_running(rq, p))) |
2457 | goto out_activate; | 2335 | goto out_activate; |
2458 | 2336 | ||
2459 | cpu = p->sched_class->select_task_rq(p, sync); | 2337 | /* |
2460 | if (cpu != orig_cpu) { | 2338 | * In order to handle concurrent wakeups and release the rq->lock |
2339 | * we put the task in TASK_WAKING state. | ||
2340 | * | ||
2341 | * First fix up the nr_uninterruptible count: | ||
2342 | */ | ||
2343 | if (task_contributes_to_load(p)) | ||
2344 | rq->nr_uninterruptible--; | ||
2345 | p->state = TASK_WAKING; | ||
2346 | task_rq_unlock(rq, &flags); | ||
2347 | |||
2348 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); | ||
2349 | if (cpu != orig_cpu) | ||
2461 | set_task_cpu(p, cpu); | 2350 | set_task_cpu(p, cpu); |
2462 | task_rq_unlock(rq, &flags); | ||
2463 | /* might preempt at this point */ | ||
2464 | rq = task_rq_lock(p, &flags); | ||
2465 | old_state = p->state; | ||
2466 | if (!(old_state & state)) | ||
2467 | goto out; | ||
2468 | if (p->se.on_rq) | ||
2469 | goto out_running; | ||
2470 | 2351 | ||
2471 | this_cpu = smp_processor_id(); | 2352 | rq = task_rq_lock(p, &flags); |
2472 | cpu = task_cpu(p); | 2353 | WARN_ON(p->state != TASK_WAKING); |
2473 | } | 2354 | cpu = task_cpu(p); |
2474 | 2355 | ||
2475 | #ifdef CONFIG_SCHEDSTATS | 2356 | #ifdef CONFIG_SCHEDSTATS |
2476 | schedstat_inc(rq, ttwu_count); | 2357 | schedstat_inc(rq, ttwu_count); |
@@ -2490,7 +2371,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
2490 | out_activate: | 2371 | out_activate: |
2491 | #endif /* CONFIG_SMP */ | 2372 | #endif /* CONFIG_SMP */ |
2492 | schedstat_inc(p, se.nr_wakeups); | 2373 | schedstat_inc(p, se.nr_wakeups); |
2493 | if (sync) | 2374 | if (wake_flags & WF_SYNC) |
2494 | schedstat_inc(p, se.nr_wakeups_sync); | 2375 | schedstat_inc(p, se.nr_wakeups_sync); |
2495 | if (orig_cpu != cpu) | 2376 | if (orig_cpu != cpu) |
2496 | schedstat_inc(p, se.nr_wakeups_migrate); | 2377 | schedstat_inc(p, se.nr_wakeups_migrate); |
@@ -2519,7 +2400,7 @@ out_activate: | |||
2519 | 2400 | ||
2520 | out_running: | 2401 | out_running: |
2521 | trace_sched_wakeup(rq, p, success); | 2402 | trace_sched_wakeup(rq, p, success); |
2522 | check_preempt_curr(rq, p, sync); | 2403 | check_preempt_curr(rq, p, wake_flags); |
2523 | 2404 | ||
2524 | p->state = TASK_RUNNING; | 2405 | p->state = TASK_RUNNING; |
2525 | #ifdef CONFIG_SMP | 2406 | #ifdef CONFIG_SMP |
@@ -2528,6 +2409,7 @@ out_running: | |||
2528 | #endif | 2409 | #endif |
2529 | out: | 2410 | out: |
2530 | task_rq_unlock(rq, &flags); | 2411 | task_rq_unlock(rq, &flags); |
2412 | put_cpu(); | ||
2531 | 2413 | ||
2532 | return success; | 2414 | return success; |
2533 | } | 2415 | } |
@@ -2570,6 +2452,7 @@ static void __sched_fork(struct task_struct *p) | |||
2570 | p->se.avg_overlap = 0; | 2452 | p->se.avg_overlap = 0; |
2571 | p->se.start_runtime = 0; | 2453 | p->se.start_runtime = 0; |
2572 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; | 2454 | p->se.avg_wakeup = sysctl_sched_wakeup_granularity; |
2455 | p->se.avg_running = 0; | ||
2573 | 2456 | ||
2574 | #ifdef CONFIG_SCHEDSTATS | 2457 | #ifdef CONFIG_SCHEDSTATS |
2575 | p->se.wait_start = 0; | 2458 | p->se.wait_start = 0; |
@@ -2631,18 +2514,41 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2631 | 2514 | ||
2632 | __sched_fork(p); | 2515 | __sched_fork(p); |
2633 | 2516 | ||
2634 | #ifdef CONFIG_SMP | ||
2635 | cpu = sched_balance_self(cpu, SD_BALANCE_FORK); | ||
2636 | #endif | ||
2637 | set_task_cpu(p, cpu); | ||
2638 | |||
2639 | /* | 2517 | /* |
2640 | * Make sure we do not leak PI boosting priority to the child: | 2518 | * Make sure we do not leak PI boosting priority to the child. |
2641 | */ | 2519 | */ |
2642 | p->prio = current->normal_prio; | 2520 | p->prio = current->normal_prio; |
2521 | |||
2522 | /* | ||
2523 | * Revert to default priority/policy on fork if requested. | ||
2524 | */ | ||
2525 | if (unlikely(p->sched_reset_on_fork)) { | ||
2526 | if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) | ||
2527 | p->policy = SCHED_NORMAL; | ||
2528 | |||
2529 | if (p->normal_prio < DEFAULT_PRIO) | ||
2530 | p->prio = DEFAULT_PRIO; | ||
2531 | |||
2532 | if (PRIO_TO_NICE(p->static_prio) < 0) { | ||
2533 | p->static_prio = NICE_TO_PRIO(0); | ||
2534 | set_load_weight(p); | ||
2535 | } | ||
2536 | |||
2537 | /* | ||
2538 | * We don't need the reset flag anymore after the fork. It has | ||
2539 | * fulfilled its duty: | ||
2540 | */ | ||
2541 | p->sched_reset_on_fork = 0; | ||
2542 | } | ||
2543 | |||
2643 | if (!rt_prio(p->prio)) | 2544 | if (!rt_prio(p->prio)) |
2644 | p->sched_class = &fair_sched_class; | 2545 | p->sched_class = &fair_sched_class; |
2645 | 2546 | ||
2547 | #ifdef CONFIG_SMP | ||
2548 | cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); | ||
2549 | #endif | ||
2550 | set_task_cpu(p, cpu); | ||
2551 | |||
2646 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) | 2552 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
2647 | if (likely(sched_info_on())) | 2553 | if (likely(sched_info_on())) |
2648 | memset(&p->sched_info, 0, sizeof(p->sched_info)); | 2554 | memset(&p->sched_info, 0, sizeof(p->sched_info)); |
@@ -2688,7 +2594,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2688 | inc_nr_running(rq); | 2594 | inc_nr_running(rq); |
2689 | } | 2595 | } |
2690 | trace_sched_wakeup_new(rq, p, 1); | 2596 | trace_sched_wakeup_new(rq, p, 1); |
2691 | check_preempt_curr(rq, p, 0); | 2597 | check_preempt_curr(rq, p, WF_FORK); |
2692 | #ifdef CONFIG_SMP | 2598 | #ifdef CONFIG_SMP |
2693 | if (p->sched_class->task_wake_up) | 2599 | if (p->sched_class->task_wake_up) |
2694 | p->sched_class->task_wake_up(rq, p); | 2600 | p->sched_class->task_wake_up(rq, p); |
@@ -2796,12 +2702,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2796 | { | 2702 | { |
2797 | struct mm_struct *mm = rq->prev_mm; | 2703 | struct mm_struct *mm = rq->prev_mm; |
2798 | long prev_state; | 2704 | long prev_state; |
2799 | #ifdef CONFIG_SMP | ||
2800 | int post_schedule = 0; | ||
2801 | |||
2802 | if (current->sched_class->needs_post_schedule) | ||
2803 | post_schedule = current->sched_class->needs_post_schedule(rq); | ||
2804 | #endif | ||
2805 | 2705 | ||
2806 | rq->prev_mm = NULL; | 2706 | rq->prev_mm = NULL; |
2807 | 2707 | ||
@@ -2818,12 +2718,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2818 | */ | 2718 | */ |
2819 | prev_state = prev->state; | 2719 | prev_state = prev->state; |
2820 | finish_arch_switch(prev); | 2720 | finish_arch_switch(prev); |
2821 | perf_counter_task_sched_in(current, cpu_of(rq)); | 2721 | perf_event_task_sched_in(current, cpu_of(rq)); |
2822 | finish_lock_switch(rq, prev); | 2722 | finish_lock_switch(rq, prev); |
2823 | #ifdef CONFIG_SMP | ||
2824 | if (post_schedule) | ||
2825 | current->sched_class->post_schedule(rq); | ||
2826 | #endif | ||
2827 | 2723 | ||
2828 | fire_sched_in_preempt_notifiers(current); | 2724 | fire_sched_in_preempt_notifiers(current); |
2829 | if (mm) | 2725 | if (mm) |
@@ -2838,6 +2734,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2838 | } | 2734 | } |
2839 | } | 2735 | } |
2840 | 2736 | ||
2737 | #ifdef CONFIG_SMP | ||
2738 | |||
2739 | /* assumes rq->lock is held */ | ||
2740 | static inline void pre_schedule(struct rq *rq, struct task_struct *prev) | ||
2741 | { | ||
2742 | if (prev->sched_class->pre_schedule) | ||
2743 | prev->sched_class->pre_schedule(rq, prev); | ||
2744 | } | ||
2745 | |||
2746 | /* rq->lock is NOT held, but preemption is disabled */ | ||
2747 | static inline void post_schedule(struct rq *rq) | ||
2748 | { | ||
2749 | if (rq->post_schedule) { | ||
2750 | unsigned long flags; | ||
2751 | |||
2752 | spin_lock_irqsave(&rq->lock, flags); | ||
2753 | if (rq->curr->sched_class->post_schedule) | ||
2754 | rq->curr->sched_class->post_schedule(rq); | ||
2755 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2756 | |||
2757 | rq->post_schedule = 0; | ||
2758 | } | ||
2759 | } | ||
2760 | |||
2761 | #else | ||
2762 | |||
2763 | static inline void pre_schedule(struct rq *rq, struct task_struct *p) | ||
2764 | { | ||
2765 | } | ||
2766 | |||
2767 | static inline void post_schedule(struct rq *rq) | ||
2768 | { | ||
2769 | } | ||
2770 | |||
2771 | #endif | ||
2772 | |||
2841 | /** | 2773 | /** |
2842 | * schedule_tail - first thing a freshly forked thread must call. | 2774 | * schedule_tail - first thing a freshly forked thread must call. |
2843 | * @prev: the thread we just switched away from. | 2775 | * @prev: the thread we just switched away from. |
@@ -2848,6 +2780,13 @@ asmlinkage void schedule_tail(struct task_struct *prev) | |||
2848 | struct rq *rq = this_rq(); | 2780 | struct rq *rq = this_rq(); |
2849 | 2781 | ||
2850 | finish_task_switch(rq, prev); | 2782 | finish_task_switch(rq, prev); |
2783 | |||
2784 | /* | ||
2785 | * FIXME: do we need to worry about rq being invalidated by the | ||
2786 | * task_switch? | ||
2787 | */ | ||
2788 | post_schedule(rq); | ||
2789 | |||
2851 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW | 2790 | #ifdef __ARCH_WANT_UNLOCKED_CTXSW |
2852 | /* In this case, finish_task_switch does not reenable preemption */ | 2791 | /* In this case, finish_task_switch does not reenable preemption */ |
2853 | preempt_enable(); | 2792 | preempt_enable(); |
@@ -2965,6 +2904,19 @@ unsigned long nr_iowait(void) | |||
2965 | return sum; | 2904 | return sum; |
2966 | } | 2905 | } |
2967 | 2906 | ||
2907 | unsigned long nr_iowait_cpu(void) | ||
2908 | { | ||
2909 | struct rq *this = this_rq(); | ||
2910 | return atomic_read(&this->nr_iowait); | ||
2911 | } | ||
2912 | |||
2913 | unsigned long this_cpu_load(void) | ||
2914 | { | ||
2915 | struct rq *this = this_rq(); | ||
2916 | return this->cpu_load[0]; | ||
2917 | } | ||
2918 | |||
2919 | |||
2968 | /* Variables and functions for calc_load */ | 2920 | /* Variables and functions for calc_load */ |
2969 | static atomic_long_t calc_load_tasks; | 2921 | static atomic_long_t calc_load_tasks; |
2970 | static unsigned long calc_load_update; | 2922 | static unsigned long calc_load_update; |
@@ -3164,7 +3116,7 @@ out: | |||
3164 | void sched_exec(void) | 3116 | void sched_exec(void) |
3165 | { | 3117 | { |
3166 | int new_cpu, this_cpu = get_cpu(); | 3118 | int new_cpu, this_cpu = get_cpu(); |
3167 | new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); | 3119 | new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0); |
3168 | put_cpu(); | 3120 | put_cpu(); |
3169 | if (new_cpu != this_cpu) | 3121 | if (new_cpu != this_cpu) |
3170 | sched_migrate_task(current, new_cpu); | 3122 | sched_migrate_task(current, new_cpu); |
@@ -3379,9 +3331,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
3379 | { | 3331 | { |
3380 | const struct sched_class *class; | 3332 | const struct sched_class *class; |
3381 | 3333 | ||
3382 | for (class = sched_class_highest; class; class = class->next) | 3334 | for_each_class(class) { |
3383 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) | 3335 | if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) |
3384 | return 1; | 3336 | return 1; |
3337 | } | ||
3385 | 3338 | ||
3386 | return 0; | 3339 | return 0; |
3387 | } | 3340 | } |
@@ -3544,7 +3497,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, | |||
3544 | * capacity but still has some space to pick up some load | 3497 | * capacity but still has some space to pick up some load |
3545 | * from other group and save more power | 3498 | * from other group and save more power |
3546 | */ | 3499 | */ |
3547 | if (sgs->sum_nr_running > sgs->group_capacity - 1) | 3500 | if (sgs->sum_nr_running + 1 > sgs->group_capacity) |
3548 | return; | 3501 | return; |
3549 | 3502 | ||
3550 | if (sgs->sum_nr_running > sds->leader_nr_running || | 3503 | if (sgs->sum_nr_running > sds->leader_nr_running || |
@@ -3583,11 +3536,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3583 | *imbalance = sds->min_load_per_task; | 3536 | *imbalance = sds->min_load_per_task; |
3584 | sds->busiest = sds->group_min; | 3537 | sds->busiest = sds->group_min; |
3585 | 3538 | ||
3586 | if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { | ||
3587 | cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = | ||
3588 | group_first_cpu(sds->group_leader); | ||
3589 | } | ||
3590 | |||
3591 | return 1; | 3539 | return 1; |
3592 | 3540 | ||
3593 | } | 3541 | } |
@@ -3612,6 +3560,102 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3612 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | 3560 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ |
3613 | 3561 | ||
3614 | 3562 | ||
3563 | unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3564 | { | ||
3565 | return SCHED_LOAD_SCALE; | ||
3566 | } | ||
3567 | |||
3568 | unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) | ||
3569 | { | ||
3570 | return default_scale_freq_power(sd, cpu); | ||
3571 | } | ||
3572 | |||
3573 | unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3574 | { | ||
3575 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3576 | unsigned long smt_gain = sd->smt_gain; | ||
3577 | |||
3578 | smt_gain /= weight; | ||
3579 | |||
3580 | return smt_gain; | ||
3581 | } | ||
3582 | |||
3583 | unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | ||
3584 | { | ||
3585 | return default_scale_smt_power(sd, cpu); | ||
3586 | } | ||
3587 | |||
3588 | unsigned long scale_rt_power(int cpu) | ||
3589 | { | ||
3590 | struct rq *rq = cpu_rq(cpu); | ||
3591 | u64 total, available; | ||
3592 | |||
3593 | sched_avg_update(rq); | ||
3594 | |||
3595 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | ||
3596 | available = total - rq->rt_avg; | ||
3597 | |||
3598 | if (unlikely((s64)total < SCHED_LOAD_SCALE)) | ||
3599 | total = SCHED_LOAD_SCALE; | ||
3600 | |||
3601 | total >>= SCHED_LOAD_SHIFT; | ||
3602 | |||
3603 | return div_u64(available, total); | ||
3604 | } | ||
3605 | |||
3606 | static void update_cpu_power(struct sched_domain *sd, int cpu) | ||
3607 | { | ||
3608 | unsigned long weight = cpumask_weight(sched_domain_span(sd)); | ||
3609 | unsigned long power = SCHED_LOAD_SCALE; | ||
3610 | struct sched_group *sdg = sd->groups; | ||
3611 | |||
3612 | if (sched_feat(ARCH_POWER)) | ||
3613 | power *= arch_scale_freq_power(sd, cpu); | ||
3614 | else | ||
3615 | power *= default_scale_freq_power(sd, cpu); | ||
3616 | |||
3617 | power >>= SCHED_LOAD_SHIFT; | ||
3618 | |||
3619 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { | ||
3620 | if (sched_feat(ARCH_POWER)) | ||
3621 | power *= arch_scale_smt_power(sd, cpu); | ||
3622 | else | ||
3623 | power *= default_scale_smt_power(sd, cpu); | ||
3624 | |||
3625 | power >>= SCHED_LOAD_SHIFT; | ||
3626 | } | ||
3627 | |||
3628 | power *= scale_rt_power(cpu); | ||
3629 | power >>= SCHED_LOAD_SHIFT; | ||
3630 | |||
3631 | if (!power) | ||
3632 | power = 1; | ||
3633 | |||
3634 | sdg->cpu_power = power; | ||
3635 | } | ||
3636 | |||
3637 | static void update_group_power(struct sched_domain *sd, int cpu) | ||
3638 | { | ||
3639 | struct sched_domain *child = sd->child; | ||
3640 | struct sched_group *group, *sdg = sd->groups; | ||
3641 | unsigned long power; | ||
3642 | |||
3643 | if (!child) { | ||
3644 | update_cpu_power(sd, cpu); | ||
3645 | return; | ||
3646 | } | ||
3647 | |||
3648 | power = 0; | ||
3649 | |||
3650 | group = child->groups; | ||
3651 | do { | ||
3652 | power += group->cpu_power; | ||
3653 | group = group->next; | ||
3654 | } while (group != child->groups); | ||
3655 | |||
3656 | sdg->cpu_power = power; | ||
3657 | } | ||
3658 | |||
3615 | /** | 3659 | /** |
3616 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3660 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
3617 | * @group: sched_group whose statistics are to be updated. | 3661 | * @group: sched_group whose statistics are to be updated. |
@@ -3624,7 +3668,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, | |||
3624 | * @balance: Should we balance. | 3668 | * @balance: Should we balance. |
3625 | * @sgs: variable to hold the statistics for this group. | 3669 | * @sgs: variable to hold the statistics for this group. |
3626 | */ | 3670 | */ |
3627 | static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | 3671 | static inline void update_sg_lb_stats(struct sched_domain *sd, |
3672 | struct sched_group *group, int this_cpu, | ||
3628 | enum cpu_idle_type idle, int load_idx, int *sd_idle, | 3673 | enum cpu_idle_type idle, int load_idx, int *sd_idle, |
3629 | int local_group, const struct cpumask *cpus, | 3674 | int local_group, const struct cpumask *cpus, |
3630 | int *balance, struct sg_lb_stats *sgs) | 3675 | int *balance, struct sg_lb_stats *sgs) |
@@ -3635,8 +3680,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3635 | unsigned long sum_avg_load_per_task; | 3680 | unsigned long sum_avg_load_per_task; |
3636 | unsigned long avg_load_per_task; | 3681 | unsigned long avg_load_per_task; |
3637 | 3682 | ||
3638 | if (local_group) | 3683 | if (local_group) { |
3639 | balance_cpu = group_first_cpu(group); | 3684 | balance_cpu = group_first_cpu(group); |
3685 | if (balance_cpu == this_cpu) | ||
3686 | update_group_power(sd, this_cpu); | ||
3687 | } | ||
3640 | 3688 | ||
3641 | /* Tally up the load of all CPUs in the group */ | 3689 | /* Tally up the load of all CPUs in the group */ |
3642 | sum_avg_load_per_task = avg_load_per_task = 0; | 3690 | sum_avg_load_per_task = avg_load_per_task = 0; |
@@ -3685,8 +3733,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3685 | } | 3733 | } |
3686 | 3734 | ||
3687 | /* Adjust by relative CPU power of the group */ | 3735 | /* Adjust by relative CPU power of the group */ |
3688 | sgs->avg_load = sg_div_cpu_power(group, | 3736 | sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; |
3689 | sgs->group_load * SCHED_LOAD_SCALE); | ||
3690 | 3737 | ||
3691 | 3738 | ||
3692 | /* | 3739 | /* |
@@ -3698,14 +3745,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, | |||
3698 | * normalized nr_running number somewhere that negates | 3745 | * normalized nr_running number somewhere that negates |
3699 | * the hierarchy? | 3746 | * the hierarchy? |
3700 | */ | 3747 | */ |
3701 | avg_load_per_task = sg_div_cpu_power(group, | 3748 | avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / |
3702 | sum_avg_load_per_task * SCHED_LOAD_SCALE); | 3749 | group->cpu_power; |
3703 | 3750 | ||
3704 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) | 3751 | if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) |
3705 | sgs->group_imb = 1; | 3752 | sgs->group_imb = 1; |
3706 | 3753 | ||
3707 | sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; | 3754 | sgs->group_capacity = |
3708 | 3755 | DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); | |
3709 | } | 3756 | } |
3710 | 3757 | ||
3711 | /** | 3758 | /** |
@@ -3723,9 +3770,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3723 | const struct cpumask *cpus, int *balance, | 3770 | const struct cpumask *cpus, int *balance, |
3724 | struct sd_lb_stats *sds) | 3771 | struct sd_lb_stats *sds) |
3725 | { | 3772 | { |
3773 | struct sched_domain *child = sd->child; | ||
3726 | struct sched_group *group = sd->groups; | 3774 | struct sched_group *group = sd->groups; |
3727 | struct sg_lb_stats sgs; | 3775 | struct sg_lb_stats sgs; |
3728 | int load_idx; | 3776 | int load_idx, prefer_sibling = 0; |
3777 | |||
3778 | if (child && child->flags & SD_PREFER_SIBLING) | ||
3779 | prefer_sibling = 1; | ||
3729 | 3780 | ||
3730 | init_sd_power_savings_stats(sd, sds, idle); | 3781 | init_sd_power_savings_stats(sd, sds, idle); |
3731 | load_idx = get_sd_load_idx(sd, idle); | 3782 | load_idx = get_sd_load_idx(sd, idle); |
@@ -3736,14 +3787,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3736 | local_group = cpumask_test_cpu(this_cpu, | 3787 | local_group = cpumask_test_cpu(this_cpu, |
3737 | sched_group_cpus(group)); | 3788 | sched_group_cpus(group)); |
3738 | memset(&sgs, 0, sizeof(sgs)); | 3789 | memset(&sgs, 0, sizeof(sgs)); |
3739 | update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, | 3790 | update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, |
3740 | local_group, cpus, balance, &sgs); | 3791 | local_group, cpus, balance, &sgs); |
3741 | 3792 | ||
3742 | if (local_group && balance && !(*balance)) | 3793 | if (local_group && balance && !(*balance)) |
3743 | return; | 3794 | return; |
3744 | 3795 | ||
3745 | sds->total_load += sgs.group_load; | 3796 | sds->total_load += sgs.group_load; |
3746 | sds->total_pwr += group->__cpu_power; | 3797 | sds->total_pwr += group->cpu_power; |
3798 | |||
3799 | /* | ||
3800 | * In case the child domain prefers tasks go to siblings | ||
3801 | * first, lower the group capacity to one so that we'll try | ||
3802 | * and move all the excess tasks away. | ||
3803 | */ | ||
3804 | if (prefer_sibling) | ||
3805 | sgs.group_capacity = min(sgs.group_capacity, 1UL); | ||
3747 | 3806 | ||
3748 | if (local_group) { | 3807 | if (local_group) { |
3749 | sds->this_load = sgs.avg_load; | 3808 | sds->this_load = sgs.avg_load; |
@@ -3763,7 +3822,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, | |||
3763 | update_sd_power_savings_stats(group, sds, local_group, &sgs); | 3822 | update_sd_power_savings_stats(group, sds, local_group, &sgs); |
3764 | group = group->next; | 3823 | group = group->next; |
3765 | } while (group != sd->groups); | 3824 | } while (group != sd->groups); |
3766 | |||
3767 | } | 3825 | } |
3768 | 3826 | ||
3769 | /** | 3827 | /** |
@@ -3801,28 +3859,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, | |||
3801 | * moving them. | 3859 | * moving them. |
3802 | */ | 3860 | */ |
3803 | 3861 | ||
3804 | pwr_now += sds->busiest->__cpu_power * | 3862 | pwr_now += sds->busiest->cpu_power * |
3805 | min(sds->busiest_load_per_task, sds->max_load); | 3863 | min(sds->busiest_load_per_task, sds->max_load); |
3806 | pwr_now += sds->this->__cpu_power * | 3864 | pwr_now += sds->this->cpu_power * |
3807 | min(sds->this_load_per_task, sds->this_load); | 3865 | min(sds->this_load_per_task, sds->this_load); |
3808 | pwr_now /= SCHED_LOAD_SCALE; | 3866 | pwr_now /= SCHED_LOAD_SCALE; |
3809 | 3867 | ||
3810 | /* Amount of load we'd subtract */ | 3868 | /* Amount of load we'd subtract */ |
3811 | tmp = sg_div_cpu_power(sds->busiest, | 3869 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3812 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3870 | sds->busiest->cpu_power; |
3813 | if (sds->max_load > tmp) | 3871 | if (sds->max_load > tmp) |
3814 | pwr_move += sds->busiest->__cpu_power * | 3872 | pwr_move += sds->busiest->cpu_power * |
3815 | min(sds->busiest_load_per_task, sds->max_load - tmp); | 3873 | min(sds->busiest_load_per_task, sds->max_load - tmp); |
3816 | 3874 | ||
3817 | /* Amount of load we'd add */ | 3875 | /* Amount of load we'd add */ |
3818 | if (sds->max_load * sds->busiest->__cpu_power < | 3876 | if (sds->max_load * sds->busiest->cpu_power < |
3819 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) | 3877 | sds->busiest_load_per_task * SCHED_LOAD_SCALE) |
3820 | tmp = sg_div_cpu_power(sds->this, | 3878 | tmp = (sds->max_load * sds->busiest->cpu_power) / |
3821 | sds->max_load * sds->busiest->__cpu_power); | 3879 | sds->this->cpu_power; |
3822 | else | 3880 | else |
3823 | tmp = sg_div_cpu_power(sds->this, | 3881 | tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / |
3824 | sds->busiest_load_per_task * SCHED_LOAD_SCALE); | 3882 | sds->this->cpu_power; |
3825 | pwr_move += sds->this->__cpu_power * | 3883 | pwr_move += sds->this->cpu_power * |
3826 | min(sds->this_load_per_task, sds->this_load + tmp); | 3884 | min(sds->this_load_per_task, sds->this_load + tmp); |
3827 | pwr_move /= SCHED_LOAD_SCALE; | 3885 | pwr_move /= SCHED_LOAD_SCALE; |
3828 | 3886 | ||
@@ -3857,8 +3915,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, | |||
3857 | sds->max_load - sds->busiest_load_per_task); | 3915 | sds->max_load - sds->busiest_load_per_task); |
3858 | 3916 | ||
3859 | /* How much load to actually move to equalise the imbalance */ | 3917 | /* How much load to actually move to equalise the imbalance */ |
3860 | *imbalance = min(max_pull * sds->busiest->__cpu_power, | 3918 | *imbalance = min(max_pull * sds->busiest->cpu_power, |
3861 | (sds->avg_load - sds->this_load) * sds->this->__cpu_power) | 3919 | (sds->avg_load - sds->this_load) * sds->this->cpu_power) |
3862 | / SCHED_LOAD_SCALE; | 3920 | / SCHED_LOAD_SCALE; |
3863 | 3921 | ||
3864 | /* | 3922 | /* |
@@ -3988,15 +4046,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, | |||
3988 | int i; | 4046 | int i; |
3989 | 4047 | ||
3990 | for_each_cpu(i, sched_group_cpus(group)) { | 4048 | for_each_cpu(i, sched_group_cpus(group)) { |
4049 | unsigned long power = power_of(i); | ||
4050 | unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); | ||
3991 | unsigned long wl; | 4051 | unsigned long wl; |
3992 | 4052 | ||
3993 | if (!cpumask_test_cpu(i, cpus)) | 4053 | if (!cpumask_test_cpu(i, cpus)) |
3994 | continue; | 4054 | continue; |
3995 | 4055 | ||
3996 | rq = cpu_rq(i); | 4056 | rq = cpu_rq(i); |
3997 | wl = weighted_cpuload(i); | 4057 | wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; |
4058 | wl /= power; | ||
3998 | 4059 | ||
3999 | if (rq->nr_running == 1 && wl > imbalance) | 4060 | if (capacity && rq->nr_running == 1 && wl > imbalance) |
4000 | continue; | 4061 | continue; |
4001 | 4062 | ||
4002 | if (wl > max_load) { | 4063 | if (wl > max_load) { |
@@ -5031,17 +5092,16 @@ void account_idle_time(cputime_t cputime) | |||
5031 | */ | 5092 | */ |
5032 | void account_process_tick(struct task_struct *p, int user_tick) | 5093 | void account_process_tick(struct task_struct *p, int user_tick) |
5033 | { | 5094 | { |
5034 | cputime_t one_jiffy = jiffies_to_cputime(1); | 5095 | cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); |
5035 | cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); | ||
5036 | struct rq *rq = this_rq(); | 5096 | struct rq *rq = this_rq(); |
5037 | 5097 | ||
5038 | if (user_tick) | 5098 | if (user_tick) |
5039 | account_user_time(p, one_jiffy, one_jiffy_scaled); | 5099 | account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); |
5040 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 5100 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
5041 | account_system_time(p, HARDIRQ_OFFSET, one_jiffy, | 5101 | account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, |
5042 | one_jiffy_scaled); | 5102 | one_jiffy_scaled); |
5043 | else | 5103 | else |
5044 | account_idle_time(one_jiffy); | 5104 | account_idle_time(cputime_one_jiffy); |
5045 | } | 5105 | } |
5046 | 5106 | ||
5047 | /* | 5107 | /* |
@@ -5145,7 +5205,7 @@ void scheduler_tick(void) | |||
5145 | curr->sched_class->task_tick(rq, curr, 0); | 5205 | curr->sched_class->task_tick(rq, curr, 0); |
5146 | spin_unlock(&rq->lock); | 5206 | spin_unlock(&rq->lock); |
5147 | 5207 | ||
5148 | perf_counter_task_tick(curr, cpu); | 5208 | perf_event_task_tick(curr, cpu); |
5149 | 5209 | ||
5150 | #ifdef CONFIG_SMP | 5210 | #ifdef CONFIG_SMP |
5151 | rq->idle_at_tick = idle_cpu(cpu); | 5211 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -5257,14 +5317,13 @@ static inline void schedule_debug(struct task_struct *prev) | |||
5257 | #endif | 5317 | #endif |
5258 | } | 5318 | } |
5259 | 5319 | ||
5260 | static void put_prev_task(struct rq *rq, struct task_struct *prev) | 5320 | static void put_prev_task(struct rq *rq, struct task_struct *p) |
5261 | { | 5321 | { |
5262 | if (prev->state == TASK_RUNNING) { | 5322 | u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; |
5263 | u64 runtime = prev->se.sum_exec_runtime; | ||
5264 | 5323 | ||
5265 | runtime -= prev->se.prev_sum_exec_runtime; | 5324 | update_avg(&p->se.avg_running, runtime); |
5266 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); | ||
5267 | 5325 | ||
5326 | if (p->state == TASK_RUNNING) { | ||
5268 | /* | 5327 | /* |
5269 | * In order to avoid avg_overlap growing stale when we are | 5328 | * In order to avoid avg_overlap growing stale when we are |
5270 | * indeed overlapping and hence not getting put to sleep, grow | 5329 | * indeed overlapping and hence not getting put to sleep, grow |
@@ -5274,9 +5333,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
5274 | * correlates to the amount of cache footprint a task can | 5333 | * correlates to the amount of cache footprint a task can |
5275 | * build up. | 5334 | * build up. |
5276 | */ | 5335 | */ |
5277 | update_avg(&prev->se.avg_overlap, runtime); | 5336 | runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); |
5337 | update_avg(&p->se.avg_overlap, runtime); | ||
5338 | } else { | ||
5339 | update_avg(&p->se.avg_running, 0); | ||
5278 | } | 5340 | } |
5279 | prev->sched_class->put_prev_task(rq, prev); | 5341 | p->sched_class->put_prev_task(rq, p); |
5280 | } | 5342 | } |
5281 | 5343 | ||
5282 | /* | 5344 | /* |
@@ -5325,7 +5387,7 @@ need_resched: | |||
5325 | preempt_disable(); | 5387 | preempt_disable(); |
5326 | cpu = smp_processor_id(); | 5388 | cpu = smp_processor_id(); |
5327 | rq = cpu_rq(cpu); | 5389 | rq = cpu_rq(cpu); |
5328 | rcu_qsctr_inc(cpu); | 5390 | rcu_sched_qs(cpu); |
5329 | prev = rq->curr; | 5391 | prev = rq->curr; |
5330 | switch_count = &prev->nivcsw; | 5392 | switch_count = &prev->nivcsw; |
5331 | 5393 | ||
@@ -5349,10 +5411,7 @@ need_resched_nonpreemptible: | |||
5349 | switch_count = &prev->nvcsw; | 5411 | switch_count = &prev->nvcsw; |
5350 | } | 5412 | } |
5351 | 5413 | ||
5352 | #ifdef CONFIG_SMP | 5414 | pre_schedule(rq, prev); |
5353 | if (prev->sched_class->pre_schedule) | ||
5354 | prev->sched_class->pre_schedule(rq, prev); | ||
5355 | #endif | ||
5356 | 5415 | ||
5357 | if (unlikely(!rq->nr_running)) | 5416 | if (unlikely(!rq->nr_running)) |
5358 | idle_balance(cpu, rq); | 5417 | idle_balance(cpu, rq); |
@@ -5362,7 +5421,7 @@ need_resched_nonpreemptible: | |||
5362 | 5421 | ||
5363 | if (likely(prev != next)) { | 5422 | if (likely(prev != next)) { |
5364 | sched_info_switch(prev, next); | 5423 | sched_info_switch(prev, next); |
5365 | perf_counter_task_sched_out(prev, next, cpu); | 5424 | perf_event_task_sched_out(prev, next, cpu); |
5366 | 5425 | ||
5367 | rq->nr_switches++; | 5426 | rq->nr_switches++; |
5368 | rq->curr = next; | 5427 | rq->curr = next; |
@@ -5378,6 +5437,8 @@ need_resched_nonpreemptible: | |||
5378 | } else | 5437 | } else |
5379 | spin_unlock_irq(&rq->lock); | 5438 | spin_unlock_irq(&rq->lock); |
5380 | 5439 | ||
5440 | post_schedule(rq); | ||
5441 | |||
5381 | if (unlikely(reacquire_kernel_lock(current) < 0)) | 5442 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
5382 | goto need_resched_nonpreemptible; | 5443 | goto need_resched_nonpreemptible; |
5383 | 5444 | ||
@@ -5509,10 +5570,10 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
5509 | 5570 | ||
5510 | #endif /* CONFIG_PREEMPT */ | 5571 | #endif /* CONFIG_PREEMPT */ |
5511 | 5572 | ||
5512 | int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | 5573 | int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, |
5513 | void *key) | 5574 | void *key) |
5514 | { | 5575 | { |
5515 | return try_to_wake_up(curr->private, mode, sync); | 5576 | return try_to_wake_up(curr->private, mode, wake_flags); |
5516 | } | 5577 | } |
5517 | EXPORT_SYMBOL(default_wake_function); | 5578 | EXPORT_SYMBOL(default_wake_function); |
5518 | 5579 | ||
@@ -5526,14 +5587,14 @@ EXPORT_SYMBOL(default_wake_function); | |||
5526 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 5587 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
5527 | */ | 5588 | */ |
5528 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 5589 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
5529 | int nr_exclusive, int sync, void *key) | 5590 | int nr_exclusive, int wake_flags, void *key) |
5530 | { | 5591 | { |
5531 | wait_queue_t *curr, *next; | 5592 | wait_queue_t *curr, *next; |
5532 | 5593 | ||
5533 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { | 5594 | list_for_each_entry_safe(curr, next, &q->task_list, task_list) { |
5534 | unsigned flags = curr->flags; | 5595 | unsigned flags = curr->flags; |
5535 | 5596 | ||
5536 | if (curr->func(curr, mode, sync, key) && | 5597 | if (curr->func(curr, mode, wake_flags, key) && |
5537 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | 5598 | (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) |
5538 | break; | 5599 | break; |
5539 | } | 5600 | } |
@@ -5594,16 +5655,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, | |||
5594 | int nr_exclusive, void *key) | 5655 | int nr_exclusive, void *key) |
5595 | { | 5656 | { |
5596 | unsigned long flags; | 5657 | unsigned long flags; |
5597 | int sync = 1; | 5658 | int wake_flags = WF_SYNC; |
5598 | 5659 | ||
5599 | if (unlikely(!q)) | 5660 | if (unlikely(!q)) |
5600 | return; | 5661 | return; |
5601 | 5662 | ||
5602 | if (unlikely(!nr_exclusive)) | 5663 | if (unlikely(!nr_exclusive)) |
5603 | sync = 0; | 5664 | wake_flags = 0; |
5604 | 5665 | ||
5605 | spin_lock_irqsave(&q->lock, flags); | 5666 | spin_lock_irqsave(&q->lock, flags); |
5606 | __wake_up_common(q, mode, nr_exclusive, sync, key); | 5667 | __wake_up_common(q, mode, nr_exclusive, wake_flags, key); |
5607 | spin_unlock_irqrestore(&q->lock, flags); | 5668 | spin_unlock_irqrestore(&q->lock, flags); |
5608 | } | 5669 | } |
5609 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); | 5670 | EXPORT_SYMBOL_GPL(__wake_up_sync_key); |
@@ -6123,17 +6184,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, | |||
6123 | unsigned long flags; | 6184 | unsigned long flags; |
6124 | const struct sched_class *prev_class = p->sched_class; | 6185 | const struct sched_class *prev_class = p->sched_class; |
6125 | struct rq *rq; | 6186 | struct rq *rq; |
6187 | int reset_on_fork; | ||
6126 | 6188 | ||
6127 | /* may grab non-irq protected spin_locks */ | 6189 | /* may grab non-irq protected spin_locks */ |
6128 | BUG_ON(in_interrupt()); | 6190 | BUG_ON(in_interrupt()); |
6129 | recheck: | 6191 | recheck: |
6130 | /* double check policy once rq lock held */ | 6192 | /* double check policy once rq lock held */ |
6131 | if (policy < 0) | 6193 | if (policy < 0) { |
6194 | reset_on_fork = p->sched_reset_on_fork; | ||
6132 | policy = oldpolicy = p->policy; | 6195 | policy = oldpolicy = p->policy; |
6133 | else if (policy != SCHED_FIFO && policy != SCHED_RR && | 6196 | } else { |
6134 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | 6197 | reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); |
6135 | policy != SCHED_IDLE) | 6198 | policy &= ~SCHED_RESET_ON_FORK; |
6136 | return -EINVAL; | 6199 | |
6200 | if (policy != SCHED_FIFO && policy != SCHED_RR && | ||
6201 | policy != SCHED_NORMAL && policy != SCHED_BATCH && | ||
6202 | policy != SCHED_IDLE) | ||
6203 | return -EINVAL; | ||
6204 | } | ||
6205 | |||
6137 | /* | 6206 | /* |
6138 | * Valid priorities for SCHED_FIFO and SCHED_RR are | 6207 | * Valid priorities for SCHED_FIFO and SCHED_RR are |
6139 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, | 6208 | * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, |
@@ -6177,6 +6246,10 @@ recheck: | |||
6177 | /* can't change other user's priorities */ | 6246 | /* can't change other user's priorities */ |
6178 | if (!check_same_owner(p)) | 6247 | if (!check_same_owner(p)) |
6179 | return -EPERM; | 6248 | return -EPERM; |
6249 | |||
6250 | /* Normal users shall not reset the sched_reset_on_fork flag */ | ||
6251 | if (p->sched_reset_on_fork && !reset_on_fork) | ||
6252 | return -EPERM; | ||
6180 | } | 6253 | } |
6181 | 6254 | ||
6182 | if (user) { | 6255 | if (user) { |
@@ -6220,6 +6293,8 @@ recheck: | |||
6220 | if (running) | 6293 | if (running) |
6221 | p->sched_class->put_prev_task(rq, p); | 6294 | p->sched_class->put_prev_task(rq, p); |
6222 | 6295 | ||
6296 | p->sched_reset_on_fork = reset_on_fork; | ||
6297 | |||
6223 | oldprio = p->prio; | 6298 | oldprio = p->prio; |
6224 | __setscheduler(rq, p, policy, param->sched_priority); | 6299 | __setscheduler(rq, p, policy, param->sched_priority); |
6225 | 6300 | ||
@@ -6336,14 +6411,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) | |||
6336 | if (p) { | 6411 | if (p) { |
6337 | retval = security_task_getscheduler(p); | 6412 | retval = security_task_getscheduler(p); |
6338 | if (!retval) | 6413 | if (!retval) |
6339 | retval = p->policy; | 6414 | retval = p->policy |
6415 | | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); | ||
6340 | } | 6416 | } |
6341 | read_unlock(&tasklist_lock); | 6417 | read_unlock(&tasklist_lock); |
6342 | return retval; | 6418 | return retval; |
6343 | } | 6419 | } |
6344 | 6420 | ||
6345 | /** | 6421 | /** |
6346 | * sys_sched_getscheduler - get the RT priority of a thread | 6422 | * sys_sched_getparam - get the RT priority of a thread |
6347 | * @pid: the pid in question. | 6423 | * @pid: the pid in question. |
6348 | * @param: structure containing the RT priority. | 6424 | * @param: structure containing the RT priority. |
6349 | */ | 6425 | */ |
@@ -6571,19 +6647,9 @@ static inline int should_resched(void) | |||
6571 | 6647 | ||
6572 | static void __cond_resched(void) | 6648 | static void __cond_resched(void) |
6573 | { | 6649 | { |
6574 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 6650 | add_preempt_count(PREEMPT_ACTIVE); |
6575 | __might_sleep(__FILE__, __LINE__); | 6651 | schedule(); |
6576 | #endif | 6652 | sub_preempt_count(PREEMPT_ACTIVE); |
6577 | /* | ||
6578 | * The BKS might be reacquired before we have dropped | ||
6579 | * PREEMPT_ACTIVE, which could trigger a second | ||
6580 | * cond_resched() call. | ||
6581 | */ | ||
6582 | do { | ||
6583 | add_preempt_count(PREEMPT_ACTIVE); | ||
6584 | schedule(); | ||
6585 | sub_preempt_count(PREEMPT_ACTIVE); | ||
6586 | } while (need_resched()); | ||
6587 | } | 6653 | } |
6588 | 6654 | ||
6589 | int __sched _cond_resched(void) | 6655 | int __sched _cond_resched(void) |
@@ -6597,18 +6663,20 @@ int __sched _cond_resched(void) | |||
6597 | EXPORT_SYMBOL(_cond_resched); | 6663 | EXPORT_SYMBOL(_cond_resched); |
6598 | 6664 | ||
6599 | /* | 6665 | /* |
6600 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 6666 | * __cond_resched_lock() - if a reschedule is pending, drop the given lock, |
6601 | * call schedule, and on return reacquire the lock. | 6667 | * call schedule, and on return reacquire the lock. |
6602 | * | 6668 | * |
6603 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 6669 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
6604 | * operations here to prevent schedule() from being called twice (once via | 6670 | * operations here to prevent schedule() from being called twice (once via |
6605 | * spin_unlock(), once by hand). | 6671 | * spin_unlock(), once by hand). |
6606 | */ | 6672 | */ |
6607 | int cond_resched_lock(spinlock_t *lock) | 6673 | int __cond_resched_lock(spinlock_t *lock) |
6608 | { | 6674 | { |
6609 | int resched = should_resched(); | 6675 | int resched = should_resched(); |
6610 | int ret = 0; | 6676 | int ret = 0; |
6611 | 6677 | ||
6678 | lockdep_assert_held(lock); | ||
6679 | |||
6612 | if (spin_needbreak(lock) || resched) { | 6680 | if (spin_needbreak(lock) || resched) { |
6613 | spin_unlock(lock); | 6681 | spin_unlock(lock); |
6614 | if (resched) | 6682 | if (resched) |
@@ -6620,9 +6688,9 @@ int cond_resched_lock(spinlock_t *lock) | |||
6620 | } | 6688 | } |
6621 | return ret; | 6689 | return ret; |
6622 | } | 6690 | } |
6623 | EXPORT_SYMBOL(cond_resched_lock); | 6691 | EXPORT_SYMBOL(__cond_resched_lock); |
6624 | 6692 | ||
6625 | int __sched cond_resched_softirq(void) | 6693 | int __sched __cond_resched_softirq(void) |
6626 | { | 6694 | { |
6627 | BUG_ON(!in_softirq()); | 6695 | BUG_ON(!in_softirq()); |
6628 | 6696 | ||
@@ -6634,7 +6702,7 @@ int __sched cond_resched_softirq(void) | |||
6634 | } | 6702 | } |
6635 | return 0; | 6703 | return 0; |
6636 | } | 6704 | } |
6637 | EXPORT_SYMBOL(cond_resched_softirq); | 6705 | EXPORT_SYMBOL(__cond_resched_softirq); |
6638 | 6706 | ||
6639 | /** | 6707 | /** |
6640 | * yield - yield the current processor to other threads. | 6708 | * yield - yield the current processor to other threads. |
@@ -6658,11 +6726,13 @@ EXPORT_SYMBOL(yield); | |||
6658 | */ | 6726 | */ |
6659 | void __sched io_schedule(void) | 6727 | void __sched io_schedule(void) |
6660 | { | 6728 | { |
6661 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6729 | struct rq *rq = raw_rq(); |
6662 | 6730 | ||
6663 | delayacct_blkio_start(); | 6731 | delayacct_blkio_start(); |
6664 | atomic_inc(&rq->nr_iowait); | 6732 | atomic_inc(&rq->nr_iowait); |
6733 | current->in_iowait = 1; | ||
6665 | schedule(); | 6734 | schedule(); |
6735 | current->in_iowait = 0; | ||
6666 | atomic_dec(&rq->nr_iowait); | 6736 | atomic_dec(&rq->nr_iowait); |
6667 | delayacct_blkio_end(); | 6737 | delayacct_blkio_end(); |
6668 | } | 6738 | } |
@@ -6670,12 +6740,14 @@ EXPORT_SYMBOL(io_schedule); | |||
6670 | 6740 | ||
6671 | long __sched io_schedule_timeout(long timeout) | 6741 | long __sched io_schedule_timeout(long timeout) |
6672 | { | 6742 | { |
6673 | struct rq *rq = &__raw_get_cpu_var(runqueues); | 6743 | struct rq *rq = raw_rq(); |
6674 | long ret; | 6744 | long ret; |
6675 | 6745 | ||
6676 | delayacct_blkio_start(); | 6746 | delayacct_blkio_start(); |
6677 | atomic_inc(&rq->nr_iowait); | 6747 | atomic_inc(&rq->nr_iowait); |
6748 | current->in_iowait = 1; | ||
6678 | ret = schedule_timeout(timeout); | 6749 | ret = schedule_timeout(timeout); |
6750 | current->in_iowait = 0; | ||
6679 | atomic_dec(&rq->nr_iowait); | 6751 | atomic_dec(&rq->nr_iowait); |
6680 | delayacct_blkio_end(); | 6752 | delayacct_blkio_end(); |
6681 | return ret; | 6753 | return ret; |
@@ -6759,23 +6831,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, | |||
6759 | if (retval) | 6831 | if (retval) |
6760 | goto out_unlock; | 6832 | goto out_unlock; |
6761 | 6833 | ||
6762 | /* | 6834 | time_slice = p->sched_class->get_rr_interval(p); |
6763 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER | ||
6764 | * tasks that are on an otherwise idle runqueue: | ||
6765 | */ | ||
6766 | time_slice = 0; | ||
6767 | if (p->policy == SCHED_RR) { | ||
6768 | time_slice = DEF_TIMESLICE; | ||
6769 | } else if (p->policy != SCHED_FIFO) { | ||
6770 | struct sched_entity *se = &p->se; | ||
6771 | unsigned long flags; | ||
6772 | struct rq *rq; | ||
6773 | 6835 | ||
6774 | rq = task_rq_lock(p, &flags); | ||
6775 | if (rq->cfs.load.weight) | ||
6776 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | ||
6777 | task_rq_unlock(rq, &flags); | ||
6778 | } | ||
6779 | read_unlock(&tasklist_lock); | 6836 | read_unlock(&tasklist_lock); |
6780 | jiffies_to_timespec(time_slice, &t); | 6837 | jiffies_to_timespec(time_slice, &t); |
6781 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 6838 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -6992,8 +7049,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) | |||
6992 | 7049 | ||
6993 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { | 7050 | if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { |
6994 | /* Need help from migration thread: drop lock and wait. */ | 7051 | /* Need help from migration thread: drop lock and wait. */ |
7052 | struct task_struct *mt = rq->migration_thread; | ||
7053 | |||
7054 | get_task_struct(mt); | ||
6995 | task_rq_unlock(rq, &flags); | 7055 | task_rq_unlock(rq, &flags); |
6996 | wake_up_process(rq->migration_thread); | 7056 | wake_up_process(rq->migration_thread); |
7057 | put_task_struct(mt); | ||
6997 | wait_for_completion(&req.done); | 7058 | wait_for_completion(&req.done); |
6998 | tlb_migrate_finish(p->mm); | 7059 | tlb_migrate_finish(p->mm); |
6999 | return 0; | 7060 | return 0; |
@@ -7051,6 +7112,11 @@ fail: | |||
7051 | return ret; | 7112 | return ret; |
7052 | } | 7113 | } |
7053 | 7114 | ||
7115 | #define RCU_MIGRATION_IDLE 0 | ||
7116 | #define RCU_MIGRATION_NEED_QS 1 | ||
7117 | #define RCU_MIGRATION_GOT_QS 2 | ||
7118 | #define RCU_MIGRATION_MUST_SYNC 3 | ||
7119 | |||
7054 | /* | 7120 | /* |
7055 | * migration_thread - this is a highprio system thread that performs | 7121 | * migration_thread - this is a highprio system thread that performs |
7056 | * thread migration by bumping thread off CPU then 'pushing' onto | 7122 | * thread migration by bumping thread off CPU then 'pushing' onto |
@@ -7058,6 +7124,7 @@ fail: | |||
7058 | */ | 7124 | */ |
7059 | static int migration_thread(void *data) | 7125 | static int migration_thread(void *data) |
7060 | { | 7126 | { |
7127 | int badcpu; | ||
7061 | int cpu = (long)data; | 7128 | int cpu = (long)data; |
7062 | struct rq *rq; | 7129 | struct rq *rq; |
7063 | 7130 | ||
@@ -7092,8 +7159,17 @@ static int migration_thread(void *data) | |||
7092 | req = list_entry(head->next, struct migration_req, list); | 7159 | req = list_entry(head->next, struct migration_req, list); |
7093 | list_del_init(head->next); | 7160 | list_del_init(head->next); |
7094 | 7161 | ||
7095 | spin_unlock(&rq->lock); | 7162 | if (req->task != NULL) { |
7096 | __migrate_task(req->task, cpu, req->dest_cpu); | 7163 | spin_unlock(&rq->lock); |
7164 | __migrate_task(req->task, cpu, req->dest_cpu); | ||
7165 | } else if (likely(cpu == (badcpu = smp_processor_id()))) { | ||
7166 | req->dest_cpu = RCU_MIGRATION_GOT_QS; | ||
7167 | spin_unlock(&rq->lock); | ||
7168 | } else { | ||
7169 | req->dest_cpu = RCU_MIGRATION_MUST_SYNC; | ||
7170 | spin_unlock(&rq->lock); | ||
7171 | WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); | ||
7172 | } | ||
7097 | local_irq_enable(); | 7173 | local_irq_enable(); |
7098 | 7174 | ||
7099 | complete(&req->done); | 7175 | complete(&req->done); |
@@ -7607,7 +7683,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
7607 | /* | 7683 | /* |
7608 | * Register at high priority so that task migration (migrate_all_tasks) | 7684 | * Register at high priority so that task migration (migrate_all_tasks) |
7609 | * happens before everything else. This has to be lower priority than | 7685 | * happens before everything else. This has to be lower priority than |
7610 | * the notifier in the perf_counter subsystem, though. | 7686 | * the notifier in the perf_event subsystem, though. |
7611 | */ | 7687 | */ |
7612 | static struct notifier_block __cpuinitdata migration_notifier = { | 7688 | static struct notifier_block __cpuinitdata migration_notifier = { |
7613 | .notifier_call = migration_call, | 7689 | .notifier_call = migration_call, |
@@ -7625,7 +7701,7 @@ static int __init migration_init(void) | |||
7625 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 7701 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
7626 | register_cpu_notifier(&migration_notifier); | 7702 | register_cpu_notifier(&migration_notifier); |
7627 | 7703 | ||
7628 | return err; | 7704 | return 0; |
7629 | } | 7705 | } |
7630 | early_initcall(migration_init); | 7706 | early_initcall(migration_init); |
7631 | #endif | 7707 | #endif |
@@ -7672,7 +7748,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7672 | break; | 7748 | break; |
7673 | } | 7749 | } |
7674 | 7750 | ||
7675 | if (!group->__cpu_power) { | 7751 | if (!group->cpu_power) { |
7676 | printk(KERN_CONT "\n"); | 7752 | printk(KERN_CONT "\n"); |
7677 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 7753 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
7678 | "set\n"); | 7754 | "set\n"); |
@@ -7696,9 +7772,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
7696 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); | 7772 | cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); |
7697 | 7773 | ||
7698 | printk(KERN_CONT " %s", str); | 7774 | printk(KERN_CONT " %s", str); |
7699 | if (group->__cpu_power != SCHED_LOAD_SCALE) { | 7775 | if (group->cpu_power != SCHED_LOAD_SCALE) { |
7700 | printk(KERN_CONT " (__cpu_power = %d)", | 7776 | printk(KERN_CONT " (cpu_power = %d)", |
7701 | group->__cpu_power); | 7777 | group->cpu_power); |
7702 | } | 7778 | } |
7703 | 7779 | ||
7704 | group = group->next; | 7780 | group = group->next; |
@@ -7763,9 +7839,7 @@ static int sd_degenerate(struct sched_domain *sd) | |||
7763 | } | 7839 | } |
7764 | 7840 | ||
7765 | /* Following flags don't use groups */ | 7841 | /* Following flags don't use groups */ |
7766 | if (sd->flags & (SD_WAKE_IDLE | | 7842 | if (sd->flags & (SD_WAKE_AFFINE)) |
7767 | SD_WAKE_AFFINE | | ||
7768 | SD_WAKE_BALANCE)) | ||
7769 | return 0; | 7843 | return 0; |
7770 | 7844 | ||
7771 | return 1; | 7845 | return 1; |
@@ -7782,10 +7856,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
7782 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) | 7856 | if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) |
7783 | return 0; | 7857 | return 0; |
7784 | 7858 | ||
7785 | /* Does parent contain flags not in child? */ | ||
7786 | /* WAKE_BALANCE is a subset of WAKE_AFFINE */ | ||
7787 | if (cflags & SD_WAKE_AFFINE) | ||
7788 | pflags &= ~SD_WAKE_BALANCE; | ||
7789 | /* Flags needing groups don't count if only 1 group in parent */ | 7859 | /* Flags needing groups don't count if only 1 group in parent */ |
7790 | if (parent->groups == parent->groups->next) { | 7860 | if (parent->groups == parent->groups->next) { |
7791 | pflags &= ~(SD_LOAD_BALANCE | | 7861 | pflags &= ~(SD_LOAD_BALANCE | |
@@ -7841,7 +7911,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) | |||
7841 | rq->rd = rd; | 7911 | rq->rd = rd; |
7842 | 7912 | ||
7843 | cpumask_set_cpu(rq->cpu, rd->span); | 7913 | cpumask_set_cpu(rq->cpu, rd->span); |
7844 | if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) | 7914 | if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) |
7845 | set_rq_online(rq); | 7915 | set_rq_online(rq); |
7846 | 7916 | ||
7847 | spin_unlock_irqrestore(&rq->lock, flags); | 7917 | spin_unlock_irqrestore(&rq->lock, flags); |
@@ -7983,7 +8053,7 @@ init_sched_build_groups(const struct cpumask *span, | |||
7983 | continue; | 8053 | continue; |
7984 | 8054 | ||
7985 | cpumask_clear(sched_group_cpus(sg)); | 8055 | cpumask_clear(sched_group_cpus(sg)); |
7986 | sg->__cpu_power = 0; | 8056 | sg->cpu_power = 0; |
7987 | 8057 | ||
7988 | for_each_cpu(j, span) { | 8058 | for_each_cpu(j, span) { |
7989 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) | 8059 | if (group_fn(j, cpu_map, NULL, tmpmask) != group) |
@@ -8091,6 +8161,39 @@ struct static_sched_domain { | |||
8091 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); | 8161 | DECLARE_BITMAP(span, CONFIG_NR_CPUS); |
8092 | }; | 8162 | }; |
8093 | 8163 | ||
8164 | struct s_data { | ||
8165 | #ifdef CONFIG_NUMA | ||
8166 | int sd_allnodes; | ||
8167 | cpumask_var_t domainspan; | ||
8168 | cpumask_var_t covered; | ||
8169 | cpumask_var_t notcovered; | ||
8170 | #endif | ||
8171 | cpumask_var_t nodemask; | ||
8172 | cpumask_var_t this_sibling_map; | ||
8173 | cpumask_var_t this_core_map; | ||
8174 | cpumask_var_t send_covered; | ||
8175 | cpumask_var_t tmpmask; | ||
8176 | struct sched_group **sched_group_nodes; | ||
8177 | struct root_domain *rd; | ||
8178 | }; | ||
8179 | |||
8180 | enum s_alloc { | ||
8181 | sa_sched_groups = 0, | ||
8182 | sa_rootdomain, | ||
8183 | sa_tmpmask, | ||
8184 | sa_send_covered, | ||
8185 | sa_this_core_map, | ||
8186 | sa_this_sibling_map, | ||
8187 | sa_nodemask, | ||
8188 | sa_sched_group_nodes, | ||
8189 | #ifdef CONFIG_NUMA | ||
8190 | sa_notcovered, | ||
8191 | sa_covered, | ||
8192 | sa_domainspan, | ||
8193 | #endif | ||
8194 | sa_none, | ||
8195 | }; | ||
8196 | |||
8094 | /* | 8197 | /* |
8095 | * SMT sched-domains: | 8198 | * SMT sched-domains: |
8096 | */ | 8199 | */ |
@@ -8208,11 +8311,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) | |||
8208 | continue; | 8311 | continue; |
8209 | } | 8312 | } |
8210 | 8313 | ||
8211 | sg_inc_cpu_power(sg, sd->groups->__cpu_power); | 8314 | sg->cpu_power += sd->groups->cpu_power; |
8212 | } | 8315 | } |
8213 | sg = sg->next; | 8316 | sg = sg->next; |
8214 | } while (sg != group_head); | 8317 | } while (sg != group_head); |
8215 | } | 8318 | } |
8319 | |||
8320 | static int build_numa_sched_groups(struct s_data *d, | ||
8321 | const struct cpumask *cpu_map, int num) | ||
8322 | { | ||
8323 | struct sched_domain *sd; | ||
8324 | struct sched_group *sg, *prev; | ||
8325 | int n, j; | ||
8326 | |||
8327 | cpumask_clear(d->covered); | ||
8328 | cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); | ||
8329 | if (cpumask_empty(d->nodemask)) { | ||
8330 | d->sched_group_nodes[num] = NULL; | ||
8331 | goto out; | ||
8332 | } | ||
8333 | |||
8334 | sched_domain_node_span(num, d->domainspan); | ||
8335 | cpumask_and(d->domainspan, d->domainspan, cpu_map); | ||
8336 | |||
8337 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8338 | GFP_KERNEL, num); | ||
8339 | if (!sg) { | ||
8340 | printk(KERN_WARNING "Can not alloc domain group for node %d\n", | ||
8341 | num); | ||
8342 | return -ENOMEM; | ||
8343 | } | ||
8344 | d->sched_group_nodes[num] = sg; | ||
8345 | |||
8346 | for_each_cpu(j, d->nodemask) { | ||
8347 | sd = &per_cpu(node_domains, j).sd; | ||
8348 | sd->groups = sg; | ||
8349 | } | ||
8350 | |||
8351 | sg->cpu_power = 0; | ||
8352 | cpumask_copy(sched_group_cpus(sg), d->nodemask); | ||
8353 | sg->next = sg; | ||
8354 | cpumask_or(d->covered, d->covered, d->nodemask); | ||
8355 | |||
8356 | prev = sg; | ||
8357 | for (j = 0; j < nr_node_ids; j++) { | ||
8358 | n = (num + j) % nr_node_ids; | ||
8359 | cpumask_complement(d->notcovered, d->covered); | ||
8360 | cpumask_and(d->tmpmask, d->notcovered, cpu_map); | ||
8361 | cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); | ||
8362 | if (cpumask_empty(d->tmpmask)) | ||
8363 | break; | ||
8364 | cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); | ||
8365 | if (cpumask_empty(d->tmpmask)) | ||
8366 | continue; | ||
8367 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | ||
8368 | GFP_KERNEL, num); | ||
8369 | if (!sg) { | ||
8370 | printk(KERN_WARNING | ||
8371 | "Can not alloc domain group for node %d\n", j); | ||
8372 | return -ENOMEM; | ||
8373 | } | ||
8374 | sg->cpu_power = 0; | ||
8375 | cpumask_copy(sched_group_cpus(sg), d->tmpmask); | ||
8376 | sg->next = prev->next; | ||
8377 | cpumask_or(d->covered, d->covered, d->tmpmask); | ||
8378 | prev->next = sg; | ||
8379 | prev = sg; | ||
8380 | } | ||
8381 | out: | ||
8382 | return 0; | ||
8383 | } | ||
8216 | #endif /* CONFIG_NUMA */ | 8384 | #endif /* CONFIG_NUMA */ |
8217 | 8385 | ||
8218 | #ifdef CONFIG_NUMA | 8386 | #ifdef CONFIG_NUMA |
@@ -8266,15 +8434,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, | |||
8266 | * there are asymmetries in the topology. If there are asymmetries, group | 8434 | * there are asymmetries in the topology. If there are asymmetries, group |
8267 | * having more cpu_power will pickup more load compared to the group having | 8435 | * having more cpu_power will pickup more load compared to the group having |
8268 | * less cpu_power. | 8436 | * less cpu_power. |
8269 | * | ||
8270 | * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents | ||
8271 | * the maximum number of tasks a group can handle in the presence of other idle | ||
8272 | * or lightly loaded groups in the same sched domain. | ||
8273 | */ | 8437 | */ |
8274 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) | 8438 | static void init_sched_groups_power(int cpu, struct sched_domain *sd) |
8275 | { | 8439 | { |
8276 | struct sched_domain *child; | 8440 | struct sched_domain *child; |
8277 | struct sched_group *group; | 8441 | struct sched_group *group; |
8442 | long power; | ||
8443 | int weight; | ||
8278 | 8444 | ||
8279 | WARN_ON(!sd || !sd->groups); | 8445 | WARN_ON(!sd || !sd->groups); |
8280 | 8446 | ||
@@ -8283,28 +8449,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
8283 | 8449 | ||
8284 | child = sd->child; | 8450 | child = sd->child; |
8285 | 8451 | ||
8286 | sd->groups->__cpu_power = 0; | 8452 | sd->groups->cpu_power = 0; |
8287 | 8453 | ||
8288 | /* | 8454 | if (!child) { |
8289 | * For perf policy, if the groups in child domain share resources | 8455 | power = SCHED_LOAD_SCALE; |
8290 | * (for example cores sharing some portions of the cache hierarchy | 8456 | weight = cpumask_weight(sched_domain_span(sd)); |
8291 | * or SMT), then set this domain groups cpu_power such that each group | 8457 | /* |
8292 | * can handle only one task, when there are other idle groups in the | 8458 | * SMT siblings share the power of a single core. |
8293 | * same sched domain. | 8459 | * Usually multiple threads get a better yield out of |
8294 | */ | 8460 | * that one core than a single thread would have, |
8295 | if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && | 8461 | * reflect that in sd->smt_gain. |
8296 | (child->flags & | 8462 | */ |
8297 | (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { | 8463 | if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { |
8298 | sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); | 8464 | power *= sd->smt_gain; |
8465 | power /= weight; | ||
8466 | power >>= SCHED_LOAD_SHIFT; | ||
8467 | } | ||
8468 | sd->groups->cpu_power += power; | ||
8299 | return; | 8469 | return; |
8300 | } | 8470 | } |
8301 | 8471 | ||
8302 | /* | 8472 | /* |
8303 | * add cpu_power of each child group to this groups cpu_power | 8473 | * Add cpu_power of each child group to this groups cpu_power. |
8304 | */ | 8474 | */ |
8305 | group = child->groups; | 8475 | group = child->groups; |
8306 | do { | 8476 | do { |
8307 | sg_inc_cpu_power(sd->groups, group->__cpu_power); | 8477 | sd->groups->cpu_power += group->cpu_power; |
8308 | group = group->next; | 8478 | group = group->next; |
8309 | } while (group != child->groups); | 8479 | } while (group != child->groups); |
8310 | } | 8480 | } |
@@ -8371,287 +8541,292 @@ static void set_domain_attribute(struct sched_domain *sd, | |||
8371 | request = attr->relax_domain_level; | 8541 | request = attr->relax_domain_level; |
8372 | if (request < sd->level) { | 8542 | if (request < sd->level) { |
8373 | /* turn off idle balance on this domain */ | 8543 | /* turn off idle balance on this domain */ |
8374 | sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); | 8544 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8375 | } else { | 8545 | } else { |
8376 | /* turn on idle balance on this domain */ | 8546 | /* turn on idle balance on this domain */ |
8377 | sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); | 8547 | sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
8548 | } | ||
8549 | } | ||
8550 | |||
8551 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | ||
8552 | const struct cpumask *cpu_map) | ||
8553 | { | ||
8554 | switch (what) { | ||
8555 | case sa_sched_groups: | ||
8556 | free_sched_groups(cpu_map, d->tmpmask); /* fall through */ | ||
8557 | d->sched_group_nodes = NULL; | ||
8558 | case sa_rootdomain: | ||
8559 | free_rootdomain(d->rd); /* fall through */ | ||
8560 | case sa_tmpmask: | ||
8561 | free_cpumask_var(d->tmpmask); /* fall through */ | ||
8562 | case sa_send_covered: | ||
8563 | free_cpumask_var(d->send_covered); /* fall through */ | ||
8564 | case sa_this_core_map: | ||
8565 | free_cpumask_var(d->this_core_map); /* fall through */ | ||
8566 | case sa_this_sibling_map: | ||
8567 | free_cpumask_var(d->this_sibling_map); /* fall through */ | ||
8568 | case sa_nodemask: | ||
8569 | free_cpumask_var(d->nodemask); /* fall through */ | ||
8570 | case sa_sched_group_nodes: | ||
8571 | #ifdef CONFIG_NUMA | ||
8572 | kfree(d->sched_group_nodes); /* fall through */ | ||
8573 | case sa_notcovered: | ||
8574 | free_cpumask_var(d->notcovered); /* fall through */ | ||
8575 | case sa_covered: | ||
8576 | free_cpumask_var(d->covered); /* fall through */ | ||
8577 | case sa_domainspan: | ||
8578 | free_cpumask_var(d->domainspan); /* fall through */ | ||
8579 | #endif | ||
8580 | case sa_none: | ||
8581 | break; | ||
8378 | } | 8582 | } |
8379 | } | 8583 | } |
8380 | 8584 | ||
8381 | /* | 8585 | static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, |
8382 | * Build sched domains for a given set of cpus and attach the sched domains | 8586 | const struct cpumask *cpu_map) |
8383 | * to the individual cpus | ||
8384 | */ | ||
8385 | static int __build_sched_domains(const struct cpumask *cpu_map, | ||
8386 | struct sched_domain_attr *attr) | ||
8387 | { | 8587 | { |
8388 | int i, err = -ENOMEM; | ||
8389 | struct root_domain *rd; | ||
8390 | cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, | ||
8391 | tmpmask; | ||
8392 | #ifdef CONFIG_NUMA | ||
8393 | cpumask_var_t domainspan, covered, notcovered; | ||
8394 | struct sched_group **sched_group_nodes = NULL; | ||
8395 | int sd_allnodes = 0; | ||
8396 | |||
8397 | if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) | ||
8398 | goto out; | ||
8399 | if (!alloc_cpumask_var(&covered, GFP_KERNEL)) | ||
8400 | goto free_domainspan; | ||
8401 | if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) | ||
8402 | goto free_covered; | ||
8403 | #endif | ||
8404 | |||
8405 | if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) | ||
8406 | goto free_notcovered; | ||
8407 | if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) | ||
8408 | goto free_nodemask; | ||
8409 | if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) | ||
8410 | goto free_this_sibling_map; | ||
8411 | if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) | ||
8412 | goto free_this_core_map; | ||
8413 | if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
8414 | goto free_send_covered; | ||
8415 | |||
8416 | #ifdef CONFIG_NUMA | 8588 | #ifdef CONFIG_NUMA |
8417 | /* | 8589 | if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) |
8418 | * Allocate the per-node list of sched groups | 8590 | return sa_none; |
8419 | */ | 8591 | if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) |
8420 | sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), | 8592 | return sa_domainspan; |
8421 | GFP_KERNEL); | 8593 | if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) |
8422 | if (!sched_group_nodes) { | 8594 | return sa_covered; |
8595 | /* Allocate the per-node list of sched groups */ | ||
8596 | d->sched_group_nodes = kcalloc(nr_node_ids, | ||
8597 | sizeof(struct sched_group *), GFP_KERNEL); | ||
8598 | if (!d->sched_group_nodes) { | ||
8423 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 8599 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
8424 | goto free_tmpmask; | 8600 | return sa_notcovered; |
8425 | } | 8601 | } |
8426 | #endif | 8602 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; |
8427 | 8603 | #endif | |
8428 | rd = alloc_rootdomain(); | 8604 | if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) |
8429 | if (!rd) { | 8605 | return sa_sched_group_nodes; |
8606 | if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) | ||
8607 | return sa_nodemask; | ||
8608 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | ||
8609 | return sa_this_sibling_map; | ||
8610 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
8611 | return sa_this_core_map; | ||
8612 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | ||
8613 | return sa_send_covered; | ||
8614 | d->rd = alloc_rootdomain(); | ||
8615 | if (!d->rd) { | ||
8430 | printk(KERN_WARNING "Cannot alloc root domain\n"); | 8616 | printk(KERN_WARNING "Cannot alloc root domain\n"); |
8431 | goto free_sched_groups; | 8617 | return sa_tmpmask; |
8432 | } | 8618 | } |
8619 | return sa_rootdomain; | ||
8620 | } | ||
8433 | 8621 | ||
8622 | static struct sched_domain *__build_numa_sched_domains(struct s_data *d, | ||
8623 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) | ||
8624 | { | ||
8625 | struct sched_domain *sd = NULL; | ||
8434 | #ifdef CONFIG_NUMA | 8626 | #ifdef CONFIG_NUMA |
8435 | sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; | 8627 | struct sched_domain *parent; |
8436 | #endif | ||
8437 | 8628 | ||
8438 | /* | 8629 | d->sd_allnodes = 0; |
8439 | * Set up domains for cpus specified by the cpu_map. | 8630 | if (cpumask_weight(cpu_map) > |
8440 | */ | 8631 | SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { |
8441 | for_each_cpu(i, cpu_map) { | 8632 | sd = &per_cpu(allnodes_domains, i).sd; |
8442 | struct sched_domain *sd = NULL, *p; | 8633 | SD_INIT(sd, ALLNODES); |
8443 | |||
8444 | cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); | ||
8445 | |||
8446 | #ifdef CONFIG_NUMA | ||
8447 | if (cpumask_weight(cpu_map) > | ||
8448 | SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { | ||
8449 | sd = &per_cpu(allnodes_domains, i).sd; | ||
8450 | SD_INIT(sd, ALLNODES); | ||
8451 | set_domain_attribute(sd, attr); | ||
8452 | cpumask_copy(sched_domain_span(sd), cpu_map); | ||
8453 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | ||
8454 | p = sd; | ||
8455 | sd_allnodes = 1; | ||
8456 | } else | ||
8457 | p = NULL; | ||
8458 | |||
8459 | sd = &per_cpu(node_domains, i).sd; | ||
8460 | SD_INIT(sd, NODE); | ||
8461 | set_domain_attribute(sd, attr); | 8634 | set_domain_attribute(sd, attr); |
8462 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | 8635 | cpumask_copy(sched_domain_span(sd), cpu_map); |
8463 | sd->parent = p; | 8636 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); |
8464 | if (p) | 8637 | d->sd_allnodes = 1; |
8465 | p->child = sd; | 8638 | } |
8466 | cpumask_and(sched_domain_span(sd), | 8639 | parent = sd; |
8467 | sched_domain_span(sd), cpu_map); | 8640 | |
8641 | sd = &per_cpu(node_domains, i).sd; | ||
8642 | SD_INIT(sd, NODE); | ||
8643 | set_domain_attribute(sd, attr); | ||
8644 | sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); | ||
8645 | sd->parent = parent; | ||
8646 | if (parent) | ||
8647 | parent->child = sd; | ||
8648 | cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); | ||
8468 | #endif | 8649 | #endif |
8650 | return sd; | ||
8651 | } | ||
8469 | 8652 | ||
8470 | p = sd; | 8653 | static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, |
8471 | sd = &per_cpu(phys_domains, i).sd; | 8654 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
8472 | SD_INIT(sd, CPU); | 8655 | struct sched_domain *parent, int i) |
8473 | set_domain_attribute(sd, attr); | 8656 | { |
8474 | cpumask_copy(sched_domain_span(sd), nodemask); | 8657 | struct sched_domain *sd; |
8475 | sd->parent = p; | 8658 | sd = &per_cpu(phys_domains, i).sd; |
8476 | if (p) | 8659 | SD_INIT(sd, CPU); |
8477 | p->child = sd; | 8660 | set_domain_attribute(sd, attr); |
8478 | cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); | 8661 | cpumask_copy(sched_domain_span(sd), d->nodemask); |
8662 | sd->parent = parent; | ||
8663 | if (parent) | ||
8664 | parent->child = sd; | ||
8665 | cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
8666 | return sd; | ||
8667 | } | ||
8479 | 8668 | ||
8669 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | ||
8670 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8671 | struct sched_domain *parent, int i) | ||
8672 | { | ||
8673 | struct sched_domain *sd = parent; | ||
8480 | #ifdef CONFIG_SCHED_MC | 8674 | #ifdef CONFIG_SCHED_MC |
8481 | p = sd; | 8675 | sd = &per_cpu(core_domains, i).sd; |
8482 | sd = &per_cpu(core_domains, i).sd; | 8676 | SD_INIT(sd, MC); |
8483 | SD_INIT(sd, MC); | 8677 | set_domain_attribute(sd, attr); |
8484 | set_domain_attribute(sd, attr); | 8678 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); |
8485 | cpumask_and(sched_domain_span(sd), cpu_map, | 8679 | sd->parent = parent; |
8486 | cpu_coregroup_mask(i)); | 8680 | parent->child = sd; |
8487 | sd->parent = p; | 8681 | cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); |
8488 | p->child = sd; | ||
8489 | cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); | ||
8490 | #endif | 8682 | #endif |
8683 | return sd; | ||
8684 | } | ||
8491 | 8685 | ||
8686 | static struct sched_domain *__build_smt_sched_domain(struct s_data *d, | ||
8687 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
8688 | struct sched_domain *parent, int i) | ||
8689 | { | ||
8690 | struct sched_domain *sd = parent; | ||
8492 | #ifdef CONFIG_SCHED_SMT | 8691 | #ifdef CONFIG_SCHED_SMT |
8493 | p = sd; | 8692 | sd = &per_cpu(cpu_domains, i).sd; |
8494 | sd = &per_cpu(cpu_domains, i).sd; | 8693 | SD_INIT(sd, SIBLING); |
8495 | SD_INIT(sd, SIBLING); | 8694 | set_domain_attribute(sd, attr); |
8496 | set_domain_attribute(sd, attr); | 8695 | cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); |
8497 | cpumask_and(sched_domain_span(sd), | 8696 | sd->parent = parent; |
8498 | topology_thread_cpumask(i), cpu_map); | 8697 | parent->child = sd; |
8499 | sd->parent = p; | 8698 | cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); |
8500 | p->child = sd; | ||
8501 | cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); | ||
8502 | #endif | 8699 | #endif |
8503 | } | 8700 | return sd; |
8701 | } | ||
8504 | 8702 | ||
8703 | static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | ||
8704 | const struct cpumask *cpu_map, int cpu) | ||
8705 | { | ||
8706 | switch (l) { | ||
8505 | #ifdef CONFIG_SCHED_SMT | 8707 | #ifdef CONFIG_SCHED_SMT |
8506 | /* Set up CPU (sibling) groups */ | 8708 | case SD_LV_SIBLING: /* set up CPU (sibling) groups */ |
8507 | for_each_cpu(i, cpu_map) { | 8709 | cpumask_and(d->this_sibling_map, cpu_map, |
8508 | cpumask_and(this_sibling_map, | 8710 | topology_thread_cpumask(cpu)); |
8509 | topology_thread_cpumask(i), cpu_map); | 8711 | if (cpu == cpumask_first(d->this_sibling_map)) |
8510 | if (i != cpumask_first(this_sibling_map)) | 8712 | init_sched_build_groups(d->this_sibling_map, cpu_map, |
8511 | continue; | 8713 | &cpu_to_cpu_group, |
8512 | 8714 | d->send_covered, d->tmpmask); | |
8513 | init_sched_build_groups(this_sibling_map, cpu_map, | 8715 | break; |
8514 | &cpu_to_cpu_group, | ||
8515 | send_covered, tmpmask); | ||
8516 | } | ||
8517 | #endif | 8716 | #endif |
8518 | |||
8519 | #ifdef CONFIG_SCHED_MC | 8717 | #ifdef CONFIG_SCHED_MC |
8520 | /* Set up multi-core groups */ | 8718 | case SD_LV_MC: /* set up multi-core groups */ |
8521 | for_each_cpu(i, cpu_map) { | 8719 | cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); |
8522 | cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); | 8720 | if (cpu == cpumask_first(d->this_core_map)) |
8523 | if (i != cpumask_first(this_core_map)) | 8721 | init_sched_build_groups(d->this_core_map, cpu_map, |
8524 | continue; | 8722 | &cpu_to_core_group, |
8525 | 8723 | d->send_covered, d->tmpmask); | |
8526 | init_sched_build_groups(this_core_map, cpu_map, | 8724 | break; |
8527 | &cpu_to_core_group, | ||
8528 | send_covered, tmpmask); | ||
8529 | } | ||
8530 | #endif | 8725 | #endif |
8531 | 8726 | case SD_LV_CPU: /* set up physical groups */ | |
8532 | /* Set up physical groups */ | 8727 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
8533 | for (i = 0; i < nr_node_ids; i++) { | 8728 | if (!cpumask_empty(d->nodemask)) |
8534 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8729 | init_sched_build_groups(d->nodemask, cpu_map, |
8535 | if (cpumask_empty(nodemask)) | 8730 | &cpu_to_phys_group, |
8536 | continue; | 8731 | d->send_covered, d->tmpmask); |
8537 | 8732 | break; | |
8538 | init_sched_build_groups(nodemask, cpu_map, | ||
8539 | &cpu_to_phys_group, | ||
8540 | send_covered, tmpmask); | ||
8541 | } | ||
8542 | |||
8543 | #ifdef CONFIG_NUMA | 8733 | #ifdef CONFIG_NUMA |
8544 | /* Set up node groups */ | 8734 | case SD_LV_ALLNODES: |
8545 | if (sd_allnodes) { | 8735 | init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, |
8546 | init_sched_build_groups(cpu_map, cpu_map, | 8736 | d->send_covered, d->tmpmask); |
8547 | &cpu_to_allnodes_group, | 8737 | break; |
8548 | send_covered, tmpmask); | 8738 | #endif |
8739 | default: | ||
8740 | break; | ||
8549 | } | 8741 | } |
8742 | } | ||
8550 | 8743 | ||
8551 | for (i = 0; i < nr_node_ids; i++) { | 8744 | /* |
8552 | /* Set up node groups */ | 8745 | * Build sched domains for a given set of cpus and attach the sched domains |
8553 | struct sched_group *sg, *prev; | 8746 | * to the individual cpus |
8554 | int j; | 8747 | */ |
8555 | 8748 | static int __build_sched_domains(const struct cpumask *cpu_map, | |
8556 | cpumask_clear(covered); | 8749 | struct sched_domain_attr *attr) |
8557 | cpumask_and(nodemask, cpumask_of_node(i), cpu_map); | 8750 | { |
8558 | if (cpumask_empty(nodemask)) { | 8751 | enum s_alloc alloc_state = sa_none; |
8559 | sched_group_nodes[i] = NULL; | 8752 | struct s_data d; |
8560 | continue; | 8753 | struct sched_domain *sd; |
8561 | } | 8754 | int i; |
8755 | #ifdef CONFIG_NUMA | ||
8756 | d.sd_allnodes = 0; | ||
8757 | #endif | ||
8562 | 8758 | ||
8563 | sched_domain_node_span(i, domainspan); | 8759 | alloc_state = __visit_domain_allocation_hell(&d, cpu_map); |
8564 | cpumask_and(domainspan, domainspan, cpu_map); | 8760 | if (alloc_state != sa_rootdomain) |
8761 | goto error; | ||
8762 | alloc_state = sa_sched_groups; | ||
8565 | 8763 | ||
8566 | sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), | 8764 | /* |
8567 | GFP_KERNEL, i); | 8765 | * Set up domains for cpus specified by the cpu_map. |
8568 | if (!sg) { | 8766 | */ |
8569 | printk(KERN_WARNING "Can not alloc domain group for " | 8767 | for_each_cpu(i, cpu_map) { |
8570 | "node %d\n", i); | 8768 | cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), |
8571 | goto error; | 8769 | cpu_map); |
8572 | } | ||
8573 | sched_group_nodes[i] = sg; | ||
8574 | for_each_cpu(j, nodemask) { | ||
8575 | struct sched_domain *sd; | ||
8576 | 8770 | ||
8577 | sd = &per_cpu(node_domains, j).sd; | 8771 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
8578 | sd->groups = sg; | 8772 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
8579 | } | 8773 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
8580 | sg->__cpu_power = 0; | 8774 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
8581 | cpumask_copy(sched_group_cpus(sg), nodemask); | 8775 | } |
8582 | sg->next = sg; | ||
8583 | cpumask_or(covered, covered, nodemask); | ||
8584 | prev = sg; | ||
8585 | 8776 | ||
8586 | for (j = 0; j < nr_node_ids; j++) { | 8777 | for_each_cpu(i, cpu_map) { |
8587 | int n = (i + j) % nr_node_ids; | 8778 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
8779 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | ||
8780 | } | ||
8588 | 8781 | ||
8589 | cpumask_complement(notcovered, covered); | 8782 | /* Set up physical groups */ |
8590 | cpumask_and(tmpmask, notcovered, cpu_map); | 8783 | for (i = 0; i < nr_node_ids; i++) |
8591 | cpumask_and(tmpmask, tmpmask, domainspan); | 8784 | build_sched_groups(&d, SD_LV_CPU, cpu_map, i); |
8592 | if (cpumask_empty(tmpmask)) | ||
8593 | break; | ||
8594 | 8785 | ||
8595 | cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); | 8786 | #ifdef CONFIG_NUMA |
8596 | if (cpumask_empty(tmpmask)) | 8787 | /* Set up node groups */ |
8597 | continue; | 8788 | if (d.sd_allnodes) |
8789 | build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); | ||
8598 | 8790 | ||
8599 | sg = kmalloc_node(sizeof(struct sched_group) + | 8791 | for (i = 0; i < nr_node_ids; i++) |
8600 | cpumask_size(), | 8792 | if (build_numa_sched_groups(&d, cpu_map, i)) |
8601 | GFP_KERNEL, i); | 8793 | goto error; |
8602 | if (!sg) { | ||
8603 | printk(KERN_WARNING | ||
8604 | "Can not alloc domain group for node %d\n", j); | ||
8605 | goto error; | ||
8606 | } | ||
8607 | sg->__cpu_power = 0; | ||
8608 | cpumask_copy(sched_group_cpus(sg), tmpmask); | ||
8609 | sg->next = prev->next; | ||
8610 | cpumask_or(covered, covered, tmpmask); | ||
8611 | prev->next = sg; | ||
8612 | prev = sg; | ||
8613 | } | ||
8614 | } | ||
8615 | #endif | 8794 | #endif |
8616 | 8795 | ||
8617 | /* Calculate CPU power for physical packages and nodes */ | 8796 | /* Calculate CPU power for physical packages and nodes */ |
8618 | #ifdef CONFIG_SCHED_SMT | 8797 | #ifdef CONFIG_SCHED_SMT |
8619 | for_each_cpu(i, cpu_map) { | 8798 | for_each_cpu(i, cpu_map) { |
8620 | struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; | 8799 | sd = &per_cpu(cpu_domains, i).sd; |
8621 | |||
8622 | init_sched_groups_power(i, sd); | 8800 | init_sched_groups_power(i, sd); |
8623 | } | 8801 | } |
8624 | #endif | 8802 | #endif |
8625 | #ifdef CONFIG_SCHED_MC | 8803 | #ifdef CONFIG_SCHED_MC |
8626 | for_each_cpu(i, cpu_map) { | 8804 | for_each_cpu(i, cpu_map) { |
8627 | struct sched_domain *sd = &per_cpu(core_domains, i).sd; | 8805 | sd = &per_cpu(core_domains, i).sd; |
8628 | |||
8629 | init_sched_groups_power(i, sd); | 8806 | init_sched_groups_power(i, sd); |
8630 | } | 8807 | } |
8631 | #endif | 8808 | #endif |
8632 | 8809 | ||
8633 | for_each_cpu(i, cpu_map) { | 8810 | for_each_cpu(i, cpu_map) { |
8634 | struct sched_domain *sd = &per_cpu(phys_domains, i).sd; | 8811 | sd = &per_cpu(phys_domains, i).sd; |
8635 | |||
8636 | init_sched_groups_power(i, sd); | 8812 | init_sched_groups_power(i, sd); |
8637 | } | 8813 | } |
8638 | 8814 | ||
8639 | #ifdef CONFIG_NUMA | 8815 | #ifdef CONFIG_NUMA |
8640 | for (i = 0; i < nr_node_ids; i++) | 8816 | for (i = 0; i < nr_node_ids; i++) |
8641 | init_numa_sched_groups_power(sched_group_nodes[i]); | 8817 | init_numa_sched_groups_power(d.sched_group_nodes[i]); |
8642 | 8818 | ||
8643 | if (sd_allnodes) { | 8819 | if (d.sd_allnodes) { |
8644 | struct sched_group *sg; | 8820 | struct sched_group *sg; |
8645 | 8821 | ||
8646 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, | 8822 | cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, |
8647 | tmpmask); | 8823 | d.tmpmask); |
8648 | init_numa_sched_groups_power(sg); | 8824 | init_numa_sched_groups_power(sg); |
8649 | } | 8825 | } |
8650 | #endif | 8826 | #endif |
8651 | 8827 | ||
8652 | /* Attach the domains */ | 8828 | /* Attach the domains */ |
8653 | for_each_cpu(i, cpu_map) { | 8829 | for_each_cpu(i, cpu_map) { |
8654 | struct sched_domain *sd; | ||
8655 | #ifdef CONFIG_SCHED_SMT | 8830 | #ifdef CONFIG_SCHED_SMT |
8656 | sd = &per_cpu(cpu_domains, i).sd; | 8831 | sd = &per_cpu(cpu_domains, i).sd; |
8657 | #elif defined(CONFIG_SCHED_MC) | 8832 | #elif defined(CONFIG_SCHED_MC) |
@@ -8659,44 +8834,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
8659 | #else | 8834 | #else |
8660 | sd = &per_cpu(phys_domains, i).sd; | 8835 | sd = &per_cpu(phys_domains, i).sd; |
8661 | #endif | 8836 | #endif |
8662 | cpu_attach_domain(sd, rd, i); | 8837 | cpu_attach_domain(sd, d.rd, i); |
8663 | } | 8838 | } |
8664 | 8839 | ||
8665 | err = 0; | 8840 | d.sched_group_nodes = NULL; /* don't free this we still need it */ |
8666 | 8841 | __free_domain_allocs(&d, sa_tmpmask, cpu_map); | |
8667 | free_tmpmask: | 8842 | return 0; |
8668 | free_cpumask_var(tmpmask); | ||
8669 | free_send_covered: | ||
8670 | free_cpumask_var(send_covered); | ||
8671 | free_this_core_map: | ||
8672 | free_cpumask_var(this_core_map); | ||
8673 | free_this_sibling_map: | ||
8674 | free_cpumask_var(this_sibling_map); | ||
8675 | free_nodemask: | ||
8676 | free_cpumask_var(nodemask); | ||
8677 | free_notcovered: | ||
8678 | #ifdef CONFIG_NUMA | ||
8679 | free_cpumask_var(notcovered); | ||
8680 | free_covered: | ||
8681 | free_cpumask_var(covered); | ||
8682 | free_domainspan: | ||
8683 | free_cpumask_var(domainspan); | ||
8684 | out: | ||
8685 | #endif | ||
8686 | return err; | ||
8687 | |||
8688 | free_sched_groups: | ||
8689 | #ifdef CONFIG_NUMA | ||
8690 | kfree(sched_group_nodes); | ||
8691 | #endif | ||
8692 | goto free_tmpmask; | ||
8693 | 8843 | ||
8694 | #ifdef CONFIG_NUMA | ||
8695 | error: | 8844 | error: |
8696 | free_sched_groups(cpu_map, tmpmask); | 8845 | __free_domain_allocs(&d, alloc_state, cpu_map); |
8697 | free_rootdomain(rd); | 8846 | return -ENOMEM; |
8698 | goto free_tmpmask; | ||
8699 | #endif | ||
8700 | } | 8847 | } |
8701 | 8848 | ||
8702 | static int build_sched_domains(const struct cpumask *cpu_map) | 8849 | static int build_sched_domains(const struct cpumask *cpu_map) |
@@ -9015,6 +9162,7 @@ void __init sched_init_smp(void) | |||
9015 | cpumask_var_t non_isolated_cpus; | 9162 | cpumask_var_t non_isolated_cpus; |
9016 | 9163 | ||
9017 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 9164 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
9165 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
9018 | 9166 | ||
9019 | #if defined(CONFIG_NUMA) | 9167 | #if defined(CONFIG_NUMA) |
9020 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), | 9168 | sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), |
@@ -9046,7 +9194,6 @@ void __init sched_init_smp(void) | |||
9046 | sched_init_granularity(); | 9194 | sched_init_granularity(); |
9047 | free_cpumask_var(non_isolated_cpus); | 9195 | free_cpumask_var(non_isolated_cpus); |
9048 | 9196 | ||
9049 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | ||
9050 | init_sched_rt_class(); | 9197 | init_sched_rt_class(); |
9051 | } | 9198 | } |
9052 | #else | 9199 | #else |
@@ -9304,11 +9451,11 @@ void __init sched_init(void) | |||
9304 | * system cpu resource, based on the weight assigned to root | 9451 | * system cpu resource, based on the weight assigned to root |
9305 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished | 9452 | * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished |
9306 | * by letting tasks of init_task_group sit in a separate cfs_rq | 9453 | * by letting tasks of init_task_group sit in a separate cfs_rq |
9307 | * (init_cfs_rq) and having one entity represent this group of | 9454 | * (init_tg_cfs_rq) and having one entity represent this group of |
9308 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). | 9455 | * tasks in rq->cfs (i.e init_task_group->se[] != NULL). |
9309 | */ | 9456 | */ |
9310 | init_tg_cfs_entry(&init_task_group, | 9457 | init_tg_cfs_entry(&init_task_group, |
9311 | &per_cpu(init_cfs_rq, i), | 9458 | &per_cpu(init_tg_cfs_rq, i), |
9312 | &per_cpu(init_sched_entity, i), i, 1, | 9459 | &per_cpu(init_sched_entity, i), i, 1, |
9313 | root_task_group.se[i]); | 9460 | root_task_group.se[i]); |
9314 | 9461 | ||
@@ -9334,6 +9481,7 @@ void __init sched_init(void) | |||
9334 | #ifdef CONFIG_SMP | 9481 | #ifdef CONFIG_SMP |
9335 | rq->sd = NULL; | 9482 | rq->sd = NULL; |
9336 | rq->rd = NULL; | 9483 | rq->rd = NULL; |
9484 | rq->post_schedule = 0; | ||
9337 | rq->active_balance = 0; | 9485 | rq->active_balance = 0; |
9338 | rq->next_balance = jiffies; | 9486 | rq->next_balance = jiffies; |
9339 | rq->push_cpu = 0; | 9487 | rq->push_cpu = 0; |
@@ -9392,19 +9540,26 @@ void __init sched_init(void) | |||
9392 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 9540 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
9393 | #endif /* SMP */ | 9541 | #endif /* SMP */ |
9394 | 9542 | ||
9395 | perf_counter_init(); | 9543 | perf_event_init(); |
9396 | 9544 | ||
9397 | scheduler_running = 1; | 9545 | scheduler_running = 1; |
9398 | } | 9546 | } |
9399 | 9547 | ||
9400 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP | 9548 | #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP |
9401 | void __might_sleep(char *file, int line) | 9549 | static inline int preempt_count_equals(int preempt_offset) |
9550 | { | ||
9551 | int nested = preempt_count() & ~PREEMPT_ACTIVE; | ||
9552 | |||
9553 | return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); | ||
9554 | } | ||
9555 | |||
9556 | void __might_sleep(char *file, int line, int preempt_offset) | ||
9402 | { | 9557 | { |
9403 | #ifdef in_atomic | 9558 | #ifdef in_atomic |
9404 | static unsigned long prev_jiffy; /* ratelimiting */ | 9559 | static unsigned long prev_jiffy; /* ratelimiting */ |
9405 | 9560 | ||
9406 | if ((!in_atomic() && !irqs_disabled()) || | 9561 | if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || |
9407 | system_state != SYSTEM_RUNNING || oops_in_progress) | 9562 | system_state != SYSTEM_RUNNING || oops_in_progress) |
9408 | return; | 9563 | return; |
9409 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) | 9564 | if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) |
9410 | return; | 9565 | return; |
@@ -10157,7 +10312,7 @@ static int sched_rt_global_constraints(void) | |||
10157 | #endif /* CONFIG_RT_GROUP_SCHED */ | 10312 | #endif /* CONFIG_RT_GROUP_SCHED */ |
10158 | 10313 | ||
10159 | int sched_rt_handler(struct ctl_table *table, int write, | 10314 | int sched_rt_handler(struct ctl_table *table, int write, |
10160 | struct file *filp, void __user *buffer, size_t *lenp, | 10315 | void __user *buffer, size_t *lenp, |
10161 | loff_t *ppos) | 10316 | loff_t *ppos) |
10162 | { | 10317 | { |
10163 | int ret; | 10318 | int ret; |
@@ -10168,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write, | |||
10168 | old_period = sysctl_sched_rt_period; | 10323 | old_period = sysctl_sched_rt_period; |
10169 | old_runtime = sysctl_sched_rt_runtime; | 10324 | old_runtime = sysctl_sched_rt_runtime; |
10170 | 10325 | ||
10171 | ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); | 10326 | ret = proc_dointvec(table, write, buffer, lenp, ppos); |
10172 | 10327 | ||
10173 | if (!ret && write) { | 10328 | if (!ret && write) { |
10174 | ret = sched_rt_global_constraints(); | 10329 | ret = sched_rt_global_constraints(); |
@@ -10222,8 +10377,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
10222 | } | 10377 | } |
10223 | 10378 | ||
10224 | static int | 10379 | static int |
10225 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10380 | cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) |
10226 | struct task_struct *tsk) | ||
10227 | { | 10381 | { |
10228 | #ifdef CONFIG_RT_GROUP_SCHED | 10382 | #ifdef CONFIG_RT_GROUP_SCHED |
10229 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) | 10383 | if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) |
@@ -10233,15 +10387,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
10233 | if (tsk->sched_class != &fair_sched_class) | 10387 | if (tsk->sched_class != &fair_sched_class) |
10234 | return -EINVAL; | 10388 | return -EINVAL; |
10235 | #endif | 10389 | #endif |
10390 | return 0; | ||
10391 | } | ||
10236 | 10392 | ||
10393 | static int | ||
10394 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | ||
10395 | struct task_struct *tsk, bool threadgroup) | ||
10396 | { | ||
10397 | int retval = cpu_cgroup_can_attach_task(cgrp, tsk); | ||
10398 | if (retval) | ||
10399 | return retval; | ||
10400 | if (threadgroup) { | ||
10401 | struct task_struct *c; | ||
10402 | rcu_read_lock(); | ||
10403 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
10404 | retval = cpu_cgroup_can_attach_task(cgrp, c); | ||
10405 | if (retval) { | ||
10406 | rcu_read_unlock(); | ||
10407 | return retval; | ||
10408 | } | ||
10409 | } | ||
10410 | rcu_read_unlock(); | ||
10411 | } | ||
10237 | return 0; | 10412 | return 0; |
10238 | } | 10413 | } |
10239 | 10414 | ||
10240 | static void | 10415 | static void |
10241 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | 10416 | cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
10242 | struct cgroup *old_cont, struct task_struct *tsk) | 10417 | struct cgroup *old_cont, struct task_struct *tsk, |
10418 | bool threadgroup) | ||
10243 | { | 10419 | { |
10244 | sched_move_task(tsk); | 10420 | sched_move_task(tsk); |
10421 | if (threadgroup) { | ||
10422 | struct task_struct *c; | ||
10423 | rcu_read_lock(); | ||
10424 | list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { | ||
10425 | sched_move_task(c); | ||
10426 | } | ||
10427 | rcu_read_unlock(); | ||
10428 | } | ||
10245 | } | 10429 | } |
10246 | 10430 | ||
10247 | #ifdef CONFIG_FAIR_GROUP_SCHED | 10431 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -10581,3 +10765,113 @@ struct cgroup_subsys cpuacct_subsys = { | |||
10581 | .subsys_id = cpuacct_subsys_id, | 10765 | .subsys_id = cpuacct_subsys_id, |
10582 | }; | 10766 | }; |
10583 | #endif /* CONFIG_CGROUP_CPUACCT */ | 10767 | #endif /* CONFIG_CGROUP_CPUACCT */ |
10768 | |||
10769 | #ifndef CONFIG_SMP | ||
10770 | |||
10771 | int rcu_expedited_torture_stats(char *page) | ||
10772 | { | ||
10773 | return 0; | ||
10774 | } | ||
10775 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10776 | |||
10777 | void synchronize_sched_expedited(void) | ||
10778 | { | ||
10779 | } | ||
10780 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10781 | |||
10782 | #else /* #ifndef CONFIG_SMP */ | ||
10783 | |||
10784 | static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); | ||
10785 | static DEFINE_MUTEX(rcu_sched_expedited_mutex); | ||
10786 | |||
10787 | #define RCU_EXPEDITED_STATE_POST -2 | ||
10788 | #define RCU_EXPEDITED_STATE_IDLE -1 | ||
10789 | |||
10790 | static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10791 | |||
10792 | int rcu_expedited_torture_stats(char *page) | ||
10793 | { | ||
10794 | int cnt = 0; | ||
10795 | int cpu; | ||
10796 | |||
10797 | cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); | ||
10798 | for_each_online_cpu(cpu) { | ||
10799 | cnt += sprintf(&page[cnt], " %d:%d", | ||
10800 | cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); | ||
10801 | } | ||
10802 | cnt += sprintf(&page[cnt], "\n"); | ||
10803 | return cnt; | ||
10804 | } | ||
10805 | EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats); | ||
10806 | |||
10807 | static long synchronize_sched_expedited_count; | ||
10808 | |||
10809 | /* | ||
10810 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
10811 | * approach to force grace period to end quickly. This consumes | ||
10812 | * significant time on all CPUs, and is thus not recommended for | ||
10813 | * any sort of common-case code. | ||
10814 | * | ||
10815 | * Note that it is illegal to call this function while holding any | ||
10816 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
10817 | * observe this restriction will result in deadlock. | ||
10818 | */ | ||
10819 | void synchronize_sched_expedited(void) | ||
10820 | { | ||
10821 | int cpu; | ||
10822 | unsigned long flags; | ||
10823 | bool need_full_sync = 0; | ||
10824 | struct rq *rq; | ||
10825 | struct migration_req *req; | ||
10826 | long snap; | ||
10827 | int trycount = 0; | ||
10828 | |||
10829 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
10830 | snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; | ||
10831 | get_online_cpus(); | ||
10832 | while (!mutex_trylock(&rcu_sched_expedited_mutex)) { | ||
10833 | put_online_cpus(); | ||
10834 | if (trycount++ < 10) | ||
10835 | udelay(trycount * num_online_cpus()); | ||
10836 | else { | ||
10837 | synchronize_sched(); | ||
10838 | return; | ||
10839 | } | ||
10840 | if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { | ||
10841 | smp_mb(); /* ensure test happens before caller kfree */ | ||
10842 | return; | ||
10843 | } | ||
10844 | get_online_cpus(); | ||
10845 | } | ||
10846 | rcu_expedited_state = RCU_EXPEDITED_STATE_POST; | ||
10847 | for_each_online_cpu(cpu) { | ||
10848 | rq = cpu_rq(cpu); | ||
10849 | req = &per_cpu(rcu_migration_req, cpu); | ||
10850 | init_completion(&req->done); | ||
10851 | req->task = NULL; | ||
10852 | req->dest_cpu = RCU_MIGRATION_NEED_QS; | ||
10853 | spin_lock_irqsave(&rq->lock, flags); | ||
10854 | list_add(&req->list, &rq->migration_queue); | ||
10855 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10856 | wake_up_process(rq->migration_thread); | ||
10857 | } | ||
10858 | for_each_online_cpu(cpu) { | ||
10859 | rcu_expedited_state = cpu; | ||
10860 | req = &per_cpu(rcu_migration_req, cpu); | ||
10861 | rq = cpu_rq(cpu); | ||
10862 | wait_for_completion(&req->done); | ||
10863 | spin_lock_irqsave(&rq->lock, flags); | ||
10864 | if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) | ||
10865 | need_full_sync = 1; | ||
10866 | req->dest_cpu = RCU_MIGRATION_IDLE; | ||
10867 | spin_unlock_irqrestore(&rq->lock, flags); | ||
10868 | } | ||
10869 | rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; | ||
10870 | mutex_unlock(&rcu_sched_expedited_mutex); | ||
10871 | put_online_cpus(); | ||
10872 | if (need_full_sync) | ||
10873 | synchronize_sched(); | ||
10874 | } | ||
10875 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
10876 | |||
10877 | #endif /* #else #ifndef CONFIG_SMP */ | ||