aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2054
1 files changed, 1230 insertions, 824 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 1b59e265273b..e7f2cfa6a257 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,7 +39,7 @@
39#include <linux/completion.h> 39#include <linux/completion.h>
40#include <linux/kernel_stat.h> 40#include <linux/kernel_stat.h>
41#include <linux/debug_locks.h> 41#include <linux/debug_locks.h>
42#include <linux/perf_counter.h> 42#include <linux/perf_event.h>
43#include <linux/security.h> 43#include <linux/security.h>
44#include <linux/notifier.h> 44#include <linux/notifier.h>
45#include <linux/profile.h> 45#include <linux/profile.h>
@@ -64,7 +64,6 @@
64#include <linux/tsacct_kern.h> 64#include <linux/tsacct_kern.h>
65#include <linux/kprobes.h> 65#include <linux/kprobes.h>
66#include <linux/delayacct.h> 66#include <linux/delayacct.h>
67#include <linux/reciprocal_div.h>
68#include <linux/unistd.h> 67#include <linux/unistd.h>
69#include <linux/pagemap.h> 68#include <linux/pagemap.h>
70#include <linux/hrtimer.h> 69#include <linux/hrtimer.h>
@@ -120,30 +119,6 @@
120 */ 119 */
121#define RUNTIME_INF ((u64)~0ULL) 120#define RUNTIME_INF ((u64)~0ULL)
122 121
123#ifdef CONFIG_SMP
124
125static void double_rq_lock(struct rq *rq1, struct rq *rq2);
126
127/*
128 * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
129 * Since cpu_power is a 'constant', we can use a reciprocal divide.
130 */
131static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
132{
133 return reciprocal_divide(load, sg->reciprocal_cpu_power);
134}
135
136/*
137 * Each time a sched group cpu_power is changed,
138 * we must compute its reciprocal value
139 */
140static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
141{
142 sg->__cpu_power += val;
143 sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
144}
145#endif
146
147static inline int rt_policy(int policy) 122static inline int rt_policy(int policy)
148{ 123{
149 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) 124 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@ -309,8 +284,8 @@ void set_tg_uid(struct user_struct *user)
309 284
310/* 285/*
311 * Root task group. 286 * Root task group.
312 * Every UID task group (including init_task_group aka UID-0) will 287 * Every UID task group (including init_task_group aka UID-0) will
313 * be a child to this group. 288 * be a child to this group.
314 */ 289 */
315struct task_group root_task_group; 290struct task_group root_task_group;
316 291
@@ -318,12 +293,12 @@ struct task_group root_task_group;
318/* Default task group's sched entity on each cpu */ 293/* Default task group's sched entity on each cpu */
319static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
320/* Default task group's cfs_rq on each cpu */ 295/* Default task group's cfs_rq on each cpu */
321static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
322#endif /* CONFIG_FAIR_GROUP_SCHED */ 297#endif /* CONFIG_FAIR_GROUP_SCHED */
323 298
324#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
325static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
326static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
327#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
328#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
329#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -334,6 +309,8 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
334 */ 309 */
335static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
336 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
337#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
338static int root_task_group_empty(void) 315static int root_task_group_empty(void)
339{ 316{
@@ -341,7 +318,6 @@ static int root_task_group_empty(void)
341} 318}
342#endif 319#endif
343 320
344#ifdef CONFIG_FAIR_GROUP_SCHED
345#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
346# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
347#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -401,13 +377,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
401 377
402#else 378#else
403 379
404#ifdef CONFIG_SMP
405static int root_task_group_empty(void)
406{
407 return 1;
408}
409#endif
410
411static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 380static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
412static inline struct task_group *task_group(struct task_struct *p) 381static inline struct task_group *task_group(struct task_struct *p)
413{ 382{
@@ -537,14 +506,6 @@ struct root_domain {
537#ifdef CONFIG_SMP 506#ifdef CONFIG_SMP
538 struct cpupri cpupri; 507 struct cpupri cpupri;
539#endif 508#endif
540#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
541 /*
542 * Preferred wake up cpu nominated by sched_mc balance that will be
543 * used when most cpus are idle in the system indicating overall very
544 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
545 */
546 unsigned int sched_mc_preferred_wakeup_cpu;
547#endif
548}; 509};
549 510
550/* 511/*
@@ -574,14 +535,12 @@ struct rq {
574 #define CPU_LOAD_IDX_MAX 5 535 #define CPU_LOAD_IDX_MAX 5
575 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 536 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
576#ifdef CONFIG_NO_HZ 537#ifdef CONFIG_NO_HZ
577 unsigned long last_tick_seen;
578 unsigned char in_nohz_recently; 538 unsigned char in_nohz_recently;
579#endif 539#endif
580 /* capture load from *all* tasks on this cpu: */ 540 /* capture load from *all* tasks on this cpu: */
581 struct load_weight load; 541 struct load_weight load;
582 unsigned long nr_load_updates; 542 unsigned long nr_load_updates;
583 u64 nr_switches; 543 u64 nr_switches;
584 u64 nr_migrations_in;
585 544
586 struct cfs_rq cfs; 545 struct cfs_rq cfs;
587 struct rt_rq rt; 546 struct rt_rq rt;
@@ -616,6 +575,7 @@ struct rq {
616 575
617 unsigned char idle_at_tick; 576 unsigned char idle_at_tick;
618 /* For active balancing */ 577 /* For active balancing */
578 int post_schedule;
619 int active_balance; 579 int active_balance;
620 int push_cpu; 580 int push_cpu;
621 /* cpu of this runqueue: */ 581 /* cpu of this runqueue: */
@@ -626,6 +586,11 @@ struct rq {
626 586
627 struct task_struct *migration_thread; 587 struct task_struct *migration_thread;
628 struct list_head migration_queue; 588 struct list_head migration_queue;
589
590 u64 rt_avg;
591 u64 age_stamp;
592 u64 idle_stamp;
593 u64 avg_idle;
629#endif 594#endif
630 595
631 /* calc_load related fields */ 596 /* calc_load related fields */
@@ -665,9 +630,10 @@ struct rq {
665 630
666static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 631static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
667 632
668static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) 633static inline
634void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
669{ 635{
670 rq->curr->sched_class->check_preempt_curr(rq, p, sync); 636 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
671} 637}
672 638
673static inline int cpu_of(struct rq *rq) 639static inline int cpu_of(struct rq *rq)
@@ -693,6 +659,7 @@ static inline int cpu_of(struct rq *rq)
693#define this_rq() (&__get_cpu_var(runqueues)) 659#define this_rq() (&__get_cpu_var(runqueues))
694#define task_rq(p) cpu_rq(task_cpu(p)) 660#define task_rq(p) cpu_rq(task_cpu(p))
695#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 661#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
662#define raw_rq() (&__raw_get_cpu_var(runqueues))
696 663
697inline void update_rq_clock(struct rq *rq) 664inline void update_rq_clock(struct rq *rq)
698{ 665{
@@ -710,20 +677,15 @@ inline void update_rq_clock(struct rq *rq)
710 677
711/** 678/**
712 * runqueue_is_locked 679 * runqueue_is_locked
680 * @cpu: the processor in question.
713 * 681 *
714 * Returns true if the current cpu runqueue is locked. 682 * Returns true if the current cpu runqueue is locked.
715 * This interface allows printk to be called with the runqueue lock 683 * This interface allows printk to be called with the runqueue lock
716 * held and know whether or not it is OK to wake up the klogd. 684 * held and know whether or not it is OK to wake up the klogd.
717 */ 685 */
718int runqueue_is_locked(void) 686int runqueue_is_locked(int cpu)
719{ 687{
720 int cpu = get_cpu(); 688 return spin_is_locked(&cpu_rq(cpu)->lock);
721 struct rq *rq = cpu_rq(cpu);
722 int ret;
723
724 ret = spin_is_locked(&rq->lock);
725 put_cpu();
726 return ret;
727} 689}
728 690
729/* 691/*
@@ -810,7 +772,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
810 if (!sched_feat_names[i]) 772 if (!sched_feat_names[i])
811 return -EINVAL; 773 return -EINVAL;
812 774
813 filp->f_pos += cnt; 775 *ppos += cnt;
814 776
815 return cnt; 777 return cnt;
816} 778}
@@ -820,7 +782,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
820 return single_open(filp, sched_feat_show, NULL); 782 return single_open(filp, sched_feat_show, NULL);
821} 783}
822 784
823static struct file_operations sched_feat_fops = { 785static const struct file_operations sched_feat_fops = {
824 .open = sched_feat_open, 786 .open = sched_feat_open,
825 .write = sched_feat_write, 787 .write = sched_feat_write,
826 .read = seq_read, 788 .read = seq_read,
@@ -861,6 +823,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000;
861unsigned int sysctl_sched_shares_thresh = 4; 823unsigned int sysctl_sched_shares_thresh = 4;
862 824
863/* 825/*
826 * period over which we average the RT time consumption, measured
827 * in ms.
828 *
829 * default: 1s
830 */
831const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
832
833/*
864 * period over which we measure -rt task cpu usage in us. 834 * period over which we measure -rt task cpu usage in us.
865 * default: 1s 835 * default: 1s
866 */ 836 */
@@ -1278,12 +1248,37 @@ void wake_up_idle_cpu(int cpu)
1278} 1248}
1279#endif /* CONFIG_NO_HZ */ 1249#endif /* CONFIG_NO_HZ */
1280 1250
1251static u64 sched_avg_period(void)
1252{
1253 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
1254}
1255
1256static void sched_avg_update(struct rq *rq)
1257{
1258 s64 period = sched_avg_period();
1259
1260 while ((s64)(rq->clock - rq->age_stamp) > period) {
1261 rq->age_stamp += period;
1262 rq->rt_avg /= 2;
1263 }
1264}
1265
1266static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1267{
1268 rq->rt_avg += rt_delta;
1269 sched_avg_update(rq);
1270}
1271
1281#else /* !CONFIG_SMP */ 1272#else /* !CONFIG_SMP */
1282static void resched_task(struct task_struct *p) 1273static void resched_task(struct task_struct *p)
1283{ 1274{
1284 assert_spin_locked(&task_rq(p)->lock); 1275 assert_spin_locked(&task_rq(p)->lock);
1285 set_tsk_need_resched(p); 1276 set_tsk_need_resched(p);
1286} 1277}
1278
1279static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1280{
1281}
1287#endif /* CONFIG_SMP */ 1282#endif /* CONFIG_SMP */
1288 1283
1289#if BITS_PER_LONG == 32 1284#if BITS_PER_LONG == 32
@@ -1494,8 +1489,65 @@ static int tg_nop(struct task_group *tg, void *data)
1494#endif 1489#endif
1495 1490
1496#ifdef CONFIG_SMP 1491#ifdef CONFIG_SMP
1497static unsigned long source_load(int cpu, int type); 1492/* Used instead of source_load when we know the type == 0 */
1498static unsigned long target_load(int cpu, int type); 1493static unsigned long weighted_cpuload(const int cpu)
1494{
1495 return cpu_rq(cpu)->load.weight;
1496}
1497
1498/*
1499 * Return a low guess at the load of a migration-source cpu weighted
1500 * according to the scheduling class and "nice" value.
1501 *
1502 * We want to under-estimate the load of migration sources, to
1503 * balance conservatively.
1504 */
1505static unsigned long source_load(int cpu, int type)
1506{
1507 struct rq *rq = cpu_rq(cpu);
1508 unsigned long total = weighted_cpuload(cpu);
1509
1510 if (type == 0 || !sched_feat(LB_BIAS))
1511 return total;
1512
1513 return min(rq->cpu_load[type-1], total);
1514}
1515
1516/*
1517 * Return a high guess at the load of a migration-target cpu weighted
1518 * according to the scheduling class and "nice" value.
1519 */
1520static unsigned long target_load(int cpu, int type)
1521{
1522 struct rq *rq = cpu_rq(cpu);
1523 unsigned long total = weighted_cpuload(cpu);
1524
1525 if (type == 0 || !sched_feat(LB_BIAS))
1526 return total;
1527
1528 return max(rq->cpu_load[type-1], total);
1529}
1530
1531static struct sched_group *group_of(int cpu)
1532{
1533 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
1534
1535 if (!sd)
1536 return NULL;
1537
1538 return sd->groups;
1539}
1540
1541static unsigned long power_of(int cpu)
1542{
1543 struct sched_group *group = group_of(cpu);
1544
1545 if (!group)
1546 return SCHED_LOAD_SCALE;
1547
1548 return group->cpu_power;
1549}
1550
1499static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1551static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1500 1552
1501static unsigned long cpu_avg_load_per_task(int cpu) 1553static unsigned long cpu_avg_load_per_task(int cpu)
@@ -1513,28 +1565,31 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1513 1565
1514#ifdef CONFIG_FAIR_GROUP_SCHED 1566#ifdef CONFIG_FAIR_GROUP_SCHED
1515 1567
1568static __read_mostly unsigned long *update_shares_data;
1569
1516static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1517 1571
1518/* 1572/*
1519 * Calculate and set the cpu's group shares. 1573 * Calculate and set the cpu's group shares.
1520 */ 1574 */
1521static void 1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1522update_group_shares_cpu(struct task_group *tg, int cpu, 1576 unsigned long sd_shares,
1523 unsigned long sd_shares, unsigned long sd_rq_weight) 1577 unsigned long sd_rq_weight,
1578 unsigned long *usd_rq_weight)
1524{ 1579{
1525 unsigned long shares; 1580 unsigned long shares, rq_weight;
1526 unsigned long rq_weight; 1581 int boost = 0;
1527 1582
1528 if (!tg->se[cpu]) 1583 rq_weight = usd_rq_weight[cpu];
1529 return; 1584 if (!rq_weight) {
1530 1585 boost = 1;
1531 rq_weight = tg->cfs_rq[cpu]->rq_weight; 1586 rq_weight = NICE_0_LOAD;
1587 }
1532 1588
1533 /* 1589 /*
1534 * \Sum shares * rq_weight 1590 * \Sum_j shares_j * rq_weight_i
1535 * shares = ----------------------- 1591 * shares_i = -----------------------------
1536 * \Sum rq_weight 1592 * \Sum_j rq_weight_j
1537 *
1538 */ 1593 */
1539 shares = (sd_shares * rq_weight) / sd_rq_weight; 1594 shares = (sd_shares * rq_weight) / sd_rq_weight;
1540 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); 1595 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@ -1545,8 +1600,8 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1545 unsigned long flags; 1600 unsigned long flags;
1546 1601
1547 spin_lock_irqsave(&rq->lock, flags); 1602 spin_lock_irqsave(&rq->lock, flags);
1548 tg->cfs_rq[cpu]->shares = shares; 1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1549 1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1550 __set_se_shares(tg->se[cpu], shares); 1605 __set_se_shares(tg->se[cpu], shares);
1551 spin_unlock_irqrestore(&rq->lock, flags); 1606 spin_unlock_irqrestore(&rq->lock, flags);
1552 } 1607 }
@@ -1559,22 +1614,30 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
1559 */ 1614 */
1560static int tg_shares_up(struct task_group *tg, void *data) 1615static int tg_shares_up(struct task_group *tg, void *data)
1561{ 1616{
1562 unsigned long weight, rq_weight = 0; 1617 unsigned long weight, rq_weight = 0, shares = 0;
1563 unsigned long shares = 0; 1618 unsigned long *usd_rq_weight;
1564 struct sched_domain *sd = data; 1619 struct sched_domain *sd = data;
1620 unsigned long flags;
1565 int i; 1621 int i;
1566 1622
1623 if (!tg->se[0])
1624 return 0;
1625
1626 local_irq_save(flags);
1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1628
1567 for_each_cpu(i, sched_domain_span(sd)) { 1629 for_each_cpu(i, sched_domain_span(sd)) {
1630 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight;
1632
1568 /* 1633 /*
1569 * If there are currently no tasks on the cpu pretend there 1634 * If there are currently no tasks on the cpu pretend there
1570 * is one of average load so that when a new task gets to 1635 * is one of average load so that when a new task gets to
1571 * run here it will not get delayed by group starvation. 1636 * run here it will not get delayed by group starvation.
1572 */ 1637 */
1573 weight = tg->cfs_rq[i]->load.weight;
1574 if (!weight) 1638 if (!weight)
1575 weight = NICE_0_LOAD; 1639 weight = NICE_0_LOAD;
1576 1640
1577 tg->cfs_rq[i]->rq_weight = weight;
1578 rq_weight += weight; 1641 rq_weight += weight;
1579 shares += tg->cfs_rq[i]->shares; 1642 shares += tg->cfs_rq[i]->shares;
1580 } 1643 }
@@ -1586,7 +1649,9 @@ static int tg_shares_up(struct task_group *tg, void *data)
1586 shares = tg->shares; 1649 shares = tg->shares;
1587 1650
1588 for_each_cpu(i, sched_domain_span(sd)) 1651 for_each_cpu(i, sched_domain_span(sd))
1589 update_group_shares_cpu(tg, i, shares, rq_weight); 1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1653
1654 local_irq_restore(flags);
1590 1655
1591 return 0; 1656 return 0;
1592} 1657}
@@ -1616,8 +1681,14 @@ static int tg_load_down(struct task_group *tg, void *data)
1616 1681
1617static void update_shares(struct sched_domain *sd) 1682static void update_shares(struct sched_domain *sd)
1618{ 1683{
1619 u64 now = cpu_clock(raw_smp_processor_id()); 1684 s64 elapsed;
1620 s64 elapsed = now - sd->last_update; 1685 u64 now;
1686
1687 if (root_task_group_empty())
1688 return;
1689
1690 now = cpu_clock(raw_smp_processor_id());
1691 elapsed = now - sd->last_update;
1621 1692
1622 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1693 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1623 sd->last_update = now; 1694 sd->last_update = now;
@@ -1627,6 +1698,9 @@ static void update_shares(struct sched_domain *sd)
1627 1698
1628static void update_shares_locked(struct rq *rq, struct sched_domain *sd) 1699static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1629{ 1700{
1701 if (root_task_group_empty())
1702 return;
1703
1630 spin_unlock(&rq->lock); 1704 spin_unlock(&rq->lock);
1631 update_shares(sd); 1705 update_shares(sd);
1632 spin_lock(&rq->lock); 1706 spin_lock(&rq->lock);
@@ -1634,6 +1708,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1634 1708
1635static void update_h_load(long cpu) 1709static void update_h_load(long cpu)
1636{ 1710{
1711 if (root_task_group_empty())
1712 return;
1713
1637 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1714 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1638} 1715}
1639 1716
@@ -1651,6 +1728,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1651 1728
1652#ifdef CONFIG_PREEMPT 1729#ifdef CONFIG_PREEMPT
1653 1730
1731static void double_rq_lock(struct rq *rq1, struct rq *rq2);
1732
1654/* 1733/*
1655 * fair double_lock_balance: Safely acquires both rq->locks in a fair 1734 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1656 * way at the expense of forcing extra atomic operations in all 1735 * way at the expense of forcing extra atomic operations in all
@@ -1914,14 +1993,40 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1914 p->sched_class->prio_changed(rq, p, oldprio, running); 1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1915} 1994}
1916 1995
1917#ifdef CONFIG_SMP 1996/**
1918 1997 * kthread_bind - bind a just-created kthread to a cpu.
1919/* Used instead of source_load when we know the type == 0 */ 1998 * @p: thread created by kthread_create().
1920static unsigned long weighted_cpuload(const int cpu) 1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
1921{ 2009{
1922 return cpu_rq(cpu)->load.weight; 2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 update_rq_clock(rq);
2021 set_task_cpu(p, cpu);
2022 p->cpus_allowed = cpumask_of_cpu(cpu);
2023 p->rt.nr_cpus_allowed = 1;
2024 p->flags |= PF_THREAD_BOUND;
2025 spin_unlock_irqrestore(&rq->lock, flags);
1923} 2026}
2027EXPORT_SYMBOL(kthread_bind);
1924 2028
2029#ifdef CONFIG_SMP
1925/* 2030/*
1926 * Is this task likely cache-hot: 2031 * Is this task likely cache-hot:
1927 */ 2032 */
@@ -1933,7 +2038,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1933 /* 2038 /*
1934 * Buddy candidates are cache hot: 2039 * Buddy candidates are cache hot:
1935 */ 2040 */
1936 if (sched_feat(CACHE_HOT_BUDDY) && 2041 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
1937 (&p->se == cfs_rq_of(&p->se)->next || 2042 (&p->se == cfs_rq_of(&p->se)->next ||
1938 &p->se == cfs_rq_of(&p->se)->last)) 2043 &p->se == cfs_rq_of(&p->se)->last))
1939 return 1; 2044 return 1;
@@ -1974,12 +2079,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1974#endif 2079#endif
1975 if (old_cpu != new_cpu) { 2080 if (old_cpu != new_cpu) {
1976 p->se.nr_migrations++; 2081 p->se.nr_migrations++;
1977 new_rq->nr_migrations_in++;
1978#ifdef CONFIG_SCHEDSTATS 2082#ifdef CONFIG_SCHEDSTATS
1979 if (task_hot(p, old_rq->clock, NULL)) 2083 if (task_hot(p, old_rq->clock, NULL))
1980 schedstat_inc(p, se.nr_forced2_migrations); 2084 schedstat_inc(p, se.nr_forced2_migrations);
1981#endif 2085#endif
1982 perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
1983 1, 1, NULL, 0); 2087 1, 1, NULL, 0);
1984 } 2088 }
1985 p->se.vruntime -= old_cfsrq->min_vruntime - 2089 p->se.vruntime -= old_cfsrq->min_vruntime -
@@ -2011,6 +2115,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2011 * it is sufficient to simply update the task's cpu field. 2115 * it is sufficient to simply update the task's cpu field.
2012 */ 2116 */
2013 if (!p->se.on_rq && !task_running(rq, p)) { 2117 if (!p->se.on_rq && !task_running(rq, p)) {
2118 update_rq_clock(rq);
2014 set_task_cpu(p, dest_cpu); 2119 set_task_cpu(p, dest_cpu);
2015 return 0; 2120 return 0;
2016 } 2121 }
@@ -2195,186 +2300,6 @@ void kick_process(struct task_struct *p)
2195 preempt_enable(); 2300 preempt_enable();
2196} 2301}
2197EXPORT_SYMBOL_GPL(kick_process); 2302EXPORT_SYMBOL_GPL(kick_process);
2198
2199/*
2200 * Return a low guess at the load of a migration-source cpu weighted
2201 * according to the scheduling class and "nice" value.
2202 *
2203 * We want to under-estimate the load of migration sources, to
2204 * balance conservatively.
2205 */
2206static unsigned long source_load(int cpu, int type)
2207{
2208 struct rq *rq = cpu_rq(cpu);
2209 unsigned long total = weighted_cpuload(cpu);
2210
2211 if (type == 0 || !sched_feat(LB_BIAS))
2212 return total;
2213
2214 return min(rq->cpu_load[type-1], total);
2215}
2216
2217/*
2218 * Return a high guess at the load of a migration-target cpu weighted
2219 * according to the scheduling class and "nice" value.
2220 */
2221static unsigned long target_load(int cpu, int type)
2222{
2223 struct rq *rq = cpu_rq(cpu);
2224 unsigned long total = weighted_cpuload(cpu);
2225
2226 if (type == 0 || !sched_feat(LB_BIAS))
2227 return total;
2228
2229 return max(rq->cpu_load[type-1], total);
2230}
2231
2232/*
2233 * find_idlest_group finds and returns the least busy CPU group within the
2234 * domain.
2235 */
2236static struct sched_group *
2237find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2238{
2239 struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
2240 unsigned long min_load = ULONG_MAX, this_load = 0;
2241 int load_idx = sd->forkexec_idx;
2242 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2243
2244 do {
2245 unsigned long load, avg_load;
2246 int local_group;
2247 int i;
2248
2249 /* Skip over this group if it has no CPUs allowed */
2250 if (!cpumask_intersects(sched_group_cpus(group),
2251 &p->cpus_allowed))
2252 continue;
2253
2254 local_group = cpumask_test_cpu(this_cpu,
2255 sched_group_cpus(group));
2256
2257 /* Tally up the load of all CPUs in the group */
2258 avg_load = 0;
2259
2260 for_each_cpu(i, sched_group_cpus(group)) {
2261 /* Bias balancing toward cpus of our domain */
2262 if (local_group)
2263 load = source_load(i, load_idx);
2264 else
2265 load = target_load(i, load_idx);
2266
2267 avg_load += load;
2268 }
2269
2270 /* Adjust by relative CPU power of the group */
2271 avg_load = sg_div_cpu_power(group,
2272 avg_load * SCHED_LOAD_SCALE);
2273
2274 if (local_group) {
2275 this_load = avg_load;
2276 this = group;
2277 } else if (avg_load < min_load) {
2278 min_load = avg_load;
2279 idlest = group;
2280 }
2281 } while (group = group->next, group != sd->groups);
2282
2283 if (!idlest || 100*this_load < imbalance*min_load)
2284 return NULL;
2285 return idlest;
2286}
2287
2288/*
2289 * find_idlest_cpu - find the idlest cpu among the cpus in group.
2290 */
2291static int
2292find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
2293{
2294 unsigned long load, min_load = ULONG_MAX;
2295 int idlest = -1;
2296 int i;
2297
2298 /* Traverse only the allowed CPUs */
2299 for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
2300 load = weighted_cpuload(i);
2301
2302 if (load < min_load || (load == min_load && i == this_cpu)) {
2303 min_load = load;
2304 idlest = i;
2305 }
2306 }
2307
2308 return idlest;
2309}
2310
2311/*
2312 * sched_balance_self: balance the current task (running on cpu) in domains
2313 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
2314 * SD_BALANCE_EXEC.
2315 *
2316 * Balance, ie. select the least loaded group.
2317 *
2318 * Returns the target CPU number, or the same CPU if no balancing is needed.
2319 *
2320 * preempt must be disabled.
2321 */
2322static int sched_balance_self(int cpu, int flag)
2323{
2324 struct task_struct *t = current;
2325 struct sched_domain *tmp, *sd = NULL;
2326
2327 for_each_domain(cpu, tmp) {
2328 /*
2329 * If power savings logic is enabled for a domain, stop there.
2330 */
2331 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
2332 break;
2333 if (tmp->flags & flag)
2334 sd = tmp;
2335 }
2336
2337 if (sd)
2338 update_shares(sd);
2339
2340 while (sd) {
2341 struct sched_group *group;
2342 int new_cpu, weight;
2343
2344 if (!(sd->flags & flag)) {
2345 sd = sd->child;
2346 continue;
2347 }
2348
2349 group = find_idlest_group(sd, t, cpu);
2350 if (!group) {
2351 sd = sd->child;
2352 continue;
2353 }
2354
2355 new_cpu = find_idlest_cpu(group, t, cpu);
2356 if (new_cpu == -1 || new_cpu == cpu) {
2357 /* Now try balancing at a lower domain level of cpu */
2358 sd = sd->child;
2359 continue;
2360 }
2361
2362 /* Now try balancing at a lower domain level of new_cpu */
2363 cpu = new_cpu;
2364 weight = cpumask_weight(sched_domain_span(sd));
2365 sd = NULL;
2366 for_each_domain(cpu, tmp) {
2367 if (weight <= cpumask_weight(sched_domain_span(tmp)))
2368 break;
2369 if (tmp->flags & flag)
2370 sd = tmp;
2371 }
2372 /* while loop will break here if sd == NULL */
2373 }
2374
2375 return cpu;
2376}
2377
2378#endif /* CONFIG_SMP */ 2303#endif /* CONFIG_SMP */
2379 2304
2380/** 2305/**
@@ -2412,37 +2337,22 @@ void task_oncpu_function_call(struct task_struct *p,
2412 * 2337 *
2413 * returns failure only if the task is already active. 2338 * returns failure only if the task is already active.
2414 */ 2339 */
2415static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) 2340static int try_to_wake_up(struct task_struct *p, unsigned int state,
2341 int wake_flags)
2416{ 2342{
2417 int cpu, orig_cpu, this_cpu, success = 0; 2343 int cpu, orig_cpu, this_cpu, success = 0;
2418 unsigned long flags; 2344 unsigned long flags;
2419 long old_state; 2345 struct rq *rq, *orig_rq;
2420 struct rq *rq;
2421 2346
2422 if (!sched_feat(SYNC_WAKEUPS)) 2347 if (!sched_feat(SYNC_WAKEUPS))
2423 sync = 0; 2348 wake_flags &= ~WF_SYNC;
2424
2425#ifdef CONFIG_SMP
2426 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2427 struct sched_domain *sd;
2428
2429 this_cpu = raw_smp_processor_id();
2430 cpu = task_cpu(p);
2431 2349
2432 for_each_domain(this_cpu, sd) { 2350 this_cpu = get_cpu();
2433 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2434 update_shares(sd);
2435 break;
2436 }
2437 }
2438 }
2439#endif
2440 2351
2441 smp_wmb(); 2352 smp_wmb();
2442 rq = task_rq_lock(p, &flags); 2353 rq = orig_rq = task_rq_lock(p, &flags);
2443 update_rq_clock(rq); 2354 update_rq_clock(rq);
2444 old_state = p->state; 2355 if (!(p->state & state))
2445 if (!(old_state & state))
2446 goto out; 2356 goto out;
2447 2357
2448 if (p->se.on_rq) 2358 if (p->se.on_rq)
@@ -2450,27 +2360,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2450 2360
2451 cpu = task_cpu(p); 2361 cpu = task_cpu(p);
2452 orig_cpu = cpu; 2362 orig_cpu = cpu;
2453 this_cpu = smp_processor_id();
2454 2363
2455#ifdef CONFIG_SMP 2364#ifdef CONFIG_SMP
2456 if (unlikely(task_running(rq, p))) 2365 if (unlikely(task_running(rq, p)))
2457 goto out_activate; 2366 goto out_activate;
2458 2367
2459 cpu = p->sched_class->select_task_rq(p, sync); 2368 /*
2369 * In order to handle concurrent wakeups and release the rq->lock
2370 * we put the task in TASK_WAKING state.
2371 *
2372 * First fix up the nr_uninterruptible count:
2373 */
2374 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags);
2378
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2460 if (cpu != orig_cpu) { 2380 if (cpu != orig_cpu) {
2381 local_irq_save(flags);
2382 rq = cpu_rq(cpu);
2383 update_rq_clock(rq);
2461 set_task_cpu(p, cpu); 2384 set_task_cpu(p, cpu);
2462 task_rq_unlock(rq, &flags); 2385 local_irq_restore(flags);
2463 /* might preempt at this point */
2464 rq = task_rq_lock(p, &flags);
2465 old_state = p->state;
2466 if (!(old_state & state))
2467 goto out;
2468 if (p->se.on_rq)
2469 goto out_running;
2470
2471 this_cpu = smp_processor_id();
2472 cpu = task_cpu(p);
2473 } 2386 }
2387 rq = task_rq_lock(p, &flags);
2388
2389 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p);
2474 2391
2475#ifdef CONFIG_SCHEDSTATS 2392#ifdef CONFIG_SCHEDSTATS
2476 schedstat_inc(rq, ttwu_count); 2393 schedstat_inc(rq, ttwu_count);
@@ -2490,7 +2407,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2490out_activate: 2407out_activate:
2491#endif /* CONFIG_SMP */ 2408#endif /* CONFIG_SMP */
2492 schedstat_inc(p, se.nr_wakeups); 2409 schedstat_inc(p, se.nr_wakeups);
2493 if (sync) 2410 if (wake_flags & WF_SYNC)
2494 schedstat_inc(p, se.nr_wakeups_sync); 2411 schedstat_inc(p, se.nr_wakeups_sync);
2495 if (orig_cpu != cpu) 2412 if (orig_cpu != cpu)
2496 schedstat_inc(p, se.nr_wakeups_migrate); 2413 schedstat_inc(p, se.nr_wakeups_migrate);
@@ -2519,15 +2436,27 @@ out_activate:
2519 2436
2520out_running: 2437out_running:
2521 trace_sched_wakeup(rq, p, success); 2438 trace_sched_wakeup(rq, p, success);
2522 check_preempt_curr(rq, p, sync); 2439 check_preempt_curr(rq, p, wake_flags);
2523 2440
2524 p->state = TASK_RUNNING; 2441 p->state = TASK_RUNNING;
2525#ifdef CONFIG_SMP 2442#ifdef CONFIG_SMP
2526 if (p->sched_class->task_wake_up) 2443 if (p->sched_class->task_wake_up)
2527 p->sched_class->task_wake_up(rq, p); 2444 p->sched_class->task_wake_up(rq, p);
2445
2446 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp;
2448 u64 max = 2*sysctl_sched_migration_cost;
2449
2450 if (delta > max)
2451 rq->avg_idle = max;
2452 else
2453 update_avg(&rq->avg_idle, delta);
2454 rq->idle_stamp = 0;
2455 }
2528#endif 2456#endif
2529out: 2457out:
2530 task_rq_unlock(rq, &flags); 2458 task_rq_unlock(rq, &flags);
2459 put_cpu();
2531 2460
2532 return success; 2461 return success;
2533} 2462}
@@ -2570,6 +2499,7 @@ static void __sched_fork(struct task_struct *p)
2570 p->se.avg_overlap = 0; 2499 p->se.avg_overlap = 0;
2571 p->se.start_runtime = 0; 2500 p->se.start_runtime = 0;
2572 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2573 2503
2574#ifdef CONFIG_SCHEDSTATS 2504#ifdef CONFIG_SCHEDSTATS
2575 p->se.wait_start = 0; 2505 p->se.wait_start = 0;
@@ -2628,21 +2558,48 @@ static void __sched_fork(struct task_struct *p)
2628void sched_fork(struct task_struct *p, int clone_flags) 2558void sched_fork(struct task_struct *p, int clone_flags)
2629{ 2559{
2630 int cpu = get_cpu(); 2560 int cpu = get_cpu();
2561 unsigned long flags;
2631 2562
2632 __sched_fork(p); 2563 __sched_fork(p);
2633 2564
2634#ifdef CONFIG_SMP 2565 /*
2635 cpu = sched_balance_self(cpu, SD_BALANCE_FORK); 2566 * Revert to default priority/policy on fork if requested.
2636#endif 2567 */
2637 set_task_cpu(p, cpu); 2568 if (unlikely(p->sched_reset_on_fork)) {
2569 if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
2570 p->policy = SCHED_NORMAL;
2571 p->normal_prio = p->static_prio;
2572 }
2573
2574 if (PRIO_TO_NICE(p->static_prio) < 0) {
2575 p->static_prio = NICE_TO_PRIO(0);
2576 p->normal_prio = p->static_prio;
2577 set_load_weight(p);
2578 }
2579
2580 /*
2581 * We don't need the reset flag anymore after the fork. It has
2582 * fulfilled its duty:
2583 */
2584 p->sched_reset_on_fork = 0;
2585 }
2638 2586
2639 /* 2587 /*
2640 * Make sure we do not leak PI boosting priority to the child: 2588 * Make sure we do not leak PI boosting priority to the child.
2641 */ 2589 */
2642 p->prio = current->normal_prio; 2590 p->prio = current->normal_prio;
2591
2643 if (!rt_prio(p->prio)) 2592 if (!rt_prio(p->prio))
2644 p->sched_class = &fair_sched_class; 2593 p->sched_class = &fair_sched_class;
2645 2594
2595#ifdef CONFIG_SMP
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0);
2597#endif
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602
2646#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2647 if (likely(sched_info_on())) 2604 if (likely(sched_info_on()))
2648 memset(&p->sched_info, 0, sizeof(p->sched_info)); 2605 memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -2675,8 +2632,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2675 BUG_ON(p->state != TASK_RUNNING); 2632 BUG_ON(p->state != TASK_RUNNING);
2676 update_rq_clock(rq); 2633 update_rq_clock(rq);
2677 2634
2678 p->prio = effective_prio(p);
2679
2680 if (!p->sched_class->task_new || !current->se.on_rq) { 2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2681 activate_task(rq, p, 0); 2636 activate_task(rq, p, 0);
2682 } else { 2637 } else {
@@ -2688,7 +2643,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2688 inc_nr_running(rq); 2643 inc_nr_running(rq);
2689 } 2644 }
2690 trace_sched_wakeup_new(rq, p, 1); 2645 trace_sched_wakeup_new(rq, p, 1);
2691 check_preempt_curr(rq, p, 0); 2646 check_preempt_curr(rq, p, WF_FORK);
2692#ifdef CONFIG_SMP 2647#ifdef CONFIG_SMP
2693 if (p->sched_class->task_wake_up) 2648 if (p->sched_class->task_wake_up)
2694 p->sched_class->task_wake_up(rq, p); 2649 p->sched_class->task_wake_up(rq, p);
@@ -2796,12 +2751,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2796{ 2751{
2797 struct mm_struct *mm = rq->prev_mm; 2752 struct mm_struct *mm = rq->prev_mm;
2798 long prev_state; 2753 long prev_state;
2799#ifdef CONFIG_SMP
2800 int post_schedule = 0;
2801
2802 if (current->sched_class->needs_post_schedule)
2803 post_schedule = current->sched_class->needs_post_schedule(rq);
2804#endif
2805 2754
2806 rq->prev_mm = NULL; 2755 rq->prev_mm = NULL;
2807 2756
@@ -2818,12 +2767,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2818 */ 2767 */
2819 prev_state = prev->state; 2768 prev_state = prev->state;
2820 finish_arch_switch(prev); 2769 finish_arch_switch(prev);
2821 perf_counter_task_sched_in(current, cpu_of(rq)); 2770 perf_event_task_sched_in(current, cpu_of(rq));
2822 finish_lock_switch(rq, prev); 2771 finish_lock_switch(rq, prev);
2823#ifdef CONFIG_SMP
2824 if (post_schedule)
2825 current->sched_class->post_schedule(rq);
2826#endif
2827 2772
2828 fire_sched_in_preempt_notifiers(current); 2773 fire_sched_in_preempt_notifiers(current);
2829 if (mm) 2774 if (mm)
@@ -2838,6 +2783,42 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2838 } 2783 }
2839} 2784}
2840 2785
2786#ifdef CONFIG_SMP
2787
2788/* assumes rq->lock is held */
2789static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
2790{
2791 if (prev->sched_class->pre_schedule)
2792 prev->sched_class->pre_schedule(rq, prev);
2793}
2794
2795/* rq->lock is NOT held, but preemption is disabled */
2796static inline void post_schedule(struct rq *rq)
2797{
2798 if (rq->post_schedule) {
2799 unsigned long flags;
2800
2801 spin_lock_irqsave(&rq->lock, flags);
2802 if (rq->curr->sched_class->post_schedule)
2803 rq->curr->sched_class->post_schedule(rq);
2804 spin_unlock_irqrestore(&rq->lock, flags);
2805
2806 rq->post_schedule = 0;
2807 }
2808}
2809
2810#else
2811
2812static inline void pre_schedule(struct rq *rq, struct task_struct *p)
2813{
2814}
2815
2816static inline void post_schedule(struct rq *rq)
2817{
2818}
2819
2820#endif
2821
2841/** 2822/**
2842 * schedule_tail - first thing a freshly forked thread must call. 2823 * schedule_tail - first thing a freshly forked thread must call.
2843 * @prev: the thread we just switched away from. 2824 * @prev: the thread we just switched away from.
@@ -2848,6 +2829,13 @@ asmlinkage void schedule_tail(struct task_struct *prev)
2848 struct rq *rq = this_rq(); 2829 struct rq *rq = this_rq();
2849 2830
2850 finish_task_switch(rq, prev); 2831 finish_task_switch(rq, prev);
2832
2833 /*
2834 * FIXME: do we need to worry about rq being invalidated by the
2835 * task_switch?
2836 */
2837 post_schedule(rq);
2838
2851#ifdef __ARCH_WANT_UNLOCKED_CTXSW 2839#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2852 /* In this case, finish_task_switch does not reenable preemption */ 2840 /* In this case, finish_task_switch does not reenable preemption */
2853 preempt_enable(); 2841 preempt_enable();
@@ -2877,14 +2865,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2877 */ 2865 */
2878 arch_start_context_switch(prev); 2866 arch_start_context_switch(prev);
2879 2867
2880 if (unlikely(!mm)) { 2868 if (likely(!mm)) {
2881 next->active_mm = oldmm; 2869 next->active_mm = oldmm;
2882 atomic_inc(&oldmm->mm_count); 2870 atomic_inc(&oldmm->mm_count);
2883 enter_lazy_tlb(oldmm, next); 2871 enter_lazy_tlb(oldmm, next);
2884 } else 2872 } else
2885 switch_mm(oldmm, mm, next); 2873 switch_mm(oldmm, mm, next);
2886 2874
2887 if (unlikely(!prev->mm)) { 2875 if (likely(!prev->mm)) {
2888 prev->active_mm = NULL; 2876 prev->active_mm = NULL;
2889 rq->prev_mm = oldmm; 2877 rq->prev_mm = oldmm;
2890 } 2878 }
@@ -2965,6 +2953,19 @@ unsigned long nr_iowait(void)
2965 return sum; 2953 return sum;
2966} 2954}
2967 2955
2956unsigned long nr_iowait_cpu(void)
2957{
2958 struct rq *this = this_rq();
2959 return atomic_read(&this->nr_iowait);
2960}
2961
2962unsigned long this_cpu_load(void)
2963{
2964 struct rq *this = this_rq();
2965 return this->cpu_load[0];
2966}
2967
2968
2968/* Variables and functions for calc_load */ 2969/* Variables and functions for calc_load */
2969static atomic_long_t calc_load_tasks; 2970static atomic_long_t calc_load_tasks;
2970static unsigned long calc_load_update; 2971static unsigned long calc_load_update;
@@ -3034,15 +3035,6 @@ static void calc_load_account_active(struct rq *this_rq)
3034} 3035}
3035 3036
3036/* 3037/*
3037 * Externally visible per-cpu scheduler statistics:
3038 * cpu_nr_migrations(cpu) - number of migrations into that cpu
3039 */
3040u64 cpu_nr_migrations(int cpu)
3041{
3042 return cpu_rq(cpu)->nr_migrations_in;
3043}
3044
3045/*
3046 * Update rq->cpu_load[] statistics. This function is usually called every 3038 * Update rq->cpu_load[] statistics. This function is usually called every
3047 * scheduler tick (TICK_NSEC). 3039 * scheduler tick (TICK_NSEC).
3048 */ 3040 */
@@ -3164,7 +3156,7 @@ out:
3164void sched_exec(void) 3156void sched_exec(void)
3165{ 3157{
3166 int new_cpu, this_cpu = get_cpu(); 3158 int new_cpu, this_cpu = get_cpu();
3167 new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); 3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3168 put_cpu(); 3160 put_cpu();
3169 if (new_cpu != this_cpu) 3161 if (new_cpu != this_cpu)
3170 sched_migrate_task(current, new_cpu); 3162 sched_migrate_task(current, new_cpu);
@@ -3379,9 +3371,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3379{ 3371{
3380 const struct sched_class *class; 3372 const struct sched_class *class;
3381 3373
3382 for (class = sched_class_highest; class; class = class->next) 3374 for_each_class(class) {
3383 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) 3375 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3384 return 1; 3376 return 1;
3377 }
3385 3378
3386 return 0; 3379 return 0;
3387} 3380}
@@ -3544,7 +3537,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
3544 * capacity but still has some space to pick up some load 3537 * capacity but still has some space to pick up some load
3545 * from other group and save more power 3538 * from other group and save more power
3546 */ 3539 */
3547 if (sgs->sum_nr_running > sgs->group_capacity - 1) 3540 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3548 return; 3541 return;
3549 3542
3550 if (sgs->sum_nr_running > sds->leader_nr_running || 3543 if (sgs->sum_nr_running > sds->leader_nr_running ||
@@ -3583,11 +3576,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3583 *imbalance = sds->min_load_per_task; 3576 *imbalance = sds->min_load_per_task;
3584 sds->busiest = sds->group_min; 3577 sds->busiest = sds->group_min;
3585 3578
3586 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3587 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3588 group_first_cpu(sds->group_leader);
3589 }
3590
3591 return 1; 3579 return 1;
3592 3580
3593} 3581}
@@ -3612,8 +3600,105 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3612#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 3600#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613 3601
3614 3602
3603unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3604{
3605 return SCHED_LOAD_SCALE;
3606}
3607
3608unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3609{
3610 return default_scale_freq_power(sd, cpu);
3611}
3612
3613unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3614{
3615 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3616 unsigned long smt_gain = sd->smt_gain;
3617
3618 smt_gain /= weight;
3619
3620 return smt_gain;
3621}
3622
3623unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3624{
3625 return default_scale_smt_power(sd, cpu);
3626}
3627
3628unsigned long scale_rt_power(int cpu)
3629{
3630 struct rq *rq = cpu_rq(cpu);
3631 u64 total, available;
3632
3633 sched_avg_update(rq);
3634
3635 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3636 available = total - rq->rt_avg;
3637
3638 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3639 total = SCHED_LOAD_SCALE;
3640
3641 total >>= SCHED_LOAD_SHIFT;
3642
3643 return div_u64(available, total);
3644}
3645
3646static void update_cpu_power(struct sched_domain *sd, int cpu)
3647{
3648 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3649 unsigned long power = SCHED_LOAD_SCALE;
3650 struct sched_group *sdg = sd->groups;
3651
3652 if (sched_feat(ARCH_POWER))
3653 power *= arch_scale_freq_power(sd, cpu);
3654 else
3655 power *= default_scale_freq_power(sd, cpu);
3656
3657 power >>= SCHED_LOAD_SHIFT;
3658
3659 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3660 if (sched_feat(ARCH_POWER))
3661 power *= arch_scale_smt_power(sd, cpu);
3662 else
3663 power *= default_scale_smt_power(sd, cpu);
3664
3665 power >>= SCHED_LOAD_SHIFT;
3666 }
3667
3668 power *= scale_rt_power(cpu);
3669 power >>= SCHED_LOAD_SHIFT;
3670
3671 if (!power)
3672 power = 1;
3673
3674 sdg->cpu_power = power;
3675}
3676
3677static void update_group_power(struct sched_domain *sd, int cpu)
3678{
3679 struct sched_domain *child = sd->child;
3680 struct sched_group *group, *sdg = sd->groups;
3681 unsigned long power;
3682
3683 if (!child) {
3684 update_cpu_power(sd, cpu);
3685 return;
3686 }
3687
3688 power = 0;
3689
3690 group = child->groups;
3691 do {
3692 power += group->cpu_power;
3693 group = group->next;
3694 } while (group != child->groups);
3695
3696 sdg->cpu_power = power;
3697}
3698
3615/** 3699/**
3616 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3700 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3701 * @sd: The sched_domain whose statistics are to be updated.
3617 * @group: sched_group whose statistics are to be updated. 3702 * @group: sched_group whose statistics are to be updated.
3618 * @this_cpu: Cpu for which load balance is currently performed. 3703 * @this_cpu: Cpu for which load balance is currently performed.
3619 * @idle: Idle status of this_cpu 3704 * @idle: Idle status of this_cpu
@@ -3624,7 +3709,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3624 * @balance: Should we balance. 3709 * @balance: Should we balance.
3625 * @sgs: variable to hold the statistics for this group. 3710 * @sgs: variable to hold the statistics for this group.
3626 */ 3711 */
3627static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, 3712static inline void update_sg_lb_stats(struct sched_domain *sd,
3713 struct sched_group *group, int this_cpu,
3628 enum cpu_idle_type idle, int load_idx, int *sd_idle, 3714 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3629 int local_group, const struct cpumask *cpus, 3715 int local_group, const struct cpumask *cpus,
3630 int *balance, struct sg_lb_stats *sgs) 3716 int *balance, struct sg_lb_stats *sgs)
@@ -3635,8 +3721,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3635 unsigned long sum_avg_load_per_task; 3721 unsigned long sum_avg_load_per_task;
3636 unsigned long avg_load_per_task; 3722 unsigned long avg_load_per_task;
3637 3723
3638 if (local_group) 3724 if (local_group) {
3639 balance_cpu = group_first_cpu(group); 3725 balance_cpu = group_first_cpu(group);
3726 if (balance_cpu == this_cpu)
3727 update_group_power(sd, this_cpu);
3728 }
3640 3729
3641 /* Tally up the load of all CPUs in the group */ 3730 /* Tally up the load of all CPUs in the group */
3642 sum_avg_load_per_task = avg_load_per_task = 0; 3731 sum_avg_load_per_task = avg_load_per_task = 0;
@@ -3685,8 +3774,7 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3685 } 3774 }
3686 3775
3687 /* Adjust by relative CPU power of the group */ 3776 /* Adjust by relative CPU power of the group */
3688 sgs->avg_load = sg_div_cpu_power(group, 3777 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3689 sgs->group_load * SCHED_LOAD_SCALE);
3690 3778
3691 3779
3692 /* 3780 /*
@@ -3698,14 +3786,14 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3698 * normalized nr_running number somewhere that negates 3786 * normalized nr_running number somewhere that negates
3699 * the hierarchy? 3787 * the hierarchy?
3700 */ 3788 */
3701 avg_load_per_task = sg_div_cpu_power(group, 3789 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3702 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3790 group->cpu_power;
3703 3791
3704 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3792 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3705 sgs->group_imb = 1; 3793 sgs->group_imb = 1;
3706 3794
3707 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3795 sgs->group_capacity =
3708 3796 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3709} 3797}
3710 3798
3711/** 3799/**
@@ -3723,9 +3811,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3723 const struct cpumask *cpus, int *balance, 3811 const struct cpumask *cpus, int *balance,
3724 struct sd_lb_stats *sds) 3812 struct sd_lb_stats *sds)
3725{ 3813{
3814 struct sched_domain *child = sd->child;
3726 struct sched_group *group = sd->groups; 3815 struct sched_group *group = sd->groups;
3727 struct sg_lb_stats sgs; 3816 struct sg_lb_stats sgs;
3728 int load_idx; 3817 int load_idx, prefer_sibling = 0;
3818
3819 if (child && child->flags & SD_PREFER_SIBLING)
3820 prefer_sibling = 1;
3729 3821
3730 init_sd_power_savings_stats(sd, sds, idle); 3822 init_sd_power_savings_stats(sd, sds, idle);
3731 load_idx = get_sd_load_idx(sd, idle); 3823 load_idx = get_sd_load_idx(sd, idle);
@@ -3736,14 +3828,22 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3736 local_group = cpumask_test_cpu(this_cpu, 3828 local_group = cpumask_test_cpu(this_cpu,
3737 sched_group_cpus(group)); 3829 sched_group_cpus(group));
3738 memset(&sgs, 0, sizeof(sgs)); 3830 memset(&sgs, 0, sizeof(sgs));
3739 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, 3831 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3740 local_group, cpus, balance, &sgs); 3832 local_group, cpus, balance, &sgs);
3741 3833
3742 if (local_group && balance && !(*balance)) 3834 if (local_group && balance && !(*balance))
3743 return; 3835 return;
3744 3836
3745 sds->total_load += sgs.group_load; 3837 sds->total_load += sgs.group_load;
3746 sds->total_pwr += group->__cpu_power; 3838 sds->total_pwr += group->cpu_power;
3839
3840 /*
3841 * In case the child domain prefers tasks go to siblings
3842 * first, lower the group capacity to one so that we'll try
3843 * and move all the excess tasks away.
3844 */
3845 if (prefer_sibling)
3846 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3747 3847
3748 if (local_group) { 3848 if (local_group) {
3749 sds->this_load = sgs.avg_load; 3849 sds->this_load = sgs.avg_load;
@@ -3763,7 +3863,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3763 update_sd_power_savings_stats(group, sds, local_group, &sgs); 3863 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3764 group = group->next; 3864 group = group->next;
3765 } while (group != sd->groups); 3865 } while (group != sd->groups);
3766
3767} 3866}
3768 3867
3769/** 3868/**
@@ -3801,28 +3900,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3801 * moving them. 3900 * moving them.
3802 */ 3901 */
3803 3902
3804 pwr_now += sds->busiest->__cpu_power * 3903 pwr_now += sds->busiest->cpu_power *
3805 min(sds->busiest_load_per_task, sds->max_load); 3904 min(sds->busiest_load_per_task, sds->max_load);
3806 pwr_now += sds->this->__cpu_power * 3905 pwr_now += sds->this->cpu_power *
3807 min(sds->this_load_per_task, sds->this_load); 3906 min(sds->this_load_per_task, sds->this_load);
3808 pwr_now /= SCHED_LOAD_SCALE; 3907 pwr_now /= SCHED_LOAD_SCALE;
3809 3908
3810 /* Amount of load we'd subtract */ 3909 /* Amount of load we'd subtract */
3811 tmp = sg_div_cpu_power(sds->busiest, 3910 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3812 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3911 sds->busiest->cpu_power;
3813 if (sds->max_load > tmp) 3912 if (sds->max_load > tmp)
3814 pwr_move += sds->busiest->__cpu_power * 3913 pwr_move += sds->busiest->cpu_power *
3815 min(sds->busiest_load_per_task, sds->max_load - tmp); 3914 min(sds->busiest_load_per_task, sds->max_load - tmp);
3816 3915
3817 /* Amount of load we'd add */ 3916 /* Amount of load we'd add */
3818 if (sds->max_load * sds->busiest->__cpu_power < 3917 if (sds->max_load * sds->busiest->cpu_power <
3819 sds->busiest_load_per_task * SCHED_LOAD_SCALE) 3918 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3820 tmp = sg_div_cpu_power(sds->this, 3919 tmp = (sds->max_load * sds->busiest->cpu_power) /
3821 sds->max_load * sds->busiest->__cpu_power); 3920 sds->this->cpu_power;
3822 else 3921 else
3823 tmp = sg_div_cpu_power(sds->this, 3922 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3824 sds->busiest_load_per_task * SCHED_LOAD_SCALE); 3923 sds->this->cpu_power;
3825 pwr_move += sds->this->__cpu_power * 3924 pwr_move += sds->this->cpu_power *
3826 min(sds->this_load_per_task, sds->this_load + tmp); 3925 min(sds->this_load_per_task, sds->this_load + tmp);
3827 pwr_move /= SCHED_LOAD_SCALE; 3926 pwr_move /= SCHED_LOAD_SCALE;
3828 3927
@@ -3857,8 +3956,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3857 sds->max_load - sds->busiest_load_per_task); 3956 sds->max_load - sds->busiest_load_per_task);
3858 3957
3859 /* How much load to actually move to equalise the imbalance */ 3958 /* How much load to actually move to equalise the imbalance */
3860 *imbalance = min(max_pull * sds->busiest->__cpu_power, 3959 *imbalance = min(max_pull * sds->busiest->cpu_power,
3861 (sds->avg_load - sds->this_load) * sds->this->__cpu_power) 3960 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3862 / SCHED_LOAD_SCALE; 3961 / SCHED_LOAD_SCALE;
3863 3962
3864 /* 3963 /*
@@ -3988,15 +4087,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3988 int i; 4087 int i;
3989 4088
3990 for_each_cpu(i, sched_group_cpus(group)) { 4089 for_each_cpu(i, sched_group_cpus(group)) {
4090 unsigned long power = power_of(i);
4091 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
3991 unsigned long wl; 4092 unsigned long wl;
3992 4093
3993 if (!cpumask_test_cpu(i, cpus)) 4094 if (!cpumask_test_cpu(i, cpus))
3994 continue; 4095 continue;
3995 4096
3996 rq = cpu_rq(i); 4097 rq = cpu_rq(i);
3997 wl = weighted_cpuload(i); 4098 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4099 wl /= power;
3998 4100
3999 if (rq->nr_running == 1 && wl > imbalance) 4101 if (capacity && rq->nr_running == 1 && wl > imbalance)
4000 continue; 4102 continue;
4001 4103
4002 if (wl > max_load) { 4104 if (wl > max_load) {
@@ -4032,7 +4134,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4032 unsigned long flags; 4134 unsigned long flags;
4033 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4034 4136
4035 cpumask_setall(cpus); 4137 cpumask_copy(cpus, cpu_online_mask);
4036 4138
4037 /* 4139 /*
4038 * When power savings policy is enabled for the parent domain, idle 4140 * When power savings policy is enabled for the parent domain, idle
@@ -4195,7 +4297,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4195 int all_pinned = 0; 4297 int all_pinned = 0;
4196 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4197 4299
4198 cpumask_setall(cpus); 4300 cpumask_copy(cpus, cpu_online_mask);
4199 4301
4200 /* 4302 /*
4201 * When power savings policy is enabled for the parent domain, idle 4303 * When power savings policy is enabled for the parent domain, idle
@@ -4335,6 +4437,11 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4335 int pulled_task = 0; 4437 int pulled_task = 0;
4336 unsigned long next_balance = jiffies + HZ; 4438 unsigned long next_balance = jiffies + HZ;
4337 4439
4440 this_rq->idle_stamp = this_rq->clock;
4441
4442 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4443 return;
4444
4338 for_each_domain(this_cpu, sd) { 4445 for_each_domain(this_cpu, sd) {
4339 unsigned long interval; 4446 unsigned long interval;
4340 4447
@@ -4349,8 +4456,10 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
4349 interval = msecs_to_jiffies(sd->balance_interval); 4456 interval = msecs_to_jiffies(sd->balance_interval);
4350 if (time_after(next_balance, sd->last_balance + interval)) 4457 if (time_after(next_balance, sd->last_balance + interval))
4351 next_balance = sd->last_balance + interval; 4458 next_balance = sd->last_balance + interval;
4352 if (pulled_task) 4459 if (pulled_task) {
4460 this_rq->idle_stamp = 0;
4353 break; 4461 break;
4462 }
4354 } 4463 }
4355 if (pulled_task || time_after(jiffies, this_rq->next_balance)) { 4464 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4356 /* 4465 /*
@@ -4952,8 +5061,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
4952 p->gtime = cputime_add(p->gtime, cputime); 5061 p->gtime = cputime_add(p->gtime, cputime);
4953 5062
4954 /* Add guest time to cpustat. */ 5063 /* Add guest time to cpustat. */
4955 cpustat->user = cputime64_add(cpustat->user, tmp); 5064 if (TASK_NICE(p) > 0) {
4956 cpustat->guest = cputime64_add(cpustat->guest, tmp); 5065 cpustat->nice = cputime64_add(cpustat->nice, tmp);
5066 cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
5067 } else {
5068 cpustat->user = cputime64_add(cpustat->user, tmp);
5069 cpustat->guest = cputime64_add(cpustat->guest, tmp);
5070 }
4957} 5071}
4958 5072
4959/* 5073/*
@@ -5031,17 +5145,16 @@ void account_idle_time(cputime_t cputime)
5031 */ 5145 */
5032void account_process_tick(struct task_struct *p, int user_tick) 5146void account_process_tick(struct task_struct *p, int user_tick)
5033{ 5147{
5034 cputime_t one_jiffy = jiffies_to_cputime(1); 5148 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
5035 cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
5036 struct rq *rq = this_rq(); 5149 struct rq *rq = this_rq();
5037 5150
5038 if (user_tick) 5151 if (user_tick)
5039 account_user_time(p, one_jiffy, one_jiffy_scaled); 5152 account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
5040 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 5153 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
5041 account_system_time(p, HARDIRQ_OFFSET, one_jiffy, 5154 account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
5042 one_jiffy_scaled); 5155 one_jiffy_scaled);
5043 else 5156 else
5044 account_idle_time(one_jiffy); 5157 account_idle_time(cputime_one_jiffy);
5045} 5158}
5046 5159
5047/* 5160/*
@@ -5069,60 +5182,86 @@ void account_idle_ticks(unsigned long ticks)
5069 * Use precise platform statistics if available: 5182 * Use precise platform statistics if available:
5070 */ 5183 */
5071#ifdef CONFIG_VIRT_CPU_ACCOUNTING 5184#ifdef CONFIG_VIRT_CPU_ACCOUNTING
5072cputime_t task_utime(struct task_struct *p) 5185void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5073{ 5186{
5074 return p->utime; 5187 *ut = p->utime;
5188 *st = p->stime;
5075} 5189}
5076 5190
5077cputime_t task_stime(struct task_struct *p) 5191void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5078{ 5192{
5079 return p->stime; 5193 struct task_cputime cputime;
5194
5195 thread_group_cputime(p, &cputime);
5196
5197 *ut = cputime.utime;
5198 *st = cputime.stime;
5080} 5199}
5081#else 5200#else
5082cputime_t task_utime(struct task_struct *p) 5201
5202#ifndef nsecs_to_cputime
5203# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
5204#endif
5205
5206void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5083{ 5207{
5084 clock_t utime = cputime_to_clock_t(p->utime), 5208 cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
5085 total = utime + cputime_to_clock_t(p->stime);
5086 u64 temp;
5087 5209
5088 /* 5210 /*
5089 * Use CFS's precise accounting: 5211 * Use CFS's precise accounting:
5090 */ 5212 */
5091 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); 5213 rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
5092 5214
5093 if (total) { 5215 if (total) {
5094 temp *= utime; 5216 u64 temp;
5217
5218 temp = (u64)(rtime * utime);
5095 do_div(temp, total); 5219 do_div(temp, total);
5096 } 5220 utime = (cputime_t)temp;
5097 utime = (clock_t)temp; 5221 } else
5222 utime = rtime;
5223
5224 /*
5225 * Compare with previous values, to keep monotonicity:
5226 */
5227 p->prev_utime = max(p->prev_utime, utime);
5228 p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
5098 5229
5099 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); 5230 *ut = p->prev_utime;
5100 return p->prev_utime; 5231 *st = p->prev_stime;
5101} 5232}
5102 5233
5103cputime_t task_stime(struct task_struct *p) 5234/*
5235 * Must be called with siglock held.
5236 */
5237void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
5104{ 5238{
5105 clock_t stime; 5239 struct signal_struct *sig = p->signal;
5240 struct task_cputime cputime;
5241 cputime_t rtime, utime, total;
5106 5242
5107 /* 5243 thread_group_cputime(p, &cputime);
5108 * Use CFS's precise accounting. (we subtract utime from
5109 * the total, to make sure the total observed by userspace
5110 * grows monotonically - apps rely on that):
5111 */
5112 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
5113 cputime_to_clock_t(task_utime(p));
5114 5244
5115 if (stime >= 0) 5245 total = cputime_add(cputime.utime, cputime.stime);
5116 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); 5246 rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
5117 5247
5118 return p->prev_stime; 5248 if (total) {
5119} 5249 u64 temp;
5120#endif
5121 5250
5122inline cputime_t task_gtime(struct task_struct *p) 5251 temp = (u64)(rtime * cputime.utime);
5123{ 5252 do_div(temp, total);
5124 return p->gtime; 5253 utime = (cputime_t)temp;
5254 } else
5255 utime = rtime;
5256
5257 sig->prev_utime = max(sig->prev_utime, utime);
5258 sig->prev_stime = max(sig->prev_stime,
5259 cputime_sub(rtime, sig->prev_utime));
5260
5261 *ut = sig->prev_utime;
5262 *st = sig->prev_stime;
5125} 5263}
5264#endif
5126 5265
5127/* 5266/*
5128 * This function gets called by the timer code, with HZ frequency. 5267 * This function gets called by the timer code, with HZ frequency.
@@ -5145,7 +5284,7 @@ void scheduler_tick(void)
5145 curr->sched_class->task_tick(rq, curr, 0); 5284 curr->sched_class->task_tick(rq, curr, 0);
5146 spin_unlock(&rq->lock); 5285 spin_unlock(&rq->lock);
5147 5286
5148 perf_counter_task_tick(curr, cpu); 5287 perf_event_task_tick(curr, cpu);
5149 5288
5150#ifdef CONFIG_SMP 5289#ifdef CONFIG_SMP
5151 rq->idle_at_tick = idle_cpu(cpu); 5290 rq->idle_at_tick = idle_cpu(cpu);
@@ -5257,14 +5396,13 @@ static inline void schedule_debug(struct task_struct *prev)
5257#endif 5396#endif
5258} 5397}
5259 5398
5260static void put_prev_task(struct rq *rq, struct task_struct *prev) 5399static void put_prev_task(struct rq *rq, struct task_struct *p)
5261{ 5400{
5262 if (prev->state == TASK_RUNNING) { 5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime;
5263 u64 runtime = prev->se.sum_exec_runtime;
5264 5402
5265 runtime -= prev->se.prev_sum_exec_runtime; 5403 update_avg(&p->se.avg_running, runtime);
5266 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5267 5404
5405 if (p->state == TASK_RUNNING) {
5268 /* 5406 /*
5269 * In order to avoid avg_overlap growing stale when we are 5407 * In order to avoid avg_overlap growing stale when we are
5270 * indeed overlapping and hence not getting put to sleep, grow 5408 * indeed overlapping and hence not getting put to sleep, grow
@@ -5274,9 +5412,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
5274 * correlates to the amount of cache footprint a task can 5412 * correlates to the amount of cache footprint a task can
5275 * build up. 5413 * build up.
5276 */ 5414 */
5277 update_avg(&prev->se.avg_overlap, runtime); 5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5278 } 5419 }
5279 prev->sched_class->put_prev_task(rq, prev); 5420 p->sched_class->put_prev_task(rq, p);
5280} 5421}
5281 5422
5282/* 5423/*
@@ -5325,7 +5466,7 @@ need_resched:
5325 preempt_disable(); 5466 preempt_disable();
5326 cpu = smp_processor_id(); 5467 cpu = smp_processor_id();
5327 rq = cpu_rq(cpu); 5468 rq = cpu_rq(cpu);
5328 rcu_qsctr_inc(cpu); 5469 rcu_sched_qs(cpu);
5329 prev = rq->curr; 5470 prev = rq->curr;
5330 switch_count = &prev->nivcsw; 5471 switch_count = &prev->nivcsw;
5331 5472
@@ -5349,10 +5490,7 @@ need_resched_nonpreemptible:
5349 switch_count = &prev->nvcsw; 5490 switch_count = &prev->nvcsw;
5350 } 5491 }
5351 5492
5352#ifdef CONFIG_SMP 5493 pre_schedule(rq, prev);
5353 if (prev->sched_class->pre_schedule)
5354 prev->sched_class->pre_schedule(rq, prev);
5355#endif
5356 5494
5357 if (unlikely(!rq->nr_running)) 5495 if (unlikely(!rq->nr_running))
5358 idle_balance(cpu, rq); 5496 idle_balance(cpu, rq);
@@ -5362,7 +5500,7 @@ need_resched_nonpreemptible:
5362 5500
5363 if (likely(prev != next)) { 5501 if (likely(prev != next)) {
5364 sched_info_switch(prev, next); 5502 sched_info_switch(prev, next);
5365 perf_counter_task_sched_out(prev, next, cpu); 5503 perf_event_task_sched_out(prev, next, cpu);
5366 5504
5367 rq->nr_switches++; 5505 rq->nr_switches++;
5368 rq->curr = next; 5506 rq->curr = next;
@@ -5378,6 +5516,8 @@ need_resched_nonpreemptible:
5378 } else 5516 } else
5379 spin_unlock_irq(&rq->lock); 5517 spin_unlock_irq(&rq->lock);
5380 5518
5519 post_schedule(rq);
5520
5381 if (unlikely(reacquire_kernel_lock(current) < 0)) 5521 if (unlikely(reacquire_kernel_lock(current) < 0))
5382 goto need_resched_nonpreemptible; 5522 goto need_resched_nonpreemptible;
5383 5523
@@ -5387,7 +5527,7 @@ need_resched_nonpreemptible:
5387} 5527}
5388EXPORT_SYMBOL(schedule); 5528EXPORT_SYMBOL(schedule);
5389 5529
5390#ifdef CONFIG_SMP 5530#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
5391/* 5531/*
5392 * Look out! "owner" is an entirely speculative pointer 5532 * Look out! "owner" is an entirely speculative pointer
5393 * access and not reliable. 5533 * access and not reliable.
@@ -5509,10 +5649,10 @@ asmlinkage void __sched preempt_schedule_irq(void)
5509 5649
5510#endif /* CONFIG_PREEMPT */ 5650#endif /* CONFIG_PREEMPT */
5511 5651
5512int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 5652int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
5513 void *key) 5653 void *key)
5514{ 5654{
5515 return try_to_wake_up(curr->private, mode, sync); 5655 return try_to_wake_up(curr->private, mode, wake_flags);
5516} 5656}
5517EXPORT_SYMBOL(default_wake_function); 5657EXPORT_SYMBOL(default_wake_function);
5518 5658
@@ -5526,14 +5666,14 @@ EXPORT_SYMBOL(default_wake_function);
5526 * zero in this (rare) case, and we handle it by continuing to scan the queue. 5666 * zero in this (rare) case, and we handle it by continuing to scan the queue.
5527 */ 5667 */
5528static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 5668static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5529 int nr_exclusive, int sync, void *key) 5669 int nr_exclusive, int wake_flags, void *key)
5530{ 5670{
5531 wait_queue_t *curr, *next; 5671 wait_queue_t *curr, *next;
5532 5672
5533 list_for_each_entry_safe(curr, next, &q->task_list, task_list) { 5673 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
5534 unsigned flags = curr->flags; 5674 unsigned flags = curr->flags;
5535 5675
5536 if (curr->func(curr, mode, sync, key) && 5676 if (curr->func(curr, mode, wake_flags, key) &&
5537 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) 5677 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
5538 break; 5678 break;
5539 } 5679 }
@@ -5594,16 +5734,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
5594 int nr_exclusive, void *key) 5734 int nr_exclusive, void *key)
5595{ 5735{
5596 unsigned long flags; 5736 unsigned long flags;
5597 int sync = 1; 5737 int wake_flags = WF_SYNC;
5598 5738
5599 if (unlikely(!q)) 5739 if (unlikely(!q))
5600 return; 5740 return;
5601 5741
5602 if (unlikely(!nr_exclusive)) 5742 if (unlikely(!nr_exclusive))
5603 sync = 0; 5743 wake_flags = 0;
5604 5744
5605 spin_lock_irqsave(&q->lock, flags); 5745 spin_lock_irqsave(&q->lock, flags);
5606 __wake_up_common(q, mode, nr_exclusive, sync, key); 5746 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
5607 spin_unlock_irqrestore(&q->lock, flags); 5747 spin_unlock_irqrestore(&q->lock, flags);
5608} 5748}
5609EXPORT_SYMBOL_GPL(__wake_up_sync_key); 5749EXPORT_SYMBOL_GPL(__wake_up_sync_key);
@@ -6081,22 +6221,14 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
6081 BUG_ON(p->se.on_rq); 6221 BUG_ON(p->se.on_rq);
6082 6222
6083 p->policy = policy; 6223 p->policy = policy;
6084 switch (p->policy) {
6085 case SCHED_NORMAL:
6086 case SCHED_BATCH:
6087 case SCHED_IDLE:
6088 p->sched_class = &fair_sched_class;
6089 break;
6090 case SCHED_FIFO:
6091 case SCHED_RR:
6092 p->sched_class = &rt_sched_class;
6093 break;
6094 }
6095
6096 p->rt_priority = prio; 6224 p->rt_priority = prio;
6097 p->normal_prio = normal_prio(p); 6225 p->normal_prio = normal_prio(p);
6098 /* we are holding p->pi_lock already */ 6226 /* we are holding p->pi_lock already */
6099 p->prio = rt_mutex_getprio(p); 6227 p->prio = rt_mutex_getprio(p);
6228 if (rt_prio(p->prio))
6229 p->sched_class = &rt_sched_class;
6230 else
6231 p->sched_class = &fair_sched_class;
6100 set_load_weight(p); 6232 set_load_weight(p);
6101} 6233}
6102 6234
@@ -6123,17 +6255,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6123 unsigned long flags; 6255 unsigned long flags;
6124 const struct sched_class *prev_class = p->sched_class; 6256 const struct sched_class *prev_class = p->sched_class;
6125 struct rq *rq; 6257 struct rq *rq;
6258 int reset_on_fork;
6126 6259
6127 /* may grab non-irq protected spin_locks */ 6260 /* may grab non-irq protected spin_locks */
6128 BUG_ON(in_interrupt()); 6261 BUG_ON(in_interrupt());
6129recheck: 6262recheck:
6130 /* double check policy once rq lock held */ 6263 /* double check policy once rq lock held */
6131 if (policy < 0) 6264 if (policy < 0) {
6265 reset_on_fork = p->sched_reset_on_fork;
6132 policy = oldpolicy = p->policy; 6266 policy = oldpolicy = p->policy;
6133 else if (policy != SCHED_FIFO && policy != SCHED_RR && 6267 } else {
6134 policy != SCHED_NORMAL && policy != SCHED_BATCH && 6268 reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
6135 policy != SCHED_IDLE) 6269 policy &= ~SCHED_RESET_ON_FORK;
6136 return -EINVAL; 6270
6271 if (policy != SCHED_FIFO && policy != SCHED_RR &&
6272 policy != SCHED_NORMAL && policy != SCHED_BATCH &&
6273 policy != SCHED_IDLE)
6274 return -EINVAL;
6275 }
6276
6137 /* 6277 /*
6138 * Valid priorities for SCHED_FIFO and SCHED_RR are 6278 * Valid priorities for SCHED_FIFO and SCHED_RR are
6139 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, 6279 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@ -6177,6 +6317,10 @@ recheck:
6177 /* can't change other user's priorities */ 6317 /* can't change other user's priorities */
6178 if (!check_same_owner(p)) 6318 if (!check_same_owner(p))
6179 return -EPERM; 6319 return -EPERM;
6320
6321 /* Normal users shall not reset the sched_reset_on_fork flag */
6322 if (p->sched_reset_on_fork && !reset_on_fork)
6323 return -EPERM;
6180 } 6324 }
6181 6325
6182 if (user) { 6326 if (user) {
@@ -6220,6 +6364,8 @@ recheck:
6220 if (running) 6364 if (running)
6221 p->sched_class->put_prev_task(rq, p); 6365 p->sched_class->put_prev_task(rq, p);
6222 6366
6367 p->sched_reset_on_fork = reset_on_fork;
6368
6223 oldprio = p->prio; 6369 oldprio = p->prio;
6224 __setscheduler(rq, p, policy, param->sched_priority); 6370 __setscheduler(rq, p, policy, param->sched_priority);
6225 6371
@@ -6336,14 +6482,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6336 if (p) { 6482 if (p) {
6337 retval = security_task_getscheduler(p); 6483 retval = security_task_getscheduler(p);
6338 if (!retval) 6484 if (!retval)
6339 retval = p->policy; 6485 retval = p->policy
6486 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6340 } 6487 }
6341 read_unlock(&tasklist_lock); 6488 read_unlock(&tasklist_lock);
6342 return retval; 6489 return retval;
6343} 6490}
6344 6491
6345/** 6492/**
6346 * sys_sched_getscheduler - get the RT priority of a thread 6493 * sys_sched_getparam - get the RT priority of a thread
6347 * @pid: the pid in question. 6494 * @pid: the pid in question.
6348 * @param: structure containing the RT priority. 6495 * @param: structure containing the RT priority.
6349 */ 6496 */
@@ -6571,19 +6718,9 @@ static inline int should_resched(void)
6571 6718
6572static void __cond_resched(void) 6719static void __cond_resched(void)
6573{ 6720{
6574#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6721 add_preempt_count(PREEMPT_ACTIVE);
6575 __might_sleep(__FILE__, __LINE__); 6722 schedule();
6576#endif 6723 sub_preempt_count(PREEMPT_ACTIVE);
6577 /*
6578 * The BKS might be reacquired before we have dropped
6579 * PREEMPT_ACTIVE, which could trigger a second
6580 * cond_resched() call.
6581 */
6582 do {
6583 add_preempt_count(PREEMPT_ACTIVE);
6584 schedule();
6585 sub_preempt_count(PREEMPT_ACTIVE);
6586 } while (need_resched());
6587} 6724}
6588 6725
6589int __sched _cond_resched(void) 6726int __sched _cond_resched(void)
@@ -6597,18 +6734,20 @@ int __sched _cond_resched(void)
6597EXPORT_SYMBOL(_cond_resched); 6734EXPORT_SYMBOL(_cond_resched);
6598 6735
6599/* 6736/*
6600 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 6737 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6601 * call schedule, and on return reacquire the lock. 6738 * call schedule, and on return reacquire the lock.
6602 * 6739 *
6603 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 6740 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
6604 * operations here to prevent schedule() from being called twice (once via 6741 * operations here to prevent schedule() from being called twice (once via
6605 * spin_unlock(), once by hand). 6742 * spin_unlock(), once by hand).
6606 */ 6743 */
6607int cond_resched_lock(spinlock_t *lock) 6744int __cond_resched_lock(spinlock_t *lock)
6608{ 6745{
6609 int resched = should_resched(); 6746 int resched = should_resched();
6610 int ret = 0; 6747 int ret = 0;
6611 6748
6749 lockdep_assert_held(lock);
6750
6612 if (spin_needbreak(lock) || resched) { 6751 if (spin_needbreak(lock) || resched) {
6613 spin_unlock(lock); 6752 spin_unlock(lock);
6614 if (resched) 6753 if (resched)
@@ -6620,9 +6759,9 @@ int cond_resched_lock(spinlock_t *lock)
6620 } 6759 }
6621 return ret; 6760 return ret;
6622} 6761}
6623EXPORT_SYMBOL(cond_resched_lock); 6762EXPORT_SYMBOL(__cond_resched_lock);
6624 6763
6625int __sched cond_resched_softirq(void) 6764int __sched __cond_resched_softirq(void)
6626{ 6765{
6627 BUG_ON(!in_softirq()); 6766 BUG_ON(!in_softirq());
6628 6767
@@ -6634,7 +6773,7 @@ int __sched cond_resched_softirq(void)
6634 } 6773 }
6635 return 0; 6774 return 0;
6636} 6775}
6637EXPORT_SYMBOL(cond_resched_softirq); 6776EXPORT_SYMBOL(__cond_resched_softirq);
6638 6777
6639/** 6778/**
6640 * yield - yield the current processor to other threads. 6779 * yield - yield the current processor to other threads.
@@ -6652,17 +6791,16 @@ EXPORT_SYMBOL(yield);
6652/* 6791/*
6653 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 6792 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6654 * that process accounting knows that this is a task in IO wait state. 6793 * that process accounting knows that this is a task in IO wait state.
6655 *
6656 * But don't do that if it is a deliberate, throttling IO wait (this task
6657 * has set its backing_dev_info: the queue against which it should throttle)
6658 */ 6794 */
6659void __sched io_schedule(void) 6795void __sched io_schedule(void)
6660{ 6796{
6661 struct rq *rq = &__raw_get_cpu_var(runqueues); 6797 struct rq *rq = raw_rq();
6662 6798
6663 delayacct_blkio_start(); 6799 delayacct_blkio_start();
6664 atomic_inc(&rq->nr_iowait); 6800 atomic_inc(&rq->nr_iowait);
6801 current->in_iowait = 1;
6665 schedule(); 6802 schedule();
6803 current->in_iowait = 0;
6666 atomic_dec(&rq->nr_iowait); 6804 atomic_dec(&rq->nr_iowait);
6667 delayacct_blkio_end(); 6805 delayacct_blkio_end();
6668} 6806}
@@ -6670,12 +6808,14 @@ EXPORT_SYMBOL(io_schedule);
6670 6808
6671long __sched io_schedule_timeout(long timeout) 6809long __sched io_schedule_timeout(long timeout)
6672{ 6810{
6673 struct rq *rq = &__raw_get_cpu_var(runqueues); 6811 struct rq *rq = raw_rq();
6674 long ret; 6812 long ret;
6675 6813
6676 delayacct_blkio_start(); 6814 delayacct_blkio_start();
6677 atomic_inc(&rq->nr_iowait); 6815 atomic_inc(&rq->nr_iowait);
6816 current->in_iowait = 1;
6678 ret = schedule_timeout(timeout); 6817 ret = schedule_timeout(timeout);
6818 current->in_iowait = 0;
6679 atomic_dec(&rq->nr_iowait); 6819 atomic_dec(&rq->nr_iowait);
6680 delayacct_blkio_end(); 6820 delayacct_blkio_end();
6681 return ret; 6821 return ret;
@@ -6759,23 +6899,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6759 if (retval) 6899 if (retval)
6760 goto out_unlock; 6900 goto out_unlock;
6761 6901
6762 /* 6902 time_slice = p->sched_class->get_rr_interval(p);
6763 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
6764 * tasks that are on an otherwise idle runqueue:
6765 */
6766 time_slice = 0;
6767 if (p->policy == SCHED_RR) {
6768 time_slice = DEF_TIMESLICE;
6769 } else if (p->policy != SCHED_FIFO) {
6770 struct sched_entity *se = &p->se;
6771 unsigned long flags;
6772 struct rq *rq;
6773 6903
6774 rq = task_rq_lock(p, &flags);
6775 if (rq->cfs.load.weight)
6776 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
6777 task_rq_unlock(rq, &flags);
6778 }
6779 read_unlock(&tasklist_lock); 6904 read_unlock(&tasklist_lock);
6780 jiffies_to_timespec(time_slice, &t); 6905 jiffies_to_timespec(time_slice, &t);
6781 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6906 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -6848,7 +6973,7 @@ void show_state_filter(unsigned long state_filter)
6848 /* 6973 /*
6849 * Only show locks if all tasks are dumped: 6974 * Only show locks if all tasks are dumped:
6850 */ 6975 */
6851 if (state_filter == -1) 6976 if (!state_filter)
6852 debug_show_all_locks(); 6977 debug_show_all_locks();
6853} 6978}
6854 6979
@@ -6992,8 +7117,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
6992 7117
6993 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7118 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
6994 /* Need help from migration thread: drop lock and wait. */ 7119 /* Need help from migration thread: drop lock and wait. */
7120 struct task_struct *mt = rq->migration_thread;
7121
7122 get_task_struct(mt);
6995 task_rq_unlock(rq, &flags); 7123 task_rq_unlock(rq, &flags);
6996 wake_up_process(rq->migration_thread); 7124 wake_up_process(rq->migration_thread);
7125 put_task_struct(mt);
6997 wait_for_completion(&req.done); 7126 wait_for_completion(&req.done);
6998 tlb_migrate_finish(p->mm); 7127 tlb_migrate_finish(p->mm);
6999 return 0; 7128 return 0;
@@ -7051,6 +7180,11 @@ fail:
7051 return ret; 7180 return ret;
7052} 7181}
7053 7182
7183#define RCU_MIGRATION_IDLE 0
7184#define RCU_MIGRATION_NEED_QS 1
7185#define RCU_MIGRATION_GOT_QS 2
7186#define RCU_MIGRATION_MUST_SYNC 3
7187
7054/* 7188/*
7055 * migration_thread - this is a highprio system thread that performs 7189 * migration_thread - this is a highprio system thread that performs
7056 * thread migration by bumping thread off CPU then 'pushing' onto 7190 * thread migration by bumping thread off CPU then 'pushing' onto
@@ -7058,6 +7192,7 @@ fail:
7058 */ 7192 */
7059static int migration_thread(void *data) 7193static int migration_thread(void *data)
7060{ 7194{
7195 int badcpu;
7061 int cpu = (long)data; 7196 int cpu = (long)data;
7062 struct rq *rq; 7197 struct rq *rq;
7063 7198
@@ -7092,8 +7227,17 @@ static int migration_thread(void *data)
7092 req = list_entry(head->next, struct migration_req, list); 7227 req = list_entry(head->next, struct migration_req, list);
7093 list_del_init(head->next); 7228 list_del_init(head->next);
7094 7229
7095 spin_unlock(&rq->lock); 7230 if (req->task != NULL) {
7096 __migrate_task(req->task, cpu, req->dest_cpu); 7231 spin_unlock(&rq->lock);
7232 __migrate_task(req->task, cpu, req->dest_cpu);
7233 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7234 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7235 spin_unlock(&rq->lock);
7236 } else {
7237 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7238 spin_unlock(&rq->lock);
7239 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7240 }
7097 local_irq_enable(); 7241 local_irq_enable();
7098 7242
7099 complete(&req->done); 7243 complete(&req->done);
@@ -7300,17 +7444,16 @@ static struct ctl_table sd_ctl_dir[] = {
7300 .procname = "sched_domain", 7444 .procname = "sched_domain",
7301 .mode = 0555, 7445 .mode = 0555,
7302 }, 7446 },
7303 {0, }, 7447 {}
7304}; 7448};
7305 7449
7306static struct ctl_table sd_ctl_root[] = { 7450static struct ctl_table sd_ctl_root[] = {
7307 { 7451 {
7308 .ctl_name = CTL_KERN,
7309 .procname = "kernel", 7452 .procname = "kernel",
7310 .mode = 0555, 7453 .mode = 0555,
7311 .child = sd_ctl_dir, 7454 .child = sd_ctl_dir,
7312 }, 7455 },
7313 {0, }, 7456 {}
7314}; 7457};
7315 7458
7316static struct ctl_table *sd_alloc_ctl_entry(int n) 7459static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -7607,7 +7750,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7607/* 7750/*
7608 * Register at high priority so that task migration (migrate_all_tasks) 7751 * Register at high priority so that task migration (migrate_all_tasks)
7609 * happens before everything else. This has to be lower priority than 7752 * happens before everything else. This has to be lower priority than
7610 * the notifier in the perf_counter subsystem, though. 7753 * the notifier in the perf_event subsystem, though.
7611 */ 7754 */
7612static struct notifier_block __cpuinitdata migration_notifier = { 7755static struct notifier_block __cpuinitdata migration_notifier = {
7613 .notifier_call = migration_call, 7756 .notifier_call = migration_call,
@@ -7625,7 +7768,7 @@ static int __init migration_init(void)
7625 migration_call(&migration_notifier, CPU_ONLINE, cpu); 7768 migration_call(&migration_notifier, CPU_ONLINE, cpu);
7626 register_cpu_notifier(&migration_notifier); 7769 register_cpu_notifier(&migration_notifier);
7627 7770
7628 return err; 7771 return 0;
7629} 7772}
7630early_initcall(migration_init); 7773early_initcall(migration_init);
7631#endif 7774#endif
@@ -7634,6 +7777,16 @@ early_initcall(migration_init);
7634 7777
7635#ifdef CONFIG_SCHED_DEBUG 7778#ifdef CONFIG_SCHED_DEBUG
7636 7779
7780static __read_mostly int sched_domain_debug_enabled;
7781
7782static int __init sched_domain_debug_setup(char *str)
7783{
7784 sched_domain_debug_enabled = 1;
7785
7786 return 0;
7787}
7788early_param("sched_debug", sched_domain_debug_setup);
7789
7637static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 7790static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7638 struct cpumask *groupmask) 7791 struct cpumask *groupmask)
7639{ 7792{
@@ -7672,7 +7825,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7672 break; 7825 break;
7673 } 7826 }
7674 7827
7675 if (!group->__cpu_power) { 7828 if (!group->cpu_power) {
7676 printk(KERN_CONT "\n"); 7829 printk(KERN_CONT "\n");
7677 printk(KERN_ERR "ERROR: domain->cpu_power not " 7830 printk(KERN_ERR "ERROR: domain->cpu_power not "
7678 "set\n"); 7831 "set\n");
@@ -7696,9 +7849,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7696 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7849 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7697 7850
7698 printk(KERN_CONT " %s", str); 7851 printk(KERN_CONT " %s", str);
7699 if (group->__cpu_power != SCHED_LOAD_SCALE) { 7852 if (group->cpu_power != SCHED_LOAD_SCALE) {
7700 printk(KERN_CONT " (__cpu_power = %d)", 7853 printk(KERN_CONT " (cpu_power = %d)",
7701 group->__cpu_power); 7854 group->cpu_power);
7702 } 7855 }
7703 7856
7704 group = group->next; 7857 group = group->next;
@@ -7720,6 +7873,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
7720 cpumask_var_t groupmask; 7873 cpumask_var_t groupmask;
7721 int level = 0; 7874 int level = 0;
7722 7875
7876 if (!sched_domain_debug_enabled)
7877 return;
7878
7723 if (!sd) { 7879 if (!sd) {
7724 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); 7880 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
7725 return; 7881 return;
@@ -7763,9 +7919,7 @@ static int sd_degenerate(struct sched_domain *sd)
7763 } 7919 }
7764 7920
7765 /* Following flags don't use groups */ 7921 /* Following flags don't use groups */
7766 if (sd->flags & (SD_WAKE_IDLE | 7922 if (sd->flags & (SD_WAKE_AFFINE))
7767 SD_WAKE_AFFINE |
7768 SD_WAKE_BALANCE))
7769 return 0; 7923 return 0;
7770 7924
7771 return 1; 7925 return 1;
@@ -7782,10 +7936,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7782 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) 7936 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
7783 return 0; 7937 return 0;
7784 7938
7785 /* Does parent contain flags not in child? */
7786 /* WAKE_BALANCE is a subset of WAKE_AFFINE */
7787 if (cflags & SD_WAKE_AFFINE)
7788 pflags &= ~SD_WAKE_BALANCE;
7789 /* Flags needing groups don't count if only 1 group in parent */ 7939 /* Flags needing groups don't count if only 1 group in parent */
7790 if (parent->groups == parent->groups->next) { 7940 if (parent->groups == parent->groups->next) {
7791 pflags &= ~(SD_LOAD_BALANCE | 7941 pflags &= ~(SD_LOAD_BALANCE |
@@ -7805,6 +7955,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
7805 7955
7806static void free_rootdomain(struct root_domain *rd) 7956static void free_rootdomain(struct root_domain *rd)
7807{ 7957{
7958 synchronize_sched();
7959
7808 cpupri_cleanup(&rd->cpupri); 7960 cpupri_cleanup(&rd->cpupri);
7809 7961
7810 free_cpumask_var(rd->rto_mask); 7962 free_cpumask_var(rd->rto_mask);
@@ -7841,7 +7993,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7841 rq->rd = rd; 7993 rq->rd = rd;
7842 7994
7843 cpumask_set_cpu(rq->cpu, rd->span); 7995 cpumask_set_cpu(rq->cpu, rd->span);
7844 if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) 7996 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7845 set_rq_online(rq); 7997 set_rq_online(rq);
7846 7998
7847 spin_unlock_irqrestore(&rq->lock, flags); 7999 spin_unlock_irqrestore(&rq->lock, flags);
@@ -7945,6 +8097,7 @@ static cpumask_var_t cpu_isolated_map;
7945/* Setup the mask of cpus configured for isolated domains */ 8097/* Setup the mask of cpus configured for isolated domains */
7946static int __init isolated_cpu_setup(char *str) 8098static int __init isolated_cpu_setup(char *str)
7947{ 8099{
8100 alloc_bootmem_cpumask_var(&cpu_isolated_map);
7948 cpulist_parse(str, cpu_isolated_map); 8101 cpulist_parse(str, cpu_isolated_map);
7949 return 1; 8102 return 1;
7950} 8103}
@@ -7983,7 +8136,7 @@ init_sched_build_groups(const struct cpumask *span,
7983 continue; 8136 continue;
7984 8137
7985 cpumask_clear(sched_group_cpus(sg)); 8138 cpumask_clear(sched_group_cpus(sg));
7986 sg->__cpu_power = 0; 8139 sg->cpu_power = 0;
7987 8140
7988 for_each_cpu(j, span) { 8141 for_each_cpu(j, span) {
7989 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 8142 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@ -8091,6 +8244,39 @@ struct static_sched_domain {
8091 DECLARE_BITMAP(span, CONFIG_NR_CPUS); 8244 DECLARE_BITMAP(span, CONFIG_NR_CPUS);
8092}; 8245};
8093 8246
8247struct s_data {
8248#ifdef CONFIG_NUMA
8249 int sd_allnodes;
8250 cpumask_var_t domainspan;
8251 cpumask_var_t covered;
8252 cpumask_var_t notcovered;
8253#endif
8254 cpumask_var_t nodemask;
8255 cpumask_var_t this_sibling_map;
8256 cpumask_var_t this_core_map;
8257 cpumask_var_t send_covered;
8258 cpumask_var_t tmpmask;
8259 struct sched_group **sched_group_nodes;
8260 struct root_domain *rd;
8261};
8262
8263enum s_alloc {
8264 sa_sched_groups = 0,
8265 sa_rootdomain,
8266 sa_tmpmask,
8267 sa_send_covered,
8268 sa_this_core_map,
8269 sa_this_sibling_map,
8270 sa_nodemask,
8271 sa_sched_group_nodes,
8272#ifdef CONFIG_NUMA
8273 sa_notcovered,
8274 sa_covered,
8275 sa_domainspan,
8276#endif
8277 sa_none,
8278};
8279
8094/* 8280/*
8095 * SMT sched-domains: 8281 * SMT sched-domains:
8096 */ 8282 */
@@ -8208,11 +8394,76 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
8208 continue; 8394 continue;
8209 } 8395 }
8210 8396
8211 sg_inc_cpu_power(sg, sd->groups->__cpu_power); 8397 sg->cpu_power += sd->groups->cpu_power;
8212 } 8398 }
8213 sg = sg->next; 8399 sg = sg->next;
8214 } while (sg != group_head); 8400 } while (sg != group_head);
8215} 8401}
8402
8403static int build_numa_sched_groups(struct s_data *d,
8404 const struct cpumask *cpu_map, int num)
8405{
8406 struct sched_domain *sd;
8407 struct sched_group *sg, *prev;
8408 int n, j;
8409
8410 cpumask_clear(d->covered);
8411 cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
8412 if (cpumask_empty(d->nodemask)) {
8413 d->sched_group_nodes[num] = NULL;
8414 goto out;
8415 }
8416
8417 sched_domain_node_span(num, d->domainspan);
8418 cpumask_and(d->domainspan, d->domainspan, cpu_map);
8419
8420 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8421 GFP_KERNEL, num);
8422 if (!sg) {
8423 printk(KERN_WARNING "Can not alloc domain group for node %d\n",
8424 num);
8425 return -ENOMEM;
8426 }
8427 d->sched_group_nodes[num] = sg;
8428
8429 for_each_cpu(j, d->nodemask) {
8430 sd = &per_cpu(node_domains, j).sd;
8431 sd->groups = sg;
8432 }
8433
8434 sg->cpu_power = 0;
8435 cpumask_copy(sched_group_cpus(sg), d->nodemask);
8436 sg->next = sg;
8437 cpumask_or(d->covered, d->covered, d->nodemask);
8438
8439 prev = sg;
8440 for (j = 0; j < nr_node_ids; j++) {
8441 n = (num + j) % nr_node_ids;
8442 cpumask_complement(d->notcovered, d->covered);
8443 cpumask_and(d->tmpmask, d->notcovered, cpu_map);
8444 cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
8445 if (cpumask_empty(d->tmpmask))
8446 break;
8447 cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
8448 if (cpumask_empty(d->tmpmask))
8449 continue;
8450 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
8451 GFP_KERNEL, num);
8452 if (!sg) {
8453 printk(KERN_WARNING
8454 "Can not alloc domain group for node %d\n", j);
8455 return -ENOMEM;
8456 }
8457 sg->cpu_power = 0;
8458 cpumask_copy(sched_group_cpus(sg), d->tmpmask);
8459 sg->next = prev->next;
8460 cpumask_or(d->covered, d->covered, d->tmpmask);
8461 prev->next = sg;
8462 prev = sg;
8463 }
8464out:
8465 return 0;
8466}
8216#endif /* CONFIG_NUMA */ 8467#endif /* CONFIG_NUMA */
8217 8468
8218#ifdef CONFIG_NUMA 8469#ifdef CONFIG_NUMA
@@ -8266,15 +8517,13 @@ static void free_sched_groups(const struct cpumask *cpu_map,
8266 * there are asymmetries in the topology. If there are asymmetries, group 8517 * there are asymmetries in the topology. If there are asymmetries, group
8267 * having more cpu_power will pickup more load compared to the group having 8518 * having more cpu_power will pickup more load compared to the group having
8268 * less cpu_power. 8519 * less cpu_power.
8269 *
8270 * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
8271 * the maximum number of tasks a group can handle in the presence of other idle
8272 * or lightly loaded groups in the same sched domain.
8273 */ 8520 */
8274static void init_sched_groups_power(int cpu, struct sched_domain *sd) 8521static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8275{ 8522{
8276 struct sched_domain *child; 8523 struct sched_domain *child;
8277 struct sched_group *group; 8524 struct sched_group *group;
8525 long power;
8526 int weight;
8278 8527
8279 WARN_ON(!sd || !sd->groups); 8528 WARN_ON(!sd || !sd->groups);
8280 8529
@@ -8283,28 +8532,32 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
8283 8532
8284 child = sd->child; 8533 child = sd->child;
8285 8534
8286 sd->groups->__cpu_power = 0; 8535 sd->groups->cpu_power = 0;
8287 8536
8288 /* 8537 if (!child) {
8289 * For perf policy, if the groups in child domain share resources 8538 power = SCHED_LOAD_SCALE;
8290 * (for example cores sharing some portions of the cache hierarchy 8539 weight = cpumask_weight(sched_domain_span(sd));
8291 * or SMT), then set this domain groups cpu_power such that each group 8540 /*
8292 * can handle only one task, when there are other idle groups in the 8541 * SMT siblings share the power of a single core.
8293 * same sched domain. 8542 * Usually multiple threads get a better yield out of
8294 */ 8543 * that one core than a single thread would have,
8295 if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && 8544 * reflect that in sd->smt_gain.
8296 (child->flags & 8545 */
8297 (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { 8546 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
8298 sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); 8547 power *= sd->smt_gain;
8548 power /= weight;
8549 power >>= SCHED_LOAD_SHIFT;
8550 }
8551 sd->groups->cpu_power += power;
8299 return; 8552 return;
8300 } 8553 }
8301 8554
8302 /* 8555 /*
8303 * add cpu_power of each child group to this groups cpu_power 8556 * Add cpu_power of each child group to this groups cpu_power.
8304 */ 8557 */
8305 group = child->groups; 8558 group = child->groups;
8306 do { 8559 do {
8307 sg_inc_cpu_power(sd->groups, group->__cpu_power); 8560 sd->groups->cpu_power += group->cpu_power;
8308 group = group->next; 8561 group = group->next;
8309 } while (group != child->groups); 8562 } while (group != child->groups);
8310} 8563}
@@ -8371,287 +8624,292 @@ static void set_domain_attribute(struct sched_domain *sd,
8371 request = attr->relax_domain_level; 8624 request = attr->relax_domain_level;
8372 if (request < sd->level) { 8625 if (request < sd->level) {
8373 /* turn off idle balance on this domain */ 8626 /* turn off idle balance on this domain */
8374 sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); 8627 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8375 } else { 8628 } else {
8376 /* turn on idle balance on this domain */ 8629 /* turn on idle balance on this domain */
8377 sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); 8630 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
8631 }
8632}
8633
8634static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
8635 const struct cpumask *cpu_map)
8636{
8637 switch (what) {
8638 case sa_sched_groups:
8639 free_sched_groups(cpu_map, d->tmpmask); /* fall through */
8640 d->sched_group_nodes = NULL;
8641 case sa_rootdomain:
8642 free_rootdomain(d->rd); /* fall through */
8643 case sa_tmpmask:
8644 free_cpumask_var(d->tmpmask); /* fall through */
8645 case sa_send_covered:
8646 free_cpumask_var(d->send_covered); /* fall through */
8647 case sa_this_core_map:
8648 free_cpumask_var(d->this_core_map); /* fall through */
8649 case sa_this_sibling_map:
8650 free_cpumask_var(d->this_sibling_map); /* fall through */
8651 case sa_nodemask:
8652 free_cpumask_var(d->nodemask); /* fall through */
8653 case sa_sched_group_nodes:
8654#ifdef CONFIG_NUMA
8655 kfree(d->sched_group_nodes); /* fall through */
8656 case sa_notcovered:
8657 free_cpumask_var(d->notcovered); /* fall through */
8658 case sa_covered:
8659 free_cpumask_var(d->covered); /* fall through */
8660 case sa_domainspan:
8661 free_cpumask_var(d->domainspan); /* fall through */
8662#endif
8663 case sa_none:
8664 break;
8378 } 8665 }
8379} 8666}
8380 8667
8381/* 8668static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
8382 * Build sched domains for a given set of cpus and attach the sched domains 8669 const struct cpumask *cpu_map)
8383 * to the individual cpus
8384 */
8385static int __build_sched_domains(const struct cpumask *cpu_map,
8386 struct sched_domain_attr *attr)
8387{ 8670{
8388 int i, err = -ENOMEM;
8389 struct root_domain *rd;
8390 cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
8391 tmpmask;
8392#ifdef CONFIG_NUMA
8393 cpumask_var_t domainspan, covered, notcovered;
8394 struct sched_group **sched_group_nodes = NULL;
8395 int sd_allnodes = 0;
8396
8397 if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
8398 goto out;
8399 if (!alloc_cpumask_var(&covered, GFP_KERNEL))
8400 goto free_domainspan;
8401 if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
8402 goto free_covered;
8403#endif
8404
8405 if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
8406 goto free_notcovered;
8407 if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
8408 goto free_nodemask;
8409 if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
8410 goto free_this_sibling_map;
8411 if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
8412 goto free_this_core_map;
8413 if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
8414 goto free_send_covered;
8415
8416#ifdef CONFIG_NUMA 8671#ifdef CONFIG_NUMA
8417 /* 8672 if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
8418 * Allocate the per-node list of sched groups 8673 return sa_none;
8419 */ 8674 if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
8420 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), 8675 return sa_domainspan;
8421 GFP_KERNEL); 8676 if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
8422 if (!sched_group_nodes) { 8677 return sa_covered;
8678 /* Allocate the per-node list of sched groups */
8679 d->sched_group_nodes = kcalloc(nr_node_ids,
8680 sizeof(struct sched_group *), GFP_KERNEL);
8681 if (!d->sched_group_nodes) {
8423 printk(KERN_WARNING "Can not alloc sched group node list\n"); 8682 printk(KERN_WARNING "Can not alloc sched group node list\n");
8424 goto free_tmpmask; 8683 return sa_notcovered;
8425 } 8684 }
8426#endif 8685 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
8427 8686#endif
8428 rd = alloc_rootdomain(); 8687 if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
8429 if (!rd) { 8688 return sa_sched_group_nodes;
8689 if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
8690 return sa_nodemask;
8691 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
8692 return sa_this_sibling_map;
8693 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
8694 return sa_this_core_map;
8695 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
8696 return sa_send_covered;
8697 d->rd = alloc_rootdomain();
8698 if (!d->rd) {
8430 printk(KERN_WARNING "Cannot alloc root domain\n"); 8699 printk(KERN_WARNING "Cannot alloc root domain\n");
8431 goto free_sched_groups; 8700 return sa_tmpmask;
8432 } 8701 }
8702 return sa_rootdomain;
8703}
8433 8704
8705static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
8706 const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
8707{
8708 struct sched_domain *sd = NULL;
8434#ifdef CONFIG_NUMA 8709#ifdef CONFIG_NUMA
8435 sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; 8710 struct sched_domain *parent;
8436#endif
8437
8438 /*
8439 * Set up domains for cpus specified by the cpu_map.
8440 */
8441 for_each_cpu(i, cpu_map) {
8442 struct sched_domain *sd = NULL, *p;
8443
8444 cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
8445
8446#ifdef CONFIG_NUMA
8447 if (cpumask_weight(cpu_map) >
8448 SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
8449 sd = &per_cpu(allnodes_domains, i).sd;
8450 SD_INIT(sd, ALLNODES);
8451 set_domain_attribute(sd, attr);
8452 cpumask_copy(sched_domain_span(sd), cpu_map);
8453 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
8454 p = sd;
8455 sd_allnodes = 1;
8456 } else
8457 p = NULL;
8458 8711
8459 sd = &per_cpu(node_domains, i).sd; 8712 d->sd_allnodes = 0;
8460 SD_INIT(sd, NODE); 8713 if (cpumask_weight(cpu_map) >
8714 SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
8715 sd = &per_cpu(allnodes_domains, i).sd;
8716 SD_INIT(sd, ALLNODES);
8461 set_domain_attribute(sd, attr); 8717 set_domain_attribute(sd, attr);
8462 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); 8718 cpumask_copy(sched_domain_span(sd), cpu_map);
8463 sd->parent = p; 8719 cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
8464 if (p) 8720 d->sd_allnodes = 1;
8465 p->child = sd; 8721 }
8466 cpumask_and(sched_domain_span(sd), 8722 parent = sd;
8467 sched_domain_span(sd), cpu_map); 8723
8724 sd = &per_cpu(node_domains, i).sd;
8725 SD_INIT(sd, NODE);
8726 set_domain_attribute(sd, attr);
8727 sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
8728 sd->parent = parent;
8729 if (parent)
8730 parent->child = sd;
8731 cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
8468#endif 8732#endif
8733 return sd;
8734}
8469 8735
8470 p = sd; 8736static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
8471 sd = &per_cpu(phys_domains, i).sd; 8737 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8472 SD_INIT(sd, CPU); 8738 struct sched_domain *parent, int i)
8473 set_domain_attribute(sd, attr); 8739{
8474 cpumask_copy(sched_domain_span(sd), nodemask); 8740 struct sched_domain *sd;
8475 sd->parent = p; 8741 sd = &per_cpu(phys_domains, i).sd;
8476 if (p) 8742 SD_INIT(sd, CPU);
8477 p->child = sd; 8743 set_domain_attribute(sd, attr);
8478 cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); 8744 cpumask_copy(sched_domain_span(sd), d->nodemask);
8745 sd->parent = parent;
8746 if (parent)
8747 parent->child = sd;
8748 cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
8749 return sd;
8750}
8479 8751
8752static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
8753 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8754 struct sched_domain *parent, int i)
8755{
8756 struct sched_domain *sd = parent;
8480#ifdef CONFIG_SCHED_MC 8757#ifdef CONFIG_SCHED_MC
8481 p = sd; 8758 sd = &per_cpu(core_domains, i).sd;
8482 sd = &per_cpu(core_domains, i).sd; 8759 SD_INIT(sd, MC);
8483 SD_INIT(sd, MC); 8760 set_domain_attribute(sd, attr);
8484 set_domain_attribute(sd, attr); 8761 cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
8485 cpumask_and(sched_domain_span(sd), cpu_map, 8762 sd->parent = parent;
8486 cpu_coregroup_mask(i)); 8763 parent->child = sd;
8487 sd->parent = p; 8764 cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
8488 p->child = sd;
8489 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
8490#endif 8765#endif
8766 return sd;
8767}
8491 8768
8769static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
8770 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
8771 struct sched_domain *parent, int i)
8772{
8773 struct sched_domain *sd = parent;
8492#ifdef CONFIG_SCHED_SMT 8774#ifdef CONFIG_SCHED_SMT
8493 p = sd; 8775 sd = &per_cpu(cpu_domains, i).sd;
8494 sd = &per_cpu(cpu_domains, i).sd; 8776 SD_INIT(sd, SIBLING);
8495 SD_INIT(sd, SIBLING); 8777 set_domain_attribute(sd, attr);
8496 set_domain_attribute(sd, attr); 8778 cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
8497 cpumask_and(sched_domain_span(sd), 8779 sd->parent = parent;
8498 topology_thread_cpumask(i), cpu_map); 8780 parent->child = sd;
8499 sd->parent = p; 8781 cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
8500 p->child = sd;
8501 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
8502#endif 8782#endif
8503 } 8783 return sd;
8784}
8504 8785
8786static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
8787 const struct cpumask *cpu_map, int cpu)
8788{
8789 switch (l) {
8505#ifdef CONFIG_SCHED_SMT 8790#ifdef CONFIG_SCHED_SMT
8506 /* Set up CPU (sibling) groups */ 8791 case SD_LV_SIBLING: /* set up CPU (sibling) groups */
8507 for_each_cpu(i, cpu_map) { 8792 cpumask_and(d->this_sibling_map, cpu_map,
8508 cpumask_and(this_sibling_map, 8793 topology_thread_cpumask(cpu));
8509 topology_thread_cpumask(i), cpu_map); 8794 if (cpu == cpumask_first(d->this_sibling_map))
8510 if (i != cpumask_first(this_sibling_map)) 8795 init_sched_build_groups(d->this_sibling_map, cpu_map,
8511 continue; 8796 &cpu_to_cpu_group,
8512 8797 d->send_covered, d->tmpmask);
8513 init_sched_build_groups(this_sibling_map, cpu_map, 8798 break;
8514 &cpu_to_cpu_group,
8515 send_covered, tmpmask);
8516 }
8517#endif 8799#endif
8518
8519#ifdef CONFIG_SCHED_MC 8800#ifdef CONFIG_SCHED_MC
8520 /* Set up multi-core groups */ 8801 case SD_LV_MC: /* set up multi-core groups */
8521 for_each_cpu(i, cpu_map) { 8802 cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
8522 cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); 8803 if (cpu == cpumask_first(d->this_core_map))
8523 if (i != cpumask_first(this_core_map)) 8804 init_sched_build_groups(d->this_core_map, cpu_map,
8524 continue; 8805 &cpu_to_core_group,
8525 8806 d->send_covered, d->tmpmask);
8526 init_sched_build_groups(this_core_map, cpu_map, 8807 break;
8527 &cpu_to_core_group,
8528 send_covered, tmpmask);
8529 }
8530#endif 8808#endif
8531 8809 case SD_LV_CPU: /* set up physical groups */
8532 /* Set up physical groups */ 8810 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
8533 for (i = 0; i < nr_node_ids; i++) { 8811 if (!cpumask_empty(d->nodemask))
8534 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8812 init_sched_build_groups(d->nodemask, cpu_map,
8535 if (cpumask_empty(nodemask)) 8813 &cpu_to_phys_group,
8536 continue; 8814 d->send_covered, d->tmpmask);
8537 8815 break;
8538 init_sched_build_groups(nodemask, cpu_map,
8539 &cpu_to_phys_group,
8540 send_covered, tmpmask);
8541 }
8542
8543#ifdef CONFIG_NUMA 8816#ifdef CONFIG_NUMA
8544 /* Set up node groups */ 8817 case SD_LV_ALLNODES:
8545 if (sd_allnodes) { 8818 init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
8546 init_sched_build_groups(cpu_map, cpu_map, 8819 d->send_covered, d->tmpmask);
8547 &cpu_to_allnodes_group, 8820 break;
8548 send_covered, tmpmask); 8821#endif
8822 default:
8823 break;
8549 } 8824 }
8825}
8550 8826
8551 for (i = 0; i < nr_node_ids; i++) { 8827/*
8552 /* Set up node groups */ 8828 * Build sched domains for a given set of cpus and attach the sched domains
8553 struct sched_group *sg, *prev; 8829 * to the individual cpus
8554 int j; 8830 */
8555 8831static int __build_sched_domains(const struct cpumask *cpu_map,
8556 cpumask_clear(covered); 8832 struct sched_domain_attr *attr)
8557 cpumask_and(nodemask, cpumask_of_node(i), cpu_map); 8833{
8558 if (cpumask_empty(nodemask)) { 8834 enum s_alloc alloc_state = sa_none;
8559 sched_group_nodes[i] = NULL; 8835 struct s_data d;
8560 continue; 8836 struct sched_domain *sd;
8561 } 8837 int i;
8838#ifdef CONFIG_NUMA
8839 d.sd_allnodes = 0;
8840#endif
8562 8841
8563 sched_domain_node_span(i, domainspan); 8842 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
8564 cpumask_and(domainspan, domainspan, cpu_map); 8843 if (alloc_state != sa_rootdomain)
8844 goto error;
8845 alloc_state = sa_sched_groups;
8565 8846
8566 sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), 8847 /*
8567 GFP_KERNEL, i); 8848 * Set up domains for cpus specified by the cpu_map.
8568 if (!sg) { 8849 */
8569 printk(KERN_WARNING "Can not alloc domain group for " 8850 for_each_cpu(i, cpu_map) {
8570 "node %d\n", i); 8851 cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
8571 goto error; 8852 cpu_map);
8572 }
8573 sched_group_nodes[i] = sg;
8574 for_each_cpu(j, nodemask) {
8575 struct sched_domain *sd;
8576 8853
8577 sd = &per_cpu(node_domains, j).sd; 8854 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
8578 sd->groups = sg; 8855 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
8579 } 8856 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
8580 sg->__cpu_power = 0; 8857 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
8581 cpumask_copy(sched_group_cpus(sg), nodemask); 8858 }
8582 sg->next = sg;
8583 cpumask_or(covered, covered, nodemask);
8584 prev = sg;
8585 8859
8586 for (j = 0; j < nr_node_ids; j++) { 8860 for_each_cpu(i, cpu_map) {
8587 int n = (i + j) % nr_node_ids; 8861 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
8862 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
8863 }
8588 8864
8589 cpumask_complement(notcovered, covered); 8865 /* Set up physical groups */
8590 cpumask_and(tmpmask, notcovered, cpu_map); 8866 for (i = 0; i < nr_node_ids; i++)
8591 cpumask_and(tmpmask, tmpmask, domainspan); 8867 build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
8592 if (cpumask_empty(tmpmask))
8593 break;
8594 8868
8595 cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); 8869#ifdef CONFIG_NUMA
8596 if (cpumask_empty(tmpmask)) 8870 /* Set up node groups */
8597 continue; 8871 if (d.sd_allnodes)
8872 build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
8598 8873
8599 sg = kmalloc_node(sizeof(struct sched_group) + 8874 for (i = 0; i < nr_node_ids; i++)
8600 cpumask_size(), 8875 if (build_numa_sched_groups(&d, cpu_map, i))
8601 GFP_KERNEL, i); 8876 goto error;
8602 if (!sg) {
8603 printk(KERN_WARNING
8604 "Can not alloc domain group for node %d\n", j);
8605 goto error;
8606 }
8607 sg->__cpu_power = 0;
8608 cpumask_copy(sched_group_cpus(sg), tmpmask);
8609 sg->next = prev->next;
8610 cpumask_or(covered, covered, tmpmask);
8611 prev->next = sg;
8612 prev = sg;
8613 }
8614 }
8615#endif 8877#endif
8616 8878
8617 /* Calculate CPU power for physical packages and nodes */ 8879 /* Calculate CPU power for physical packages and nodes */
8618#ifdef CONFIG_SCHED_SMT 8880#ifdef CONFIG_SCHED_SMT
8619 for_each_cpu(i, cpu_map) { 8881 for_each_cpu(i, cpu_map) {
8620 struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; 8882 sd = &per_cpu(cpu_domains, i).sd;
8621
8622 init_sched_groups_power(i, sd); 8883 init_sched_groups_power(i, sd);
8623 } 8884 }
8624#endif 8885#endif
8625#ifdef CONFIG_SCHED_MC 8886#ifdef CONFIG_SCHED_MC
8626 for_each_cpu(i, cpu_map) { 8887 for_each_cpu(i, cpu_map) {
8627 struct sched_domain *sd = &per_cpu(core_domains, i).sd; 8888 sd = &per_cpu(core_domains, i).sd;
8628
8629 init_sched_groups_power(i, sd); 8889 init_sched_groups_power(i, sd);
8630 } 8890 }
8631#endif 8891#endif
8632 8892
8633 for_each_cpu(i, cpu_map) { 8893 for_each_cpu(i, cpu_map) {
8634 struct sched_domain *sd = &per_cpu(phys_domains, i).sd; 8894 sd = &per_cpu(phys_domains, i).sd;
8635
8636 init_sched_groups_power(i, sd); 8895 init_sched_groups_power(i, sd);
8637 } 8896 }
8638 8897
8639#ifdef CONFIG_NUMA 8898#ifdef CONFIG_NUMA
8640 for (i = 0; i < nr_node_ids; i++) 8899 for (i = 0; i < nr_node_ids; i++)
8641 init_numa_sched_groups_power(sched_group_nodes[i]); 8900 init_numa_sched_groups_power(d.sched_group_nodes[i]);
8642 8901
8643 if (sd_allnodes) { 8902 if (d.sd_allnodes) {
8644 struct sched_group *sg; 8903 struct sched_group *sg;
8645 8904
8646 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, 8905 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
8647 tmpmask); 8906 d.tmpmask);
8648 init_numa_sched_groups_power(sg); 8907 init_numa_sched_groups_power(sg);
8649 } 8908 }
8650#endif 8909#endif
8651 8910
8652 /* Attach the domains */ 8911 /* Attach the domains */
8653 for_each_cpu(i, cpu_map) { 8912 for_each_cpu(i, cpu_map) {
8654 struct sched_domain *sd;
8655#ifdef CONFIG_SCHED_SMT 8913#ifdef CONFIG_SCHED_SMT
8656 sd = &per_cpu(cpu_domains, i).sd; 8914 sd = &per_cpu(cpu_domains, i).sd;
8657#elif defined(CONFIG_SCHED_MC) 8915#elif defined(CONFIG_SCHED_MC)
@@ -8659,44 +8917,16 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
8659#else 8917#else
8660 sd = &per_cpu(phys_domains, i).sd; 8918 sd = &per_cpu(phys_domains, i).sd;
8661#endif 8919#endif
8662 cpu_attach_domain(sd, rd, i); 8920 cpu_attach_domain(sd, d.rd, i);
8663 } 8921 }
8664 8922
8665 err = 0; 8923 d.sched_group_nodes = NULL; /* don't free this we still need it */
8666 8924 __free_domain_allocs(&d, sa_tmpmask, cpu_map);
8667free_tmpmask: 8925 return 0;
8668 free_cpumask_var(tmpmask);
8669free_send_covered:
8670 free_cpumask_var(send_covered);
8671free_this_core_map:
8672 free_cpumask_var(this_core_map);
8673free_this_sibling_map:
8674 free_cpumask_var(this_sibling_map);
8675free_nodemask:
8676 free_cpumask_var(nodemask);
8677free_notcovered:
8678#ifdef CONFIG_NUMA
8679 free_cpumask_var(notcovered);
8680free_covered:
8681 free_cpumask_var(covered);
8682free_domainspan:
8683 free_cpumask_var(domainspan);
8684out:
8685#endif
8686 return err;
8687
8688free_sched_groups:
8689#ifdef CONFIG_NUMA
8690 kfree(sched_group_nodes);
8691#endif
8692 goto free_tmpmask;
8693 8926
8694#ifdef CONFIG_NUMA
8695error: 8927error:
8696 free_sched_groups(cpu_map, tmpmask); 8928 __free_domain_allocs(&d, alloc_state, cpu_map);
8697 free_rootdomain(rd); 8929 return -ENOMEM;
8698 goto free_tmpmask;
8699#endif
8700} 8930}
8701 8931
8702static int build_sched_domains(const struct cpumask *cpu_map) 8932static int build_sched_domains(const struct cpumask *cpu_map)
@@ -8704,7 +8934,7 @@ static int build_sched_domains(const struct cpumask *cpu_map)
8704 return __build_sched_domains(cpu_map, NULL); 8934 return __build_sched_domains(cpu_map, NULL);
8705} 8935}
8706 8936
8707static struct cpumask *doms_cur; /* current sched domains */ 8937static cpumask_var_t *doms_cur; /* current sched domains */
8708static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 8938static int ndoms_cur; /* number of sched domains in 'doms_cur' */
8709static struct sched_domain_attr *dattr_cur; 8939static struct sched_domain_attr *dattr_cur;
8710 /* attribues of custom domains in 'doms_cur' */ 8940 /* attribues of custom domains in 'doms_cur' */
@@ -8726,6 +8956,31 @@ int __attribute__((weak)) arch_update_cpu_topology(void)
8726 return 0; 8956 return 0;
8727} 8957}
8728 8958
8959cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
8960{
8961 int i;
8962 cpumask_var_t *doms;
8963
8964 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
8965 if (!doms)
8966 return NULL;
8967 for (i = 0; i < ndoms; i++) {
8968 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
8969 free_sched_domains(doms, i);
8970 return NULL;
8971 }
8972 }
8973 return doms;
8974}
8975
8976void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
8977{
8978 unsigned int i;
8979 for (i = 0; i < ndoms; i++)
8980 free_cpumask_var(doms[i]);
8981 kfree(doms);
8982}
8983
8729/* 8984/*
8730 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 8985 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
8731 * For now this just excludes isolated cpus, but could be used to 8986 * For now this just excludes isolated cpus, but could be used to
@@ -8737,12 +8992,12 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
8737 8992
8738 arch_update_cpu_topology(); 8993 arch_update_cpu_topology();
8739 ndoms_cur = 1; 8994 ndoms_cur = 1;
8740 doms_cur = kmalloc(cpumask_size(), GFP_KERNEL); 8995 doms_cur = alloc_sched_domains(ndoms_cur);
8741 if (!doms_cur) 8996 if (!doms_cur)
8742 doms_cur = fallback_doms; 8997 doms_cur = &fallback_doms;
8743 cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map); 8998 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
8744 dattr_cur = NULL; 8999 dattr_cur = NULL;
8745 err = build_sched_domains(doms_cur); 9000 err = build_sched_domains(doms_cur[0]);
8746 register_sched_domain_sysctl(); 9001 register_sched_domain_sysctl();
8747 9002
8748 return err; 9003 return err;
@@ -8792,19 +9047,19 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8792 * doms_new[] to the current sched domain partitioning, doms_cur[]. 9047 * doms_new[] to the current sched domain partitioning, doms_cur[].
8793 * It destroys each deleted domain and builds each new domain. 9048 * It destroys each deleted domain and builds each new domain.
8794 * 9049 *
8795 * 'doms_new' is an array of cpumask's of length 'ndoms_new'. 9050 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
8796 * The masks don't intersect (don't overlap.) We should setup one 9051 * The masks don't intersect (don't overlap.) We should setup one
8797 * sched domain for each mask. CPUs not in any of the cpumasks will 9052 * sched domain for each mask. CPUs not in any of the cpumasks will
8798 * not be load balanced. If the same cpumask appears both in the 9053 * not be load balanced. If the same cpumask appears both in the
8799 * current 'doms_cur' domains and in the new 'doms_new', we can leave 9054 * current 'doms_cur' domains and in the new 'doms_new', we can leave
8800 * it as it is. 9055 * it as it is.
8801 * 9056 *
8802 * The passed in 'doms_new' should be kmalloc'd. This routine takes 9057 * The passed in 'doms_new' should be allocated using
8803 * ownership of it and will kfree it when done with it. If the caller 9058 * alloc_sched_domains. This routine takes ownership of it and will
8804 * failed the kmalloc call, then it can pass in doms_new == NULL && 9059 * free_sched_domains it when done with it. If the caller failed the
8805 * ndoms_new == 1, and partition_sched_domains() will fallback to 9060 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
8806 * the single partition 'fallback_doms', it also forces the domains 9061 * and partition_sched_domains() will fallback to the single partition
8807 * to be rebuilt. 9062 * 'fallback_doms', it also forces the domains to be rebuilt.
8808 * 9063 *
8809 * If doms_new == NULL it will be replaced with cpu_online_mask. 9064 * If doms_new == NULL it will be replaced with cpu_online_mask.
8810 * ndoms_new == 0 is a special case for destroying existing domains, 9065 * ndoms_new == 0 is a special case for destroying existing domains,
@@ -8812,8 +9067,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
8812 * 9067 *
8813 * Call with hotplug lock held 9068 * Call with hotplug lock held
8814 */ 9069 */
8815/* FIXME: Change to struct cpumask *doms_new[] */ 9070void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
8816void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8817 struct sched_domain_attr *dattr_new) 9071 struct sched_domain_attr *dattr_new)
8818{ 9072{
8819 int i, j, n; 9073 int i, j, n;
@@ -8832,40 +9086,40 @@ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
8832 /* Destroy deleted domains */ 9086 /* Destroy deleted domains */
8833 for (i = 0; i < ndoms_cur; i++) { 9087 for (i = 0; i < ndoms_cur; i++) {
8834 for (j = 0; j < n && !new_topology; j++) { 9088 for (j = 0; j < n && !new_topology; j++) {
8835 if (cpumask_equal(&doms_cur[i], &doms_new[j]) 9089 if (cpumask_equal(doms_cur[i], doms_new[j])
8836 && dattrs_equal(dattr_cur, i, dattr_new, j)) 9090 && dattrs_equal(dattr_cur, i, dattr_new, j))
8837 goto match1; 9091 goto match1;
8838 } 9092 }
8839 /* no match - a current sched domain not in new doms_new[] */ 9093 /* no match - a current sched domain not in new doms_new[] */
8840 detach_destroy_domains(doms_cur + i); 9094 detach_destroy_domains(doms_cur[i]);
8841match1: 9095match1:
8842 ; 9096 ;
8843 } 9097 }
8844 9098
8845 if (doms_new == NULL) { 9099 if (doms_new == NULL) {
8846 ndoms_cur = 0; 9100 ndoms_cur = 0;
8847 doms_new = fallback_doms; 9101 doms_new = &fallback_doms;
8848 cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map); 9102 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map);
8849 WARN_ON_ONCE(dattr_new); 9103 WARN_ON_ONCE(dattr_new);
8850 } 9104 }
8851 9105
8852 /* Build new domains */ 9106 /* Build new domains */
8853 for (i = 0; i < ndoms_new; i++) { 9107 for (i = 0; i < ndoms_new; i++) {
8854 for (j = 0; j < ndoms_cur && !new_topology; j++) { 9108 for (j = 0; j < ndoms_cur && !new_topology; j++) {
8855 if (cpumask_equal(&doms_new[i], &doms_cur[j]) 9109 if (cpumask_equal(doms_new[i], doms_cur[j])
8856 && dattrs_equal(dattr_new, i, dattr_cur, j)) 9110 && dattrs_equal(dattr_new, i, dattr_cur, j))
8857 goto match2; 9111 goto match2;
8858 } 9112 }
8859 /* no match - add a new doms_new */ 9113 /* no match - add a new doms_new */
8860 __build_sched_domains(doms_new + i, 9114 __build_sched_domains(doms_new[i],
8861 dattr_new ? dattr_new + i : NULL); 9115 dattr_new ? dattr_new + i : NULL);
8862match2: 9116match2:
8863 ; 9117 ;
8864 } 9118 }
8865 9119
8866 /* Remember the new sched domains */ 9120 /* Remember the new sched domains */
8867 if (doms_cur != fallback_doms) 9121 if (doms_cur != &fallback_doms)
8868 kfree(doms_cur); 9122 free_sched_domains(doms_cur, ndoms_cur);
8869 kfree(dattr_cur); /* kfree(NULL) is safe */ 9123 kfree(dattr_cur); /* kfree(NULL) is safe */
8870 doms_cur = doms_new; 9124 doms_cur = doms_new;
8871 dattr_cur = dattr_new; 9125 dattr_cur = dattr_new;
@@ -9015,6 +9269,7 @@ void __init sched_init_smp(void)
9015 cpumask_var_t non_isolated_cpus; 9269 cpumask_var_t non_isolated_cpus;
9016 9270
9017 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 9271 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
9272 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9018 9273
9019#if defined(CONFIG_NUMA) 9274#if defined(CONFIG_NUMA)
9020 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), 9275 sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@ -9046,7 +9301,6 @@ void __init sched_init_smp(void)
9046 sched_init_granularity(); 9301 sched_init_granularity();
9047 free_cpumask_var(non_isolated_cpus); 9302 free_cpumask_var(non_isolated_cpus);
9048 9303
9049 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
9050 init_sched_rt_class(); 9304 init_sched_rt_class();
9051} 9305}
9052#else 9306#else
@@ -9187,10 +9441,6 @@ void __init sched_init(void)
9187#ifdef CONFIG_CPUMASK_OFFSTACK 9441#ifdef CONFIG_CPUMASK_OFFSTACK
9188 alloc_size += num_possible_cpus() * cpumask_size(); 9442 alloc_size += num_possible_cpus() * cpumask_size();
9189#endif 9443#endif
9190 /*
9191 * As sched_init() is called before page_alloc is setup,
9192 * we use alloc_bootmem().
9193 */
9194 if (alloc_size) { 9444 if (alloc_size) {
9195 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 9445 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
9196 9446
@@ -9259,6 +9509,10 @@ void __init sched_init(void)
9259#endif /* CONFIG_USER_SCHED */ 9509#endif /* CONFIG_USER_SCHED */
9260#endif /* CONFIG_GROUP_SCHED */ 9510#endif /* CONFIG_GROUP_SCHED */
9261 9511
9512#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9513 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9514 __alignof__(unsigned long));
9515#endif
9262 for_each_possible_cpu(i) { 9516 for_each_possible_cpu(i) {
9263 struct rq *rq; 9517 struct rq *rq;
9264 9518
@@ -9304,11 +9558,11 @@ void __init sched_init(void)
9304 * system cpu resource, based on the weight assigned to root 9558 * system cpu resource, based on the weight assigned to root
9305 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished 9559 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9306 * by letting tasks of init_task_group sit in a separate cfs_rq 9560 * by letting tasks of init_task_group sit in a separate cfs_rq
9307 * (init_cfs_rq) and having one entity represent this group of 9561 * (init_tg_cfs_rq) and having one entity represent this group of
9308 * tasks in rq->cfs (i.e init_task_group->se[] != NULL). 9562 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9309 */ 9563 */
9310 init_tg_cfs_entry(&init_task_group, 9564 init_tg_cfs_entry(&init_task_group,
9311 &per_cpu(init_cfs_rq, i), 9565 &per_cpu(init_tg_cfs_rq, i),
9312 &per_cpu(init_sched_entity, i), i, 1, 9566 &per_cpu(init_sched_entity, i), i, 1,
9313 root_task_group.se[i]); 9567 root_task_group.se[i]);
9314 9568
@@ -9334,12 +9588,15 @@ void __init sched_init(void)
9334#ifdef CONFIG_SMP 9588#ifdef CONFIG_SMP
9335 rq->sd = NULL; 9589 rq->sd = NULL;
9336 rq->rd = NULL; 9590 rq->rd = NULL;
9591 rq->post_schedule = 0;
9337 rq->active_balance = 0; 9592 rq->active_balance = 0;
9338 rq->next_balance = jiffies; 9593 rq->next_balance = jiffies;
9339 rq->push_cpu = 0; 9594 rq->push_cpu = 0;
9340 rq->cpu = i; 9595 rq->cpu = i;
9341 rq->online = 0; 9596 rq->online = 0;
9342 rq->migration_thread = NULL; 9597 rq->migration_thread = NULL;
9598 rq->idle_stamp = 0;
9599 rq->avg_idle = 2*sysctl_sched_migration_cost;
9343 INIT_LIST_HEAD(&rq->migration_queue); 9600 INIT_LIST_HEAD(&rq->migration_queue);
9344 rq_attach_root(rq, &def_root_domain); 9601 rq_attach_root(rq, &def_root_domain);
9345#endif 9602#endif
@@ -9383,28 +9640,37 @@ void __init sched_init(void)
9383 current->sched_class = &fair_sched_class; 9640 current->sched_class = &fair_sched_class;
9384 9641
9385 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9642 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9386 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9643 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9387#ifdef CONFIG_SMP 9644#ifdef CONFIG_SMP
9388#ifdef CONFIG_NO_HZ 9645#ifdef CONFIG_NO_HZ
9389 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9646 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9390 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9647 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9391#endif 9648#endif
9392 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9649 /* May be allocated at isolcpus cmdline parse time */
9650 if (cpu_isolated_map == NULL)
9651 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9393#endif /* SMP */ 9652#endif /* SMP */
9394 9653
9395 perf_counter_init(); 9654 perf_event_init();
9396 9655
9397 scheduler_running = 1; 9656 scheduler_running = 1;
9398} 9657}
9399 9658
9400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9659#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9401void __might_sleep(char *file, int line) 9660static inline int preempt_count_equals(int preempt_offset)
9661{
9662 int nested = preempt_count() & ~PREEMPT_ACTIVE;
9663
9664 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9665}
9666
9667void __might_sleep(char *file, int line, int preempt_offset)
9402{ 9668{
9403#ifdef in_atomic 9669#ifdef in_atomic
9404 static unsigned long prev_jiffy; /* ratelimiting */ 9670 static unsigned long prev_jiffy; /* ratelimiting */
9405 9671
9406 if ((!in_atomic() && !irqs_disabled()) || 9672 if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
9407 system_state != SYSTEM_RUNNING || oops_in_progress) 9673 system_state != SYSTEM_RUNNING || oops_in_progress)
9408 return; 9674 return;
9409 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 9675 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
9410 return; 9676 return;
@@ -10157,7 +10423,7 @@ static int sched_rt_global_constraints(void)
10157#endif /* CONFIG_RT_GROUP_SCHED */ 10423#endif /* CONFIG_RT_GROUP_SCHED */
10158 10424
10159int sched_rt_handler(struct ctl_table *table, int write, 10425int sched_rt_handler(struct ctl_table *table, int write,
10160 struct file *filp, void __user *buffer, size_t *lenp, 10426 void __user *buffer, size_t *lenp,
10161 loff_t *ppos) 10427 loff_t *ppos)
10162{ 10428{
10163 int ret; 10429 int ret;
@@ -10168,7 +10434,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
10168 old_period = sysctl_sched_rt_period; 10434 old_period = sysctl_sched_rt_period;
10169 old_runtime = sysctl_sched_rt_runtime; 10435 old_runtime = sysctl_sched_rt_runtime;
10170 10436
10171 ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); 10437 ret = proc_dointvec(table, write, buffer, lenp, ppos);
10172 10438
10173 if (!ret && write) { 10439 if (!ret && write) {
10174 ret = sched_rt_global_constraints(); 10440 ret = sched_rt_global_constraints();
@@ -10222,8 +10488,7 @@ cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
10222} 10488}
10223 10489
10224static int 10490static int
10225cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10491cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
10226 struct task_struct *tsk)
10227{ 10492{
10228#ifdef CONFIG_RT_GROUP_SCHED 10493#ifdef CONFIG_RT_GROUP_SCHED
10229 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) 10494 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
@@ -10233,15 +10498,45 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10233 if (tsk->sched_class != &fair_sched_class) 10498 if (tsk->sched_class != &fair_sched_class)
10234 return -EINVAL; 10499 return -EINVAL;
10235#endif 10500#endif
10501 return 0;
10502}
10236 10503
10504static int
10505cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10506 struct task_struct *tsk, bool threadgroup)
10507{
10508 int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
10509 if (retval)
10510 return retval;
10511 if (threadgroup) {
10512 struct task_struct *c;
10513 rcu_read_lock();
10514 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10515 retval = cpu_cgroup_can_attach_task(cgrp, c);
10516 if (retval) {
10517 rcu_read_unlock();
10518 return retval;
10519 }
10520 }
10521 rcu_read_unlock();
10522 }
10237 return 0; 10523 return 0;
10238} 10524}
10239 10525
10240static void 10526static void
10241cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, 10527cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
10242 struct cgroup *old_cont, struct task_struct *tsk) 10528 struct cgroup *old_cont, struct task_struct *tsk,
10529 bool threadgroup)
10243{ 10530{
10244 sched_move_task(tsk); 10531 sched_move_task(tsk);
10532 if (threadgroup) {
10533 struct task_struct *c;
10534 rcu_read_lock();
10535 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
10536 sched_move_task(c);
10537 }
10538 rcu_read_unlock();
10539 }
10245} 10540}
10246 10541
10247#ifdef CONFIG_FAIR_GROUP_SCHED 10542#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -10581,3 +10876,114 @@ struct cgroup_subsys cpuacct_subsys = {
10581 .subsys_id = cpuacct_subsys_id, 10876 .subsys_id = cpuacct_subsys_id,
10582}; 10877};
10583#endif /* CONFIG_CGROUP_CPUACCT */ 10878#endif /* CONFIG_CGROUP_CPUACCT */
10879
10880#ifndef CONFIG_SMP
10881
10882int rcu_expedited_torture_stats(char *page)
10883{
10884 return 0;
10885}
10886EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10887
10888void synchronize_sched_expedited(void)
10889{
10890}
10891EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10892
10893#else /* #ifndef CONFIG_SMP */
10894
10895static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
10896static DEFINE_MUTEX(rcu_sched_expedited_mutex);
10897
10898#define RCU_EXPEDITED_STATE_POST -2
10899#define RCU_EXPEDITED_STATE_IDLE -1
10900
10901static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10902
10903int rcu_expedited_torture_stats(char *page)
10904{
10905 int cnt = 0;
10906 int cpu;
10907
10908 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
10909 for_each_online_cpu(cpu) {
10910 cnt += sprintf(&page[cnt], " %d:%d",
10911 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
10912 }
10913 cnt += sprintf(&page[cnt], "\n");
10914 return cnt;
10915}
10916EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
10917
10918static long synchronize_sched_expedited_count;
10919
10920/*
10921 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
10922 * approach to force grace period to end quickly. This consumes
10923 * significant time on all CPUs, and is thus not recommended for
10924 * any sort of common-case code.
10925 *
10926 * Note that it is illegal to call this function while holding any
10927 * lock that is acquired by a CPU-hotplug notifier. Failing to
10928 * observe this restriction will result in deadlock.
10929 */
10930void synchronize_sched_expedited(void)
10931{
10932 int cpu;
10933 unsigned long flags;
10934 bool need_full_sync = 0;
10935 struct rq *rq;
10936 struct migration_req *req;
10937 long snap;
10938 int trycount = 0;
10939
10940 smp_mb(); /* ensure prior mod happens before capturing snap. */
10941 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
10942 get_online_cpus();
10943 while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
10944 put_online_cpus();
10945 if (trycount++ < 10)
10946 udelay(trycount * num_online_cpus());
10947 else {
10948 synchronize_sched();
10949 return;
10950 }
10951 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
10952 smp_mb(); /* ensure test happens before caller kfree */
10953 return;
10954 }
10955 get_online_cpus();
10956 }
10957 rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
10958 for_each_online_cpu(cpu) {
10959 rq = cpu_rq(cpu);
10960 req = &per_cpu(rcu_migration_req, cpu);
10961 init_completion(&req->done);
10962 req->task = NULL;
10963 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10964 spin_lock_irqsave(&rq->lock, flags);
10965 list_add(&req->list, &rq->migration_queue);
10966 spin_unlock_irqrestore(&rq->lock, flags);
10967 wake_up_process(rq->migration_thread);
10968 }
10969 for_each_online_cpu(cpu) {
10970 rcu_expedited_state = cpu;
10971 req = &per_cpu(rcu_migration_req, cpu);
10972 rq = cpu_rq(cpu);
10973 wait_for_completion(&req->done);
10974 spin_lock_irqsave(&rq->lock, flags);
10975 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10976 need_full_sync = 1;
10977 req->dest_cpu = RCU_MIGRATION_IDLE;
10978 spin_unlock_irqrestore(&rq->lock, flags);
10979 }
10980 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10981 synchronize_sched_expedited_count++;
10982 mutex_unlock(&rcu_sched_expedited_mutex);
10983 put_online_cpus();
10984 if (need_full_sync)
10985 synchronize_sched();
10986}
10987EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
10988
10989#endif /* #else #ifndef CONFIG_SMP */