aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c709
1 files changed, 180 insertions, 529 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 297d1a0eedb0..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -552,9 +553,6 @@ struct rq {
552 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
553 unsigned int ttwu_count; 554 unsigned int ttwu_count;
554 unsigned int ttwu_local; 555 unsigned int ttwu_local;
555
556 /* BKL stats */
557 unsigned int bkl_count;
558#endif 556#endif
559}; 557};
560 558
@@ -605,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
605 */ 603 */
606static inline struct task_group *task_group(struct task_struct *p) 604static inline struct task_group *task_group(struct task_struct *p)
607{ 605{
606 struct task_group *tg;
608 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
609 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
613} 617}
614 618
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -737,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
737 buf[cnt] = 0; 741 buf[cnt] = 0;
738 cmp = strstrip(buf); 742 cmp = strstrip(buf);
739 743
740 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
741 neg = 1; 745 neg = 1;
742 cmp += 3; 746 cmp += 3;
743 } 747 }
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 798
795/* 799/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
811 * in ms. 801 * in ms.
812 * 802 *
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1356} 1346}
1357 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1358/* 1354/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1539
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1541
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1542/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1653 } else { 1554 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1558 }
1658 1559
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1562 return 0;
1662} 1563}
1663 1564
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1682{ 1566{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1568}
1685 1569
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1570#endif
1693 1571
1694#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1688
1811#endif 1689#endif
1812 1690
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1692static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -2063,6 +1932,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1932#include "sched_idletask.c"
2064#include "sched_fair.c" 1933#include "sched_fair.c"
2065#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2255,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2257 */ 2127 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2129{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2130 /*
2263 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2438,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2306 return dest_cpu;
2439 2307
2440 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2443 /* 2311 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2446 * leave kernel. 2314 */
2447 */ 2315 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2318 }
2454 2319
2455 return dest_cpu; 2320 return dest_cpu;
@@ -2640,7 +2505,7 @@ out:
2640 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2641 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2642 * 2507 *
2643 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2644 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2645 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2646 */ 2511 */
@@ -2785,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2787#endif 2652#endif
2653#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2789 2656
2790 put_cpu(); 2657 put_cpu();
2791} 2658}
@@ -3549,7 +3416,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3550 */ 3417 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3554 3421
3555 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -4020,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
4020 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
4021#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
4022 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
4023 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
4024 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
4025 } 3892 }
4026#endif 3893#endif
@@ -4214,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4082 return 0;
4216 4083
4217 cpu_relax(); 4084 arch_mutex_cpu_relax();
4218 } 4085 }
4219 4086
4220 return 1; 4087 return 1;
@@ -4526,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4395 */
4529unsigned long __sched 4396long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4398 unsigned long timeout)
4532{ 4399{
@@ -4559,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4428 */
4562unsigned long __sched 4429long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4431 unsigned long timeout)
4565{ 4432{
@@ -4901,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4768}
4902 4769
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4905{ 4772{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4774 unsigned long flags;
@@ -5004,7 +4871,8 @@ recheck:
5004 * assigned. 4871 * assigned.
5005 */ 4872 */
5006 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5007 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
5008 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
5009 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
5010 return -EPERM; 4878 return -EPERM;
@@ -5056,7 +4924,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4924 * NOTE that the task may be already dead.
5057 */ 4925 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4926int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4927 const struct sched_param *param)
5060{ 4928{
5061 return __sched_setscheduler(p, policy, param, true); 4929 return __sched_setscheduler(p, policy, param, true);
5062} 4930}
@@ -5074,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4942 * but our caller might not have that capability.
5075 */ 4943 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4944int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4945 const struct sched_param *param)
5078{ 4946{
5079 return __sched_setscheduler(p, policy, param, false); 4947 return __sched_setscheduler(p, policy, param, false);
5080} 4948}
@@ -5590,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5458 unsigned state;
5591 5459
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5460 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5461 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5462 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5463#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5464 if (state == TASK_RUNNING)
@@ -5754,7 +5622,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5622 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5623 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5624 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5625#undef SET_SYSCTL
5759} 5626}
5760 5627
@@ -5830,7 +5697,7 @@ again:
5830 goto out; 5697 goto out;
5831 5698
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5700 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5701 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5702 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5703 task_rq_unlock(rq, &flags);
@@ -5912,29 +5779,20 @@ static int migration_cpu_stop(void *data)
5912} 5779}
5913 5780
5914#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5782
5915/* 5783/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5784 * Ensures that the idle task is using init_mm right before its cpu goes
5785 * offline.
5917 */ 5786 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5787void idle_task_exit(void)
5919{ 5788{
5920 struct rq *rq = cpu_rq(dead_cpu); 5789 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5790
5924 local_irq_save(flags); 5791 BUG_ON(cpu_online(smp_processor_id()));
5925 5792
5926 raw_spin_lock(&rq->lock); 5793 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5794 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5795 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5796}
5939 5797
5940/* 5798/*
@@ -5947,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5805static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5806{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5807 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5808
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5809 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5810 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5811}
5977 5812
5978/* 5813/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5814 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5815 */
5983void sched_idle_next(void) 5816static void calc_global_load_remove(struct rq *rq)
5984{ 5817{
5985 int this_cpu = smp_processor_id(); 5818 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5819 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5820}
6005 5821
6006/* 5822/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5823 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5824 * try_to_wake_up()->select_task_rq().
5825 *
5826 * Called with rq->lock held even though we'er in stop_machine() and
5827 * there's no concurrency possible, we hold the required locks anyway
5828 * because of lock validation efforts.
6009 */ 5829 */
6010void idle_task_exit(void) 5830static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5831{
6024 struct rq *rq = cpu_rq(dead_cpu); 5832 struct rq *rq = cpu_rq(dead_cpu);
6025 5833 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5834 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5835
6034 /* 5836 /*
6035 * Drop lock around migration; if someone else moves it, 5837 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5838 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5839 *
5840 * We're currently inside stop_machine() and the rq is either stuck
5841 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5842 * either way we should never end up calling schedule() until we're
5843 * done here.
6038 */ 5844 */
6039 raw_spin_unlock_irq(&rq->lock); 5845 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5846
6052 for ( ; ; ) { 5847 for ( ; ; ) {
6053 if (!rq->nr_running) 5848 /*
5849 * There's this thread running, bail when that's the only
5850 * remaining thread.
5851 */
5852 if (rq->nr_running == 1)
6054 break; 5853 break;
5854
6055 next = pick_next_task(rq); 5855 next = pick_next_task(rq);
6056 if (!next) 5856 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5857 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5858
5859 /* Find suitable destination for @next, with force if needed. */
5860 dest_cpu = select_fallback_rq(dead_cpu, next);
5861 raw_spin_unlock(&rq->lock);
5862
5863 __migrate_task(next, dead_cpu, dest_cpu);
5864
5865 raw_spin_lock(&rq->lock);
6061 } 5866 }
6062}
6063 5867
6064/* 5868 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5869}
5870
6072#endif /* CONFIG_HOTPLUG_CPU */ 5871#endif /* CONFIG_HOTPLUG_CPU */
6073 5872
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5873#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6077 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6078 struct rq *rq = cpu_rq(cpu);
6280 6079
6281 switch (action) { 6080 switch (action & ~CPU_TASKS_FROZEN) {
6282 6081
6283 case CPU_UP_PREPARE: 6082 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6083 rq->calc_load_update = calc_load_update;
6286 break; 6084 break;
6287 6085
6288 case CPU_ONLINE: 6086 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6087 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6088 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6089 if (rq->rd) {
@@ -6298,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6095 break;
6299 6096
6300#ifdef CONFIG_HOTPLUG_CPU 6097#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6098 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6099 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6100 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6101 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6102 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6103 set_rq_offline(rq);
6323 } 6104 }
6105 migrate_tasks(cpu);
6106 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6107 raw_spin_unlock_irqrestore(&rq->lock, flags);
6108
6109 migrate_nr_uninterruptible(rq);
6110 calc_global_load_remove(rq);
6325 break; 6111 break;
6326#endif 6112#endif
6327 } 6113 }
@@ -8052,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7838
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7839#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7840static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7841 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7842 struct sched_entity *parent)
8057{ 7843{
8058 struct rq *rq = cpu_rq(cpu); 7844 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7845 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7846 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7847 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7848
8065 tg->se[cpu] = se; 7849 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7850 /* se could be NULL for root_task_group */
8067 if (!se) 7851 if (!se)
8068 return; 7852 return;
8069 7853
@@ -8073,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7857 se->cfs_rq = parent->my_q;
8074 7858
8075 se->my_q = cfs_rq; 7859 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7860 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7861 se->parent = parent;
8079} 7862}
8080#endif 7863#endif
8081 7864
8082#ifdef CONFIG_RT_GROUP_SCHED 7865#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7866static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7867 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7868 struct sched_rt_entity *parent)
8086{ 7869{
8087 struct rq *rq = cpu_rq(cpu); 7870 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7873 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7874 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7875 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7876
8096 tg->rt_se[cpu] = rt_se; 7877 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7878 if (!rt_se)
@@ -8126,18 +7907,18 @@ void __init sched_init(void)
8126 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7907 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
8127 7908
8128#ifdef CONFIG_FAIR_GROUP_SCHED 7909#ifdef CONFIG_FAIR_GROUP_SCHED
8129 init_task_group.se = (struct sched_entity **)ptr; 7910 root_task_group.se = (struct sched_entity **)ptr;
8130 ptr += nr_cpu_ids * sizeof(void **); 7911 ptr += nr_cpu_ids * sizeof(void **);
8131 7912
8132 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7913 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8133 ptr += nr_cpu_ids * sizeof(void **); 7914 ptr += nr_cpu_ids * sizeof(void **);
8134 7915
8135#endif /* CONFIG_FAIR_GROUP_SCHED */ 7916#endif /* CONFIG_FAIR_GROUP_SCHED */
8136#ifdef CONFIG_RT_GROUP_SCHED 7917#ifdef CONFIG_RT_GROUP_SCHED
8137 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7918 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8138 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
8139 7920
8140 init_task_group.rt_rq = (struct rt_rq **)ptr; 7921 root_task_group.rt_rq = (struct rt_rq **)ptr;
8141 ptr += nr_cpu_ids * sizeof(void **); 7922 ptr += nr_cpu_ids * sizeof(void **);
8142 7923
8143#endif /* CONFIG_RT_GROUP_SCHED */ 7924#endif /* CONFIG_RT_GROUP_SCHED */
@@ -8157,20 +7938,16 @@ void __init sched_init(void)
8157 global_rt_period(), global_rt_runtime()); 7938 global_rt_period(), global_rt_runtime());
8158 7939
8159#ifdef CONFIG_RT_GROUP_SCHED 7940#ifdef CONFIG_RT_GROUP_SCHED
8160 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7941 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8161 global_rt_period(), global_rt_runtime()); 7942 global_rt_period(), global_rt_runtime());
8162#endif /* CONFIG_RT_GROUP_SCHED */ 7943#endif /* CONFIG_RT_GROUP_SCHED */
8163 7944
8164#ifdef CONFIG_CGROUP_SCHED 7945#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7946 list_add(&root_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7947 INIT_LIST_HEAD(&root_task_group.children);
8167 7948 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7949#endif /* CONFIG_CGROUP_SCHED */
8169 7950
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7951 for_each_possible_cpu(i) {
8175 struct rq *rq; 7952 struct rq *rq;
8176 7953
@@ -8182,38 +7959,34 @@ void __init sched_init(void)
8182 init_cfs_rq(&rq->cfs, rq); 7959 init_cfs_rq(&rq->cfs, rq);
8183 init_rt_rq(&rq->rt, rq); 7960 init_rt_rq(&rq->rt, rq);
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7961#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7962 root_task_group.shares = root_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7963 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7964 /*
8189 * How much cpu bandwidth does init_task_group get? 7965 * How much cpu bandwidth does root_task_group get?
8190 * 7966 *
8191 * In case of task-groups formed thr' the cgroup filesystem, it 7967 * In case of task-groups formed thr' the cgroup filesystem, it
8192 * gets 100% of the cpu resources in the system. This overall 7968 * gets 100% of the cpu resources in the system. This overall
8193 * system cpu resource is divided among the tasks of 7969 * system cpu resource is divided among the tasks of
8194 * init_task_group and its child task-groups in a fair manner, 7970 * root_task_group and its child task-groups in a fair manner,
8195 * based on each entity's (task or task-group's) weight 7971 * based on each entity's (task or task-group's) weight
8196 * (se->load.weight). 7972 * (se->load.weight).
8197 * 7973 *
8198 * In other words, if init_task_group has 10 tasks of weight 7974 * In other words, if root_task_group has 10 tasks of weight
8199 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7975 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8200 * then A0's share of the cpu resource is: 7976 * then A0's share of the cpu resource is:
8201 * 7977 *
8202 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7978 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8203 * 7979 *
8204 * We achieve this by letting init_task_group's tasks sit 7980 * We achieve this by letting root_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7981 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8206 */ 7982 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7983 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7984#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7985
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7986 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7987#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7988 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7989 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7990#endif
8218 7991
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7992 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8293,8 +8066,6 @@ void __init sched_init(void)
8293 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8066 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8294#endif /* SMP */ 8067#endif /* SMP */
8295 8068
8296 perf_event_init();
8297
8298 scheduler_running = 1; 8069 scheduler_running = 1;
8299} 8070}
8300 8071
@@ -8488,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8488 if (!se) 8259 if (!se)
8489 goto err_free_rq; 8260 goto err_free_rq;
8490 8261
8491 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8262 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8492 } 8263 }
8493 8264
8494 return 1; 8265 return 1;
@@ -8499,15 +8270,21 @@ err:
8499 return 0; 8270 return 0;
8500} 8271}
8501 8272
8502static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8503{
8504 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8505 &cpu_rq(cpu)->leaf_cfs_rq_list);
8506}
8507
8508static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8273static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8509{ 8274{
8510 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8275 struct rq *rq = cpu_rq(cpu);
8276 unsigned long flags;
8277
8278 /*
8279 * Only empty task groups can be destroyed; so we can speculatively
8280 * check on_list without danger of it being re-added.
8281 */
8282 if (!tg->cfs_rq[cpu]->on_list)
8283 return;
8284
8285 raw_spin_lock_irqsave(&rq->lock, flags);
8286 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 raw_spin_unlock_irqrestore(&rq->lock, flags);
8511} 8288}
8512#else /* !CONFG_FAIR_GROUP_SCHED */ 8289#else /* !CONFG_FAIR_GROUP_SCHED */
8513static inline void free_fair_sched_group(struct task_group *tg) 8290static inline void free_fair_sched_group(struct task_group *tg)
@@ -8520,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8520 return 1; 8297 return 1;
8521} 8298}
8522 8299
8523static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8524{
8525}
8526
8527static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8300static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8528{ 8301{
8529} 8302}
@@ -8578,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8578 if (!rt_se) 8351 if (!rt_se)
8579 goto err_free_rq; 8352 goto err_free_rq;
8580 8353
8581 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8354 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8582 } 8355 }
8583 8356
8584 return 1; 8357 return 1;
@@ -8588,17 +8361,6 @@ err_free_rq:
8588err: 8361err:
8589 return 0; 8362 return 0;
8590} 8363}
8591
8592static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8593{
8594 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8595 &cpu_rq(cpu)->leaf_rt_rq_list);
8596}
8597
8598static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8599{
8600 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8601}
8602#else /* !CONFIG_RT_GROUP_SCHED */ 8364#else /* !CONFIG_RT_GROUP_SCHED */
8603static inline void free_rt_sched_group(struct task_group *tg) 8365static inline void free_rt_sched_group(struct task_group *tg)
8604{ 8366{
@@ -8609,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8609{ 8371{
8610 return 1; 8372 return 1;
8611} 8373}
8612
8613static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8614{
8615}
8616
8617static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8618{
8619}
8620#endif /* CONFIG_RT_GROUP_SCHED */ 8374#endif /* CONFIG_RT_GROUP_SCHED */
8621 8375
8622#ifdef CONFIG_CGROUP_SCHED 8376#ifdef CONFIG_CGROUP_SCHED
@@ -8624,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
8624{ 8378{
8625 free_fair_sched_group(tg); 8379 free_fair_sched_group(tg);
8626 free_rt_sched_group(tg); 8380 free_rt_sched_group(tg);
8381 autogroup_free(tg);
8627 kfree(tg); 8382 kfree(tg);
8628} 8383}
8629 8384
@@ -8632,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8632{ 8387{
8633 struct task_group *tg; 8388 struct task_group *tg;
8634 unsigned long flags; 8389 unsigned long flags;
8635 int i;
8636 8390
8637 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8638 if (!tg) 8392 if (!tg)
@@ -8645,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8645 goto err; 8399 goto err;
8646 8400
8647 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8648 for_each_possible_cpu(i) {
8649 register_fair_sched_group(tg, i);
8650 register_rt_sched_group(tg, i);
8651 }
8652 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8653 8403
8654 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8678,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8678 unsigned long flags; 8428 unsigned long flags;
8679 int i; 8429 int i;
8680 8430
8681 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8682 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8683 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8684 unregister_rt_sched_group(tg, i); 8434
8685 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8686 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8687 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8688 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8729,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8729#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8730 8480
8731#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8732static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8733{
8734 struct cfs_rq *cfs_rq = se->cfs_rq;
8735 int on_rq;
8736
8737 on_rq = se->on_rq;
8738 if (on_rq)
8739 dequeue_entity(cfs_rq, se, 0);
8740
8741 se->load.weight = shares;
8742 se->load.inv_weight = 0;
8743
8744 if (on_rq)
8745 enqueue_entity(cfs_rq, se, 0);
8746}
8747
8748static void set_se_shares(struct sched_entity *se, unsigned long shares)
8749{
8750 struct cfs_rq *cfs_rq = se->cfs_rq;
8751 struct rq *rq = cfs_rq->rq;
8752 unsigned long flags;
8753
8754 raw_spin_lock_irqsave(&rq->lock, flags);
8755 __set_se_shares(se, shares);
8756 raw_spin_unlock_irqrestore(&rq->lock, flags);
8757}
8758
8759static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8760 8483
8761int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8778,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8778 if (tg->shares == shares) 8501 if (tg->shares == shares)
8779 goto done; 8502 goto done;
8780 8503
8781 spin_lock_irqsave(&task_group_lock, flags);
8782 for_each_possible_cpu(i)
8783 unregister_fair_sched_group(tg, i);
8784 list_del_rcu(&tg->siblings);
8785 spin_unlock_irqrestore(&task_group_lock, flags);
8786
8787 /* wait for any ongoing reference to this group to finish */
8788 synchronize_sched();
8789
8790 /*
8791 * Now we are free to modify the group's share on each cpu
8792 * w/o tripping rebalance_share or load_balance_fair.
8793 */
8794 tg->shares = shares; 8504 tg->shares = shares;
8795 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8796 /* 8506 struct rq *rq = cpu_rq(i);
8797 * force a rebalance 8507 struct sched_entity *se;
8798 */ 8508
8799 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8800 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8801 } 8515 }
8802 8516
8803 /*
8804 * Enable load balance activity on this group, by inserting it back on
8805 * each cpu's rq->leaf_cfs_rq_list.
8806 */
8807 spin_lock_irqsave(&task_group_lock, flags);
8808 for_each_possible_cpu(i)
8809 register_fair_sched_group(tg, i);
8810 list_add_rcu(&tg->siblings, &tg->parent->children);
8811 spin_unlock_irqrestore(&task_group_lock, flags);
8812done: 8517done:
8813 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8814 return 0; 8519 return 0;
@@ -9107,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
9107 8812
9108 if (!cgrp->parent) { 8813 if (!cgrp->parent) {
9109 /* This is early initialization for the top cgroup */ 8814 /* This is early initialization for the top cgroup */
9110 return &init_task_group.css; 8815 return &root_task_group.css;
9111 } 8816 }
9112 8817
9113 parent = cgroup_tg(cgrp->parent); 8818 parent = cgroup_tg(cgrp->parent);
@@ -9178,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9178 } 8883 }
9179} 8884}
9180 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
9181#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
9182static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
9183 u64 shareval) 8902 u64 shareval)
@@ -9250,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9250 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
9251 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
9252 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
9253 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
9254 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
9255 .early_init = 1, 8975 .early_init = 1,
@@ -9534,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
9534}; 9254};
9535#endif /* CONFIG_CGROUP_CPUACCT */ 9255#endif /* CONFIG_CGROUP_CPUACCT */
9536 9256
9537#ifndef CONFIG_SMP
9538
9539void synchronize_sched_expedited(void)
9540{
9541 barrier();
9542}
9543EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9544
9545#else /* #ifndef CONFIG_SMP */
9546
9547static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9548
9549static int synchronize_sched_expedited_cpu_stop(void *data)
9550{
9551 /*
9552 * There must be a full memory barrier on each affected CPU
9553 * between the time that try_stop_cpus() is called and the
9554 * time that it returns.
9555 *
9556 * In the current initial implementation of cpu_stop, the
9557 * above condition is already met when the control reaches
9558 * this point and the following smp_mb() is not strictly
9559 * necessary. Do smp_mb() anyway for documentation and
9560 * robustness against future implementation changes.
9561 */
9562 smp_mb(); /* See above comment block. */
9563 return 0;
9564}
9565
9566/*
9567 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9568 * approach to force grace period to end quickly. This consumes
9569 * significant time on all CPUs, and is thus not recommended for
9570 * any sort of common-case code.
9571 *
9572 * Note that it is illegal to call this function while holding any
9573 * lock that is acquired by a CPU-hotplug notifier. Failing to
9574 * observe this restriction will result in deadlock.
9575 */
9576void synchronize_sched_expedited(void)
9577{
9578 int snap, trycount = 0;
9579
9580 smp_mb(); /* ensure prior mod happens before capturing snap. */
9581 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9582 get_online_cpus();
9583 while (try_stop_cpus(cpu_online_mask,
9584 synchronize_sched_expedited_cpu_stop,
9585 NULL) == -EAGAIN) {
9586 put_online_cpus();
9587 if (trycount++ < 10)
9588 udelay(trycount * num_online_cpus());
9589 else {
9590 synchronize_sched();
9591 return;
9592 }
9593 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9594 smp_mb(); /* ensure test happens before caller kfree */
9595 return;
9596 }
9597 get_online_cpus();
9598 }
9599 atomic_inc(&synchronize_sched_expedited_count);
9600 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9601 put_online_cpus();
9602}
9603EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9604
9605#endif /* #else #ifndef CONFIG_SMP */