aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/sched.c569
-rw-r--r--kernel/sched_autogroup.c238
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c322
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
19 files changed, 785 insertions, 585 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..cb7a1efa9c2b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c
index 5447dc7defa9..7d164e25b0f0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -905,6 +907,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
905 posix_cpu_timers_init_group(sig); 907 posix_cpu_timers_init_group(sig);
906 908
907 tty_audit_fork(sig); 909 tty_audit_fork(sig);
910 sched_autogroup_fork(sig);
908 911
909 sig->oom_adj = current->signal->oom_adj; 912 sig->oom_adj = current->signal->oom_adj;
910 sig->oom_score_adj = current->signal->oom_score_adj; 913 sig->oom_score_adj = current->signal->oom_score_adj;
@@ -1315,7 +1318,7 @@ bad_fork_cleanup_mm:
1315 } 1318 }
1316bad_fork_cleanup_signal: 1319bad_fork_cleanup_signal:
1317 if (!(clone_flags & CLONE_THREAD)) 1320 if (!(clone_flags & CLONE_THREAD))
1318 free_signal_struct(p->signal); 1321 put_signal_struct(p->signal);
1319bad_fork_cleanup_sighand: 1322bad_fork_cleanup_sighand:
1320 __cleanup_sighand(p->sighand); 1323 __cleanup_sighand(p->sighand);
1321bad_fork_cleanup_fs: 1324bad_fork_cleanup_fs:
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ca61bbdd44b2..5355cfd44a3f 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1074
1075void printk_tick(void) 1075void printk_tick(void)
1076{ 1076{
1077 if (__get_cpu_var(printk_pending)) { 1077 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1078 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1079 wake_up_interruptible(&log_wait);
1080 } 1080 }
1081} 1081}
1082 1082
1083int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1084{ 1084{
1085 if (unlikely(cpu_is_offline(cpu))) 1085 if (cpu_is_offline(cpu))
1086 printk_tick(); 1086 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1087 return __this_cpu_read(printk_pending);
1088} 1088}
1089 1089
1090void wake_up_klogd(void) 1090void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index 260132961a99..04949089e760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 608 */
606static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
607{ 610{
611 struct task_group *tg;
608 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
609 613
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
613} 619}
614 620
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -793,20 +799,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 799const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 800
795/* 801/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
811 * in ms. 803 * in ms.
812 * 804 *
@@ -1355,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1347 lw->inv_weight = 0;
1356} 1348}
1357 1349
1350static inline void update_load_set(struct load_weight *lw, unsigned long w)
1351{
1352 lw->weight = w;
1353 lw->inv_weight = 0;
1354}
1355
1358/* 1356/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1357 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1358 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1541
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1542#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1543
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1544/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1545 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1546 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1555 load = cpu_rq(cpu)->load.weight;
1653 } else { 1556 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1557 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1558 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1559 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1560 }
1658 1561
@@ -1661,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1564 return 0;
1662} 1565}
1663 1566
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1567static void update_h_load(long cpu)
1682{ 1568{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1569 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1570}
1685 1571
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1572#endif
1693 1573
1694#ifdef CONFIG_PREEMPT 1574#ifdef CONFIG_PREEMPT
@@ -1810,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1690
1811#endif 1691#endif
1812 1692
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1693static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1694static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1695static int get_update_sysctl_factor(void);
@@ -2063,6 +1934,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
2063#include "sched_idletask.c" 1934#include "sched_idletask.c"
2064#include "sched_fair.c" 1935#include "sched_fair.c"
2065#include "sched_rt.c" 1936#include "sched_rt.c"
1937#include "sched_autogroup.c"
2066#include "sched_stoptask.c" 1938#include "sched_stoptask.c"
2067#ifdef CONFIG_SCHED_DEBUG 1939#ifdef CONFIG_SCHED_DEBUG
2068# include "sched_debug.c" 1940# include "sched_debug.c"
@@ -2255,10 +2127,8 @@ static int migration_cpu_stop(void *data);
2255 * The task's runqueue lock must be held. 2127 * The task's runqueue lock must be held.
2256 * Returns true if you have to wait for migration thread. 2128 * Returns true if you have to wait for migration thread.
2257 */ 2129 */
2258static bool migrate_task(struct task_struct *p, int dest_cpu) 2130static bool migrate_task(struct task_struct *p, struct rq *rq)
2259{ 2131{
2260 struct rq *rq = task_rq(p);
2261
2262 /* 2132 /*
2263 * If the task is not on a runqueue (and not running), then 2133 * If the task is not on a runqueue (and not running), then
2264 * the next wake-up will properly place the task. 2134 * the next wake-up will properly place the task.
@@ -2438,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2438 return dest_cpu; 2308 return dest_cpu;
2439 2309
2440 /* No more Mr. Nice Guy. */ 2310 /* No more Mr. Nice Guy. */
2441 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2311 dest_cpu = cpuset_cpus_allowed_fallback(p);
2442 dest_cpu = cpuset_cpus_allowed_fallback(p); 2312 /*
2443 /* 2313 * Don't tell them about moving exiting tasks or
2444 * Don't tell them about moving exiting tasks or 2314 * kernel threads (both mm NULL), since they never
2445 * kernel threads (both mm NULL), since they never 2315 * leave kernel.
2446 * leave kernel. 2316 */
2447 */ 2317 if (p->mm && printk_ratelimit()) {
2448 if (p->mm && printk_ratelimit()) { 2318 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2449 printk(KERN_INFO "process %d (%s) no " 2319 task_pid_nr(p), p->comm, cpu);
2450 "longer affine to cpu%d\n",
2451 task_pid_nr(p), p->comm, cpu);
2452 }
2453 } 2320 }
2454 2321
2455 return dest_cpu; 2322 return dest_cpu;
@@ -2785,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2785 /* Want to start with kernel preemption disabled. */ 2652 /* Want to start with kernel preemption disabled. */
2786 task_thread_info(p)->preempt_count = 1; 2653 task_thread_info(p)->preempt_count = 1;
2787#endif 2654#endif
2655#ifdef CONFIG_SMP
2788 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2657#endif
2789 2658
2790 put_cpu(); 2659 put_cpu();
2791} 2660}
@@ -3549,7 +3418,7 @@ void sched_exec(void)
3549 * select_task_rq() can race against ->cpus_allowed 3418 * select_task_rq() can race against ->cpus_allowed
3550 */ 3419 */
3551 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3420 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3552 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3421 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3553 struct migration_arg arg = { p, dest_cpu }; 3422 struct migration_arg arg = { p, dest_cpu };
3554 3423
3555 task_rq_unlock(rq, &flags); 3424 task_rq_unlock(rq, &flags);
@@ -4214,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4214 if (task_thread_info(rq->curr) != owner || need_resched()) 4083 if (task_thread_info(rq->curr) != owner || need_resched())
4215 return 0; 4084 return 0;
4216 4085
4217 cpu_relax(); 4086 arch_mutex_cpu_relax();
4218 } 4087 }
4219 4088
4220 return 1; 4089 return 1;
@@ -4526,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4526 * This waits for either a completion of a specific task to be signaled or for a 4395 * This waits for either a completion of a specific task to be signaled or for a
4527 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4396 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4528 */ 4397 */
4529unsigned long __sched 4398long __sched
4530wait_for_completion_interruptible_timeout(struct completion *x, 4399wait_for_completion_interruptible_timeout(struct completion *x,
4531 unsigned long timeout) 4400 unsigned long timeout)
4532{ 4401{
@@ -4559,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4559 * signaled or for a specified timeout to expire. It can be 4428 * signaled or for a specified timeout to expire. It can be
4560 * interrupted by a kill signal. The timeout is in jiffies. 4429 * interrupted by a kill signal. The timeout is in jiffies.
4561 */ 4430 */
4562unsigned long __sched 4431long __sched
4563wait_for_completion_killable_timeout(struct completion *x, 4432wait_for_completion_killable_timeout(struct completion *x,
4564 unsigned long timeout) 4433 unsigned long timeout)
4565{ 4434{
@@ -4901,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
4901} 4770}
4902 4771
4903static int __sched_setscheduler(struct task_struct *p, int policy, 4772static int __sched_setscheduler(struct task_struct *p, int policy,
4904 struct sched_param *param, bool user) 4773 const struct sched_param *param, bool user)
4905{ 4774{
4906 int retval, oldprio, oldpolicy = -1, on_rq, running; 4775 int retval, oldprio, oldpolicy = -1, on_rq, running;
4907 unsigned long flags; 4776 unsigned long flags;
@@ -5056,7 +4925,7 @@ recheck:
5056 * NOTE that the task may be already dead. 4925 * NOTE that the task may be already dead.
5057 */ 4926 */
5058int sched_setscheduler(struct task_struct *p, int policy, 4927int sched_setscheduler(struct task_struct *p, int policy,
5059 struct sched_param *param) 4928 const struct sched_param *param)
5060{ 4929{
5061 return __sched_setscheduler(p, policy, param, true); 4930 return __sched_setscheduler(p, policy, param, true);
5062} 4931}
@@ -5074,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
5074 * but our caller might not have that capability. 4943 * but our caller might not have that capability.
5075 */ 4944 */
5076int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5077 struct sched_param *param) 4946 const struct sched_param *param)
5078{ 4947{
5079 return __sched_setscheduler(p, policy, param, false); 4948 return __sched_setscheduler(p, policy, param, false);
5080} 4949}
@@ -5590,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
5590 unsigned state; 5459 unsigned state;
5591 5460
5592 state = p->state ? __ffs(p->state) + 1 : 0; 5461 state = p->state ? __ffs(p->state) + 1 : 0;
5593 printk(KERN_INFO "%-13.13s %c", p->comm, 5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5594 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5595#if BITS_PER_LONG == 32 5464#if BITS_PER_LONG == 32
5596 if (state == TASK_RUNNING) 5465 if (state == TASK_RUNNING)
@@ -5754,7 +5623,6 @@ static void update_sysctl(void)
5754 SET_SYSCTL(sched_min_granularity); 5623 SET_SYSCTL(sched_min_granularity);
5755 SET_SYSCTL(sched_latency); 5624 SET_SYSCTL(sched_latency);
5756 SET_SYSCTL(sched_wakeup_granularity); 5625 SET_SYSCTL(sched_wakeup_granularity);
5757 SET_SYSCTL(sched_shares_ratelimit);
5758#undef SET_SYSCTL 5626#undef SET_SYSCTL
5759} 5627}
5760 5628
@@ -5830,7 +5698,7 @@ again:
5830 goto out; 5698 goto out;
5831 5699
5832 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5833 if (migrate_task(p, dest_cpu)) { 5701 if (migrate_task(p, rq)) {
5834 struct migration_arg arg = { p, dest_cpu }; 5702 struct migration_arg arg = { p, dest_cpu };
5835 /* Need help from migration thread: drop lock and wait. */ 5703 /* Need help from migration thread: drop lock and wait. */
5836 task_rq_unlock(rq, &flags); 5704 task_rq_unlock(rq, &flags);
@@ -5912,29 +5780,20 @@ static int migration_cpu_stop(void *data)
5912} 5780}
5913 5781
5914#ifdef CONFIG_HOTPLUG_CPU 5782#ifdef CONFIG_HOTPLUG_CPU
5783
5915/* 5784/*
5916 * Figure out where task on dead CPU should go, use force if necessary. 5785 * Ensures that the idle task is using init_mm right before its cpu goes
5786 * offline.
5917 */ 5787 */
5918void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5788void idle_task_exit(void)
5919{ 5789{
5920 struct rq *rq = cpu_rq(dead_cpu); 5790 struct mm_struct *mm = current->active_mm;
5921 int needs_cpu, uninitialized_var(dest_cpu);
5922 unsigned long flags;
5923 5791
5924 local_irq_save(flags); 5792 BUG_ON(cpu_online(smp_processor_id()));
5925 5793
5926 raw_spin_lock(&rq->lock); 5794 if (mm != &init_mm)
5927 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5795 switch_mm(mm, &init_mm, current);
5928 if (needs_cpu) 5796 mmdrop(mm);
5929 dest_cpu = select_fallback_rq(dead_cpu, p);
5930 raw_spin_unlock(&rq->lock);
5931 /*
5932 * It can only fail if we race with set_cpus_allowed(),
5933 * in the racer should migrate the task anyway.
5934 */
5935 if (needs_cpu)
5936 __migrate_task(p, dead_cpu, dest_cpu);
5937 local_irq_restore(flags);
5938} 5797}
5939 5798
5940/* 5799/*
@@ -5947,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5947static void migrate_nr_uninterruptible(struct rq *rq_src) 5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5948{ 5807{
5949 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5950 unsigned long flags;
5951 5809
5952 local_irq_save(flags);
5953 double_rq_lock(rq_src, rq_dest);
5954 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5955 rq_src->nr_uninterruptible = 0; 5811 rq_src->nr_uninterruptible = 0;
5956 double_rq_unlock(rq_src, rq_dest);
5957 local_irq_restore(flags);
5958}
5959
5960/* Run through task list and migrate tasks from the dead cpu. */
5961static void migrate_live_tasks(int src_cpu)
5962{
5963 struct task_struct *p, *t;
5964
5965 read_lock(&tasklist_lock);
5966
5967 do_each_thread(t, p) {
5968 if (p == current)
5969 continue;
5970
5971 if (task_cpu(p) == src_cpu)
5972 move_task_off_dead_cpu(src_cpu, p);
5973 } while_each_thread(t, p);
5974
5975 read_unlock(&tasklist_lock);
5976} 5812}
5977 5813
5978/* 5814/*
5979 * Schedules idle task to be the next runnable task on current CPU. 5815 * remove the tasks which were accounted by rq from calc_load_tasks.
5980 * It does so by boosting its priority to highest possible.
5981 * Used by CPU offline code.
5982 */ 5816 */
5983void sched_idle_next(void) 5817static void calc_global_load_remove(struct rq *rq)
5984{ 5818{
5985 int this_cpu = smp_processor_id(); 5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5986 struct rq *rq = cpu_rq(this_cpu); 5820 rq->calc_load_active = 0;
5987 struct task_struct *p = rq->idle;
5988 unsigned long flags;
5989
5990 /* cpu has to be offline */
5991 BUG_ON(cpu_online(this_cpu));
5992
5993 /*
5994 * Strictly not necessary since rest of the CPUs are stopped by now
5995 * and interrupts disabled on the current cpu.
5996 */
5997 raw_spin_lock_irqsave(&rq->lock, flags);
5998
5999 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
6000
6001 activate_task(rq, p, 0);
6002
6003 raw_spin_unlock_irqrestore(&rq->lock, flags);
6004} 5821}
6005 5822
6006/* 5823/*
6007 * Ensures that the idle task is using init_mm right before its cpu goes 5824 * Migrate all tasks from the rq, sleeping tasks will be migrated by
6008 * offline. 5825 * try_to_wake_up()->select_task_rq().
5826 *
5827 * Called with rq->lock held even though we'er in stop_machine() and
5828 * there's no concurrency possible, we hold the required locks anyway
5829 * because of lock validation efforts.
6009 */ 5830 */
6010void idle_task_exit(void) 5831static void migrate_tasks(unsigned int dead_cpu)
6011{
6012 struct mm_struct *mm = current->active_mm;
6013
6014 BUG_ON(cpu_online(smp_processor_id()));
6015
6016 if (mm != &init_mm)
6017 switch_mm(mm, &init_mm, current);
6018 mmdrop(mm);
6019}
6020
6021/* called under rq->lock with disabled interrupts */
6022static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
6023{ 5832{
6024 struct rq *rq = cpu_rq(dead_cpu); 5833 struct rq *rq = cpu_rq(dead_cpu);
6025 5834 struct task_struct *next, *stop = rq->stop;
6026 /* Must be exiting, otherwise would be on tasklist. */ 5835 int dest_cpu;
6027 BUG_ON(!p->exit_state);
6028
6029 /* Cannot have done final schedule yet: would have vanished. */
6030 BUG_ON(p->state == TASK_DEAD);
6031
6032 get_task_struct(p);
6033 5836
6034 /* 5837 /*
6035 * Drop lock around migration; if someone else moves it, 5838 * Fudge the rq selection such that the below task selection loop
6036 * that's OK. No task can be added to this CPU, so iteration is 5839 * doesn't get stuck on the currently eligible stop task.
6037 * fine. 5840 *
5841 * We're currently inside stop_machine() and the rq is either stuck
5842 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5843 * either way we should never end up calling schedule() until we're
5844 * done here.
6038 */ 5845 */
6039 raw_spin_unlock_irq(&rq->lock); 5846 rq->stop = NULL;
6040 move_task_off_dead_cpu(dead_cpu, p);
6041 raw_spin_lock_irq(&rq->lock);
6042
6043 put_task_struct(p);
6044}
6045
6046/* release_task() removes task from tasklist, so we won't find dead tasks. */
6047static void migrate_dead_tasks(unsigned int dead_cpu)
6048{
6049 struct rq *rq = cpu_rq(dead_cpu);
6050 struct task_struct *next;
6051 5847
6052 for ( ; ; ) { 5848 for ( ; ; ) {
6053 if (!rq->nr_running) 5849 /*
5850 * There's this thread running, bail when that's the only
5851 * remaining thread.
5852 */
5853 if (rq->nr_running == 1)
6054 break; 5854 break;
5855
6055 next = pick_next_task(rq); 5856 next = pick_next_task(rq);
6056 if (!next) 5857 BUG_ON(!next);
6057 break;
6058 next->sched_class->put_prev_task(rq, next); 5858 next->sched_class->put_prev_task(rq, next);
6059 migrate_dead(dead_cpu, next);
6060 5859
5860 /* Find suitable destination for @next, with force if needed. */
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
6061 } 5867 }
6062}
6063 5868
6064/* 5869 rq->stop = stop;
6065 * remove the tasks which were accounted by rq from calc_load_tasks.
6066 */
6067static void calc_global_load_remove(struct rq *rq)
6068{
6069 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
6070 rq->calc_load_active = 0;
6071} 5870}
5871
6072#endif /* CONFIG_HOTPLUG_CPU */ 5872#endif /* CONFIG_HOTPLUG_CPU */
6073 5873
6074#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6278,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6278 unsigned long flags; 6078 unsigned long flags;
6279 struct rq *rq = cpu_rq(cpu); 6079 struct rq *rq = cpu_rq(cpu);
6280 6080
6281 switch (action) { 6081 switch (action & ~CPU_TASKS_FROZEN) {
6282 6082
6283 case CPU_UP_PREPARE: 6083 case CPU_UP_PREPARE:
6284 case CPU_UP_PREPARE_FROZEN:
6285 rq->calc_load_update = calc_load_update; 6084 rq->calc_load_update = calc_load_update;
6286 break; 6085 break;
6287 6086
6288 case CPU_ONLINE: 6087 case CPU_ONLINE:
6289 case CPU_ONLINE_FROZEN:
6290 /* Update our root-domain */ 6088 /* Update our root-domain */
6291 raw_spin_lock_irqsave(&rq->lock, flags); 6089 raw_spin_lock_irqsave(&rq->lock, flags);
6292 if (rq->rd) { 6090 if (rq->rd) {
@@ -6298,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6298 break; 6096 break;
6299 6097
6300#ifdef CONFIG_HOTPLUG_CPU 6098#ifdef CONFIG_HOTPLUG_CPU
6301 case CPU_DEAD:
6302 case CPU_DEAD_FROZEN:
6303 migrate_live_tasks(cpu);
6304 /* Idle task back to normal (off runqueue, low prio) */
6305 raw_spin_lock_irq(&rq->lock);
6306 deactivate_task(rq, rq->idle, 0);
6307 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6308 rq->idle->sched_class = &idle_sched_class;
6309 migrate_dead_tasks(cpu);
6310 raw_spin_unlock_irq(&rq->lock);
6311 migrate_nr_uninterruptible(rq);
6312 BUG_ON(rq->nr_running != 0);
6313 calc_global_load_remove(rq);
6314 break;
6315
6316 case CPU_DYING: 6099 case CPU_DYING:
6317 case CPU_DYING_FROZEN:
6318 /* Update our root-domain */ 6100 /* Update our root-domain */
6319 raw_spin_lock_irqsave(&rq->lock, flags); 6101 raw_spin_lock_irqsave(&rq->lock, flags);
6320 if (rq->rd) { 6102 if (rq->rd) {
6321 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6322 set_rq_offline(rq); 6104 set_rq_offline(rq);
6323 } 6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1); /* the migration thread */
6324 raw_spin_unlock_irqrestore(&rq->lock, flags); 6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6325 break; 6112 break;
6326#endif 6113#endif
6327 } 6114 }
@@ -8052,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8052 7839
8053#ifdef CONFIG_FAIR_GROUP_SCHED 7840#ifdef CONFIG_FAIR_GROUP_SCHED
8054static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7841static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8055 struct sched_entity *se, int cpu, int add, 7842 struct sched_entity *se, int cpu,
8056 struct sched_entity *parent) 7843 struct sched_entity *parent)
8057{ 7844{
8058 struct rq *rq = cpu_rq(cpu); 7845 struct rq *rq = cpu_rq(cpu);
8059 tg->cfs_rq[cpu] = cfs_rq; 7846 tg->cfs_rq[cpu] = cfs_rq;
8060 init_cfs_rq(cfs_rq, rq); 7847 init_cfs_rq(cfs_rq, rq);
8061 cfs_rq->tg = tg; 7848 cfs_rq->tg = tg;
8062 if (add)
8063 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
8064 7849
8065 tg->se[cpu] = se; 7850 tg->se[cpu] = se;
8066 /* se could be NULL for init_task_group */ 7851 /* se could be NULL for init_task_group */
@@ -8073,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
8073 se->cfs_rq = parent->my_q; 7858 se->cfs_rq = parent->my_q;
8074 7859
8075 se->my_q = cfs_rq; 7860 se->my_q = cfs_rq;
8076 se->load.weight = tg->shares; 7861 update_load_set(&se->load, 0);
8077 se->load.inv_weight = 0;
8078 se->parent = parent; 7862 se->parent = parent;
8079} 7863}
8080#endif 7864#endif
8081 7865
8082#ifdef CONFIG_RT_GROUP_SCHED 7866#ifdef CONFIG_RT_GROUP_SCHED
8083static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7867static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8084 struct sched_rt_entity *rt_se, int cpu, int add, 7868 struct sched_rt_entity *rt_se, int cpu,
8085 struct sched_rt_entity *parent) 7869 struct sched_rt_entity *parent)
8086{ 7870{
8087 struct rq *rq = cpu_rq(cpu); 7871 struct rq *rq = cpu_rq(cpu);
@@ -8090,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
8090 init_rt_rq(rt_rq, rq); 7874 init_rt_rq(rt_rq, rq);
8091 rt_rq->tg = tg; 7875 rt_rq->tg = tg;
8092 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7876 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
8093 if (add)
8094 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
8095 7877
8096 tg->rt_se[cpu] = rt_se; 7878 tg->rt_se[cpu] = rt_se;
8097 if (!rt_se) 7879 if (!rt_se)
@@ -8164,13 +7946,9 @@ void __init sched_init(void)
8164#ifdef CONFIG_CGROUP_SCHED 7946#ifdef CONFIG_CGROUP_SCHED
8165 list_add(&init_task_group.list, &task_groups); 7947 list_add(&init_task_group.list, &task_groups);
8166 INIT_LIST_HEAD(&init_task_group.children); 7948 INIT_LIST_HEAD(&init_task_group.children);
8167 7949 autogroup_init(&init_task);
8168#endif /* CONFIG_CGROUP_SCHED */ 7950#endif /* CONFIG_CGROUP_SCHED */
8169 7951
8170#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
8171 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
8172 __alignof__(unsigned long));
8173#endif
8174 for_each_possible_cpu(i) { 7952 for_each_possible_cpu(i) {
8175 struct rq *rq; 7953 struct rq *rq;
8176 7954
@@ -8184,7 +7962,6 @@ void __init sched_init(void)
8184#ifdef CONFIG_FAIR_GROUP_SCHED 7962#ifdef CONFIG_FAIR_GROUP_SCHED
8185 init_task_group.shares = init_task_group_load; 7963 init_task_group.shares = init_task_group_load;
8186 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8187#ifdef CONFIG_CGROUP_SCHED
8188 /* 7965 /*
8189 * How much cpu bandwidth does init_task_group get? 7966 * How much cpu bandwidth does init_task_group get?
8190 * 7967 *
@@ -8204,16 +7981,13 @@ void __init sched_init(void)
8204 * We achieve this by letting init_task_group's tasks sit 7981 * We achieve this by letting init_task_group's tasks sit
8205 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8206 */ 7983 */
8207 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8208#endif
8209#endif /* CONFIG_FAIR_GROUP_SCHED */ 7985#endif /* CONFIG_FAIR_GROUP_SCHED */
8210 7986
8211 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8212#ifdef CONFIG_RT_GROUP_SCHED 7988#ifdef CONFIG_RT_GROUP_SCHED
8213 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8214#ifdef CONFIG_CGROUP_SCHED 7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8215 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8216#endif
8217#endif 7991#endif
8218 7992
8219 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8486,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8486 if (!se) 8260 if (!se)
8487 goto err_free_rq; 8261 goto err_free_rq;
8488 8262
8489 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8263 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8490 } 8264 }
8491 8265
8492 return 1; 8266 return 1;
@@ -8497,15 +8271,21 @@ err:
8497 return 0; 8271 return 0;
8498} 8272}
8499 8273
8500static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8501{
8502 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8503 &cpu_rq(cpu)->leaf_cfs_rq_list);
8504}
8505
8506static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8274static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8507{ 8275{
8508 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8276 struct rq *rq = cpu_rq(cpu);
8277 unsigned long flags;
8278
8279 /*
8280 * Only empty task groups can be destroyed; so we can speculatively
8281 * check on_list without danger of it being re-added.
8282 */
8283 if (!tg->cfs_rq[cpu]->on_list)
8284 return;
8285
8286 raw_spin_lock_irqsave(&rq->lock, flags);
8287 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8288 raw_spin_unlock_irqrestore(&rq->lock, flags);
8509} 8289}
8510#else /* !CONFG_FAIR_GROUP_SCHED */ 8290#else /* !CONFG_FAIR_GROUP_SCHED */
8511static inline void free_fair_sched_group(struct task_group *tg) 8291static inline void free_fair_sched_group(struct task_group *tg)
@@ -8518,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8518 return 1; 8298 return 1;
8519} 8299}
8520 8300
8521static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8522{
8523}
8524
8525static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8301static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8526{ 8302{
8527} 8303}
@@ -8576,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8576 if (!rt_se) 8352 if (!rt_se)
8577 goto err_free_rq; 8353 goto err_free_rq;
8578 8354
8579 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8355 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8580 } 8356 }
8581 8357
8582 return 1; 8358 return 1;
@@ -8586,17 +8362,6 @@ err_free_rq:
8586err: 8362err:
8587 return 0; 8363 return 0;
8588} 8364}
8589
8590static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8591{
8592 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8593 &cpu_rq(cpu)->leaf_rt_rq_list);
8594}
8595
8596static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8597{
8598 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8599}
8600#else /* !CONFIG_RT_GROUP_SCHED */ 8365#else /* !CONFIG_RT_GROUP_SCHED */
8601static inline void free_rt_sched_group(struct task_group *tg) 8366static inline void free_rt_sched_group(struct task_group *tg)
8602{ 8367{
@@ -8607,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8607{ 8372{
8608 return 1; 8373 return 1;
8609} 8374}
8610
8611static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8612{
8613}
8614
8615static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8616{
8617}
8618#endif /* CONFIG_RT_GROUP_SCHED */ 8375#endif /* CONFIG_RT_GROUP_SCHED */
8619 8376
8620#ifdef CONFIG_CGROUP_SCHED 8377#ifdef CONFIG_CGROUP_SCHED
@@ -8630,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8630{ 8387{
8631 struct task_group *tg; 8388 struct task_group *tg;
8632 unsigned long flags; 8389 unsigned long flags;
8633 int i;
8634 8390
8635 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8636 if (!tg) 8392 if (!tg)
@@ -8643,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8643 goto err; 8399 goto err;
8644 8400
8645 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8646 for_each_possible_cpu(i) {
8647 register_fair_sched_group(tg, i);
8648 register_rt_sched_group(tg, i);
8649 }
8650 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8651 8403
8652 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8676,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8676 unsigned long flags; 8428 unsigned long flags;
8677 int i; 8429 int i;
8678 8430
8679 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8680 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8681 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8682 unregister_rt_sched_group(tg, i); 8434
8683 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8684 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8685 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8686 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8727,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8727#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8728 8480
8729#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8730static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8731{
8732 struct cfs_rq *cfs_rq = se->cfs_rq;
8733 int on_rq;
8734
8735 on_rq = se->on_rq;
8736 if (on_rq)
8737 dequeue_entity(cfs_rq, se, 0);
8738
8739 se->load.weight = shares;
8740 se->load.inv_weight = 0;
8741
8742 if (on_rq)
8743 enqueue_entity(cfs_rq, se, 0);
8744}
8745
8746static void set_se_shares(struct sched_entity *se, unsigned long shares)
8747{
8748 struct cfs_rq *cfs_rq = se->cfs_rq;
8749 struct rq *rq = cfs_rq->rq;
8750 unsigned long flags;
8751
8752 raw_spin_lock_irqsave(&rq->lock, flags);
8753 __set_se_shares(se, shares);
8754 raw_spin_unlock_irqrestore(&rq->lock, flags);
8755}
8756
8757static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8758 8483
8759int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8776,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8776 if (tg->shares == shares) 8501 if (tg->shares == shares)
8777 goto done; 8502 goto done;
8778 8503
8779 spin_lock_irqsave(&task_group_lock, flags);
8780 for_each_possible_cpu(i)
8781 unregister_fair_sched_group(tg, i);
8782 list_del_rcu(&tg->siblings);
8783 spin_unlock_irqrestore(&task_group_lock, flags);
8784
8785 /* wait for any ongoing reference to this group to finish */
8786 synchronize_sched();
8787
8788 /*
8789 * Now we are free to modify the group's share on each cpu
8790 * w/o tripping rebalance_share or load_balance_fair.
8791 */
8792 tg->shares = shares; 8504 tg->shares = shares;
8793 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8794 /* 8506 struct rq *rq = cpu_rq(i);
8795 * force a rebalance 8507 struct sched_entity *se;
8796 */ 8508
8797 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8798 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8799 } 8515 }
8800 8516
8801 /*
8802 * Enable load balance activity on this group, by inserting it back on
8803 * each cpu's rq->leaf_cfs_rq_list.
8804 */
8805 spin_lock_irqsave(&task_group_lock, flags);
8806 for_each_possible_cpu(i)
8807 register_fair_sched_group(tg, i);
8808 list_add_rcu(&tg->siblings, &tg->parent->children);
8809 spin_unlock_irqrestore(&task_group_lock, flags);
8810done: 8517done:
8811 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8812 return 0; 8519 return 0;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..c80fedcd476b
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,238 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_task_get(struct task_struct *p)
45{
46 struct autogroup *ag;
47 unsigned long flags;
48
49 if (!lock_task_sighand(p, &flags))
50 return autogroup_kref_get(&autogroup_default);
51
52 ag = autogroup_kref_get(p->signal->autogroup);
53 unlock_task_sighand(p, &flags);
54
55 return ag;
56}
57
58static inline struct autogroup *autogroup_create(void)
59{
60 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
61 struct task_group *tg;
62
63 if (!ag)
64 goto out_fail;
65
66 tg = sched_create_group(&init_task_group);
67
68 if (IS_ERR(tg))
69 goto out_free;
70
71 kref_init(&ag->kref);
72 init_rwsem(&ag->lock);
73 ag->id = atomic_inc_return(&autogroup_seq_nr);
74 ag->tg = tg;
75 tg->autogroup = ag;
76
77 return ag;
78
79out_free:
80 kfree(ag);
81out_fail:
82 if (printk_ratelimit()) {
83 printk(KERN_WARNING "autogroup_create: %s failure.\n",
84 ag ? "sched_create_group()" : "kmalloc()");
85 }
86
87 return autogroup_kref_get(&autogroup_default);
88}
89
90static inline bool
91task_wants_autogroup(struct task_struct *p, struct task_group *tg)
92{
93 if (tg != &root_task_group)
94 return false;
95
96 if (p->sched_class != &fair_sched_class)
97 return false;
98
99 /*
100 * We can only assume the task group can't go away on us if
101 * autogroup_move_group() can see us on ->thread_group list.
102 */
103 if (p->flags & PF_EXITING)
104 return false;
105
106 return true;
107}
108
109static inline struct task_group *
110autogroup_task_group(struct task_struct *p, struct task_group *tg)
111{
112 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
113
114 if (enabled && task_wants_autogroup(p, tg))
115 return p->signal->autogroup->tg;
116
117 return tg;
118}
119
120static void
121autogroup_move_group(struct task_struct *p, struct autogroup *ag)
122{
123 struct autogroup *prev;
124 struct task_struct *t;
125 unsigned long flags;
126
127 BUG_ON(!lock_task_sighand(p, &flags));
128
129 prev = p->signal->autogroup;
130 if (prev == ag) {
131 unlock_task_sighand(p, &flags);
132 return;
133 }
134
135 p->signal->autogroup = autogroup_kref_get(ag);
136
137 t = p;
138 do {
139 sched_move_task(t);
140 } while_each_thread(p, t);
141
142 unlock_task_sighand(p, &flags);
143 autogroup_kref_put(prev);
144}
145
146/* Allocates GFP_KERNEL, cannot be called under any spinlock */
147void sched_autogroup_create_attach(struct task_struct *p)
148{
149 struct autogroup *ag = autogroup_create();
150
151 autogroup_move_group(p, ag);
152 /* drop extra refrence added by autogroup_create() */
153 autogroup_kref_put(ag);
154}
155EXPORT_SYMBOL(sched_autogroup_create_attach);
156
157/* Cannot be called under siglock. Currently has no users */
158void sched_autogroup_detach(struct task_struct *p)
159{
160 autogroup_move_group(p, &autogroup_default);
161}
162EXPORT_SYMBOL(sched_autogroup_detach);
163
164void sched_autogroup_fork(struct signal_struct *sig)
165{
166 sig->autogroup = autogroup_task_get(current);
167}
168
169void sched_autogroup_exit(struct signal_struct *sig)
170{
171 autogroup_kref_put(sig->autogroup);
172}
173
174static int __init setup_autogroup(char *str)
175{
176 sysctl_sched_autogroup_enabled = 0;
177
178 return 1;
179}
180
181__setup("noautogroup", setup_autogroup);
182
183#ifdef CONFIG_PROC_FS
184
185int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
186{
187 static unsigned long next = INITIAL_JIFFIES;
188 struct autogroup *ag;
189 int err;
190
191 if (*nice < -20 || *nice > 19)
192 return -EINVAL;
193
194 err = security_task_setnice(current, *nice);
195 if (err)
196 return err;
197
198 if (*nice < 0 && !can_nice(current, *nice))
199 return -EPERM;
200
201 /* this is a heavy operation taking global locks.. */
202 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
203 return -EAGAIN;
204
205 next = HZ / 10 + jiffies;
206 ag = autogroup_task_get(p);
207
208 down_write(&ag->lock);
209 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
210 if (!err)
211 ag->nice = *nice;
212 up_write(&ag->lock);
213
214 autogroup_kref_put(ag);
215
216 return err;
217}
218
219void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
220{
221 struct autogroup *ag = autogroup_task_get(p);
222
223 down_read(&ag->lock);
224 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
225 up_read(&ag->lock);
226
227 autogroup_kref_put(ag);
228}
229#endif /* CONFIG_PROC_FS */
230
231#ifdef CONFIG_SCHED_DEBUG
232static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
233{
234 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
235}
236#endif /* CONFIG_SCHED_DEBUG */
237
238#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c62ebae65cf0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,10 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567#endif
517} 568}
518 569
519static void update_curr(struct cfs_rq *cfs_rq) 570static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +684,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 684 list_add(&se->group_node, &cfs_rq->tasks);
634 } 685 }
635 cfs_rq->nr_running++; 686 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 687}
638 688
639static void 689static void
@@ -647,9 +697,140 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 697 list_del_init(&se->group_node);
648 } 698 }
649 cfs_rq->nr_running--; 699 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 700}
652 701
702#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
703static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
704 int global_update)
705{
706 struct task_group *tg = cfs_rq->tg;
707 long load_avg;
708
709 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
710 load_avg -= cfs_rq->load_contribution;
711
712 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
713 atomic_add(load_avg, &tg->load_weight);
714 cfs_rq->load_contribution += load_avg;
715 }
716}
717
718static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
719{
720 u64 period = sysctl_sched_shares_window;
721 u64 now, delta;
722 unsigned long load = cfs_rq->load.weight;
723
724 if (!cfs_rq)
725 return;
726
727 now = rq_of(cfs_rq)->clock;
728 delta = now - cfs_rq->load_stamp;
729
730 /* truncate load history at 4 idle periods */
731 if (cfs_rq->load_stamp > cfs_rq->load_last &&
732 now - cfs_rq->load_last > 4 * period) {
733 cfs_rq->load_period = 0;
734 cfs_rq->load_avg = 0;
735 }
736
737 cfs_rq->load_stamp = now;
738 cfs_rq->load_unacc_exec_time = 0;
739 cfs_rq->load_period += delta;
740 if (load) {
741 cfs_rq->load_last = now;
742 cfs_rq->load_avg += delta * load;
743 }
744
745 /* consider updating load contribution on each fold or truncate */
746 if (global_update || cfs_rq->load_period > period
747 || !cfs_rq->load_period)
748 update_cfs_rq_load_contribution(cfs_rq, global_update);
749
750 while (cfs_rq->load_period > period) {
751 /*
752 * Inline assembly required to prevent the compiler
753 * optimising this loop into a divmod call.
754 * See __iter_div_u64_rem() for another example of this.
755 */
756 asm("" : "+rm" (cfs_rq->load_period));
757 cfs_rq->load_period /= 2;
758 cfs_rq->load_avg /= 2;
759 }
760
761 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
762 list_del_leaf_cfs_rq(cfs_rq);
763}
764
765static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
766 unsigned long weight)
767{
768 if (se->on_rq) {
769 /* commit outstanding execution time */
770 if (cfs_rq->curr == se)
771 update_curr(cfs_rq);
772 account_entity_dequeue(cfs_rq, se);
773 }
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812
813static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
814{
815 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
816 update_cfs_load(cfs_rq, 0);
817 update_cfs_shares(cfs_rq, 0);
818 }
819}
820#else /* CONFIG_FAIR_GROUP_SCHED */
821static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
822{
823}
824
825static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
826{
827}
828
829static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq)
830{
831}
832#endif /* CONFIG_FAIR_GROUP_SCHED */
833
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 834static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 835{
655#ifdef CONFIG_SCHEDSTATS 836#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +952,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 952 * Update run-time statistics of the 'current'.
772 */ 953 */
773 update_curr(cfs_rq); 954 update_curr(cfs_rq);
955 update_cfs_load(cfs_rq, 0);
956 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 957 account_entity_enqueue(cfs_rq, se);
775 958
776 if (flags & ENQUEUE_WAKEUP) { 959 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +965,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 965 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 966 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 967 __enqueue_entity(cfs_rq, se);
968 se->on_rq = 1;
969
970 if (cfs_rq->nr_running == 1)
971 list_add_leaf_cfs_rq(cfs_rq);
785} 972}
786 973
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 974static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1012,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1012
826 if (se != cfs_rq->curr) 1013 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1014 __dequeue_entity(cfs_rq, se);
1015 se->on_rq = 0;
1016 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1017 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1018 update_min_vruntime(cfs_rq);
1019 update_cfs_shares(cfs_rq, 0);
830 1020
831 /* 1021 /*
832 * Normalize the entity after updating the min_vruntime because the 1022 * Normalize the entity after updating the min_vruntime because the
@@ -955,6 +1145,11 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
955 */ 1145 */
956 update_curr(cfs_rq); 1146 update_curr(cfs_rq);
957 1147
1148 /*
1149 * Update share accounting for long-running entities.
1150 */
1151 update_entity_shares_tick(cfs_rq);
1152
958#ifdef CONFIG_SCHED_HRTICK 1153#ifdef CONFIG_SCHED_HRTICK
959 /* 1154 /*
960 * queued ticks are scheduled to match the slice, so don't bother 1155 * queued ticks are scheduled to match the slice, so don't bother
@@ -1055,6 +1250,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1250 flags = ENQUEUE_WAKEUP;
1056 } 1251 }
1057 1252
1253 for_each_sched_entity(se) {
1254 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1255
1256 update_cfs_load(cfs_rq, 0);
1257 update_cfs_shares(cfs_rq, 0);
1258 }
1259
1058 hrtick_update(rq); 1260 hrtick_update(rq);
1059} 1261}
1060 1262
@@ -1071,12 +1273,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1273 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1274 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1275 dequeue_entity(cfs_rq, se, flags);
1276
1074 /* Don't dequeue parent if it has other entities besides us */ 1277 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1278 if (cfs_rq->load.weight)
1076 break; 1279 break;
1077 flags |= DEQUEUE_SLEEP; 1280 flags |= DEQUEUE_SLEEP;
1078 } 1281 }
1079 1282
1283 for_each_sched_entity(se) {
1284 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1285
1286 update_cfs_load(cfs_rq, 0);
1287 update_cfs_shares(cfs_rq, 0);
1288 }
1289
1080 hrtick_update(rq); 1290 hrtick_update(rq);
1081} 1291}
1082 1292
@@ -1143,51 +1353,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1353 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1354 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1355 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1356 */
1161static long effective_load(struct task_group *tg, int cpu, 1357static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1358{
1164 struct sched_entity *se = tg->se[cpu]; 1359 struct sched_entity *se = tg->se[cpu];
1165 1360
1166 if (!tg->parent) 1361 if (!tg->parent)
1167 return wl; 1362 return wl;
1168 1363
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1364 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1365 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1366
1188 S = se->my_q->tg->shares; 1367 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1368 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1369 rw = se->my_q->load.weight;
1191 1370
1192 a = S*(rw + wl); 1371 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1372 b = S*rw + s*wg;
@@ -1508,23 +1687,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1687 sd = tmp;
1509 } 1688 }
1510 1689
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1690 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1691 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1692 return select_idle_sibling(p, cpu);
@@ -1909,6 +2071,48 @@ out:
1909} 2071}
1910 2072
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2073#ifdef CONFIG_FAIR_GROUP_SCHED
2074/*
2075 * update tg->load_weight by folding this cpu's load_avg
2076 */
2077static int update_shares_cpu(struct task_group *tg, int cpu)
2078{
2079 struct cfs_rq *cfs_rq;
2080 unsigned long flags;
2081 struct rq *rq;
2082
2083 if (!tg->se[cpu])
2084 return 0;
2085
2086 rq = cpu_rq(cpu);
2087 cfs_rq = tg->cfs_rq[cpu];
2088
2089 raw_spin_lock_irqsave(&rq->lock, flags);
2090
2091 update_rq_clock(rq);
2092 update_cfs_load(cfs_rq, 1);
2093
2094 /*
2095 * We need to update shares after updating tg->load_weight in
2096 * order to adjust the weight of groups with long running tasks.
2097 */
2098 update_cfs_shares(cfs_rq, 0);
2099
2100 raw_spin_unlock_irqrestore(&rq->lock, flags);
2101
2102 return 0;
2103}
2104
2105static void update_shares(int cpu)
2106{
2107 struct cfs_rq *cfs_rq;
2108 struct rq *rq = cpu_rq(cpu);
2109
2110 rcu_read_lock();
2111 for_each_leaf_cfs_rq(rq, cfs_rq)
2112 update_shares_cpu(cfs_rq->tg, cpu);
2113 rcu_read_unlock();
2114}
2115
1912static unsigned long 2116static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2117load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2118 unsigned long max_load_move,
@@ -1956,6 +2160,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2160 return max_load_move - rem_load_move;
1957} 2161}
1958#else 2162#else
2163static inline void update_shares(int cpu)
2164{
2165}
2166
1959static unsigned long 2167static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2168load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2169 unsigned long max_load_move,
@@ -3032,7 +3240,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3240 schedstat_inc(sd, lb_count[idle]);
3033 3241
3034redo: 3242redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3243 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3244 cpus, balance);
3038 3245
@@ -3174,8 +3381,6 @@ out_one_pinned:
3174 else 3381 else
3175 ld_moved = 0; 3382 ld_moved = 0;
3176out: 3383out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3384 return ld_moved;
3180} 3385}
3181 3386
@@ -3199,6 +3404,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3404 */
3200 raw_spin_unlock(&this_rq->lock); 3405 raw_spin_unlock(&this_rq->lock);
3201 3406
3407 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3408 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3409 unsigned long interval;
3204 int balance = 1; 3410 int balance = 1;
@@ -3569,6 +3775,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3775 int update_next_balance = 0;
3570 int need_serialize; 3776 int need_serialize;
3571 3777
3778 update_shares(cpu);
3779
3572 for_each_domain(cpu, sd) { 3780 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3781 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3782 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 46404414d8a7..ae5cbb1e3ced 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index aaa8dae08236..6e7b575ac33c 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -309,7 +309,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
309 */ 309 */
310static int watchdog(void *unused) 310static int watchdog(void *unused)
311{ 311{
312 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 312 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 313 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
314 314
315 sched_setscheduler(current, SCHED_FIFO, &param); 315 sched_setscheduler(current, SCHED_FIFO, &param);