aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/mutex.c2
-rw-r--r--kernel/printk.c8
-rw-r--r--kernel/sched.c561
-rw-r--r--kernel/sched_autogroup.c229
-rw-r--r--kernel/sched_autogroup.h32
-rw-r--r--kernel/sched_clock.c2
-rw-r--r--kernel/sched_debug.c91
-rw-r--r--kernel/sched_fair.c305
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sched_rt.c24
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/watchdog.c2
19 files changed, 756 insertions, 578 deletions
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f6e726f18491..cb7a1efa9c2b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -189,7 +189,6 @@ static inline void check_for_tasks(int cpu)
189} 189}
190 190
191struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
193 unsigned long mod; 192 unsigned long mod;
194 void *hcpu; 193 void *hcpu;
195}; 194};
@@ -198,7 +197,6 @@ struct take_cpu_down_param {
198static int __ref take_cpu_down(void *_param) 197static int __ref take_cpu_down(void *_param)
199{ 198{
200 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
202 int err; 200 int err;
203 201
204 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -208,11 +206,6 @@ static int __ref take_cpu_down(void *_param)
208 206
209 cpu_notify(CPU_DYING | param->mod, param->hcpu); 207 cpu_notify(CPU_DYING | param->mod, param->hcpu);
210 208
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
213 /* Force idle task to run as soon as we yield: it should
214 immediately notice cpu is offline and die quickly. */
215 sched_idle_next();
216 return 0; 209 return 0;
217} 210}
218 211
@@ -223,7 +216,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 void *hcpu = (void *)(long)cpu; 216 void *hcpu = (void *)(long)cpu;
224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 217 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
225 struct take_cpu_down_param tcd_param = { 218 struct take_cpu_down_param tcd_param = {
226 .caller = current,
227 .mod = mod, 219 .mod = mod,
228 .hcpu = hcpu, 220 .hcpu = hcpu,
229 }; 221 };
@@ -253,9 +245,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
253 } 245 }
254 BUG_ON(cpu_online(cpu)); 246 BUG_ON(cpu_online(cpu));
255 247
256 /* Wait for it to sleep (leaving idle task). */ 248 /*
249 * The migration_call() CPU_DYING callback will have removed all
250 * runnable tasks from the cpu, there's only the idle task left now
251 * that the migration thread is done doing the stop_machine thing.
252 *
253 * Wait for the stop thread to go away.
254 */
257 while (!idle_cpu(cpu)) 255 while (!idle_cpu(cpu))
258 yield(); 256 cpu_relax();
259 257
260 /* This actually kills the CPU. */ 258 /* This actually kills the CPU. */
261 __cpu_die(cpu); 259 __cpu_die(cpu);
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5991b7..b6f2475f1e83 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -174,8 +174,10 @@ static inline void free_signal_struct(struct signal_struct *sig)
174 174
175static inline void put_signal_struct(struct signal_struct *sig) 175static inline void put_signal_struct(struct signal_struct *sig)
176{ 176{
177 if (atomic_dec_and_test(&sig->sigcnt)) 177 if (atomic_dec_and_test(&sig->sigcnt)) {
178 sched_autogroup_exit(sig);
178 free_signal_struct(sig); 179 free_signal_struct(sig);
180 }
179} 181}
180 182
181void __put_task_struct(struct task_struct *tsk) 183void __put_task_struct(struct task_struct *tsk)
@@ -904,6 +906,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
904 posix_cpu_timers_init_group(sig); 906 posix_cpu_timers_init_group(sig);
905 907
906 tty_audit_fork(sig); 908 tty_audit_fork(sig);
909 sched_autogroup_fork(sig);
907 910
908 sig->oom_adj = current->signal->oom_adj; 911 sig->oom_adj = current->signal->oom_adj;
909 sig->oom_score_adj = current->signal->oom_score_adj; 912 sig->oom_score_adj = current->signal->oom_score_adj;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5f92acc5f952..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -577,7 +577,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
577 */ 577 */
578static int irq_thread(void *data) 578static int irq_thread(void *data)
579{ 579{
580 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, }; 580 static struct sched_param param = {
581 .sched_priority = MAX_USER_RT_PRIO/2,
582 };
581 struct irqaction *action = data; 583 struct irqaction *action = data;
582 struct irq_desc *desc = irq_to_desc(action->irq); 584 struct irq_desc *desc = irq_to_desc(action->irq);
583 int wake, oneshot = desc->status & IRQ_ONESHOT; 585 int wake, oneshot = desc->status & IRQ_ONESHOT;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 2dc3786349d1..74cf6f5e7ade 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -148,7 +148,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
148 wait_for_completion(&create.done); 148 wait_for_completion(&create.done);
149 149
150 if (!IS_ERR(create.result)) { 150 if (!IS_ERR(create.result)) {
151 struct sched_param param = { .sched_priority = 0 }; 151 static struct sched_param param = { .sched_priority = 0 };
152 va_list args; 152 va_list args;
153 153
154 va_start(args, namefmt); 154 va_start(args, namefmt);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 200407c1502f..a5889fb28ecf 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -199,7 +199,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
199 * memory barriers as we'll eventually observe the right 199 * memory barriers as we'll eventually observe the right
200 * values at the cost of a few extra spins. 200 * values at the cost of a few extra spins.
201 */ 201 */
202 cpu_relax(); 202 arch_mutex_cpu_relax();
203 } 203 }
204#endif 204#endif
205 spin_lock_mutex(&lock->wait_lock, flags); 205 spin_lock_mutex(&lock->wait_lock, flags);
diff --git a/kernel/printk.c b/kernel/printk.c
index a23315dc4498..ab3ffc5b3b64 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1074,17 +1074,17 @@ static DEFINE_PER_CPU(int, printk_pending);
1074 1074
1075void printk_tick(void) 1075void printk_tick(void)
1076{ 1076{
1077 if (__get_cpu_var(printk_pending)) { 1077 if (__this_cpu_read(printk_pending)) {
1078 __get_cpu_var(printk_pending) = 0; 1078 __this_cpu_write(printk_pending, 0);
1079 wake_up_interruptible(&log_wait); 1079 wake_up_interruptible(&log_wait);
1080 } 1080 }
1081} 1081}
1082 1082
1083int printk_needs_cpu(int cpu) 1083int printk_needs_cpu(int cpu)
1084{ 1084{
1085 if (unlikely(cpu_is_offline(cpu))) 1085 if (cpu_is_offline(cpu))
1086 printk_tick(); 1086 printk_tick();
1087 return per_cpu(printk_pending, cpu); 1087 return __this_cpu_read(printk_pending);
1088} 1088}
1089 1089
1090void wake_up_klogd(void) 1090void wake_up_klogd(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..3925a1bbf5dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -605,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
605 */ 608 */
606static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
607{ 610{
611 struct task_group *tg;
608 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
609 613
610 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
611 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
612 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
613} 619}
614 620
615/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -797,20 +803,6 @@ late_initcall(sched_init_debug);
797const_debug unsigned int sysctl_sched_nr_migrate = 32; 803const_debug unsigned int sysctl_sched_nr_migrate = 32;
798 804
799/* 805/*
800 * ratelimit for updating the group shares.
801 * default: 0.25ms
802 */
803unsigned int sysctl_sched_shares_ratelimit = 250000;
804unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
805
806/*
807 * Inject some fuzzyness into changing the per-cpu group shares
808 * this avoids remote rq-locks at the expense of fairness.
809 * default: 4
810 */
811unsigned int sysctl_sched_shares_thresh = 4;
812
813/*
814 * period over which we average the RT time consumption, measured 806 * period over which we average the RT time consumption, measured
815 * in ms. 807 * in ms.
816 * 808 *
@@ -1359,6 +1351,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1359 lw->inv_weight = 0; 1351 lw->inv_weight = 0;
1360} 1352}
1361 1353
1354static inline void update_load_set(struct load_weight *lw, unsigned long w)
1355{
1356 lw->weight = w;
1357 lw->inv_weight = 0;
1358}
1359
1362/* 1360/*
1363 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1361 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1364 * of tasks with abnormal "nice" values across CPUs the contribution that 1362 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1547,101 +1545,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1547 1545
1548#ifdef CONFIG_FAIR_GROUP_SCHED 1546#ifdef CONFIG_FAIR_GROUP_SCHED
1549 1547
1550static __read_mostly unsigned long __percpu *update_shares_data;
1551
1552static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1553
1554/*
1555 * Calculate and set the cpu's group shares.
1556 */
1557static void update_group_shares_cpu(struct task_group *tg, int cpu,
1558 unsigned long sd_shares,
1559 unsigned long sd_rq_weight,
1560 unsigned long *usd_rq_weight)
1561{
1562 unsigned long shares, rq_weight;
1563 int boost = 0;
1564
1565 rq_weight = usd_rq_weight[cpu];
1566 if (!rq_weight) {
1567 boost = 1;
1568 rq_weight = NICE_0_LOAD;
1569 }
1570
1571 /*
1572 * \Sum_j shares_j * rq_weight_i
1573 * shares_i = -----------------------------
1574 * \Sum_j rq_weight_j
1575 */
1576 shares = (sd_shares * rq_weight) / sd_rq_weight;
1577 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1578
1579 if (abs(shares - tg->se[cpu]->load.weight) >
1580 sysctl_sched_shares_thresh) {
1581 struct rq *rq = cpu_rq(cpu);
1582 unsigned long flags;
1583
1584 raw_spin_lock_irqsave(&rq->lock, flags);
1585 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1586 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1587 __set_se_shares(tg->se[cpu], shares);
1588 raw_spin_unlock_irqrestore(&rq->lock, flags);
1589 }
1590}
1591
1592/*
1593 * Re-compute the task group their per cpu shares over the given domain.
1594 * This needs to be done in a bottom-up fashion because the rq weight of a
1595 * parent group depends on the shares of its child groups.
1596 */
1597static int tg_shares_up(struct task_group *tg, void *data)
1598{
1599 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1600 unsigned long *usd_rq_weight;
1601 struct sched_domain *sd = data;
1602 unsigned long flags;
1603 int i;
1604
1605 if (!tg->se[0])
1606 return 0;
1607
1608 local_irq_save(flags);
1609 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1610
1611 for_each_cpu(i, sched_domain_span(sd)) {
1612 weight = tg->cfs_rq[i]->load.weight;
1613 usd_rq_weight[i] = weight;
1614
1615 rq_weight += weight;
1616 /*
1617 * If there are currently no tasks on the cpu pretend there
1618 * is one of average load so that when a new task gets to
1619 * run here it will not get delayed by group starvation.
1620 */
1621 if (!weight)
1622 weight = NICE_0_LOAD;
1623
1624 sum_weight += weight;
1625 shares += tg->cfs_rq[i]->shares;
1626 }
1627
1628 if (!rq_weight)
1629 rq_weight = sum_weight;
1630
1631 if ((!shares && rq_weight) || shares > tg->shares)
1632 shares = tg->shares;
1633
1634 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1635 shares = tg->shares;
1636
1637 for_each_cpu(i, sched_domain_span(sd))
1638 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1639
1640 local_irq_restore(flags);
1641
1642 return 0;
1643}
1644
1645/* 1548/*
1646 * Compute the cpu's hierarchical load factor for each task group. 1549 * Compute the cpu's hierarchical load factor for each task group.
1647 * This needs to be done in a top-down fashion because the load of a child 1550 * This needs to be done in a top-down fashion because the load of a child
@@ -1656,7 +1559,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1656 load = cpu_rq(cpu)->load.weight; 1559 load = cpu_rq(cpu)->load.weight;
1657 } else { 1560 } else {
1658 load = tg->parent->cfs_rq[cpu]->h_load; 1561 load = tg->parent->cfs_rq[cpu]->h_load;
1659 load *= tg->cfs_rq[cpu]->shares; 1562 load *= tg->se[cpu]->load.weight;
1660 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1563 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1661 } 1564 }
1662 1565
@@ -1665,34 +1568,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1665 return 0; 1568 return 0;
1666} 1569}
1667 1570
1668static void update_shares(struct sched_domain *sd)
1669{
1670 s64 elapsed;
1671 u64 now;
1672
1673 if (root_task_group_empty())
1674 return;
1675
1676 now = local_clock();
1677 elapsed = now - sd->last_update;
1678
1679 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1680 sd->last_update = now;
1681 walk_tg_tree(tg_nop, tg_shares_up, sd);
1682 }
1683}
1684
1685static void update_h_load(long cpu) 1571static void update_h_load(long cpu)
1686{ 1572{
1687 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1573 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1688} 1574}
1689 1575
1690#else
1691
1692static inline void update_shares(struct sched_domain *sd)
1693{
1694}
1695
1696#endif 1576#endif
1697 1577
1698#ifdef CONFIG_PREEMPT 1578#ifdef CONFIG_PREEMPT
@@ -1814,15 +1694,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1814 1694
1815#endif 1695#endif
1816 1696
1817#ifdef CONFIG_FAIR_GROUP_SCHED
1818static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1819{
1820#ifdef CONFIG_SMP
1821 cfs_rq->shares = shares;
1822#endif
1823}
1824#endif
1825
1826static void calc_load_account_idle(struct rq *this_rq); 1697static void calc_load_account_idle(struct rq *this_rq);
1827static void update_sysctl(void); 1698static void update_sysctl(void);
1828static int get_update_sysctl_factor(void); 1699static int get_update_sysctl_factor(void);
@@ -2006,6 +1877,7 @@ static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
2006#include "sched_idletask.c" 1877#include "sched_idletask.c"
2007#include "sched_fair.c" 1878#include "sched_fair.c"
2008#include "sched_rt.c" 1879#include "sched_rt.c"
1880#include "sched_autogroup.c"
2009#include "sched_stoptask.c" 1881#include "sched_stoptask.c"
2010#ifdef CONFIG_SCHED_DEBUG 1882#ifdef CONFIG_SCHED_DEBUG
2011# include "sched_debug.c" 1883# include "sched_debug.c"
@@ -2198,10 +2070,8 @@ static int migration_cpu_stop(void *data);
2198 * The task's runqueue lock must be held. 2070 * The task's runqueue lock must be held.
2199 * Returns true if you have to wait for migration thread. 2071 * Returns true if you have to wait for migration thread.
2200 */ 2072 */
2201static bool migrate_task(struct task_struct *p, int dest_cpu) 2073static bool migrate_task(struct task_struct *p, struct rq *rq)
2202{ 2074{
2203 struct rq *rq = task_rq(p);
2204
2205 /* 2075 /*
2206 * If the task is not on a runqueue (and not running), then 2076 * If the task is not on a runqueue (and not running), then
2207 * the next wake-up will properly place the task. 2077 * the next wake-up will properly place the task.
@@ -2381,18 +2251,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2381 return dest_cpu; 2251 return dest_cpu;
2382 2252
2383 /* No more Mr. Nice Guy. */ 2253 /* No more Mr. Nice Guy. */
2384 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2254 dest_cpu = cpuset_cpus_allowed_fallback(p);
2385 dest_cpu = cpuset_cpus_allowed_fallback(p); 2255 /*
2386 /* 2256 * Don't tell them about moving exiting tasks or
2387 * Don't tell them about moving exiting tasks or 2257 * kernel threads (both mm NULL), since they never
2388 * kernel threads (both mm NULL), since they never 2258 * leave kernel.
2389 * leave kernel. 2259 */
2390 */ 2260 if (p->mm && printk_ratelimit()) {
2391 if (p->mm && printk_ratelimit()) { 2261 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2392 printk(KERN_INFO "process %d (%s) no " 2262 task_pid_nr(p), p->comm, cpu);
2393 "longer affine to cpu%d\n",
2394 task_pid_nr(p), p->comm, cpu);
2395 }
2396 } 2263 }
2397 2264
2398 return dest_cpu; 2265 return dest_cpu;
@@ -2728,7 +2595,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2728 /* Want to start with kernel preemption disabled. */ 2595 /* Want to start with kernel preemption disabled. */
2729 task_thread_info(p)->preempt_count = 1; 2596 task_thread_info(p)->preempt_count = 1;
2730#endif 2597#endif
2598#ifdef CONFIG_SMP
2731 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2599 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2600#endif
2732 2601
2733 put_cpu(); 2602 put_cpu();
2734} 2603}
@@ -3364,7 +3233,7 @@ void sched_exec(void)
3364 * select_task_rq() can race against ->cpus_allowed 3233 * select_task_rq() can race against ->cpus_allowed
3365 */ 3234 */
3366 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3235 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3367 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3236 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3368 struct migration_arg arg = { p, dest_cpu }; 3237 struct migration_arg arg = { p, dest_cpu };
3369 3238
3370 task_rq_unlock(rq, &flags); 3239 task_rq_unlock(rq, &flags);
@@ -4029,7 +3898,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4029 if (task_thread_info(rq->curr) != owner || need_resched()) 3898 if (task_thread_info(rq->curr) != owner || need_resched())
4030 return 0; 3899 return 0;
4031 3900
4032 cpu_relax(); 3901 arch_mutex_cpu_relax();
4033 } 3902 }
4034 3903
4035 return 1; 3904 return 1;
@@ -4716,7 +4585,7 @@ static bool check_same_owner(struct task_struct *p)
4716} 4585}
4717 4586
4718static int __sched_setscheduler(struct task_struct *p, int policy, 4587static int __sched_setscheduler(struct task_struct *p, int policy,
4719 struct sched_param *param, bool user) 4588 const struct sched_param *param, bool user)
4720{ 4589{
4721 int retval, oldprio, oldpolicy = -1, on_rq, running; 4590 int retval, oldprio, oldpolicy = -1, on_rq, running;
4722 unsigned long flags; 4591 unsigned long flags;
@@ -4871,7 +4740,7 @@ recheck:
4871 * NOTE that the task may be already dead. 4740 * NOTE that the task may be already dead.
4872 */ 4741 */
4873int sched_setscheduler(struct task_struct *p, int policy, 4742int sched_setscheduler(struct task_struct *p, int policy,
4874 struct sched_param *param) 4743 const struct sched_param *param)
4875{ 4744{
4876 return __sched_setscheduler(p, policy, param, true); 4745 return __sched_setscheduler(p, policy, param, true);
4877} 4746}
@@ -4889,7 +4758,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4889 * but our caller might not have that capability. 4758 * but our caller might not have that capability.
4890 */ 4759 */
4891int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4760int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4892 struct sched_param *param) 4761 const struct sched_param *param)
4893{ 4762{
4894 return __sched_setscheduler(p, policy, param, false); 4763 return __sched_setscheduler(p, policy, param, false);
4895} 4764}
@@ -5405,7 +5274,7 @@ void sched_show_task(struct task_struct *p)
5405 unsigned state; 5274 unsigned state;
5406 5275
5407 state = p->state ? __ffs(p->state) + 1 : 0; 5276 state = p->state ? __ffs(p->state) + 1 : 0;
5408 printk(KERN_INFO "%-13.13s %c", p->comm, 5277 printk(KERN_INFO "%-15.15s %c", p->comm,
5409 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5278 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5410#if BITS_PER_LONG == 32 5279#if BITS_PER_LONG == 32
5411 if (state == TASK_RUNNING) 5280 if (state == TASK_RUNNING)
@@ -5569,7 +5438,6 @@ static void update_sysctl(void)
5569 SET_SYSCTL(sched_min_granularity); 5438 SET_SYSCTL(sched_min_granularity);
5570 SET_SYSCTL(sched_latency); 5439 SET_SYSCTL(sched_latency);
5571 SET_SYSCTL(sched_wakeup_granularity); 5440 SET_SYSCTL(sched_wakeup_granularity);
5572 SET_SYSCTL(sched_shares_ratelimit);
5573#undef SET_SYSCTL 5441#undef SET_SYSCTL
5574} 5442}
5575 5443
@@ -5645,7 +5513,7 @@ again:
5645 goto out; 5513 goto out;
5646 5514
5647 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5515 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5648 if (migrate_task(p, dest_cpu)) { 5516 if (migrate_task(p, rq)) {
5649 struct migration_arg arg = { p, dest_cpu }; 5517 struct migration_arg arg = { p, dest_cpu };
5650 /* Need help from migration thread: drop lock and wait. */ 5518 /* Need help from migration thread: drop lock and wait. */
5651 task_rq_unlock(rq, &flags); 5519 task_rq_unlock(rq, &flags);
@@ -5727,29 +5595,20 @@ static int migration_cpu_stop(void *data)
5727} 5595}
5728 5596
5729#ifdef CONFIG_HOTPLUG_CPU 5597#ifdef CONFIG_HOTPLUG_CPU
5598
5730/* 5599/*
5731 * Figure out where task on dead CPU should go, use force if necessary. 5600 * Ensures that the idle task is using init_mm right before its cpu goes
5601 * offline.
5732 */ 5602 */
5733void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5603void idle_task_exit(void)
5734{ 5604{
5735 struct rq *rq = cpu_rq(dead_cpu); 5605 struct mm_struct *mm = current->active_mm;
5736 int needs_cpu, uninitialized_var(dest_cpu);
5737 unsigned long flags;
5738 5606
5739 local_irq_save(flags); 5607 BUG_ON(cpu_online(smp_processor_id()));
5740 5608
5741 raw_spin_lock(&rq->lock); 5609 if (mm != &init_mm)
5742 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5610 switch_mm(mm, &init_mm, current);
5743 if (needs_cpu) 5611 mmdrop(mm);
5744 dest_cpu = select_fallback_rq(dead_cpu, p);
5745 raw_spin_unlock(&rq->lock);
5746 /*
5747 * It can only fail if we race with set_cpus_allowed(),
5748 * in the racer should migrate the task anyway.
5749 */
5750 if (needs_cpu)
5751 __migrate_task(p, dead_cpu, dest_cpu);
5752 local_irq_restore(flags);
5753} 5612}
5754 5613
5755/* 5614/*
@@ -5762,128 +5621,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5762static void migrate_nr_uninterruptible(struct rq *rq_src) 5621static void migrate_nr_uninterruptible(struct rq *rq_src)
5763{ 5622{
5764 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5623 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5765 unsigned long flags;
5766 5624
5767 local_irq_save(flags);
5768 double_rq_lock(rq_src, rq_dest);
5769 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5625 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5770 rq_src->nr_uninterruptible = 0; 5626 rq_src->nr_uninterruptible = 0;
5771 double_rq_unlock(rq_src, rq_dest);
5772 local_irq_restore(flags);
5773}
5774
5775/* Run through task list and migrate tasks from the dead cpu. */
5776static void migrate_live_tasks(int src_cpu)
5777{
5778 struct task_struct *p, *t;
5779
5780 read_lock(&tasklist_lock);
5781
5782 do_each_thread(t, p) {
5783 if (p == current)
5784 continue;
5785
5786 if (task_cpu(p) == src_cpu)
5787 move_task_off_dead_cpu(src_cpu, p);
5788 } while_each_thread(t, p);
5789
5790 read_unlock(&tasklist_lock);
5791} 5627}
5792 5628
5793/* 5629/*
5794 * Schedules idle task to be the next runnable task on current CPU. 5630 * remove the tasks which were accounted by rq from calc_load_tasks.
5795 * It does so by boosting its priority to highest possible.
5796 * Used by CPU offline code.
5797 */ 5631 */
5798void sched_idle_next(void) 5632static void calc_global_load_remove(struct rq *rq)
5799{ 5633{
5800 int this_cpu = smp_processor_id(); 5634 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5801 struct rq *rq = cpu_rq(this_cpu); 5635 rq->calc_load_active = 0;
5802 struct task_struct *p = rq->idle;
5803 unsigned long flags;
5804
5805 /* cpu has to be offline */
5806 BUG_ON(cpu_online(this_cpu));
5807
5808 /*
5809 * Strictly not necessary since rest of the CPUs are stopped by now
5810 * and interrupts disabled on the current cpu.
5811 */
5812 raw_spin_lock_irqsave(&rq->lock, flags);
5813
5814 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5815
5816 activate_task(rq, p, 0);
5817
5818 raw_spin_unlock_irqrestore(&rq->lock, flags);
5819} 5636}
5820 5637
5821/* 5638/*
5822 * Ensures that the idle task is using init_mm right before its cpu goes 5639 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5823 * offline. 5640 * try_to_wake_up()->select_task_rq().
5641 *
5642 * Called with rq->lock held even though we'er in stop_machine() and
5643 * there's no concurrency possible, we hold the required locks anyway
5644 * because of lock validation efforts.
5824 */ 5645 */
5825void idle_task_exit(void) 5646static void migrate_tasks(unsigned int dead_cpu)
5826{
5827 struct mm_struct *mm = current->active_mm;
5828
5829 BUG_ON(cpu_online(smp_processor_id()));
5830
5831 if (mm != &init_mm)
5832 switch_mm(mm, &init_mm, current);
5833 mmdrop(mm);
5834}
5835
5836/* called under rq->lock with disabled interrupts */
5837static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5838{ 5647{
5839 struct rq *rq = cpu_rq(dead_cpu); 5648 struct rq *rq = cpu_rq(dead_cpu);
5840 5649 struct task_struct *next, *stop = rq->stop;
5841 /* Must be exiting, otherwise would be on tasklist. */ 5650 int dest_cpu;
5842 BUG_ON(!p->exit_state);
5843
5844 /* Cannot have done final schedule yet: would have vanished. */
5845 BUG_ON(p->state == TASK_DEAD);
5846
5847 get_task_struct(p);
5848 5651
5849 /* 5652 /*
5850 * Drop lock around migration; if someone else moves it, 5653 * Fudge the rq selection such that the below task selection loop
5851 * that's OK. No task can be added to this CPU, so iteration is 5654 * doesn't get stuck on the currently eligible stop task.
5852 * fine. 5655 *
5656 * We're currently inside stop_machine() and the rq is either stuck
5657 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5658 * either way we should never end up calling schedule() until we're
5659 * done here.
5853 */ 5660 */
5854 raw_spin_unlock_irq(&rq->lock); 5661 rq->stop = NULL;
5855 move_task_off_dead_cpu(dead_cpu, p);
5856 raw_spin_lock_irq(&rq->lock);
5857
5858 put_task_struct(p);
5859}
5860
5861/* release_task() removes task from tasklist, so we won't find dead tasks. */
5862static void migrate_dead_tasks(unsigned int dead_cpu)
5863{
5864 struct rq *rq = cpu_rq(dead_cpu);
5865 struct task_struct *next;
5866 5662
5867 for ( ; ; ) { 5663 for ( ; ; ) {
5868 if (!rq->nr_running) 5664 /*
5665 * There's this thread running, bail when that's the only
5666 * remaining thread.
5667 */
5668 if (rq->nr_running == 1)
5869 break; 5669 break;
5670
5870 next = pick_next_task(rq); 5671 next = pick_next_task(rq);
5871 if (!next) 5672 BUG_ON(!next);
5872 break;
5873 next->sched_class->put_prev_task(rq, next); 5673 next->sched_class->put_prev_task(rq, next);
5874 migrate_dead(dead_cpu, next);
5875 5674
5675 /* Find suitable destination for @next, with force if needed. */
5676 dest_cpu = select_fallback_rq(dead_cpu, next);
5677 raw_spin_unlock(&rq->lock);
5678
5679 __migrate_task(next, dead_cpu, dest_cpu);
5680
5681 raw_spin_lock(&rq->lock);
5876 } 5682 }
5877}
5878 5683
5879/* 5684 rq->stop = stop;
5880 * remove the tasks which were accounted by rq from calc_load_tasks.
5881 */
5882static void calc_global_load_remove(struct rq *rq)
5883{
5884 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5885 rq->calc_load_active = 0;
5886} 5685}
5686
5887#endif /* CONFIG_HOTPLUG_CPU */ 5687#endif /* CONFIG_HOTPLUG_CPU */
5888 5688
5889#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5689#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6093,15 +5893,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6093 unsigned long flags; 5893 unsigned long flags;
6094 struct rq *rq = cpu_rq(cpu); 5894 struct rq *rq = cpu_rq(cpu);
6095 5895
6096 switch (action) { 5896 switch (action & ~CPU_TASKS_FROZEN) {
6097 5897
6098 case CPU_UP_PREPARE: 5898 case CPU_UP_PREPARE:
6099 case CPU_UP_PREPARE_FROZEN:
6100 rq->calc_load_update = calc_load_update; 5899 rq->calc_load_update = calc_load_update;
6101 break; 5900 break;
6102 5901
6103 case CPU_ONLINE: 5902 case CPU_ONLINE:
6104 case CPU_ONLINE_FROZEN:
6105 /* Update our root-domain */ 5903 /* Update our root-domain */
6106 raw_spin_lock_irqsave(&rq->lock, flags); 5904 raw_spin_lock_irqsave(&rq->lock, flags);
6107 if (rq->rd) { 5905 if (rq->rd) {
@@ -6113,30 +5911,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6113 break; 5911 break;
6114 5912
6115#ifdef CONFIG_HOTPLUG_CPU 5913#ifdef CONFIG_HOTPLUG_CPU
6116 case CPU_DEAD:
6117 case CPU_DEAD_FROZEN:
6118 migrate_live_tasks(cpu);
6119 /* Idle task back to normal (off runqueue, low prio) */
6120 raw_spin_lock_irq(&rq->lock);
6121 deactivate_task(rq, rq->idle, 0);
6122 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6123 rq->idle->sched_class = &idle_sched_class;
6124 migrate_dead_tasks(cpu);
6125 raw_spin_unlock_irq(&rq->lock);
6126 migrate_nr_uninterruptible(rq);
6127 BUG_ON(rq->nr_running != 0);
6128 calc_global_load_remove(rq);
6129 break;
6130
6131 case CPU_DYING: 5914 case CPU_DYING:
6132 case CPU_DYING_FROZEN:
6133 /* Update our root-domain */ 5915 /* Update our root-domain */
6134 raw_spin_lock_irqsave(&rq->lock, flags); 5916 raw_spin_lock_irqsave(&rq->lock, flags);
6135 if (rq->rd) { 5917 if (rq->rd) {
6136 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5918 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6137 set_rq_offline(rq); 5919 set_rq_offline(rq);
6138 } 5920 }
5921 migrate_tasks(cpu);
5922 BUG_ON(rq->nr_running != 1); /* the migration thread */
6139 raw_spin_unlock_irqrestore(&rq->lock, flags); 5923 raw_spin_unlock_irqrestore(&rq->lock, flags);
5924
5925 migrate_nr_uninterruptible(rq);
5926 calc_global_load_remove(rq);
6140 break; 5927 break;
6141#endif 5928#endif
6142 } 5929 }
@@ -7867,15 +7654,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7867 7654
7868#ifdef CONFIG_FAIR_GROUP_SCHED 7655#ifdef CONFIG_FAIR_GROUP_SCHED
7869static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7656static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7870 struct sched_entity *se, int cpu, int add, 7657 struct sched_entity *se, int cpu,
7871 struct sched_entity *parent) 7658 struct sched_entity *parent)
7872{ 7659{
7873 struct rq *rq = cpu_rq(cpu); 7660 struct rq *rq = cpu_rq(cpu);
7874 tg->cfs_rq[cpu] = cfs_rq; 7661 tg->cfs_rq[cpu] = cfs_rq;
7875 init_cfs_rq(cfs_rq, rq); 7662 init_cfs_rq(cfs_rq, rq);
7876 cfs_rq->tg = tg; 7663 cfs_rq->tg = tg;
7877 if (add)
7878 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7879 7664
7880 tg->se[cpu] = se; 7665 tg->se[cpu] = se;
7881 /* se could be NULL for init_task_group */ 7666 /* se could be NULL for init_task_group */
@@ -7888,15 +7673,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7888 se->cfs_rq = parent->my_q; 7673 se->cfs_rq = parent->my_q;
7889 7674
7890 se->my_q = cfs_rq; 7675 se->my_q = cfs_rq;
7891 se->load.weight = tg->shares; 7676 update_load_set(&se->load, 0);
7892 se->load.inv_weight = 0;
7893 se->parent = parent; 7677 se->parent = parent;
7894} 7678}
7895#endif 7679#endif
7896 7680
7897#ifdef CONFIG_RT_GROUP_SCHED 7681#ifdef CONFIG_RT_GROUP_SCHED
7898static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7682static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7899 struct sched_rt_entity *rt_se, int cpu, int add, 7683 struct sched_rt_entity *rt_se, int cpu,
7900 struct sched_rt_entity *parent) 7684 struct sched_rt_entity *parent)
7901{ 7685{
7902 struct rq *rq = cpu_rq(cpu); 7686 struct rq *rq = cpu_rq(cpu);
@@ -7905,8 +7689,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7905 init_rt_rq(rt_rq, rq); 7689 init_rt_rq(rt_rq, rq);
7906 rt_rq->tg = tg; 7690 rt_rq->tg = tg;
7907 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7691 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7908 if (add)
7909 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7910 7692
7911 tg->rt_se[cpu] = rt_se; 7693 tg->rt_se[cpu] = rt_se;
7912 if (!rt_se) 7694 if (!rt_se)
@@ -7979,13 +7761,9 @@ void __init sched_init(void)
7979#ifdef CONFIG_CGROUP_SCHED 7761#ifdef CONFIG_CGROUP_SCHED
7980 list_add(&init_task_group.list, &task_groups); 7762 list_add(&init_task_group.list, &task_groups);
7981 INIT_LIST_HEAD(&init_task_group.children); 7763 INIT_LIST_HEAD(&init_task_group.children);
7982 7764 autogroup_init(&init_task);
7983#endif /* CONFIG_CGROUP_SCHED */ 7765#endif /* CONFIG_CGROUP_SCHED */
7984 7766
7985#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7986 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7987 __alignof__(unsigned long));
7988#endif
7989 for_each_possible_cpu(i) { 7767 for_each_possible_cpu(i) {
7990 struct rq *rq; 7768 struct rq *rq;
7991 7769
@@ -8019,7 +7797,7 @@ void __init sched_init(void)
8019 * We achieve this by letting init_task_group's tasks sit 7797 * We achieve this by letting init_task_group's tasks sit
8020 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7798 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8021 */ 7799 */
8022 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7800 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8023#endif 7801#endif
8024#endif /* CONFIG_FAIR_GROUP_SCHED */ 7802#endif /* CONFIG_FAIR_GROUP_SCHED */
8025 7803
@@ -8027,7 +7805,7 @@ void __init sched_init(void)
8027#ifdef CONFIG_RT_GROUP_SCHED 7805#ifdef CONFIG_RT_GROUP_SCHED
8028 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7806 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8029#ifdef CONFIG_CGROUP_SCHED 7807#ifdef CONFIG_CGROUP_SCHED
8030 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7808 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8031#endif 7809#endif
8032#endif 7810#endif
8033 7811
@@ -8303,7 +8081,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8303 if (!se) 8081 if (!se)
8304 goto err_free_rq; 8082 goto err_free_rq;
8305 8083
8306 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8084 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8307 } 8085 }
8308 8086
8309 return 1; 8087 return 1;
@@ -8314,15 +8092,21 @@ err:
8314 return 0; 8092 return 0;
8315} 8093}
8316 8094
8317static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8318{
8319 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8320 &cpu_rq(cpu)->leaf_cfs_rq_list);
8321}
8322
8323static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8095static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8324{ 8096{
8325 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8097 struct rq *rq = cpu_rq(cpu);
8098 unsigned long flags;
8099
8100 /*
8101 * Only empty task groups can be destroyed; so we can speculatively
8102 * check on_list without danger of it being re-added.
8103 */
8104 if (!tg->cfs_rq[cpu]->on_list)
8105 return;
8106
8107 raw_spin_lock_irqsave(&rq->lock, flags);
8108 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8109 raw_spin_unlock_irqrestore(&rq->lock, flags);
8326} 8110}
8327#else /* !CONFG_FAIR_GROUP_SCHED */ 8111#else /* !CONFG_FAIR_GROUP_SCHED */
8328static inline void free_fair_sched_group(struct task_group *tg) 8112static inline void free_fair_sched_group(struct task_group *tg)
@@ -8335,10 +8119,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8335 return 1; 8119 return 1;
8336} 8120}
8337 8121
8338static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8339{
8340}
8341
8342static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8122static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8343{ 8123{
8344} 8124}
@@ -8393,7 +8173,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8393 if (!rt_se) 8173 if (!rt_se)
8394 goto err_free_rq; 8174 goto err_free_rq;
8395 8175
8396 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8176 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8397 } 8177 }
8398 8178
8399 return 1; 8179 return 1;
@@ -8403,17 +8183,6 @@ err_free_rq:
8403err: 8183err:
8404 return 0; 8184 return 0;
8405} 8185}
8406
8407static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8408{
8409 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8410 &cpu_rq(cpu)->leaf_rt_rq_list);
8411}
8412
8413static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8414{
8415 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8416}
8417#else /* !CONFIG_RT_GROUP_SCHED */ 8186#else /* !CONFIG_RT_GROUP_SCHED */
8418static inline void free_rt_sched_group(struct task_group *tg) 8187static inline void free_rt_sched_group(struct task_group *tg)
8419{ 8188{
@@ -8424,14 +8193,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8424{ 8193{
8425 return 1; 8194 return 1;
8426} 8195}
8427
8428static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8429{
8430}
8431
8432static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8433{
8434}
8435#endif /* CONFIG_RT_GROUP_SCHED */ 8196#endif /* CONFIG_RT_GROUP_SCHED */
8436 8197
8437#ifdef CONFIG_CGROUP_SCHED 8198#ifdef CONFIG_CGROUP_SCHED
@@ -8447,7 +8208,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8447{ 8208{
8448 struct task_group *tg; 8209 struct task_group *tg;
8449 unsigned long flags; 8210 unsigned long flags;
8450 int i;
8451 8211
8452 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8212 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8453 if (!tg) 8213 if (!tg)
@@ -8460,10 +8220,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8460 goto err; 8220 goto err;
8461 8221
8462 spin_lock_irqsave(&task_group_lock, flags); 8222 spin_lock_irqsave(&task_group_lock, flags);
8463 for_each_possible_cpu(i) {
8464 register_fair_sched_group(tg, i);
8465 register_rt_sched_group(tg, i);
8466 }
8467 list_add_rcu(&tg->list, &task_groups); 8223 list_add_rcu(&tg->list, &task_groups);
8468 8224
8469 WARN_ON(!parent); /* root should already exist */ 8225 WARN_ON(!parent); /* root should already exist */
@@ -8493,11 +8249,11 @@ void sched_destroy_group(struct task_group *tg)
8493 unsigned long flags; 8249 unsigned long flags;
8494 int i; 8250 int i;
8495 8251
8496 spin_lock_irqsave(&task_group_lock, flags); 8252 /* end participation in shares distribution */
8497 for_each_possible_cpu(i) { 8253 for_each_possible_cpu(i)
8498 unregister_fair_sched_group(tg, i); 8254 unregister_fair_sched_group(tg, i);
8499 unregister_rt_sched_group(tg, i); 8255
8500 } 8256 spin_lock_irqsave(&task_group_lock, flags);
8501 list_del_rcu(&tg->list); 8257 list_del_rcu(&tg->list);
8502 list_del_rcu(&tg->siblings); 8258 list_del_rcu(&tg->siblings);
8503 spin_unlock_irqrestore(&task_group_lock, flags); 8259 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8544,33 +8300,6 @@ void sched_move_task(struct task_struct *tsk)
8544#endif /* CONFIG_CGROUP_SCHED */ 8300#endif /* CONFIG_CGROUP_SCHED */
8545 8301
8546#ifdef CONFIG_FAIR_GROUP_SCHED 8302#ifdef CONFIG_FAIR_GROUP_SCHED
8547static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8548{
8549 struct cfs_rq *cfs_rq = se->cfs_rq;
8550 int on_rq;
8551
8552 on_rq = se->on_rq;
8553 if (on_rq)
8554 dequeue_entity(cfs_rq, se, 0);
8555
8556 se->load.weight = shares;
8557 se->load.inv_weight = 0;
8558
8559 if (on_rq)
8560 enqueue_entity(cfs_rq, se, 0);
8561}
8562
8563static void set_se_shares(struct sched_entity *se, unsigned long shares)
8564{
8565 struct cfs_rq *cfs_rq = se->cfs_rq;
8566 struct rq *rq = cfs_rq->rq;
8567 unsigned long flags;
8568
8569 raw_spin_lock_irqsave(&rq->lock, flags);
8570 __set_se_shares(se, shares);
8571 raw_spin_unlock_irqrestore(&rq->lock, flags);
8572}
8573
8574static DEFINE_MUTEX(shares_mutex); 8303static DEFINE_MUTEX(shares_mutex);
8575 8304
8576int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8305int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8593,37 +8322,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8593 if (tg->shares == shares) 8322 if (tg->shares == shares)
8594 goto done; 8323 goto done;
8595 8324
8596 spin_lock_irqsave(&task_group_lock, flags);
8597 for_each_possible_cpu(i)
8598 unregister_fair_sched_group(tg, i);
8599 list_del_rcu(&tg->siblings);
8600 spin_unlock_irqrestore(&task_group_lock, flags);
8601
8602 /* wait for any ongoing reference to this group to finish */
8603 synchronize_sched();
8604
8605 /*
8606 * Now we are free to modify the group's share on each cpu
8607 * w/o tripping rebalance_share or load_balance_fair.
8608 */
8609 tg->shares = shares; 8325 tg->shares = shares;
8610 for_each_possible_cpu(i) { 8326 for_each_possible_cpu(i) {
8611 /* 8327 struct rq *rq = cpu_rq(i);
8612 * force a rebalance 8328 struct sched_entity *se;
8613 */ 8329
8614 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8330 se = tg->se[i];
8615 set_se_shares(tg->se[i], shares); 8331 /* Propagate contribution to hierarchy */
8332 raw_spin_lock_irqsave(&rq->lock, flags);
8333 for_each_sched_entity(se)
8334 update_cfs_shares(group_cfs_rq(se), 0);
8335 raw_spin_unlock_irqrestore(&rq->lock, flags);
8616 } 8336 }
8617 8337
8618 /*
8619 * Enable load balance activity on this group, by inserting it back on
8620 * each cpu's rq->leaf_cfs_rq_list.
8621 */
8622 spin_lock_irqsave(&task_group_lock, flags);
8623 for_each_possible_cpu(i)
8624 register_fair_sched_group(tg, i);
8625 list_add_rcu(&tg->siblings, &tg->parent->children);
8626 spin_unlock_irqrestore(&task_group_lock, flags);
8627done: 8338done:
8628 mutex_unlock(&shares_mutex); 8339 mutex_unlock(&shares_mutex);
8629 return 0; 8340 return 0;
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
new file mode 100644
index 000000000000..57a7ac286a02
--- /dev/null
+++ b/kernel/sched_autogroup.c
@@ -0,0 +1,229 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3#include <linux/proc_fs.h>
4#include <linux/seq_file.h>
5#include <linux/kallsyms.h>
6#include <linux/utsname.h>
7
8unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
9static struct autogroup autogroup_default;
10static atomic_t autogroup_seq_nr;
11
12static void autogroup_init(struct task_struct *init_task)
13{
14 autogroup_default.tg = &init_task_group;
15 init_task_group.autogroup = &autogroup_default;
16 kref_init(&autogroup_default.kref);
17 init_rwsem(&autogroup_default.lock);
18 init_task->signal->autogroup = &autogroup_default;
19}
20
21static inline void autogroup_free(struct task_group *tg)
22{
23 kfree(tg->autogroup);
24}
25
26static inline void autogroup_destroy(struct kref *kref)
27{
28 struct autogroup *ag = container_of(kref, struct autogroup, kref);
29
30 sched_destroy_group(ag->tg);
31}
32
33static inline void autogroup_kref_put(struct autogroup *ag)
34{
35 kref_put(&ag->kref, autogroup_destroy);
36}
37
38static inline struct autogroup *autogroup_kref_get(struct autogroup *ag)
39{
40 kref_get(&ag->kref);
41 return ag;
42}
43
44static inline struct autogroup *autogroup_create(void)
45{
46 struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
47 struct task_group *tg;
48
49 if (!ag)
50 goto out_fail;
51
52 tg = sched_create_group(&init_task_group);
53
54 if (IS_ERR(tg))
55 goto out_free;
56
57 kref_init(&ag->kref);
58 init_rwsem(&ag->lock);
59 ag->id = atomic_inc_return(&autogroup_seq_nr);
60 ag->tg = tg;
61 tg->autogroup = ag;
62
63 return ag;
64
65out_free:
66 kfree(ag);
67out_fail:
68 if (printk_ratelimit()) {
69 printk(KERN_WARNING "autogroup_create: %s failure.\n",
70 ag ? "sched_create_group()" : "kmalloc()");
71 }
72
73 return autogroup_kref_get(&autogroup_default);
74}
75
76static inline bool
77task_wants_autogroup(struct task_struct *p, struct task_group *tg)
78{
79 if (tg != &root_task_group)
80 return false;
81
82 if (p->sched_class != &fair_sched_class)
83 return false;
84
85 /*
86 * We can only assume the task group can't go away on us if
87 * autogroup_move_group() can see us on ->thread_group list.
88 */
89 if (p->flags & PF_EXITING)
90 return false;
91
92 return true;
93}
94
95static inline struct task_group *
96autogroup_task_group(struct task_struct *p, struct task_group *tg)
97{
98 int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
99
100 if (enabled && task_wants_autogroup(p, tg))
101 return p->signal->autogroup->tg;
102
103 return tg;
104}
105
106static void
107autogroup_move_group(struct task_struct *p, struct autogroup *ag)
108{
109 struct autogroup *prev;
110 struct task_struct *t;
111 unsigned long flags;
112
113 BUG_ON(!lock_task_sighand(p, &flags));
114
115 prev = p->signal->autogroup;
116 if (prev == ag) {
117 unlock_task_sighand(p, &flags);
118 return;
119 }
120
121 p->signal->autogroup = autogroup_kref_get(ag);
122
123 t = p;
124 do {
125 sched_move_task(t);
126 } while_each_thread(p, t);
127
128 unlock_task_sighand(p, &flags);
129 autogroup_kref_put(prev);
130}
131
132/* Allocates GFP_KERNEL, cannot be called under any spinlock */
133void sched_autogroup_create_attach(struct task_struct *p)
134{
135 struct autogroup *ag = autogroup_create();
136
137 autogroup_move_group(p, ag);
138 /* drop extra refrence added by autogroup_create() */
139 autogroup_kref_put(ag);
140}
141EXPORT_SYMBOL(sched_autogroup_create_attach);
142
143/* Cannot be called under siglock. Currently has no users */
144void sched_autogroup_detach(struct task_struct *p)
145{
146 autogroup_move_group(p, &autogroup_default);
147}
148EXPORT_SYMBOL(sched_autogroup_detach);
149
150void sched_autogroup_fork(struct signal_struct *sig)
151{
152 struct task_struct *p = current;
153
154 spin_lock_irq(&p->sighand->siglock);
155 sig->autogroup = autogroup_kref_get(p->signal->autogroup);
156 spin_unlock_irq(&p->sighand->siglock);
157}
158
159void sched_autogroup_exit(struct signal_struct *sig)
160{
161 autogroup_kref_put(sig->autogroup);
162}
163
164static int __init setup_autogroup(char *str)
165{
166 sysctl_sched_autogroup_enabled = 0;
167
168 return 1;
169}
170
171__setup("noautogroup", setup_autogroup);
172
173#ifdef CONFIG_PROC_FS
174
175/* Called with siglock held. */
176int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice)
177{
178 static unsigned long next = INITIAL_JIFFIES;
179 struct autogroup *ag;
180 int err;
181
182 if (*nice < -20 || *nice > 19)
183 return -EINVAL;
184
185 err = security_task_setnice(current, *nice);
186 if (err)
187 return err;
188
189 if (*nice < 0 && !can_nice(current, *nice))
190 return -EPERM;
191
192 /* this is a heavy operation taking global locks.. */
193 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
194 return -EAGAIN;
195
196 next = HZ / 10 + jiffies;
197 ag = autogroup_kref_get(p->signal->autogroup);
198
199 down_write(&ag->lock);
200 err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]);
201 if (!err)
202 ag->nice = *nice;
203 up_write(&ag->lock);
204
205 autogroup_kref_put(ag);
206
207 return err;
208}
209
210void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
211{
212 struct autogroup *ag = autogroup_kref_get(p->signal->autogroup);
213
214 down_read(&ag->lock);
215 seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
216 up_read(&ag->lock);
217
218 autogroup_kref_put(ag);
219}
220#endif /* CONFIG_PROC_FS */
221
222#ifdef CONFIG_SCHED_DEBUG
223static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
224{
225 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
226}
227#endif /* CONFIG_SCHED_DEBUG */
228
229#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
new file mode 100644
index 000000000000..5358e241cb20
--- /dev/null
+++ b/kernel/sched_autogroup.h
@@ -0,0 +1,32 @@
1#ifdef CONFIG_SCHED_AUTOGROUP
2
3struct autogroup {
4 struct kref kref;
5 struct task_group *tg;
6 struct rw_semaphore lock;
7 unsigned long id;
8 int nice;
9};
10
11static inline struct task_group *
12autogroup_task_group(struct task_struct *p, struct task_group *tg);
13
14#else /* !CONFIG_SCHED_AUTOGROUP */
15
16static inline void autogroup_init(struct task_struct *init_task) { }
17static inline void autogroup_free(struct task_group *tg) { }
18
19static inline struct task_group *
20autogroup_task_group(struct task_struct *p, struct task_group *tg)
21{
22 return tg;
23}
24
25#ifdef CONFIG_SCHED_DEBUG
26static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
27{
28 return 0;
29}
30#endif
31
32#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 52f1a149bfb1..9d8af0b3fb64 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -79,7 +79,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
79} 79}
80EXPORT_SYMBOL_GPL(sched_clock); 80EXPORT_SYMBOL_GPL(sched_clock);
81 81
82static __read_mostly int sched_clock_running; 82__read_mostly int sched_clock_running;
83 83
84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 84#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
85__read_mostly int sched_clock_stable; 85__read_mostly int sched_clock_stable;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..1dfae3d014b5 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -54,8 +54,7 @@ static unsigned long nsec_low(unsigned long long nsec)
54#define SPLIT_NS(x) nsec_high(x), nsec_low(x) 54#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
55 55
56#ifdef CONFIG_FAIR_GROUP_SCHED 56#ifdef CONFIG_FAIR_GROUP_SCHED
57static void print_cfs_group_stats(struct seq_file *m, int cpu, 57static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
58 struct task_group *tg)
59{ 58{
60 struct sched_entity *se = tg->se[cpu]; 59 struct sched_entity *se = tg->se[cpu];
61 if (!se) 60 if (!se)
@@ -110,16 +109,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 109 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
111#endif 110#endif
112 111
113#ifdef CONFIG_CGROUP_SCHED
114 {
115 char path[64];
116
117 rcu_read_lock();
118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
120 SEQ_printf(m, " %s", path);
121 }
122#endif
123 SEQ_printf(m, "\n"); 112 SEQ_printf(m, "\n");
124} 113}
125 114
@@ -147,19 +136,6 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
147 read_unlock_irqrestore(&tasklist_lock, flags); 136 read_unlock_irqrestore(&tasklist_lock, flags);
148} 137}
149 138
150#if defined(CONFIG_CGROUP_SCHED) && \
151 (defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED))
152static void task_group_path(struct task_group *tg, char *buf, int buflen)
153{
154 /* may be NULL if the underlying cgroup isn't fully-created yet */
155 if (!tg->css.cgroup) {
156 buf[0] = '\0';
157 return;
158 }
159 cgroup_path(tg->css.cgroup, buf, buflen);
160}
161#endif
162
163void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 139void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
164{ 140{
165 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, 141 s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1,
@@ -168,16 +144,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
168 struct sched_entity *last; 144 struct sched_entity *last;
169 unsigned long flags; 145 unsigned long flags;
170 146
171#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
172 char path[128];
173 struct task_group *tg = cfs_rq->tg;
174
175 task_group_path(tg, path, sizeof(path));
176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#else
179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 147 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
180#endif
181 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 148 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
182 SPLIT_NS(cfs_rq->exec_clock)); 149 SPLIT_NS(cfs_rq->exec_clock));
183 150
@@ -202,32 +169,29 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 169 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 170 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 171 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 172 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 173 cfs_rq->nr_spread_over);
174 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
175 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 176#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 177#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 178 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
179 SPLIT_NS(cfs_rq->load_avg));
180 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
181 SPLIT_NS(cfs_rq->load_period));
182 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
183 cfs_rq->load_contribution);
184 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
185 atomic_read(&cfs_rq->tg->load_weight));
213#endif 186#endif
187
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 188 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 189#endif
216} 190}
217 191
218void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
219{ 193{
220#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
221 char path[128];
222 struct task_group *tg = rt_rq->tg;
223
224 task_group_path(tg, path, sizeof(path));
225
226 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
227#else
228 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); 194 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
229#endif
230
231 195
232#define P(x) \ 196#define P(x) \
233 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) 197 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -243,6 +207,8 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
243#undef P 207#undef P
244} 208}
245 209
210extern __read_mostly int sched_clock_running;
211
246static void print_cpu(struct seq_file *m, int cpu) 212static void print_cpu(struct seq_file *m, int cpu)
247{ 213{
248 struct rq *rq = cpu_rq(cpu); 214 struct rq *rq = cpu_rq(cpu);
@@ -314,21 +280,42 @@ static const char *sched_tunable_scaling_names[] = {
314 280
315static int sched_debug_show(struct seq_file *m, void *v) 281static int sched_debug_show(struct seq_file *m, void *v)
316{ 282{
317 u64 now = ktime_to_ns(ktime_get()); 283 u64 ktime, sched_clk, cpu_clk;
284 unsigned long flags;
318 int cpu; 285 int cpu;
319 286
320 SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", 287 local_irq_save(flags);
288 ktime = ktime_to_ns(ktime_get());
289 sched_clk = sched_clock();
290 cpu_clk = local_clock();
291 local_irq_restore(flags);
292
293 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
321 init_utsname()->release, 294 init_utsname()->release,
322 (int)strcspn(init_utsname()->version, " "), 295 (int)strcspn(init_utsname()->version, " "),
323 init_utsname()->version); 296 init_utsname()->version);
324 297
325 SEQ_printf(m, "now at %Lu.%06ld msecs\n", SPLIT_NS(now)); 298#define P(x) \
299 SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x))
300#define PN(x) \
301 SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
302 PN(ktime);
303 PN(sched_clk);
304 PN(cpu_clk);
305 P(jiffies);
306#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
307 P(sched_clock_stable);
308#endif
309#undef PN
310#undef P
311
312 SEQ_printf(m, "\n");
313 SEQ_printf(m, "sysctl_sched\n");
326 314
327#define P(x) \ 315#define P(x) \
328 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) 316 SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x))
329#define PN(x) \ 317#define PN(x) \
330 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 318 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
331 P(jiffies);
332 PN(sysctl_sched_latency); 319 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 320 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 321 PN(sysctl_sched_wakeup_granularity);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 00ebd7686676..c88671718bc9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
89 89
90const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
91 91
92/*
93 * The exponential sliding window over which load is averaged for shares
94 * distribution.
95 * (default: 10msec)
96 */
97unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
98
92static const struct sched_class fair_sched_class; 99static const struct sched_class fair_sched_class;
93 100
94/************************************************************** 101/**************************************************************
@@ -143,6 +150,36 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
143 return cfs_rq->tg->cfs_rq[this_cpu]; 150 return cfs_rq->tg->cfs_rq[this_cpu];
144} 151}
145 152
153static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
154{
155 if (!cfs_rq->on_list) {
156 /*
157 * Ensure we either appear before our parent (if already
158 * enqueued) or force our parent to appear after us when it is
159 * enqueued. The fact that we always enqueue bottom-up
160 * reduces this to two cases.
161 */
162 if (cfs_rq->tg->parent &&
163 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
164 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
165 &rq_of(cfs_rq)->leaf_cfs_rq_list);
166 } else {
167 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
168 &rq_of(cfs_rq)->leaf_cfs_rq_list);
169 }
170
171 cfs_rq->on_list = 1;
172 }
173}
174
175static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
176{
177 if (cfs_rq->on_list) {
178 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
179 cfs_rq->on_list = 0;
180 }
181}
182
146/* Iterate thr' all leaf cfs_rq's on a runqueue */ 183/* Iterate thr' all leaf cfs_rq's on a runqueue */
147#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 184#define for_each_leaf_cfs_rq(rq, cfs_rq) \
148 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 185 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
@@ -246,6 +283,14 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
246 return &cpu_rq(this_cpu)->cfs; 283 return &cpu_rq(this_cpu)->cfs;
247} 284}
248 285
286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
287{
288}
289
290static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{
292}
293
249#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 294#define for_each_leaf_cfs_rq(rq, cfs_rq) \
250 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) 295 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
251 296
@@ -417,7 +462,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 462 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 463 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 464 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 465#undef WRT_SYSCTL
422 466
423 return 0; 467 return 0;
@@ -495,6 +539,9 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
495 return calc_delta_fair(sched_slice(cfs_rq, se), se); 539 return calc_delta_fair(sched_slice(cfs_rq, se), se);
496} 540}
497 541
542static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
543static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
544
498/* 545/*
499 * Update the current task's runtime statistics. Skip current tasks that 546 * Update the current task's runtime statistics. Skip current tasks that
500 * are not in our scheduling class. 547 * are not in our scheduling class.
@@ -514,6 +561,14 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
514 561
515 curr->vruntime += delta_exec_weighted; 562 curr->vruntime += delta_exec_weighted;
516 update_min_vruntime(cfs_rq); 563 update_min_vruntime(cfs_rq);
564
565#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
566 cfs_rq->load_unacc_exec_time += delta_exec;
567 if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
568 update_cfs_load(cfs_rq, 0);
569 update_cfs_shares(cfs_rq, 0);
570 }
571#endif
517} 572}
518 573
519static void update_curr(struct cfs_rq *cfs_rq) 574static void update_curr(struct cfs_rq *cfs_rq)
@@ -633,7 +688,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 688 list_add(&se->group_node, &cfs_rq->tasks);
634 } 689 }
635 cfs_rq->nr_running++; 690 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 691}
638 692
639static void 693static void
@@ -647,9 +701,124 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 701 list_del_init(&se->group_node);
648 } 702 }
649 cfs_rq->nr_running--; 703 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 704}
652 705
706#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
707static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
708 int global_update)
709{
710 struct task_group *tg = cfs_rq->tg;
711 long load_avg;
712
713 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
714 load_avg -= cfs_rq->load_contribution;
715
716 if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) {
717 atomic_add(load_avg, &tg->load_weight);
718 cfs_rq->load_contribution += load_avg;
719 }
720}
721
722static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
723{
724 u64 period = sysctl_sched_shares_window;
725 u64 now, delta;
726 unsigned long load = cfs_rq->load.weight;
727
728 if (!cfs_rq)
729 return;
730
731 now = rq_of(cfs_rq)->clock;
732 delta = now - cfs_rq->load_stamp;
733
734 /* truncate load history at 4 idle periods */
735 if (cfs_rq->load_stamp > cfs_rq->load_last &&
736 now - cfs_rq->load_last > 4 * period) {
737 cfs_rq->load_period = 0;
738 cfs_rq->load_avg = 0;
739 }
740
741 cfs_rq->load_stamp = now;
742 cfs_rq->load_unacc_exec_time = 0;
743 cfs_rq->load_period += delta;
744 if (load) {
745 cfs_rq->load_last = now;
746 cfs_rq->load_avg += delta * load;
747 }
748
749 /* consider updating load contribution on each fold or truncate */
750 if (global_update || cfs_rq->load_period > period
751 || !cfs_rq->load_period)
752 update_cfs_rq_load_contribution(cfs_rq, global_update);
753
754 while (cfs_rq->load_period > period) {
755 /*
756 * Inline assembly required to prevent the compiler
757 * optimising this loop into a divmod call.
758 * See __iter_div_u64_rem() for another example of this.
759 */
760 asm("" : "+rm" (cfs_rq->load_period));
761 cfs_rq->load_period /= 2;
762 cfs_rq->load_avg /= 2;
763 }
764
765 if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg)
766 list_del_leaf_cfs_rq(cfs_rq);
767}
768
769static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
770 unsigned long weight)
771{
772 if (se->on_rq)
773 account_entity_dequeue(cfs_rq, se);
774
775 update_load_set(&se->load, weight);
776
777 if (se->on_rq)
778 account_entity_enqueue(cfs_rq, se);
779}
780
781static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
782{
783 struct task_group *tg;
784 struct sched_entity *se;
785 long load_weight, load, shares;
786
787 if (!cfs_rq)
788 return;
789
790 tg = cfs_rq->tg;
791 se = tg->se[cpu_of(rq_of(cfs_rq))];
792 if (!se)
793 return;
794
795 load = cfs_rq->load.weight + weight_delta;
796
797 load_weight = atomic_read(&tg->load_weight);
798 load_weight -= cfs_rq->load_contribution;
799 load_weight += load;
800
801 shares = (tg->shares * load);
802 if (load_weight)
803 shares /= load_weight;
804
805 if (shares < MIN_SHARES)
806 shares = MIN_SHARES;
807 if (shares > tg->shares)
808 shares = tg->shares;
809
810 reweight_entity(cfs_rq_of(se), se, shares);
811}
812#else /* CONFIG_FAIR_GROUP_SCHED */
813static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
814{
815}
816
817static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
818{
819}
820#endif /* CONFIG_FAIR_GROUP_SCHED */
821
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 822static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 823{
655#ifdef CONFIG_SCHEDSTATS 824#ifdef CONFIG_SCHEDSTATS
@@ -771,6 +940,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 940 * Update run-time statistics of the 'current'.
772 */ 941 */
773 update_curr(cfs_rq); 942 update_curr(cfs_rq);
943 update_cfs_load(cfs_rq, 0);
944 update_cfs_shares(cfs_rq, se->load.weight);
774 account_entity_enqueue(cfs_rq, se); 945 account_entity_enqueue(cfs_rq, se);
775 946
776 if (flags & ENQUEUE_WAKEUP) { 947 if (flags & ENQUEUE_WAKEUP) {
@@ -782,6 +953,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 953 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 954 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 955 __enqueue_entity(cfs_rq, se);
956 se->on_rq = 1;
957
958 if (cfs_rq->nr_running == 1)
959 list_add_leaf_cfs_rq(cfs_rq);
785} 960}
786 961
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 962static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +1000,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 1000
826 if (se != cfs_rq->curr) 1001 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 1002 __dequeue_entity(cfs_rq, se);
1003 se->on_rq = 0;
1004 update_cfs_load(cfs_rq, 0);
828 account_entity_dequeue(cfs_rq, se); 1005 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 1006 update_min_vruntime(cfs_rq);
1007 update_cfs_shares(cfs_rq, 0);
830 1008
831 /* 1009 /*
832 * Normalize the entity after updating the min_vruntime because the 1010 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1233,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1233 flags = ENQUEUE_WAKEUP;
1056 } 1234 }
1057 1235
1236 for_each_sched_entity(se) {
1237 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1238
1239 update_cfs_load(cfs_rq, 0);
1240 update_cfs_shares(cfs_rq, 0);
1241 }
1242
1058 hrtick_update(rq); 1243 hrtick_update(rq);
1059} 1244}
1060 1245
@@ -1071,12 +1256,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1256 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1257 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1258 dequeue_entity(cfs_rq, se, flags);
1259
1074 /* Don't dequeue parent if it has other entities besides us */ 1260 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1261 if (cfs_rq->load.weight)
1076 break; 1262 break;
1077 flags |= DEQUEUE_SLEEP; 1263 flags |= DEQUEUE_SLEEP;
1078 } 1264 }
1079 1265
1266 for_each_sched_entity(se) {
1267 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1268
1269 update_cfs_load(cfs_rq, 0);
1270 update_cfs_shares(cfs_rq, 0);
1271 }
1272
1080 hrtick_update(rq); 1273 hrtick_update(rq);
1081} 1274}
1082 1275
@@ -1143,51 +1336,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1336 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1337 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1338 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1339 */
1161static long effective_load(struct task_group *tg, int cpu, 1340static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1341{
1164 struct sched_entity *se = tg->se[cpu]; 1342 struct sched_entity *se = tg->se[cpu];
1165 1343
1166 if (!tg->parent) 1344 if (!tg->parent)
1167 return wl; 1345 return wl;
1168 1346
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1347 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1348 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1349
1188 S = se->my_q->tg->shares; 1350 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1351 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1352 rw = se->my_q->load.weight;
1191 1353
1192 a = S*(rw + wl); 1354 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1355 b = S*rw + s*wg;
@@ -1508,23 +1670,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1670 sd = tmp;
1509 } 1671 }
1510 1672
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1673 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1674 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1675 return select_idle_sibling(p, cpu);
@@ -1909,6 +2054,48 @@ out:
1909} 2054}
1910 2055
1911#ifdef CONFIG_FAIR_GROUP_SCHED 2056#ifdef CONFIG_FAIR_GROUP_SCHED
2057/*
2058 * update tg->load_weight by folding this cpu's load_avg
2059 */
2060static int update_shares_cpu(struct task_group *tg, int cpu)
2061{
2062 struct cfs_rq *cfs_rq;
2063 unsigned long flags;
2064 struct rq *rq;
2065
2066 if (!tg->se[cpu])
2067 return 0;
2068
2069 rq = cpu_rq(cpu);
2070 cfs_rq = tg->cfs_rq[cpu];
2071
2072 raw_spin_lock_irqsave(&rq->lock, flags);
2073
2074 update_rq_clock(rq);
2075 update_cfs_load(cfs_rq, 1);
2076
2077 /*
2078 * We need to update shares after updating tg->load_weight in
2079 * order to adjust the weight of groups with long running tasks.
2080 */
2081 update_cfs_shares(cfs_rq, 0);
2082
2083 raw_spin_unlock_irqrestore(&rq->lock, flags);
2084
2085 return 0;
2086}
2087
2088static void update_shares(int cpu)
2089{
2090 struct cfs_rq *cfs_rq;
2091 struct rq *rq = cpu_rq(cpu);
2092
2093 rcu_read_lock();
2094 for_each_leaf_cfs_rq(rq, cfs_rq)
2095 update_shares_cpu(cfs_rq->tg, cpu);
2096 rcu_read_unlock();
2097}
2098
1912static unsigned long 2099static unsigned long
1913load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2100load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1914 unsigned long max_load_move, 2101 unsigned long max_load_move,
@@ -1956,6 +2143,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1956 return max_load_move - rem_load_move; 2143 return max_load_move - rem_load_move;
1957} 2144}
1958#else 2145#else
2146static inline void update_shares(int cpu)
2147{
2148}
2149
1959static unsigned long 2150static unsigned long
1960load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2151load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1961 unsigned long max_load_move, 2152 unsigned long max_load_move,
@@ -3032,7 +3223,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3032 schedstat_inc(sd, lb_count[idle]); 3223 schedstat_inc(sd, lb_count[idle]);
3033 3224
3034redo: 3225redo:
3035 update_shares(sd);
3036 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3226 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3037 cpus, balance); 3227 cpus, balance);
3038 3228
@@ -3174,8 +3364,6 @@ out_one_pinned:
3174 else 3364 else
3175 ld_moved = 0; 3365 ld_moved = 0;
3176out: 3366out:
3177 if (ld_moved)
3178 update_shares(sd);
3179 return ld_moved; 3367 return ld_moved;
3180} 3368}
3181 3369
@@ -3199,6 +3387,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3199 */ 3387 */
3200 raw_spin_unlock(&this_rq->lock); 3388 raw_spin_unlock(&this_rq->lock);
3201 3389
3390 update_shares(this_cpu);
3202 for_each_domain(this_cpu, sd) { 3391 for_each_domain(this_cpu, sd) {
3203 unsigned long interval; 3392 unsigned long interval;
3204 int balance = 1; 3393 int balance = 1;
@@ -3569,6 +3758,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3569 int update_next_balance = 0; 3758 int update_next_balance = 0;
3570 int need_serialize; 3759 int need_serialize;
3571 3760
3761 update_shares(cpu);
3762
3572 for_each_domain(cpu, sd) { 3763 for_each_domain(cpu, sd) {
3573 if (!(sd->flags & SD_LOAD_BALANCE)) 3764 if (!(sd->flags & SD_LOAD_BALANCE))
3574 continue; 3765 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index bea7d79f7e9c..c914ec747ca6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,17 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); 183 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
184} 184}
185 185
186static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
187{
188 list_add_rcu(&rt_rq->leaf_rt_rq_list,
189 &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list);
190}
191
192static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
193{
194 list_del_rcu(&rt_rq->leaf_rt_rq_list);
195}
196
186#define for_each_leaf_rt_rq(rt_rq, rq) \ 197#define for_each_leaf_rt_rq(rt_rq, rq) \
187 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) 198 list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
188 199
@@ -276,6 +287,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
276 return ktime_to_ns(def_rt_bandwidth.rt_period); 287 return ktime_to_ns(def_rt_bandwidth.rt_period);
277} 288}
278 289
290static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
291{
292}
293
294static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq)
295{
296}
297
279#define for_each_leaf_rt_rq(rt_rq, rq) \ 298#define for_each_leaf_rt_rq(rt_rq, rq) \
280 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) 299 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
281 300
@@ -825,6 +844,9 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 844 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
826 return; 845 return;
827 846
847 if (!rt_rq->rt_nr_running)
848 list_add_leaf_rt_rq(rt_rq);
849
828 if (head) 850 if (head)
829 list_add(&rt_se->run_list, queue); 851 list_add(&rt_se->run_list, queue);
830 else 852 else
@@ -844,6 +866,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
844 __clear_bit(rt_se_prio(rt_se), array->bitmap); 866 __clear_bit(rt_se_prio(rt_se), array->bitmap);
845 867
846 dec_rt_tasks(rt_se, rt_rq); 868 dec_rt_tasks(rt_se, rt_rq);
869 if (!rt_rq->rt_nr_running)
870 list_del_leaf_rt_rq(rt_rq);
847} 871}
848 872
849/* 873/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 18f4be0d5fe0..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -853,7 +853,9 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
853 cpumask_any(cpu_online_mask)); 853 cpumask_any(cpu_online_mask));
854 case CPU_DEAD: 854 case CPU_DEAD:
855 case CPU_DEAD_FROZEN: { 855 case CPU_DEAD_FROZEN: {
856 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 856 static struct sched_param param = {
857 .sched_priority = MAX_RT_PRIO-1
858 };
857 859
858 p = per_cpu(ksoftirqd, hotcpu); 860 p = per_cpu(ksoftirqd, hotcpu);
859 per_cpu(ksoftirqd, hotcpu) = NULL; 861 per_cpu(ksoftirqd, hotcpu) = NULL;
diff --git a/kernel/sys.c b/kernel/sys.c
index 7f5a0cd296a9..2745dcdb6c6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1080,8 +1080,10 @@ SYSCALL_DEFINE0(setsid)
1080 err = session; 1080 err = session;
1081out: 1081out:
1082 write_unlock_irq(&tasklist_lock); 1082 write_unlock_irq(&tasklist_lock);
1083 if (err > 0) 1083 if (err > 0) {
1084 proc_sid_connector(group_leader); 1084 proc_sid_connector(group_leader);
1085 sched_autogroup_create_attach(group_leader);
1086 }
1085 return err; 1087 return err;
1086} 1088}
1087 1089
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5abfa1518554..121e4fff03d1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),
@@ -352,6 +333,13 @@ static struct ctl_table kern_table[] = {
352 .proc_handler = proc_dointvec, 333 .proc_handler = proc_dointvec,
353 }, 334 },
354 { 335 {
336 .procname = "sched_shares_window",
337 .data = &sysctl_sched_shares_window,
338 .maxlen = sizeof(unsigned int),
339 .mode = 0644,
340 .proc_handler = proc_dointvec,
341 },
342 {
355 .procname = "timer_migration", 343 .procname = "timer_migration",
356 .data = &sysctl_timer_migration, 344 .data = &sysctl_timer_migration,
357 .maxlen = sizeof(unsigned int), 345 .maxlen = sizeof(unsigned int),
@@ -382,6 +370,17 @@ static struct ctl_table kern_table[] = {
382 .mode = 0644, 370 .mode = 0644,
383 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
384 }, 372 },
373#ifdef CONFIG_SCHED_AUTOGROUP
374 {
375 .procname = "sched_autogroup_enabled",
376 .data = &sysctl_sched_autogroup_enabled,
377 .maxlen = sizeof(unsigned int),
378 .mode = 0644,
379 .proc_handler = proc_dointvec,
380 .extra1 = &zero,
381 .extra2 = &one,
382 },
383#endif
385#ifdef CONFIG_PROVE_LOCKING 384#ifdef CONFIG_PROVE_LOCKING
386 { 385 {
387 .procname = "prove_locking", 386 .procname = "prove_locking",
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 155a415b3209..562c56e048fd 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -558,7 +558,7 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
558static int trace_wakeup_test_thread(void *data) 558static int trace_wakeup_test_thread(void *data)
559{ 559{
560 /* Make this a RT thread, doesn't need to be too high */ 560 /* Make this a RT thread, doesn't need to be too high */
561 struct sched_param param = { .sched_priority = 5 }; 561 static struct sched_param param = { .sched_priority = 5 };
562 struct completion *x = data; 562 struct completion *x = data;
563 563
564 sched_setscheduler(current, SCHED_FIFO, &param); 564 sched_setscheduler(current, SCHED_FIFO, &param);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6e3c41a4024c..14b8120d5232 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -307,7 +307,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
307 */ 307 */
308static int watchdog(void *unused) 308static int watchdog(void *unused)
309{ 309{
310 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 310 static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 311 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
312 312
313 sched_setscheduler(current, SCHED_FIFO, &param); 313 sched_setscheduler(current, SCHED_FIFO, &param);