aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/auto_group.c3
-rw-r--r--kernel/sched/core.c159
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c314
-rw-r--r--kernel/sched/debug.c101
-rw-r--r--kernel/sched/fair.c29
-rw-r--r--kernel/sched/rt.c28
-rw-r--r--kernel/sched/sched.h2
-rw-r--r--kernel/sched/stats.c79
9 files changed, 552 insertions, 165 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
35 ag->tg->rt_se = NULL; 35 ag->tg->rt_se = NULL;
36 ag->tg->rt_rq = NULL; 36 ag->tg->rt_rq = NULL;
37#endif 37#endif
38 sched_offline_group(ag->tg);
38 sched_destroy_group(ag->tg); 39 sched_destroy_group(ag->tg);
39} 40}
40 41
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
76 if (IS_ERR(tg)) 77 if (IS_ERR(tg))
77 goto out_free; 78 goto out_free;
78 79
80 sched_online_group(tg, &root_task_group);
81
79 kref_init(&ag->kref); 82 kref_init(&ag->kref);
80 init_rwsem(&ag->lock); 83 init_rwsem(&ag->lock);
81 ag->id = atomic_inc_return(&autogroup_seq_nr); 84 ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6fdcdcbb9b1..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
83#endif 83#endif
84 84
85#include "sched.h" 85#include "sched.h"
86#include "../workqueue_sched.h" 86#include "../workqueue_internal.h"
87#include "../smpboot.h" 87#include "../smpboot.h"
88 88
89#define CREATE_TRACE_POINTS 89#define CREATE_TRACE_POINTS
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
1132 */ 1132 */
1133static int select_fallback_rq(int cpu, struct task_struct *p) 1133static int select_fallback_rq(int cpu, struct task_struct *p)
1134{ 1134{
1135 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); 1135 int nid = cpu_to_node(cpu);
1136 const struct cpumask *nodemask = NULL;
1136 enum { cpuset, possible, fail } state = cpuset; 1137 enum { cpuset, possible, fail } state = cpuset;
1137 int dest_cpu; 1138 int dest_cpu;
1138 1139
1139 /* Look for allowed, online CPU in same node. */ 1140 /*
1140 for_each_cpu(dest_cpu, nodemask) { 1141 * If the node that the cpu is on has been offlined, cpu_to_node()
1141 if (!cpu_online(dest_cpu)) 1142 * will return -1. There is no cpu on the node, and we should
1142 continue; 1143 * select the cpu on the other node.
1143 if (!cpu_active(dest_cpu)) 1144 */
1144 continue; 1145 if (nid != -1) {
1145 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 1146 nodemask = cpumask_of_node(nid);
1146 return dest_cpu; 1147
1148 /* Look for allowed, online CPU in same node. */
1149 for_each_cpu(dest_cpu, nodemask) {
1150 if (!cpu_online(dest_cpu))
1151 continue;
1152 if (!cpu_active(dest_cpu))
1153 continue;
1154 if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1155 return dest_cpu;
1156 }
1147 } 1157 }
1148 1158
1149 for (;;) { 1159 for (;;) {
@@ -1523,7 +1533,8 @@ out:
1523 */ 1533 */
1524int wake_up_process(struct task_struct *p) 1534int wake_up_process(struct task_struct *p)
1525{ 1535{
1526 return try_to_wake_up(p, TASK_ALL, 0); 1536 WARN_ON(task_is_stopped_or_traced(p));
1537 return try_to_wake_up(p, TASK_NORMAL, 0);
1527} 1538}
1528EXPORT_SYMBOL(wake_up_process); 1539EXPORT_SYMBOL(wake_up_process);
1529 1540
@@ -1741,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
1741static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 1752static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
1742{ 1753{
1743 struct preempt_notifier *notifier; 1754 struct preempt_notifier *notifier;
1744 struct hlist_node *node;
1745 1755
1746 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1756 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1747 notifier->ops->sched_in(notifier, raw_smp_processor_id()); 1757 notifier->ops->sched_in(notifier, raw_smp_processor_id());
1748} 1758}
1749 1759
@@ -1752,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
1752 struct task_struct *next) 1762 struct task_struct *next)
1753{ 1763{
1754 struct preempt_notifier *notifier; 1764 struct preempt_notifier *notifier;
1755 struct hlist_node *node;
1756 1765
1757 hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) 1766 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
1758 notifier->ops->sched_out(notifier, next); 1767 notifier->ops->sched_out(notifier, next);
1759} 1768}
1760 1769
@@ -1968,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
1968} 1977}
1969 1978
1970/* 1979/*
1971 * nr_running, nr_uninterruptible and nr_context_switches: 1980 * nr_running and nr_context_switches:
1972 * 1981 *
1973 * externally visible scheduler statistics: current number of runnable 1982 * externally visible scheduler statistics: current number of runnable
1974 * threads, current number of uninterruptible-sleeping threads, total 1983 * threads, total number of context switches performed since bootup.
1975 * number of context switches performed since bootup.
1976 */ 1984 */
1977unsigned long nr_running(void) 1985unsigned long nr_running(void)
1978{ 1986{
@@ -1984,23 +1992,6 @@ unsigned long nr_running(void)
1984 return sum; 1992 return sum;
1985} 1993}
1986 1994
1987unsigned long nr_uninterruptible(void)
1988{
1989 unsigned long i, sum = 0;
1990
1991 for_each_possible_cpu(i)
1992 sum += cpu_rq(i)->nr_uninterruptible;
1993
1994 /*
1995 * Since we read the counters lockless, it might be slightly
1996 * inaccurate. Do not allow it to go below zero though:
1997 */
1998 if (unlikely((long)sum < 0))
1999 sum = 0;
2000
2001 return sum;
2002}
2003
2004unsigned long long nr_context_switches(void) 1995unsigned long long nr_context_switches(void)
2005{ 1996{
2006 int i; 1997 int i;
@@ -2785,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
2785 if (irqs_disabled()) 2776 if (irqs_disabled())
2786 print_irqtrace_events(prev); 2777 print_irqtrace_events(prev);
2787 dump_stack(); 2778 dump_stack();
2788 add_taint(TAINT_WARN); 2779 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
2789} 2780}
2790 2781
2791/* 2782/*
@@ -4410,20 +4401,32 @@ EXPORT_SYMBOL(yield);
4410 * It's the caller's job to ensure that the target task struct 4401 * It's the caller's job to ensure that the target task struct
4411 * can't go away on us before we can do any checks. 4402 * can't go away on us before we can do any checks.
4412 * 4403 *
4413 * Returns true if we indeed boosted the target task. 4404 * Returns:
4405 * true (>0) if we indeed boosted the target task.
4406 * false (0) if we failed to boost the target.
4407 * -ESRCH if there's no task to yield to.
4414 */ 4408 */
4415bool __sched yield_to(struct task_struct *p, bool preempt) 4409bool __sched yield_to(struct task_struct *p, bool preempt)
4416{ 4410{
4417 struct task_struct *curr = current; 4411 struct task_struct *curr = current;
4418 struct rq *rq, *p_rq; 4412 struct rq *rq, *p_rq;
4419 unsigned long flags; 4413 unsigned long flags;
4420 bool yielded = 0; 4414 int yielded = 0;
4421 4415
4422 local_irq_save(flags); 4416 local_irq_save(flags);
4423 rq = this_rq(); 4417 rq = this_rq();
4424 4418
4425again: 4419again:
4426 p_rq = task_rq(p); 4420 p_rq = task_rq(p);
4421 /*
4422 * If we're the only runnable task on the rq and target rq also
4423 * has only one task, there's absolutely no point in yielding.
4424 */
4425 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
4426 yielded = -ESRCH;
4427 goto out_irq;
4428 }
4429
4427 double_rq_lock(rq, p_rq); 4430 double_rq_lock(rq, p_rq);
4428 while (task_rq(p) != p_rq) { 4431 while (task_rq(p) != p_rq) {
4429 double_rq_unlock(rq, p_rq); 4432 double_rq_unlock(rq, p_rq);
@@ -4431,13 +4434,13 @@ again:
4431 } 4434 }
4432 4435
4433 if (!curr->sched_class->yield_to_task) 4436 if (!curr->sched_class->yield_to_task)
4434 goto out; 4437 goto out_unlock;
4435 4438
4436 if (curr->sched_class != p->sched_class) 4439 if (curr->sched_class != p->sched_class)
4437 goto out; 4440 goto out_unlock;
4438 4441
4439 if (task_running(p_rq, p) || p->state) 4442 if (task_running(p_rq, p) || p->state)
4440 goto out; 4443 goto out_unlock;
4441 4444
4442 yielded = curr->sched_class->yield_to_task(rq, p, preempt); 4445 yielded = curr->sched_class->yield_to_task(rq, p, preempt);
4443 if (yielded) { 4446 if (yielded) {
@@ -4450,11 +4453,12 @@ again:
4450 resched_task(p_rq->curr); 4453 resched_task(p_rq->curr);
4451 } 4454 }
4452 4455
4453out: 4456out_unlock:
4454 double_rq_unlock(rq, p_rq); 4457 double_rq_unlock(rq, p_rq);
4458out_irq:
4455 local_irq_restore(flags); 4459 local_irq_restore(flags);
4456 4460
4457 if (yielded) 4461 if (yielded > 0)
4458 schedule(); 4462 schedule();
4459 4463
4460 return yielded; 4464 return yielded;
@@ -4713,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4713 */ 4717 */
4714 idle->sched_class = &idle_sched_class; 4718 idle->sched_class = &idle_sched_class;
4715 ftrace_graph_init_idle_task(idle, cpu); 4719 ftrace_graph_init_idle_task(idle, cpu);
4720 vtime_init_idle(idle);
4716#if defined(CONFIG_SMP) 4721#if defined(CONFIG_SMP)
4717 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); 4722 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
4718#endif 4723#endif
@@ -7206,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
7206struct task_group *sched_create_group(struct task_group *parent) 7211struct task_group *sched_create_group(struct task_group *parent)
7207{ 7212{
7208 struct task_group *tg; 7213 struct task_group *tg;
7209 unsigned long flags;
7210 7214
7211 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 7215 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
7212 if (!tg) 7216 if (!tg)
@@ -7218,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
7218 if (!alloc_rt_sched_group(tg, parent)) 7222 if (!alloc_rt_sched_group(tg, parent))
7219 goto err; 7223 goto err;
7220 7224
7225 return tg;
7226
7227err:
7228 free_sched_group(tg);
7229 return ERR_PTR(-ENOMEM);
7230}
7231
7232void sched_online_group(struct task_group *tg, struct task_group *parent)
7233{
7234 unsigned long flags;
7235
7221 spin_lock_irqsave(&task_group_lock, flags); 7236 spin_lock_irqsave(&task_group_lock, flags);
7222 list_add_rcu(&tg->list, &task_groups); 7237 list_add_rcu(&tg->list, &task_groups);
7223 7238
@@ -7227,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
7227 INIT_LIST_HEAD(&tg->children); 7242 INIT_LIST_HEAD(&tg->children);
7228 list_add_rcu(&tg->siblings, &parent->children); 7243 list_add_rcu(&tg->siblings, &parent->children);
7229 spin_unlock_irqrestore(&task_group_lock, flags); 7244 spin_unlock_irqrestore(&task_group_lock, flags);
7230
7231 return tg;
7232
7233err:
7234 free_sched_group(tg);
7235 return ERR_PTR(-ENOMEM);
7236} 7245}
7237 7246
7238/* rcu callback to free various structures associated with a task group */ 7247/* rcu callback to free various structures associated with a task group */
@@ -7245,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
7245/* Destroy runqueue etc associated with a task group */ 7254/* Destroy runqueue etc associated with a task group */
7246void sched_destroy_group(struct task_group *tg) 7255void sched_destroy_group(struct task_group *tg)
7247{ 7256{
7257 /* wait for possible concurrent references to cfs_rqs complete */
7258 call_rcu(&tg->rcu, free_sched_group_rcu);
7259}
7260
7261void sched_offline_group(struct task_group *tg)
7262{
7248 unsigned long flags; 7263 unsigned long flags;
7249 int i; 7264 int i;
7250 7265
@@ -7256,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
7256 list_del_rcu(&tg->list); 7271 list_del_rcu(&tg->list);
7257 list_del_rcu(&tg->siblings); 7272 list_del_rcu(&tg->siblings);
7258 spin_unlock_irqrestore(&task_group_lock, flags); 7273 spin_unlock_irqrestore(&task_group_lock, flags);
7259
7260 /* wait for possible concurrent references to cfs_rqs complete */
7261 call_rcu(&tg->rcu, free_sched_group_rcu);
7262} 7274}
7263 7275
7264/* change task's runqueue when it moves between groups. 7276/* change task's runqueue when it moves between groups.
@@ -7554,6 +7566,25 @@ static int sched_rt_global_constraints(void)
7554} 7566}
7555#endif /* CONFIG_RT_GROUP_SCHED */ 7567#endif /* CONFIG_RT_GROUP_SCHED */
7556 7568
7569int sched_rr_handler(struct ctl_table *table, int write,
7570 void __user *buffer, size_t *lenp,
7571 loff_t *ppos)
7572{
7573 int ret;
7574 static DEFINE_MUTEX(mutex);
7575
7576 mutex_lock(&mutex);
7577 ret = proc_dointvec(table, write, buffer, lenp, ppos);
7578 /* make sure that internally we keep jiffies */
7579 /* also, writing zero resets timeslice to default */
7580 if (!ret && write) {
7581 sched_rr_timeslice = sched_rr_timeslice <= 0 ?
7582 RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
7583 }
7584 mutex_unlock(&mutex);
7585 return ret;
7586}
7587
7557int sched_rt_handler(struct ctl_table *table, int write, 7588int sched_rt_handler(struct ctl_table *table, int write,
7558 void __user *buffer, size_t *lenp, 7589 void __user *buffer, size_t *lenp,
7559 loff_t *ppos) 7590 loff_t *ppos)
@@ -7610,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
7610 return &tg->css; 7641 return &tg->css;
7611} 7642}
7612 7643
7644static int cpu_cgroup_css_online(struct cgroup *cgrp)
7645{
7646 struct task_group *tg = cgroup_tg(cgrp);
7647 struct task_group *parent;
7648
7649 if (!cgrp->parent)
7650 return 0;
7651
7652 parent = cgroup_tg(cgrp->parent);
7653 sched_online_group(tg, parent);
7654 return 0;
7655}
7656
7613static void cpu_cgroup_css_free(struct cgroup *cgrp) 7657static void cpu_cgroup_css_free(struct cgroup *cgrp)
7614{ 7658{
7615 struct task_group *tg = cgroup_tg(cgrp); 7659 struct task_group *tg = cgroup_tg(cgrp);
@@ -7617,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
7617 sched_destroy_group(tg); 7661 sched_destroy_group(tg);
7618} 7662}
7619 7663
7664static void cpu_cgroup_css_offline(struct cgroup *cgrp)
7665{
7666 struct task_group *tg = cgroup_tg(cgrp);
7667
7668 sched_offline_group(tg);
7669}
7670
7620static int cpu_cgroup_can_attach(struct cgroup *cgrp, 7671static int cpu_cgroup_can_attach(struct cgroup *cgrp,
7621 struct cgroup_taskset *tset) 7672 struct cgroup_taskset *tset)
7622{ 7673{
@@ -7972,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7972 .name = "cpu", 8023 .name = "cpu",
7973 .css_alloc = cpu_cgroup_css_alloc, 8024 .css_alloc = cpu_cgroup_css_alloc,
7974 .css_free = cpu_cgroup_css_free, 8025 .css_free = cpu_cgroup_css_free,
8026 .css_online = cpu_cgroup_css_online,
8027 .css_offline = cpu_cgroup_css_offline,
7975 .can_attach = cpu_cgroup_can_attach, 8028 .can_attach = cpu_cgroup_can_attach,
7976 .attach = cpu_cgroup_attach, 8029 .attach = cpu_cgroup_attach,
7977 .exit = cpu_cgroup_exit, 8030 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
28 */ 28 */
29 29
30#include <linux/gfp.h> 30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
31#include "cpupri.h" 33#include "cpupri.h"
32 34
33/* Convert between a 140 based task->prio, and our 102 based cpupri */ 35/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
3#include <linux/tsacct_kern.h> 3#include <linux/tsacct_kern.h>
4#include <linux/kernel_stat.h> 4#include <linux/kernel_stat.h>
5#include <linux/static_key.h> 5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
6#include "sched.h" 7#include "sched.h"
7 8
8 9
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
163 task_group_account_field(p, index, (__force u64) cputime); 164 task_group_account_field(p, index, (__force u64) cputime);
164 165
165 /* Account for user time used */ 166 /* Account for user time used */
166 acct_update_integrals(p); 167 acct_account_cputime(p);
167} 168}
168 169
169/* 170/*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
213 task_group_account_field(p, index, (__force u64) cputime); 214 task_group_account_field(p, index, (__force u64) cputime);
214 215
215 /* Account for system time used */ 216 /* Account for system time used */
216 acct_update_integrals(p); 217 acct_account_cputime(p);
217} 218}
218 219
219/* 220/*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
295void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 296void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
296{ 297{
297 struct signal_struct *sig = tsk->signal; 298 struct signal_struct *sig = tsk->signal;
299 cputime_t utime, stime;
298 struct task_struct *t; 300 struct task_struct *t;
299 301
300 times->utime = sig->utime; 302 times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
308 310
309 t = tsk; 311 t = tsk;
310 do { 312 do {
311 times->utime += t->utime; 313 task_cputime(tsk, &utime, &stime);
312 times->stime += t->stime; 314 times->utime += utime;
315 times->stime += stime;
313 times->sum_exec_runtime += task_sched_runtime(t); 316 times->sum_exec_runtime += task_sched_runtime(t);
314 } while_each_thread(tsk, t); 317 } while_each_thread(tsk, t);
315out: 318out:
316 rcu_read_unlock(); 319 rcu_read_unlock();
317} 320}
318 321
319#ifndef CONFIG_VIRT_CPU_ACCOUNTING
320
321#ifdef CONFIG_IRQ_TIME_ACCOUNTING 322#ifdef CONFIG_IRQ_TIME_ACCOUNTING
322/* 323/*
323 * Account a tick to a process and cpustat 324 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
382 irqtime_account_process_tick(current, 0, rq); 383 irqtime_account_process_tick(current, 0, rq);
383} 384}
384#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 385#else /* CONFIG_IRQ_TIME_ACCOUNTING */
385static void irqtime_account_idle_ticks(int ticks) {} 386static inline void irqtime_account_idle_ticks(int ticks) {}
386static void irqtime_account_process_tick(struct task_struct *p, int user_tick, 387static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
387 struct rq *rq) {} 388 struct rq *rq) {}
388#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 389#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
389 390
391#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
390/* 392/*
391 * Account a single tick of cpu time. 393 * Account a single tick of cpu time.
392 * @p: the process that the cpu time gets accounted to 394 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
397 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); 399 cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
398 struct rq *rq = this_rq(); 400 struct rq *rq = this_rq();
399 401
402 if (vtime_accounting_enabled())
403 return;
404
400 if (sched_clock_irqtime) { 405 if (sched_clock_irqtime) {
401 irqtime_account_process_tick(p, user_tick, rq); 406 irqtime_account_process_tick(p, user_tick, rq);
402 return; 407 return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
438 443
439 account_idle_time(jiffies_to_cputime(ticks)); 444 account_idle_time(jiffies_to_cputime(ticks));
440} 445}
441 446#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
442#endif
443 447
444/* 448/*
445 * Use precise platform statistics if available: 449 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
461 *st = cputime.stime; 465 *st = cputime.stime;
462} 466}
463 467
464void vtime_account_system_irqsafe(struct task_struct *tsk)
465{
466 unsigned long flags;
467
468 local_irq_save(flags);
469 vtime_account_system(tsk);
470 local_irq_restore(flags);
471}
472EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
473
474#ifndef __ARCH_HAS_VTIME_TASK_SWITCH 468#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
475void vtime_task_switch(struct task_struct *prev) 469void vtime_task_switch(struct task_struct *prev)
476{ 470{
471 if (!vtime_accounting_enabled())
472 return;
473
477 if (is_idle_task(prev)) 474 if (is_idle_task(prev))
478 vtime_account_idle(prev); 475 vtime_account_idle(prev);
479 else 476 else
480 vtime_account_system(prev); 477 vtime_account_system(prev);
481 478
479#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
482 vtime_account_user(prev); 480 vtime_account_user(prev);
481#endif
483 arch_vtime_task_switch(prev); 482 arch_vtime_task_switch(prev);
484} 483}
485#endif 484#endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
493 * vtime_account(). 492 * vtime_account().
494 */ 493 */
495#ifndef __ARCH_HAS_VTIME_ACCOUNT 494#ifndef __ARCH_HAS_VTIME_ACCOUNT
496void vtime_account(struct task_struct *tsk) 495void vtime_account_irq_enter(struct task_struct *tsk)
497{ 496{
498 if (in_interrupt() || !is_idle_task(tsk)) 497 if (!vtime_accounting_enabled())
499 vtime_account_system(tsk); 498 return;
500 else 499
501 vtime_account_idle(tsk); 500 if (!in_interrupt()) {
501 /*
502 * If we interrupted user, context_tracking_in_user()
503 * is 1 because the context tracking don't hook
504 * on irq entry/exit. This way we know if
505 * we need to flush user time on kernel entry.
506 */
507 if (context_tracking_in_user()) {
508 vtime_account_user(tsk);
509 return;
510 }
511
512 if (is_idle_task(tsk)) {
513 vtime_account_idle(tsk);
514 return;
515 }
516 }
517 vtime_account_system(tsk);
502} 518}
503EXPORT_SYMBOL_GPL(vtime_account); 519EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
504#endif /* __ARCH_HAS_VTIME_ACCOUNT */ 520#endif /* __ARCH_HAS_VTIME_ACCOUNT */
505 521
506#else 522#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
507
508#ifndef nsecs_to_cputime
509# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
510#endif
511 523
512static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total) 524static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
513{ 525{
514 u64 temp = (__force u64) rtime; 526 u64 temp = (__force u64) rtime;
515 527
516 temp *= (__force u64) utime; 528 temp *= (__force u64) stime;
517 529
518 if (sizeof(cputime_t) == 4) 530 if (sizeof(cputime_t) == 4)
519 temp = div_u64(temp, (__force u32) total); 531 temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
531 struct cputime *prev, 543 struct cputime *prev,
532 cputime_t *ut, cputime_t *st) 544 cputime_t *ut, cputime_t *st)
533{ 545{
534 cputime_t rtime, utime, total; 546 cputime_t rtime, stime, total;
535 547
536 utime = curr->utime; 548 stime = curr->stime;
537 total = utime + curr->stime; 549 total = stime + curr->utime;
538 550
539 /* 551 /*
540 * Tick based cputime accounting depend on random scheduling 552 * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
549 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 561 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
550 562
551 if (total) 563 if (total)
552 utime = scale_utime(utime, rtime, total); 564 stime = scale_stime(stime, rtime, total);
553 else 565 else
554 utime = rtime; 566 stime = rtime;
555 567
556 /* 568 /*
557 * If the tick based count grows faster than the scheduler one, 569 * If the tick based count grows faster than the scheduler one,
558 * the result of the scaling may go backward. 570 * the result of the scaling may go backward.
559 * Let's enforce monotonicity. 571 * Let's enforce monotonicity.
560 */ 572 */
561 prev->utime = max(prev->utime, utime); 573 prev->stime = max(prev->stime, stime);
562 prev->stime = max(prev->stime, rtime - prev->utime); 574 prev->utime = max(prev->utime, rtime - prev->stime);
563 575
564 *ut = prev->utime; 576 *ut = prev->utime;
565 *st = prev->stime; 577 *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
568void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 580void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
569{ 581{
570 struct task_cputime cputime = { 582 struct task_cputime cputime = {
571 .utime = p->utime,
572 .stime = p->stime,
573 .sum_exec_runtime = p->se.sum_exec_runtime, 583 .sum_exec_runtime = p->se.sum_exec_runtime,
574 }; 584 };
575 585
586 task_cputime(p, &cputime.utime, &cputime.stime);
576 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 587 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
577} 588}
578 589
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
586 thread_group_cputime(p, &cputime); 597 thread_group_cputime(p, &cputime);
587 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); 598 cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
588} 599}
589#endif 600#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
601
602#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
603static unsigned long long vtime_delta(struct task_struct *tsk)
604{
605 unsigned long long clock;
606
607 clock = local_clock();
608 if (clock < tsk->vtime_snap)
609 return 0;
610
611 return clock - tsk->vtime_snap;
612}
613
614static cputime_t get_vtime_delta(struct task_struct *tsk)
615{
616 unsigned long long delta = vtime_delta(tsk);
617
618 WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
619 tsk->vtime_snap += delta;
620
621 /* CHECKME: always safe to convert nsecs to cputime? */
622 return nsecs_to_cputime(delta);
623}
624
625static void __vtime_account_system(struct task_struct *tsk)
626{
627 cputime_t delta_cpu = get_vtime_delta(tsk);
628
629 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
630}
631
632void vtime_account_system(struct task_struct *tsk)
633{
634 if (!vtime_accounting_enabled())
635 return;
636
637 write_seqlock(&tsk->vtime_seqlock);
638 __vtime_account_system(tsk);
639 write_sequnlock(&tsk->vtime_seqlock);
640}
641
642void vtime_account_irq_exit(struct task_struct *tsk)
643{
644 if (!vtime_accounting_enabled())
645 return;
646
647 write_seqlock(&tsk->vtime_seqlock);
648 if (context_tracking_in_user())
649 tsk->vtime_snap_whence = VTIME_USER;
650 __vtime_account_system(tsk);
651 write_sequnlock(&tsk->vtime_seqlock);
652}
653
654void vtime_account_user(struct task_struct *tsk)
655{
656 cputime_t delta_cpu;
657
658 if (!vtime_accounting_enabled())
659 return;
660
661 delta_cpu = get_vtime_delta(tsk);
662
663 write_seqlock(&tsk->vtime_seqlock);
664 tsk->vtime_snap_whence = VTIME_SYS;
665 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
666 write_sequnlock(&tsk->vtime_seqlock);
667}
668
669void vtime_user_enter(struct task_struct *tsk)
670{
671 if (!vtime_accounting_enabled())
672 return;
673
674 write_seqlock(&tsk->vtime_seqlock);
675 tsk->vtime_snap_whence = VTIME_USER;
676 __vtime_account_system(tsk);
677 write_sequnlock(&tsk->vtime_seqlock);
678}
679
680void vtime_guest_enter(struct task_struct *tsk)
681{
682 write_seqlock(&tsk->vtime_seqlock);
683 __vtime_account_system(tsk);
684 current->flags |= PF_VCPU;
685 write_sequnlock(&tsk->vtime_seqlock);
686}
687
688void vtime_guest_exit(struct task_struct *tsk)
689{
690 write_seqlock(&tsk->vtime_seqlock);
691 __vtime_account_system(tsk);
692 current->flags &= ~PF_VCPU;
693 write_sequnlock(&tsk->vtime_seqlock);
694}
695
696void vtime_account_idle(struct task_struct *tsk)
697{
698 cputime_t delta_cpu = get_vtime_delta(tsk);
699
700 account_idle_time(delta_cpu);
701}
702
703bool vtime_accounting_enabled(void)
704{
705 return context_tracking_active();
706}
707
708void arch_vtime_task_switch(struct task_struct *prev)
709{
710 write_seqlock(&prev->vtime_seqlock);
711 prev->vtime_snap_whence = VTIME_SLEEPING;
712 write_sequnlock(&prev->vtime_seqlock);
713
714 write_seqlock(&current->vtime_seqlock);
715 current->vtime_snap_whence = VTIME_SYS;
716 current->vtime_snap = sched_clock();
717 write_sequnlock(&current->vtime_seqlock);
718}
719
720void vtime_init_idle(struct task_struct *t)
721{
722 unsigned long flags;
723
724 write_seqlock_irqsave(&t->vtime_seqlock, flags);
725 t->vtime_snap_whence = VTIME_SYS;
726 t->vtime_snap = sched_clock();
727 write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
728}
729
730cputime_t task_gtime(struct task_struct *t)
731{
732 unsigned int seq;
733 cputime_t gtime;
734
735 do {
736 seq = read_seqbegin(&t->vtime_seqlock);
737
738 gtime = t->gtime;
739 if (t->flags & PF_VCPU)
740 gtime += vtime_delta(t);
741
742 } while (read_seqretry(&t->vtime_seqlock, seq));
743
744 return gtime;
745}
746
747/*
748 * Fetch cputime raw values from fields of task_struct and
749 * add up the pending nohz execution time since the last
750 * cputime snapshot.
751 */
752static void
753fetch_task_cputime(struct task_struct *t,
754 cputime_t *u_dst, cputime_t *s_dst,
755 cputime_t *u_src, cputime_t *s_src,
756 cputime_t *udelta, cputime_t *sdelta)
757{
758 unsigned int seq;
759 unsigned long long delta;
760
761 do {
762 *udelta = 0;
763 *sdelta = 0;
764
765 seq = read_seqbegin(&t->vtime_seqlock);
766
767 if (u_dst)
768 *u_dst = *u_src;
769 if (s_dst)
770 *s_dst = *s_src;
771
772 /* Task is sleeping, nothing to add */
773 if (t->vtime_snap_whence == VTIME_SLEEPING ||
774 is_idle_task(t))
775 continue;
776
777 delta = vtime_delta(t);
778
779 /*
780 * Task runs either in user or kernel space, add pending nohz time to
781 * the right place.
782 */
783 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
784 *udelta = delta;
785 } else {
786 if (t->vtime_snap_whence == VTIME_SYS)
787 *sdelta = delta;
788 }
789 } while (read_seqretry(&t->vtime_seqlock, seq));
790}
791
792
793void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
794{
795 cputime_t udelta, sdelta;
796
797 fetch_task_cputime(t, utime, stime, &t->utime,
798 &t->stime, &udelta, &sdelta);
799 if (utime)
800 *utime += udelta;
801 if (stime)
802 *stime += sdelta;
803}
804
805void task_cputime_scaled(struct task_struct *t,
806 cputime_t *utimescaled, cputime_t *stimescaled)
807{
808 cputime_t udelta, sdelta;
809
810 fetch_task_cputime(t, utimescaled, stimescaled,
811 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
812 if (utimescaled)
813 *utimescaled += cputime_to_scaled(udelta);
814 if (stimescaled)
815 *stimescaled += cputime_to_scaled(sdelta);
816}
817#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b4e582..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
110 if (autogroup_path(tg, group_path, PATH_MAX)) 110 if (autogroup_path(tg, group_path, PATH_MAX))
111 return group_path; 111 return group_path;
112 112
113 /*
114 * May be NULL if the underlying cgroup isn't fully-created yet
115 */
116 if (!tg->css.cgroup) {
117 group_path[0] = '\0';
118 return group_path;
119 }
120 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 113 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
121 return group_path; 114 return group_path;
122} 115}
@@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
222 cfs_rq->runnable_load_avg); 215 cfs_rq->runnable_load_avg);
223 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg", 216 SEQ_printf(m, " .%-30s: %lld\n", "blocked_load_avg",
224 cfs_rq->blocked_load_avg); 217 cfs_rq->blocked_load_avg);
225 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", 218 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_avg",
226 atomic64_read(&cfs_rq->tg->load_avg)); 219 (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
227 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib", 220 SEQ_printf(m, " .%-30s: %lld\n", "tg_load_contrib",
228 cfs_rq->tg_load_contrib); 221 cfs_rq->tg_load_contrib);
229 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib", 222 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
269 { 262 {
270 unsigned int freq = cpu_khz ? : 1; 263 unsigned int freq = cpu_khz ? : 1;
271 264
272 SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", 265 SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
273 cpu, freq / 1000, (freq % 1000)); 266 cpu, freq / 1000, (freq % 1000));
274 } 267 }
275#else 268#else
276 SEQ_printf(m, "\ncpu#%d\n", cpu); 269 SEQ_printf(m, "cpu#%d\n", cpu);
277#endif 270#endif
278 271
279#define P(x) \ 272#define P(x) \
@@ -330,6 +323,7 @@ do { \
330 print_rq(m, rq, cpu); 323 print_rq(m, rq, cpu);
331 rcu_read_unlock(); 324 rcu_read_unlock();
332 spin_unlock_irqrestore(&sched_debug_lock, flags); 325 spin_unlock_irqrestore(&sched_debug_lock, flags);
326 SEQ_printf(m, "\n");
333} 327}
334 328
335static const char *sched_tunable_scaling_names[] = { 329static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
338 "linear" 332 "linear"
339}; 333};
340 334
341static int sched_debug_show(struct seq_file *m, void *v) 335static void sched_debug_header(struct seq_file *m)
342{ 336{
343 u64 ktime, sched_clk, cpu_clk; 337 u64 ktime, sched_clk, cpu_clk;
344 unsigned long flags; 338 unsigned long flags;
345 int cpu;
346 339
347 local_irq_save(flags); 340 local_irq_save(flags);
348 ktime = ktime_to_ns(ktime_get()); 341 ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
384#undef PN 377#undef PN
385#undef P 378#undef P
386 379
387 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", 380 SEQ_printf(m, " .%-40s: %d (%s)\n",
381 "sysctl_sched_tunable_scaling",
388 sysctl_sched_tunable_scaling, 382 sysctl_sched_tunable_scaling,
389 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); 383 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
384 SEQ_printf(m, "\n");
385}
390 386
391 for_each_online_cpu(cpu) 387static int sched_debug_show(struct seq_file *m, void *v)
392 print_cpu(m, cpu); 388{
389 int cpu = (unsigned long)(v - 2);
393 390
394 SEQ_printf(m, "\n"); 391 if (cpu != -1)
392 print_cpu(m, cpu);
393 else
394 sched_debug_header(m);
395 395
396 return 0; 396 return 0;
397} 397}
398 398
399void sysrq_sched_debug_show(void) 399void sysrq_sched_debug_show(void)
400{ 400{
401 sched_debug_show(NULL, NULL); 401 int cpu;
402
403 sched_debug_header(NULL);
404 for_each_online_cpu(cpu)
405 print_cpu(NULL, cpu);
406
407}
408
409/*
410 * This itererator needs some explanation.
411 * It returns 1 for the header position.
412 * This means 2 is cpu 0.
413 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
414 * to use cpumask_* to iterate over the cpus.
415 */
416static void *sched_debug_start(struct seq_file *file, loff_t *offset)
417{
418 unsigned long n = *offset;
419
420 if (n == 0)
421 return (void *) 1;
422
423 n--;
424
425 if (n > 0)
426 n = cpumask_next(n - 1, cpu_online_mask);
427 else
428 n = cpumask_first(cpu_online_mask);
429
430 *offset = n + 1;
431
432 if (n < nr_cpu_ids)
433 return (void *)(unsigned long)(n + 2);
434 return NULL;
435}
436
437static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
438{
439 (*offset)++;
440 return sched_debug_start(file, offset);
441}
442
443static void sched_debug_stop(struct seq_file *file, void *data)
444{
445}
446
447static const struct seq_operations sched_debug_sops = {
448 .start = sched_debug_start,
449 .next = sched_debug_next,
450 .stop = sched_debug_stop,
451 .show = sched_debug_show,
452};
453
454static int sched_debug_release(struct inode *inode, struct file *file)
455{
456 seq_release(inode, file);
457
458 return 0;
402} 459}
403 460
404static int sched_debug_open(struct inode *inode, struct file *filp) 461static int sched_debug_open(struct inode *inode, struct file *filp)
405{ 462{
406 return single_open(filp, sched_debug_show, NULL); 463 int ret = 0;
464
465 ret = seq_open(filp, &sched_debug_sops);
466
467 return ret;
407} 468}
408 469
409static const struct file_operations sched_debug_fops = { 470static const struct file_operations sched_debug_fops = {
410 .open = sched_debug_open, 471 .open = sched_debug_open,
411 .read = seq_read, 472 .read = seq_read,
412 .llseek = seq_lseek, 473 .llseek = seq_lseek,
413 .release = single_release, 474 .release = sched_debug_release,
414}; 475};
415 476
416static int __init init_sched_debug_procfs(void) 477static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eea8707234a..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1680 } 1680 }
1681 1681
1682 /* ensure we never gain time by being placed backwards. */ 1682 /* ensure we never gain time by being placed backwards. */
1683 vruntime = max_vruntime(se->vruntime, vruntime); 1683 se->vruntime = max_vruntime(se->vruntime, vruntime);
1684
1685 se->vruntime = vruntime;
1686} 1684}
1687 1685
1688static void check_enqueue_throttle(struct cfs_rq *cfs_rq); 1686static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -2663,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2663 hrtimer_cancel(&cfs_b->slack_timer); 2661 hrtimer_cancel(&cfs_b->slack_timer);
2664} 2662}
2665 2663
2666static void unthrottle_offline_cfs_rqs(struct rq *rq) 2664static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
2667{ 2665{
2668 struct cfs_rq *cfs_rq; 2666 struct cfs_rq *cfs_rq;
2669 2667
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3254 */ 3252 */
3255static int select_idle_sibling(struct task_struct *p, int target) 3253static int select_idle_sibling(struct task_struct *p, int target)
3256{ 3254{
3257 int cpu = smp_processor_id();
3258 int prev_cpu = task_cpu(p);
3259 struct sched_domain *sd; 3255 struct sched_domain *sd;
3260 struct sched_group *sg; 3256 struct sched_group *sg;
3261 int i; 3257 int i = task_cpu(p);
3262 3258
3263 /* 3259 if (idle_cpu(target))
3264 * If the task is going to be woken-up on this cpu and if it is 3260 return target;
3265 * already idle, then it is the right target.
3266 */
3267 if (target == cpu && idle_cpu(cpu))
3268 return cpu;
3269 3261
3270 /* 3262 /*
3271 * If the task is going to be woken-up on the cpu where it previously 3263 * If the prevous cpu is cache affine and idle, don't be stupid.
3272 * ran and if it is currently idle, then it the right target.
3273 */ 3264 */
3274 if (target == prev_cpu && idle_cpu(prev_cpu)) 3265 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3275 return prev_cpu; 3266 return i;
3276 3267
3277 /* 3268 /*
3278 * Otherwise, iterate the domains and find an elegible idle cpu. 3269 * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
3286 goto next; 3277 goto next;
3287 3278
3288 for_each_cpu(i, sched_group_cpus(sg)) { 3279 for_each_cpu(i, sched_group_cpus(sg)) {
3289 if (!idle_cpu(i)) 3280 if (i == target || !idle_cpu(i))
3290 goto next; 3281 goto next;
3291 } 3282 }
3292 3283
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
6101 * idle runqueue: 6092 * idle runqueue:
6102 */ 6093 */
6103 if (rq->cfs.load.weight) 6094 if (rq->cfs.load.weight)
6104 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 6095 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
6105 6096
6106 return rr_interval; 6097 return rr_interval;
6107} 6098}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
7 7
8#include <linux/slab.h> 8#include <linux/slab.h>
9 9
10int sched_rr_timeslice = RR_TIMESLICE;
11
10static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); 12static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
11 13
12struct rt_bandwidth def_rt_bandwidth; 14struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
566static int do_balance_runtime(struct rt_rq *rt_rq) 568static int do_balance_runtime(struct rt_rq *rt_rq)
567{ 569{
568 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 570 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
569 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 571 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
570 int i, weight, more = 0; 572 int i, weight, more = 0;
571 u64 rt_period; 573 u64 rt_period;
572 574
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
925 return; 927 return;
926 928
927 delta_exec = rq->clock_task - curr->se.exec_start; 929 delta_exec = rq->clock_task - curr->se.exec_start;
928 if (unlikely((s64)delta_exec < 0)) 930 if (unlikely((s64)delta_exec <= 0))
929 delta_exec = 0; 931 return;
930 932
931 schedstat_set(curr->se.statistics.exec_max, 933 schedstat_set(curr->se.statistics.exec_max,
932 max(curr->se.statistics.exec_max, delta_exec)); 934 max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1427static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1429static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1428{ 1430{
1429 if (!task_running(rq, p) && 1431 if (!task_running(rq, p) &&
1430 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1432 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1431 (p->nr_cpus_allowed > 1))
1432 return 1; 1433 return 1;
1433 return 0; 1434 return 0;
1434} 1435}
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1889 * we may need to handle the pulling of RT tasks 1890 * we may need to handle the pulling of RT tasks
1890 * now. 1891 * now.
1891 */ 1892 */
1892 if (p->on_rq && !rq->rt.rt_nr_running) 1893 if (!p->on_rq || rq->rt.rt_nr_running)
1893 pull_rt_task(rq); 1894 return;
1895
1896 if (pull_rt_task(rq))
1897 resched_task(rq->curr);
1894} 1898}
1895 1899
1896void init_sched_rt_class(void) 1900void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1985 if (soft != RLIM_INFINITY) { 1989 if (soft != RLIM_INFINITY) {
1986 unsigned long next; 1990 unsigned long next;
1987 1991
1988 p->rt.timeout++; 1992 if (p->rt.watchdog_stamp != jiffies) {
1993 p->rt.timeout++;
1994 p->rt.watchdog_stamp = jiffies;
1995 }
1996
1989 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1997 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1990 if (p->rt.timeout > next) 1998 if (p->rt.timeout > next)
1991 p->cputime_expires.sched_exp = p->se.sum_exec_runtime; 1999 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2010 if (--p->rt.time_slice) 2018 if (--p->rt.time_slice)
2011 return; 2019 return;
2012 2020
2013 p->rt.time_slice = RR_TIMESLICE; 2021 p->rt.time_slice = sched_rr_timeslice;
2014 2022
2015 /* 2023 /*
2016 * Requeue to the end of queue if we (and all of our ancestors) are the 2024 * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2041 * Time slice is 0 for SCHED_FIFO tasks 2049 * Time slice is 0 for SCHED_FIFO tasks
2042 */ 2050 */
2043 if (task->policy == SCHED_RR) 2051 if (task->policy == SCHED_RR)
2044 return RR_TIMESLICE; 2052 return sched_rr_timeslice;
2045 else 2053 else
2046 return 0; 2054 return 0;
2047} 2055}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
1 1
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/sched/sysctl.h>
4#include <linux/sched/rt.h>
3#include <linux/mutex.h> 5#include <linux/mutex.h>
4#include <linux/spinlock.h> 6#include <linux/spinlock.h>
5#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
21 if (mask_str == NULL) 21 if (mask_str == NULL)
22 return -ENOMEM; 22 return -ENOMEM;
23 23
24 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 24 if (v == (void *)1) {
25 seq_printf(seq, "timestamp %lu\n", jiffies); 25 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
26 for_each_online_cpu(cpu) { 26 seq_printf(seq, "timestamp %lu\n", jiffies);
27 struct rq *rq = cpu_rq(cpu); 27 } else {
28 struct rq *rq;
28#ifdef CONFIG_SMP 29#ifdef CONFIG_SMP
29 struct sched_domain *sd; 30 struct sched_domain *sd;
30 int dcount = 0; 31 int dcount = 0;
31#endif 32#endif
33 cpu = (unsigned long)(v - 2);
34 rq = cpu_rq(cpu);
32 35
33 /* runqueue-specific stats */ 36 /* runqueue-specific stats */
34 seq_printf(seq, 37 seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
77 return 0; 80 return 0;
78} 81}
79 82
80static int schedstat_open(struct inode *inode, struct file *file) 83/*
84 * This itererator needs some explanation.
85 * It returns 1 for the header position.
86 * This means 2 is cpu 0.
87 * In a hotplugged system some cpus, including cpu 0, may be missing so we have
88 * to use cpumask_* to iterate over the cpus.
89 */
90static void *schedstat_start(struct seq_file *file, loff_t *offset)
81{ 91{
82 unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); 92 unsigned long n = *offset;
83 char *buf = kmalloc(size, GFP_KERNEL);
84 struct seq_file *m;
85 int res;
86 93
87 if (!buf) 94 if (n == 0)
88 return -ENOMEM; 95 return (void *) 1;
89 res = single_open(file, show_schedstat, NULL); 96
90 if (!res) { 97 n--;
91 m = file->private_data; 98
92 m->buf = buf; 99 if (n > 0)
93 m->size = size; 100 n = cpumask_next(n - 1, cpu_online_mask);
94 } else 101 else
95 kfree(buf); 102 n = cpumask_first(cpu_online_mask);
96 return res; 103
104 *offset = n + 1;
105
106 if (n < nr_cpu_ids)
107 return (void *)(unsigned long)(n + 2);
108 return NULL;
109}
110
111static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
112{
113 (*offset)++;
114 return schedstat_start(file, offset);
115}
116
117static void schedstat_stop(struct seq_file *file, void *data)
118{
119}
120
121static const struct seq_operations schedstat_sops = {
122 .start = schedstat_start,
123 .next = schedstat_next,
124 .stop = schedstat_stop,
125 .show = show_schedstat,
126};
127
128static int schedstat_open(struct inode *inode, struct file *file)
129{
130 return seq_open(file, &schedstat_sops);
97} 131}
98 132
133static int schedstat_release(struct inode *inode, struct file *file)
134{
135 return 0;
136};
137
99static const struct file_operations proc_schedstat_operations = { 138static const struct file_operations proc_schedstat_operations = {
100 .open = schedstat_open, 139 .open = schedstat_open,
101 .read = seq_read, 140 .read = seq_read,
102 .llseek = seq_lseek, 141 .llseek = seq_lseek,
103 .release = single_release, 142 .release = schedstat_release,
104}; 143};
105 144
106static int __init proc_schedstat_init(void) 145static int __init proc_schedstat_init(void)