aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c111
1 files changed, 59 insertions, 52 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6f230596bd0c..57c933ffbee1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -226,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
226 228
227 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
228 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
229 hrtimer_start(&rt_b->rt_period_timer, 231 hrtimer_start_expires(&rt_b->rt_period_timer,
230 rt_b->rt_period_timer.expires, 232 HRTIMER_MODE_ABS);
231 HRTIMER_MODE_ABS);
232 } 233 }
233 spin_unlock(&rt_b->rt_runtime_lock); 234 spin_unlock(&rt_b->rt_runtime_lock);
234} 235}
@@ -385,7 +386,6 @@ struct cfs_rq {
385 386
386 u64 exec_clock; 387 u64 exec_clock;
387 u64 min_vruntime; 388 u64 min_vruntime;
388 u64 pair_start;
389 389
390 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
391 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -397,7 +397,7 @@ struct cfs_rq {
397 * 'curr' points to currently running entity on this cfs_rq. 397 * 'curr' points to currently running entity on this cfs_rq.
398 * It is set to NULL otherwise (i.e when none are currently running). 398 * It is set to NULL otherwise (i.e when none are currently running).
399 */ 399 */
400 struct sched_entity *curr, *next; 400 struct sched_entity *curr, *next, *last;
401 401
402 unsigned long nr_spread_over; 402 unsigned long nr_spread_over;
403 403
@@ -818,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
818unsigned int sysctl_sched_shares_ratelimit = 250000; 818unsigned int sysctl_sched_shares_ratelimit = 250000;
819 819
820/* 820/*
821 * Inject some fuzzyness into changing the per-cpu group shares
822 * this avoids remote rq-locks at the expense of fairness.
823 * default: 4
824 */
825unsigned int sysctl_sched_shares_thresh = 4;
826
827/*
821 * period over which we measure -rt task cpu usage in us. 828 * period over which we measure -rt task cpu usage in us.
822 * default: 1s 829 * default: 1s
823 */ 830 */
@@ -1063,7 +1070,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1063 struct hrtimer *timer = &rq->hrtick_timer; 1070 struct hrtimer *timer = &rq->hrtick_timer;
1064 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 1071 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1065 1072
1066 timer->expires = time; 1073 hrtimer_set_expires(timer, time);
1067 1074
1068 if (rq == this_rq()) { 1075 if (rq == this_rq()) {
1069 hrtimer_restart(timer); 1076 hrtimer_restart(timer);
@@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1453 * Calculate and set the cpu's group shares. 1460 * Calculate and set the cpu's group shares.
1454 */ 1461 */
1455static void 1462static void
1456__update_group_shares_cpu(struct task_group *tg, int cpu, 1463update_group_shares_cpu(struct task_group *tg, int cpu,
1457 unsigned long sd_shares, unsigned long sd_rq_weight) 1464 unsigned long sd_shares, unsigned long sd_rq_weight)
1458{ 1465{
1459 int boost = 0; 1466 int boost = 0;
1460 unsigned long shares; 1467 unsigned long shares;
@@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1485 * 1492 *
1486 */ 1493 */
1487 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1494 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1495 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1488 1496
1489 /* 1497 if (abs(shares - tg->se[cpu]->load.weight) >
1490 * record the actual number of shares, not the boosted amount. 1498 sysctl_sched_shares_thresh) {
1491 */ 1499 struct rq *rq = cpu_rq(cpu);
1492 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1500 unsigned long flags;
1493 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1494 1501
1495 if (shares < MIN_SHARES) 1502 spin_lock_irqsave(&rq->lock, flags);
1496 shares = MIN_SHARES; 1503 /*
1497 else if (shares > MAX_SHARES) 1504 * record the actual number of shares, not the boosted amount.
1498 shares = MAX_SHARES; 1505 */
1506 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1507 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1499 1508
1500 __set_se_shares(tg->se[cpu], shares); 1509 __set_se_shares(tg->se[cpu], shares);
1510 spin_unlock_irqrestore(&rq->lock, flags);
1511 }
1501} 1512}
1502 1513
1503/* 1514/*
@@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
1526 if (!rq_weight) 1537 if (!rq_weight)
1527 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1538 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1528 1539
1529 for_each_cpu_mask(i, sd->span) { 1540 for_each_cpu_mask(i, sd->span)
1530 struct rq *rq = cpu_rq(i); 1541 update_group_shares_cpu(tg, i, shares, rq_weight);
1531 unsigned long flags;
1532
1533 spin_lock_irqsave(&rq->lock, flags);
1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1535 spin_unlock_irqrestore(&rq->lock, flags);
1536 }
1537 1542
1538 return 0; 1543 return 0;
1539} 1544}
@@ -1800,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1800 /* 1805 /*
1801 * Buddy candidates are cache hot: 1806 * Buddy candidates are cache hot:
1802 */ 1807 */
1803 if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) 1808 if (sched_feat(CACHE_HOT_BUDDY) &&
1809 (&p->se == cfs_rq_of(&p->se)->next ||
1810 &p->se == cfs_rq_of(&p->se)->last))
1804 return 1; 1811 return 1;
1805 1812
1806 if (p->sched_class != &fair_sched_class) 1813 if (p->sched_class != &fair_sched_class)
@@ -1936,6 +1943,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1936 * just go back and repeat. 1943 * just go back and repeat.
1937 */ 1944 */
1938 rq = task_rq_lock(p, &flags); 1945 rq = task_rq_lock(p, &flags);
1946 trace_sched_wait_task(rq, p);
1939 running = task_running(rq, p); 1947 running = task_running(rq, p);
1940 on_rq = p->se.on_rq; 1948 on_rq = p->se.on_rq;
1941 ncsw = 0; 1949 ncsw = 0;
@@ -2297,9 +2305,7 @@ out_activate:
2297 success = 1; 2305 success = 1;
2298 2306
2299out_running: 2307out_running:
2300 trace_mark(kernel_sched_wakeup, 2308 trace_sched_wakeup(rq, p);
2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2302 p->pid, p->state, rq, p, rq->curr);
2303 check_preempt_curr(rq, p, sync); 2309 check_preempt_curr(rq, p, sync);
2304 2310
2305 p->state = TASK_RUNNING; 2311 p->state = TASK_RUNNING;
@@ -2432,9 +2438,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2432 p->sched_class->task_new(rq, p); 2438 p->sched_class->task_new(rq, p);
2433 inc_nr_running(rq); 2439 inc_nr_running(rq);
2434 } 2440 }
2435 trace_mark(kernel_sched_wakeup_new, 2441 trace_sched_wakeup_new(rq, p);
2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2437 p->pid, p->state, rq, p, rq->curr);
2438 check_preempt_curr(rq, p, 0); 2442 check_preempt_curr(rq, p, 0);
2439#ifdef CONFIG_SMP 2443#ifdef CONFIG_SMP
2440 if (p->sched_class->task_wake_up) 2444 if (p->sched_class->task_wake_up)
@@ -2607,11 +2611,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2607 struct mm_struct *mm, *oldmm; 2611 struct mm_struct *mm, *oldmm;
2608 2612
2609 prepare_task_switch(rq, prev, next); 2613 prepare_task_switch(rq, prev, next);
2610 trace_mark(kernel_sched_schedule, 2614 trace_sched_switch(rq, prev, next);
2611 "prev_pid %d next_pid %d prev_state %ld "
2612 "## rq %p prev %p next %p",
2613 prev->pid, next->pid, prev->state,
2614 rq, prev, next);
2615 mm = next->mm; 2615 mm = next->mm;
2616 oldmm = prev->active_mm; 2616 oldmm = prev->active_mm;
2617 /* 2617 /*
@@ -2851,6 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2851 || unlikely(!cpu_active(dest_cpu))) 2851 || unlikely(!cpu_active(dest_cpu)))
2852 goto out; 2852 goto out;
2853 2853
2854 trace_sched_migrate_task(rq, p, dest_cpu);
2854 /* force the process onto the specified CPU */ 2855 /* force the process onto the specified CPU */
2855 if (migrate_task(p, dest_cpu, &req)) { 2856 if (migrate_task(p, dest_cpu, &req)) {
2856 /* Need to wait for migration thread (might exit: take ref). */ 2857 /* Need to wait for migration thread (might exit: take ref). */
@@ -3344,7 +3345,7 @@ small_imbalance:
3344 } else 3345 } else
3345 this_load_per_task = cpu_avg_load_per_task(this_cpu); 3346 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3346 3347
3347 if (max_load - this_load + 2*busiest_load_per_task >= 3348 if (max_load - this_load + busiest_load_per_task >=
3348 busiest_load_per_task * imbn) { 3349 busiest_load_per_task * imbn) {
3349 *imbalance = busiest_load_per_task; 3350 *imbalance = busiest_load_per_task;
3350 return busiest; 3351 return busiest;
@@ -4052,23 +4053,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4052EXPORT_PER_CPU_SYMBOL(kstat); 4053EXPORT_PER_CPU_SYMBOL(kstat);
4053 4054
4054/* 4055/*
4055 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4056 * Return any ns on the sched_clock that have not yet been banked in
4056 * that have not yet been banked in case the task is currently running. 4057 * @p in case that task is currently running.
4057 */ 4058 */
4058unsigned long long task_sched_runtime(struct task_struct *p) 4059unsigned long long task_delta_exec(struct task_struct *p)
4059{ 4060{
4060 unsigned long flags; 4061 unsigned long flags;
4061 u64 ns, delta_exec;
4062 struct rq *rq; 4062 struct rq *rq;
4063 u64 ns = 0;
4063 4064
4064 rq = task_rq_lock(p, &flags); 4065 rq = task_rq_lock(p, &flags);
4065 ns = p->se.sum_exec_runtime; 4066
4066 if (task_current(rq, p)) { 4067 if (task_current(rq, p)) {
4068 u64 delta_exec;
4069
4067 update_rq_clock(rq); 4070 update_rq_clock(rq);
4068 delta_exec = rq->clock - p->se.exec_start; 4071 delta_exec = rq->clock - p->se.exec_start;
4069 if ((s64)delta_exec > 0) 4072 if ((s64)delta_exec > 0)
4070 ns += delta_exec; 4073 ns = delta_exec;
4071 } 4074 }
4075
4072 task_rq_unlock(rq, &flags); 4076 task_rq_unlock(rq, &flags);
4073 4077
4074 return ns; 4078 return ns;
@@ -4085,6 +4089,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4085 cputime64_t tmp; 4089 cputime64_t tmp;
4086 4090
4087 p->utime = cputime_add(p->utime, cputime); 4091 p->utime = cputime_add(p->utime, cputime);
4092 account_group_user_time(p, cputime);
4088 4093
4089 /* Add user time to cpustat. */ 4094 /* Add user time to cpustat. */
4090 tmp = cputime_to_cputime64(cputime); 4095 tmp = cputime_to_cputime64(cputime);
@@ -4109,6 +4114,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4109 tmp = cputime_to_cputime64(cputime); 4114 tmp = cputime_to_cputime64(cputime);
4110 4115
4111 p->utime = cputime_add(p->utime, cputime); 4116 p->utime = cputime_add(p->utime, cputime);
4117 account_group_user_time(p, cputime);
4112 p->gtime = cputime_add(p->gtime, cputime); 4118 p->gtime = cputime_add(p->gtime, cputime);
4113 4119
4114 cpustat->user = cputime64_add(cpustat->user, tmp); 4120 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4144,6 +4150,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4144 } 4150 }
4145 4151
4146 p->stime = cputime_add(p->stime, cputime); 4152 p->stime = cputime_add(p->stime, cputime);
4153 account_group_system_time(p, cputime);
4147 4154
4148 /* Add system time to cpustat. */ 4155 /* Add system time to cpustat. */
4149 tmp = cputime_to_cputime64(cputime); 4156 tmp = cputime_to_cputime64(cputime);
@@ -4185,6 +4192,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4185 4192
4186 if (p == rq->idle) { 4193 if (p == rq->idle) {
4187 p->stime = cputime_add(p->stime, steal); 4194 p->stime = cputime_add(p->stime, steal);
4195 account_group_system_time(p, steal);
4188 if (atomic_read(&rq->nr_iowait) > 0) 4196 if (atomic_read(&rq->nr_iowait) > 0)
4189 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4197 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4190 else 4198 else
@@ -4441,12 +4449,8 @@ need_resched_nonpreemptible:
4441 if (sched_feat(HRTICK)) 4449 if (sched_feat(HRTICK))
4442 hrtick_clear(rq); 4450 hrtick_clear(rq);
4443 4451
4444 /* 4452 spin_lock_irq(&rq->lock);
4445 * Do the rq-clock update outside the rq lock:
4446 */
4447 local_irq_disable();
4448 update_rq_clock(rq); 4453 update_rq_clock(rq);
4449 spin_lock(&rq->lock);
4450 clear_tsk_need_resched(prev); 4454 clear_tsk_need_resched(prev);
4451 4455
4452 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4456 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -6873,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6873 struct sched_domain *tmp; 6877 struct sched_domain *tmp;
6874 6878
6875 /* Remove the sched domains which do not contribute to scheduling. */ 6879 /* Remove the sched domains which do not contribute to scheduling. */
6876 for (tmp = sd; tmp; tmp = tmp->parent) { 6880 for (tmp = sd; tmp; ) {
6877 struct sched_domain *parent = tmp->parent; 6881 struct sched_domain *parent = tmp->parent;
6878 if (!parent) 6882 if (!parent)
6879 break; 6883 break;
6884
6880 if (sd_parent_degenerate(tmp, parent)) { 6885 if (sd_parent_degenerate(tmp, parent)) {
6881 tmp->parent = parent->parent; 6886 tmp->parent = parent->parent;
6882 if (parent->parent) 6887 if (parent->parent)
6883 parent->parent->child = tmp; 6888 parent->parent->child = tmp;
6884 } 6889 } else
6890 tmp = tmp->parent;
6885 } 6891 }
6886 6892
6887 if (sd && sd_degenerate(sd)) { 6893 if (sd && sd_degenerate(sd)) {
@@ -7670,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7670error: 7676error:
7671 free_sched_groups(cpu_map, tmpmask); 7677 free_sched_groups(cpu_map, tmpmask);
7672 SCHED_CPUMASK_FREE((void *)allmasks); 7678 SCHED_CPUMASK_FREE((void *)allmasks);
7679 kfree(rd);
7673 return -ENOMEM; 7680 return -ENOMEM;
7674#endif 7681#endif
7675} 7682}