diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 111 |
1 files changed, 59 insertions, 52 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 6f230596bd0c..57c933ffbee1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -55,6 +55,7 @@ | |||
55 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
56 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
57 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
58 | #include <linux/proc_fs.h> | ||
58 | #include <linux/seq_file.h> | 59 | #include <linux/seq_file.h> |
59 | #include <linux/sysctl.h> | 60 | #include <linux/sysctl.h> |
60 | #include <linux/syscalls.h> | 61 | #include <linux/syscalls.h> |
@@ -71,6 +72,7 @@ | |||
71 | #include <linux/debugfs.h> | 72 | #include <linux/debugfs.h> |
72 | #include <linux/ctype.h> | 73 | #include <linux/ctype.h> |
73 | #include <linux/ftrace.h> | 74 | #include <linux/ftrace.h> |
75 | #include <trace/sched.h> | ||
74 | 76 | ||
75 | #include <asm/tlb.h> | 77 | #include <asm/tlb.h> |
76 | #include <asm/irq_regs.h> | 78 | #include <asm/irq_regs.h> |
@@ -226,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) | |||
226 | 228 | ||
227 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); | 229 | now = hrtimer_cb_get_time(&rt_b->rt_period_timer); |
228 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); | 230 | hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); |
229 | hrtimer_start(&rt_b->rt_period_timer, | 231 | hrtimer_start_expires(&rt_b->rt_period_timer, |
230 | rt_b->rt_period_timer.expires, | 232 | HRTIMER_MODE_ABS); |
231 | HRTIMER_MODE_ABS); | ||
232 | } | 233 | } |
233 | spin_unlock(&rt_b->rt_runtime_lock); | 234 | spin_unlock(&rt_b->rt_runtime_lock); |
234 | } | 235 | } |
@@ -385,7 +386,6 @@ struct cfs_rq { | |||
385 | 386 | ||
386 | u64 exec_clock; | 387 | u64 exec_clock; |
387 | u64 min_vruntime; | 388 | u64 min_vruntime; |
388 | u64 pair_start; | ||
389 | 389 | ||
390 | struct rb_root tasks_timeline; | 390 | struct rb_root tasks_timeline; |
391 | struct rb_node *rb_leftmost; | 391 | struct rb_node *rb_leftmost; |
@@ -397,7 +397,7 @@ struct cfs_rq { | |||
397 | * 'curr' points to currently running entity on this cfs_rq. | 397 | * 'curr' points to currently running entity on this cfs_rq. |
398 | * It is set to NULL otherwise (i.e when none are currently running). | 398 | * It is set to NULL otherwise (i.e when none are currently running). |
399 | */ | 399 | */ |
400 | struct sched_entity *curr, *next; | 400 | struct sched_entity *curr, *next, *last; |
401 | 401 | ||
402 | unsigned long nr_spread_over; | 402 | unsigned long nr_spread_over; |
403 | 403 | ||
@@ -818,6 +818,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; | |||
818 | unsigned int sysctl_sched_shares_ratelimit = 250000; | 818 | unsigned int sysctl_sched_shares_ratelimit = 250000; |
819 | 819 | ||
820 | /* | 820 | /* |
821 | * Inject some fuzzyness into changing the per-cpu group shares | ||
822 | * this avoids remote rq-locks at the expense of fairness. | ||
823 | * default: 4 | ||
824 | */ | ||
825 | unsigned int sysctl_sched_shares_thresh = 4; | ||
826 | |||
827 | /* | ||
821 | * period over which we measure -rt task cpu usage in us. | 828 | * period over which we measure -rt task cpu usage in us. |
822 | * default: 1s | 829 | * default: 1s |
823 | */ | 830 | */ |
@@ -1063,7 +1070,7 @@ static void hrtick_start(struct rq *rq, u64 delay) | |||
1063 | struct hrtimer *timer = &rq->hrtick_timer; | 1070 | struct hrtimer *timer = &rq->hrtick_timer; |
1064 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); | 1071 | ktime_t time = ktime_add_ns(timer->base->get_time(), delay); |
1065 | 1072 | ||
1066 | timer->expires = time; | 1073 | hrtimer_set_expires(timer, time); |
1067 | 1074 | ||
1068 | if (rq == this_rq()) { | 1075 | if (rq == this_rq()) { |
1069 | hrtimer_restart(timer); | 1076 | hrtimer_restart(timer); |
@@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
1453 | * Calculate and set the cpu's group shares. | 1460 | * Calculate and set the cpu's group shares. |
1454 | */ | 1461 | */ |
1455 | static void | 1462 | static void |
1456 | __update_group_shares_cpu(struct task_group *tg, int cpu, | 1463 | update_group_shares_cpu(struct task_group *tg, int cpu, |
1457 | unsigned long sd_shares, unsigned long sd_rq_weight) | 1464 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1458 | { | 1465 | { |
1459 | int boost = 0; | 1466 | int boost = 0; |
1460 | unsigned long shares; | 1467 | unsigned long shares; |
@@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1485 | * | 1492 | * |
1486 | */ | 1493 | */ |
1487 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); | 1494 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); |
1495 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1488 | 1496 | ||
1489 | /* | 1497 | if (abs(shares - tg->se[cpu]->load.weight) > |
1490 | * record the actual number of shares, not the boosted amount. | 1498 | sysctl_sched_shares_thresh) { |
1491 | */ | 1499 | struct rq *rq = cpu_rq(cpu); |
1492 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | 1500 | unsigned long flags; |
1493 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1494 | 1501 | ||
1495 | if (shares < MIN_SHARES) | 1502 | spin_lock_irqsave(&rq->lock, flags); |
1496 | shares = MIN_SHARES; | 1503 | /* |
1497 | else if (shares > MAX_SHARES) | 1504 | * record the actual number of shares, not the boosted amount. |
1498 | shares = MAX_SHARES; | 1505 | */ |
1506 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1507 | tg->cfs_rq[cpu]->rq_weight = rq_weight; | ||
1499 | 1508 | ||
1500 | __set_se_shares(tg->se[cpu], shares); | 1509 | __set_se_shares(tg->se[cpu], shares); |
1510 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1511 | } | ||
1501 | } | 1512 | } |
1502 | 1513 | ||
1503 | /* | 1514 | /* |
@@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1526 | if (!rq_weight) | 1537 | if (!rq_weight) |
1527 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; | 1538 | rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; |
1528 | 1539 | ||
1529 | for_each_cpu_mask(i, sd->span) { | 1540 | for_each_cpu_mask(i, sd->span) |
1530 | struct rq *rq = cpu_rq(i); | 1541 | update_group_shares_cpu(tg, i, shares, rq_weight); |
1531 | unsigned long flags; | ||
1532 | |||
1533 | spin_lock_irqsave(&rq->lock, flags); | ||
1534 | __update_group_shares_cpu(tg, i, shares, rq_weight); | ||
1535 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1536 | } | ||
1537 | 1542 | ||
1538 | return 0; | 1543 | return 0; |
1539 | } | 1544 | } |
@@ -1800,7 +1805,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
1800 | /* | 1805 | /* |
1801 | * Buddy candidates are cache hot: | 1806 | * Buddy candidates are cache hot: |
1802 | */ | 1807 | */ |
1803 | if (sched_feat(CACHE_HOT_BUDDY) && (&p->se == cfs_rq_of(&p->se)->next)) | 1808 | if (sched_feat(CACHE_HOT_BUDDY) && |
1809 | (&p->se == cfs_rq_of(&p->se)->next || | ||
1810 | &p->se == cfs_rq_of(&p->se)->last)) | ||
1804 | return 1; | 1811 | return 1; |
1805 | 1812 | ||
1806 | if (p->sched_class != &fair_sched_class) | 1813 | if (p->sched_class != &fair_sched_class) |
@@ -1936,6 +1943,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) | |||
1936 | * just go back and repeat. | 1943 | * just go back and repeat. |
1937 | */ | 1944 | */ |
1938 | rq = task_rq_lock(p, &flags); | 1945 | rq = task_rq_lock(p, &flags); |
1946 | trace_sched_wait_task(rq, p); | ||
1939 | running = task_running(rq, p); | 1947 | running = task_running(rq, p); |
1940 | on_rq = p->se.on_rq; | 1948 | on_rq = p->se.on_rq; |
1941 | ncsw = 0; | 1949 | ncsw = 0; |
@@ -2297,9 +2305,7 @@ out_activate: | |||
2297 | success = 1; | 2305 | success = 1; |
2298 | 2306 | ||
2299 | out_running: | 2307 | out_running: |
2300 | trace_mark(kernel_sched_wakeup, | 2308 | trace_sched_wakeup(rq, p); |
2301 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2302 | p->pid, p->state, rq, p, rq->curr); | ||
2303 | check_preempt_curr(rq, p, sync); | 2309 | check_preempt_curr(rq, p, sync); |
2304 | 2310 | ||
2305 | p->state = TASK_RUNNING; | 2311 | p->state = TASK_RUNNING; |
@@ -2432,9 +2438,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2432 | p->sched_class->task_new(rq, p); | 2438 | p->sched_class->task_new(rq, p); |
2433 | inc_nr_running(rq); | 2439 | inc_nr_running(rq); |
2434 | } | 2440 | } |
2435 | trace_mark(kernel_sched_wakeup_new, | 2441 | trace_sched_wakeup_new(rq, p); |
2436 | "pid %d state %ld ## rq %p task %p rq->curr %p", | ||
2437 | p->pid, p->state, rq, p, rq->curr); | ||
2438 | check_preempt_curr(rq, p, 0); | 2442 | check_preempt_curr(rq, p, 0); |
2439 | #ifdef CONFIG_SMP | 2443 | #ifdef CONFIG_SMP |
2440 | if (p->sched_class->task_wake_up) | 2444 | if (p->sched_class->task_wake_up) |
@@ -2607,11 +2611,7 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2607 | struct mm_struct *mm, *oldmm; | 2611 | struct mm_struct *mm, *oldmm; |
2608 | 2612 | ||
2609 | prepare_task_switch(rq, prev, next); | 2613 | prepare_task_switch(rq, prev, next); |
2610 | trace_mark(kernel_sched_schedule, | 2614 | trace_sched_switch(rq, prev, next); |
2611 | "prev_pid %d next_pid %d prev_state %ld " | ||
2612 | "## rq %p prev %p next %p", | ||
2613 | prev->pid, next->pid, prev->state, | ||
2614 | rq, prev, next); | ||
2615 | mm = next->mm; | 2615 | mm = next->mm; |
2616 | oldmm = prev->active_mm; | 2616 | oldmm = prev->active_mm; |
2617 | /* | 2617 | /* |
@@ -2851,6 +2851,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu) | |||
2851 | || unlikely(!cpu_active(dest_cpu))) | 2851 | || unlikely(!cpu_active(dest_cpu))) |
2852 | goto out; | 2852 | goto out; |
2853 | 2853 | ||
2854 | trace_sched_migrate_task(rq, p, dest_cpu); | ||
2854 | /* force the process onto the specified CPU */ | 2855 | /* force the process onto the specified CPU */ |
2855 | if (migrate_task(p, dest_cpu, &req)) { | 2856 | if (migrate_task(p, dest_cpu, &req)) { |
2856 | /* Need to wait for migration thread (might exit: take ref). */ | 2857 | /* Need to wait for migration thread (might exit: take ref). */ |
@@ -3344,7 +3345,7 @@ small_imbalance: | |||
3344 | } else | 3345 | } else |
3345 | this_load_per_task = cpu_avg_load_per_task(this_cpu); | 3346 | this_load_per_task = cpu_avg_load_per_task(this_cpu); |
3346 | 3347 | ||
3347 | if (max_load - this_load + 2*busiest_load_per_task >= | 3348 | if (max_load - this_load + busiest_load_per_task >= |
3348 | busiest_load_per_task * imbn) { | 3349 | busiest_load_per_task * imbn) { |
3349 | *imbalance = busiest_load_per_task; | 3350 | *imbalance = busiest_load_per_task; |
3350 | return busiest; | 3351 | return busiest; |
@@ -4052,23 +4053,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); | |||
4052 | EXPORT_PER_CPU_SYMBOL(kstat); | 4053 | EXPORT_PER_CPU_SYMBOL(kstat); |
4053 | 4054 | ||
4054 | /* | 4055 | /* |
4055 | * Return p->sum_exec_runtime plus any more ns on the sched_clock | 4056 | * Return any ns on the sched_clock that have not yet been banked in |
4056 | * that have not yet been banked in case the task is currently running. | 4057 | * @p in case that task is currently running. |
4057 | */ | 4058 | */ |
4058 | unsigned long long task_sched_runtime(struct task_struct *p) | 4059 | unsigned long long task_delta_exec(struct task_struct *p) |
4059 | { | 4060 | { |
4060 | unsigned long flags; | 4061 | unsigned long flags; |
4061 | u64 ns, delta_exec; | ||
4062 | struct rq *rq; | 4062 | struct rq *rq; |
4063 | u64 ns = 0; | ||
4063 | 4064 | ||
4064 | rq = task_rq_lock(p, &flags); | 4065 | rq = task_rq_lock(p, &flags); |
4065 | ns = p->se.sum_exec_runtime; | 4066 | |
4066 | if (task_current(rq, p)) { | 4067 | if (task_current(rq, p)) { |
4068 | u64 delta_exec; | ||
4069 | |||
4067 | update_rq_clock(rq); | 4070 | update_rq_clock(rq); |
4068 | delta_exec = rq->clock - p->se.exec_start; | 4071 | delta_exec = rq->clock - p->se.exec_start; |
4069 | if ((s64)delta_exec > 0) | 4072 | if ((s64)delta_exec > 0) |
4070 | ns += delta_exec; | 4073 | ns = delta_exec; |
4071 | } | 4074 | } |
4075 | |||
4072 | task_rq_unlock(rq, &flags); | 4076 | task_rq_unlock(rq, &flags); |
4073 | 4077 | ||
4074 | return ns; | 4078 | return ns; |
@@ -4085,6 +4089,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
4085 | cputime64_t tmp; | 4089 | cputime64_t tmp; |
4086 | 4090 | ||
4087 | p->utime = cputime_add(p->utime, cputime); | 4091 | p->utime = cputime_add(p->utime, cputime); |
4092 | account_group_user_time(p, cputime); | ||
4088 | 4093 | ||
4089 | /* Add user time to cpustat. */ | 4094 | /* Add user time to cpustat. */ |
4090 | tmp = cputime_to_cputime64(cputime); | 4095 | tmp = cputime_to_cputime64(cputime); |
@@ -4109,6 +4114,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) | |||
4109 | tmp = cputime_to_cputime64(cputime); | 4114 | tmp = cputime_to_cputime64(cputime); |
4110 | 4115 | ||
4111 | p->utime = cputime_add(p->utime, cputime); | 4116 | p->utime = cputime_add(p->utime, cputime); |
4117 | account_group_user_time(p, cputime); | ||
4112 | p->gtime = cputime_add(p->gtime, cputime); | 4118 | p->gtime = cputime_add(p->gtime, cputime); |
4113 | 4119 | ||
4114 | cpustat->user = cputime64_add(cpustat->user, tmp); | 4120 | cpustat->user = cputime64_add(cpustat->user, tmp); |
@@ -4144,6 +4150,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
4144 | } | 4150 | } |
4145 | 4151 | ||
4146 | p->stime = cputime_add(p->stime, cputime); | 4152 | p->stime = cputime_add(p->stime, cputime); |
4153 | account_group_system_time(p, cputime); | ||
4147 | 4154 | ||
4148 | /* Add system time to cpustat. */ | 4155 | /* Add system time to cpustat. */ |
4149 | tmp = cputime_to_cputime64(cputime); | 4156 | tmp = cputime_to_cputime64(cputime); |
@@ -4185,6 +4192,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
4185 | 4192 | ||
4186 | if (p == rq->idle) { | 4193 | if (p == rq->idle) { |
4187 | p->stime = cputime_add(p->stime, steal); | 4194 | p->stime = cputime_add(p->stime, steal); |
4195 | account_group_system_time(p, steal); | ||
4188 | if (atomic_read(&rq->nr_iowait) > 0) | 4196 | if (atomic_read(&rq->nr_iowait) > 0) |
4189 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 4197 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
4190 | else | 4198 | else |
@@ -4441,12 +4449,8 @@ need_resched_nonpreemptible: | |||
4441 | if (sched_feat(HRTICK)) | 4449 | if (sched_feat(HRTICK)) |
4442 | hrtick_clear(rq); | 4450 | hrtick_clear(rq); |
4443 | 4451 | ||
4444 | /* | 4452 | spin_lock_irq(&rq->lock); |
4445 | * Do the rq-clock update outside the rq lock: | ||
4446 | */ | ||
4447 | local_irq_disable(); | ||
4448 | update_rq_clock(rq); | 4453 | update_rq_clock(rq); |
4449 | spin_lock(&rq->lock); | ||
4450 | clear_tsk_need_resched(prev); | 4454 | clear_tsk_need_resched(prev); |
4451 | 4455 | ||
4452 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4456 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -6873,15 +6877,17 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | |||
6873 | struct sched_domain *tmp; | 6877 | struct sched_domain *tmp; |
6874 | 6878 | ||
6875 | /* Remove the sched domains which do not contribute to scheduling. */ | 6879 | /* Remove the sched domains which do not contribute to scheduling. */ |
6876 | for (tmp = sd; tmp; tmp = tmp->parent) { | 6880 | for (tmp = sd; tmp; ) { |
6877 | struct sched_domain *parent = tmp->parent; | 6881 | struct sched_domain *parent = tmp->parent; |
6878 | if (!parent) | 6882 | if (!parent) |
6879 | break; | 6883 | break; |
6884 | |||
6880 | if (sd_parent_degenerate(tmp, parent)) { | 6885 | if (sd_parent_degenerate(tmp, parent)) { |
6881 | tmp->parent = parent->parent; | 6886 | tmp->parent = parent->parent; |
6882 | if (parent->parent) | 6887 | if (parent->parent) |
6883 | parent->parent->child = tmp; | 6888 | parent->parent->child = tmp; |
6884 | } | 6889 | } else |
6890 | tmp = tmp->parent; | ||
6885 | } | 6891 | } |
6886 | 6892 | ||
6887 | if (sd && sd_degenerate(sd)) { | 6893 | if (sd && sd_degenerate(sd)) { |
@@ -7670,6 +7676,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7670 | error: | 7676 | error: |
7671 | free_sched_groups(cpu_map, tmpmask); | 7677 | free_sched_groups(cpu_map, tmpmask); |
7672 | SCHED_CPUMASK_FREE((void *)allmasks); | 7678 | SCHED_CPUMASK_FREE((void *)allmasks); |
7679 | kfree(rd); | ||
7673 | return -ENOMEM; | 7680 | return -ENOMEM; |
7674 | #endif | 7681 | #endif |
7675 | } | 7682 | } |