aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-09 19:06:06 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-09 19:06:06 -0500
commit5b9b28a63f2e47dac5ff3a2503bfe3ade8796aa0 (patch)
tree3d6e42aa380f53c45ed60779960b420d40169256 /kernel
parenta4cbbf549a9be10b7583c44249efccd64839533d (diff)
parent139b6fd26d85a65c4e0d2795b87b94f9505e5943 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main scheduler changes in this cycle were: - various sched/deadline fixes and enhancements - rescheduling latency fixes/cleanups - rework the rq->clock code to be more consistent and more robust. - minor micro-optimizations - ->avg.decay_count fixes - add a stack overflow check to might_sleep() - idle-poll handler fix, possibly resulting in power savings - misc smaller updates and fixes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/Documentation: Remove unneeded word sched/wait: Introduce wait_on_bit_timeout() sched: Pull resched loop to __schedule() callers sched/deadline: Remove cpu_active_mask from cpudl_find() sched: Fix hrtick_start() on UP sched/deadline: Avoid pointless __setscheduler() sched/deadline: Fix stale yield state sched/deadline: Fix hrtick for a non-leftmost task sched/deadline: Modify cpudl::free_cpus to reflect rd->online sched/idle: Add missing checks to the exit condition of cpu_idle_poll() sched: Fix missing preemption opportunity sched/rt: Reduce rq lock contention by eliminating locking of non-feasible target sched/debug: Print rq->clock_task sched/core: Rework rq->clock update skips sched/core: Validate rq_clock*() serialization sched/core: Remove check of p->sched_class sched/fair: Fix sched_entity::avg::decay_count initialization sched/debug: Fix potential call to __ffs(0) in sched_show_task() sched/debug: Check for stack overflow in ___might_sleep() sched/fair: Fix the dealing with decay_count in __synchronize_entity_decay()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/locking/mutex.c2
-rw-r--r--kernel/sched/core.c107
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpudeadline.h2
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/debug.c1
-rw-r--r--kernel/sched/fair.c7
-rw-r--r--kernel/sched/idle.c3
-rw-r--r--kernel/sched/rt.c26
-rw-r--r--kernel/sched/sched.h22
10 files changed, 171 insertions, 77 deletions
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 57407062e209..94674e5919cb 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count);
81 * The mutex must later on be released by the same task that 81 * The mutex must later on be released by the same task that
82 * acquired it. Recursive locking is not allowed. The task 82 * acquired it. Recursive locking is not allowed. The task
83 * may not exit without first unlocking the mutex. Also, kernel 83 * may not exit without first unlocking the mutex. Also, kernel
84 * memory where the mutex resides mutex must not be freed with 84 * memory where the mutex resides must not be freed with
85 * the mutex still locked. The mutex must first be initialized 85 * the mutex still locked. The mutex must first be initialized
86 * (or statically defined) before it can be locked. memset()-ing 86 * (or statically defined) before it can be locked. memset()-ing
87 * the mutex to 0 is not allowed. 87 * the mutex to 0 is not allowed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1612578a5b7a..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490 */ 492 */
491void hrtick_start(struct rq *rq, u64 delay) 493void hrtick_start(struct rq *rq, u64 delay)
492{ 494{
495 /*
496 * Don't schedule slices shorter than 10000ns, that just
497 * doesn't make sense. Rely on vruntime for fairness.
498 */
499 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 500 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 501 HRTIMER_MODE_REL_PINNED, 0);
495} 502}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 1053 * this case, we can save a useless back to back clock update.
1047 */ 1054 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1055 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 1056 rq_clock_skip_update(rq, true);
1050} 1057}
1051 1058
1052#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
@@ -1836,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1836 p->se.prev_sum_exec_runtime = 0; 1843 p->se.prev_sum_exec_runtime = 0;
1837 p->se.nr_migrations = 0; 1844 p->se.nr_migrations = 0;
1838 p->se.vruntime = 0; 1845 p->se.vruntime = 0;
1846#ifdef CONFIG_SMP
1847 p->se.avg.decay_count = 0;
1848#endif
1839 INIT_LIST_HEAD(&p->se.group_node); 1849 INIT_LIST_HEAD(&p->se.group_node);
1840 1850
1841#ifdef CONFIG_SCHEDSTATS 1851#ifdef CONFIG_SCHEDSTATS
@@ -2755,6 +2765,10 @@ again:
2755 * - explicit schedule() call 2765 * - explicit schedule() call
2756 * - return from syscall or exception to user-space 2766 * - return from syscall or exception to user-space
2757 * - return from interrupt-handler to user-space 2767 * - return from interrupt-handler to user-space
2768 *
2769 * WARNING: all callers must re-check need_resched() afterward and reschedule
2770 * accordingly in case an event triggered the need for rescheduling (such as
2771 * an interrupt waking up a task) while preemption was disabled in __schedule().
2758 */ 2772 */
2759static void __sched __schedule(void) 2773static void __sched __schedule(void)
2760{ 2774{
@@ -2763,7 +2777,6 @@ static void __sched __schedule(void)
2763 struct rq *rq; 2777 struct rq *rq;
2764 int cpu; 2778 int cpu;
2765 2779
2766need_resched:
2767 preempt_disable(); 2780 preempt_disable();
2768 cpu = smp_processor_id(); 2781 cpu = smp_processor_id();
2769 rq = cpu_rq(cpu); 2782 rq = cpu_rq(cpu);
@@ -2783,6 +2796,8 @@ need_resched:
2783 smp_mb__before_spinlock(); 2796 smp_mb__before_spinlock();
2784 raw_spin_lock_irq(&rq->lock); 2797 raw_spin_lock_irq(&rq->lock);
2785 2798
2799 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2800
2786 switch_count = &prev->nivcsw; 2801 switch_count = &prev->nivcsw;
2787 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2802 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2788 if (unlikely(signal_pending_state(prev->state, prev))) { 2803 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2807,13 +2822,13 @@ need_resched:
2807 switch_count = &prev->nvcsw; 2822 switch_count = &prev->nvcsw;
2808 } 2823 }
2809 2824
2810 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2825 if (task_on_rq_queued(prev))
2811 update_rq_clock(rq); 2826 update_rq_clock(rq);
2812 2827
2813 next = pick_next_task(rq, prev); 2828 next = pick_next_task(rq, prev);
2814 clear_tsk_need_resched(prev); 2829 clear_tsk_need_resched(prev);
2815 clear_preempt_need_resched(); 2830 clear_preempt_need_resched();
2816 rq->skip_clock_update = 0; 2831 rq->clock_skip_update = 0;
2817 2832
2818 if (likely(prev != next)) { 2833 if (likely(prev != next)) {
2819 rq->nr_switches++; 2834 rq->nr_switches++;
@@ -2828,8 +2843,6 @@ need_resched:
2828 post_schedule(rq); 2843 post_schedule(rq);
2829 2844
2830 sched_preempt_enable_no_resched(); 2845 sched_preempt_enable_no_resched();
2831 if (need_resched())
2832 goto need_resched;
2833} 2846}
2834 2847
2835static inline void sched_submit_work(struct task_struct *tsk) 2848static inline void sched_submit_work(struct task_struct *tsk)
@@ -2849,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2849 struct task_struct *tsk = current; 2862 struct task_struct *tsk = current;
2850 2863
2851 sched_submit_work(tsk); 2864 sched_submit_work(tsk);
2852 __schedule(); 2865 do {
2866 __schedule();
2867 } while (need_resched());
2853} 2868}
2854EXPORT_SYMBOL(schedule); 2869EXPORT_SYMBOL(schedule);
2855 2870
@@ -2884,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2884 preempt_disable(); 2899 preempt_disable();
2885} 2900}
2886 2901
2902static void preempt_schedule_common(void)
2903{
2904 do {
2905 __preempt_count_add(PREEMPT_ACTIVE);
2906 __schedule();
2907 __preempt_count_sub(PREEMPT_ACTIVE);
2908
2909 /*
2910 * Check again in case we missed a preemption opportunity
2911 * between schedule and now.
2912 */
2913 barrier();
2914 } while (need_resched());
2915}
2916
2887#ifdef CONFIG_PREEMPT 2917#ifdef CONFIG_PREEMPT
2888/* 2918/*
2889 * this is the entry point to schedule() from in-kernel preemption 2919 * this is the entry point to schedule() from in-kernel preemption
@@ -2899,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2899 if (likely(!preemptible())) 2929 if (likely(!preemptible()))
2900 return; 2930 return;
2901 2931
2902 do { 2932 preempt_schedule_common();
2903 __preempt_count_add(PREEMPT_ACTIVE);
2904 __schedule();
2905 __preempt_count_sub(PREEMPT_ACTIVE);
2906
2907 /*
2908 * Check again in case we missed a preemption opportunity
2909 * between schedule and now.
2910 */
2911 barrier();
2912 } while (need_resched());
2913} 2933}
2914NOKPROBE_SYMBOL(preempt_schedule); 2934NOKPROBE_SYMBOL(preempt_schedule);
2915EXPORT_SYMBOL(preempt_schedule); 2935EXPORT_SYMBOL(preempt_schedule);
@@ -3405,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3405 return match; 3425 return match;
3406} 3426}
3407 3427
3428static bool dl_param_changed(struct task_struct *p,
3429 const struct sched_attr *attr)
3430{
3431 struct sched_dl_entity *dl_se = &p->dl;
3432
3433 if (dl_se->dl_runtime != attr->sched_runtime ||
3434 dl_se->dl_deadline != attr->sched_deadline ||
3435 dl_se->dl_period != attr->sched_period ||
3436 dl_se->flags != attr->sched_flags)
3437 return true;
3438
3439 return false;
3440}
3441
3408static int __sched_setscheduler(struct task_struct *p, 3442static int __sched_setscheduler(struct task_struct *p,
3409 const struct sched_attr *attr, 3443 const struct sched_attr *attr,
3410 bool user) 3444 bool user)
@@ -3533,7 +3567,7 @@ recheck:
3533 goto change; 3567 goto change;
3534 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3568 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3535 goto change; 3569 goto change;
3536 if (dl_policy(policy)) 3570 if (dl_policy(policy) && dl_param_changed(p, attr))
3537 goto change; 3571 goto change;
3538 3572
3539 p->sched_reset_on_fork = reset_on_fork; 3573 p->sched_reset_on_fork = reset_on_fork;
@@ -4225,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4225 return 0; 4259 return 0;
4226} 4260}
4227 4261
4228static void __cond_resched(void)
4229{
4230 __preempt_count_add(PREEMPT_ACTIVE);
4231 __schedule();
4232 __preempt_count_sub(PREEMPT_ACTIVE);
4233}
4234
4235int __sched _cond_resched(void) 4262int __sched _cond_resched(void)
4236{ 4263{
4237 if (should_resched()) { 4264 if (should_resched()) {
4238 __cond_resched(); 4265 preempt_schedule_common();
4239 return 1; 4266 return 1;
4240 } 4267 }
4241 return 0; 4268 return 0;
@@ -4260,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4260 if (spin_needbreak(lock) || resched) { 4287 if (spin_needbreak(lock) || resched) {
4261 spin_unlock(lock); 4288 spin_unlock(lock);
4262 if (resched) 4289 if (resched)
4263 __cond_resched(); 4290 preempt_schedule_common();
4264 else 4291 else
4265 cpu_relax(); 4292 cpu_relax();
4266 ret = 1; 4293 ret = 1;
@@ -4276,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4276 4303
4277 if (should_resched()) { 4304 if (should_resched()) {
4278 local_bh_enable(); 4305 local_bh_enable();
4279 __cond_resched(); 4306 preempt_schedule_common();
4280 local_bh_disable(); 4307 local_bh_disable();
4281 return 1; 4308 return 1;
4282 } 4309 }
@@ -4531,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4531{ 4558{
4532 unsigned long free = 0; 4559 unsigned long free = 0;
4533 int ppid; 4560 int ppid;
4534 unsigned state; 4561 unsigned long state = p->state;
4535 4562
4536 state = p->state ? __ffs(p->state) + 1 : 0; 4563 if (state)
4564 state = __ffs(state) + 1;
4537 printk(KERN_INFO "%-15.15s %c", p->comm, 4565 printk(KERN_INFO "%-15.15s %c", p->comm,
4538 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4566 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4539#if BITS_PER_LONG == 32 4567#if BITS_PER_LONG == 32
@@ -4766,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4766 4794
4767void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4795void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4768{ 4796{
4769 if (p->sched_class && p->sched_class->set_cpus_allowed) 4797 if (p->sched_class->set_cpus_allowed)
4770 p->sched_class->set_cpus_allowed(p, new_mask); 4798 p->sched_class->set_cpus_allowed(p, new_mask);
4771 4799
4772 cpumask_copy(&p->cpus_allowed, new_mask); 4800 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7276,6 +7304,11 @@ void __init sched_init(void)
7276 enter_lazy_tlb(&init_mm, current); 7304 enter_lazy_tlb(&init_mm, current);
7277 7305
7278 /* 7306 /*
7307 * During early bootup we pretend to be a normal task:
7308 */
7309 current->sched_class = &fair_sched_class;
7310
7311 /*
7279 * Make us the idle thread. Technically, schedule() should not be 7312 * Make us the idle thread. Technically, schedule() should not be
7280 * called from this thread, however somewhere below it might be, 7313 * called from this thread, however somewhere below it might be,
7281 * but because we are the idle thread, we just pick up running again 7314 * but because we are the idle thread, we just pick up running again
@@ -7285,11 +7318,6 @@ void __init sched_init(void)
7285 7318
7286 calc_load_update = jiffies + LOAD_FREQ; 7319 calc_load_update = jiffies + LOAD_FREQ;
7287 7320
7288 /*
7289 * During early bootup we pretend to be a normal task:
7290 */
7291 current->sched_class = &fair_sched_class;
7292
7293#ifdef CONFIG_SMP 7321#ifdef CONFIG_SMP
7294 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7322 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7295 /* May be allocated at isolcpus cmdline parse time */ 7323 /* May be allocated at isolcpus cmdline parse time */
@@ -7350,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7350 in_atomic(), irqs_disabled(), 7378 in_atomic(), irqs_disabled(),
7351 current->pid, current->comm); 7379 current->pid, current->comm);
7352 7380
7381 if (task_stack_end_corrupted(current))
7382 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7383
7353 debug_show_held_locks(current); 7384 debug_show_held_locks(current);
7354 if (irqs_disabled()) 7385 if (irqs_disabled())
7355 print_irqtrace_events(current); 7386 print_irqtrace_events(current);
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 539ca3ce071b..c6acb07466bb 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { 110 if (later_mask &&
111 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
111 best_cpu = cpumask_any(later_mask); 112 best_cpu = cpumask_any(later_mask);
112 goto out; 113 goto out;
113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 114 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
@@ -186,6 +187,26 @@ out:
186} 187}
187 188
188/* 189/*
190 * cpudl_set_freecpu - Set the cpudl.free_cpus
191 * @cp: the cpudl max-heap context
192 * @cpu: rd attached cpu
193 */
194void cpudl_set_freecpu(struct cpudl *cp, int cpu)
195{
196 cpumask_set_cpu(cpu, cp->free_cpus);
197}
198
199/*
200 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
201 * @cp: the cpudl max-heap context
202 * @cpu: rd attached cpu
203 */
204void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
205{
206 cpumask_clear_cpu(cpu, cp->free_cpus);
207}
208
209/*
189 * cpudl_init - initialize the cpudl structure 210 * cpudl_init - initialize the cpudl structure
190 * @cp: the cpudl max-heap context 211 * @cp: the cpudl max-heap context
191 */ 212 */
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp)
203 if (!cp->elements) 224 if (!cp->elements)
204 return -ENOMEM; 225 return -ENOMEM;
205 226
206 if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { 227 if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) {
207 kfree(cp->elements); 228 kfree(cp->elements);
208 return -ENOMEM; 229 return -ENOMEM;
209 } 230 }
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp)
211 for_each_possible_cpu(i) 232 for_each_possible_cpu(i)
212 cp->elements[i].idx = IDX_INVALID; 233 cp->elements[i].idx = IDX_INVALID;
213 234
214 cpumask_setall(cp->free_cpus);
215
216 return 0; 235 return 0;
217} 236}
218 237
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 020039bd1326..1a0a6ef2fbe1 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
24 struct cpumask *later_mask); 24 struct cpumask *later_mask);
25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); 25void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
26int cpudl_init(struct cpudl *cp); 26int cpudl_init(struct cpudl *cp);
27void cpudl_set_freecpu(struct cpudl *cp, int cpu);
28void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
27void cpudl_cleanup(struct cpudl *cp); 29void cpudl_cleanup(struct cpudl *cp);
28#endif /* CONFIG_SMP */ 30#endif /* CONFIG_SMP */
29 31
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 726470d47f87..a027799ae130 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; 350 dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
351 dl_se->runtime = pi_se->dl_runtime; 351 dl_se->runtime = pi_se->dl_runtime;
352 } 352 }
353
354 if (dl_se->dl_yielded)
355 dl_se->dl_yielded = 0;
356 if (dl_se->dl_throttled)
357 dl_se->dl_throttled = 0;
353} 358}
354 359
355/* 360/*
@@ -536,23 +541,19 @@ again:
536 541
537 sched_clock_tick(); 542 sched_clock_tick();
538 update_rq_clock(rq); 543 update_rq_clock(rq);
539 dl_se->dl_throttled = 0; 544 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
540 dl_se->dl_yielded = 0; 545 if (dl_task(rq->curr))
541 if (task_on_rq_queued(p)) { 546 check_preempt_curr_dl(rq, p, 0);
542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 547 else
543 if (dl_task(rq->curr)) 548 resched_curr(rq);
544 check_preempt_curr_dl(rq, p, 0);
545 else
546 resched_curr(rq);
547#ifdef CONFIG_SMP 549#ifdef CONFIG_SMP
548 /* 550 /*
549 * Queueing this task back might have overloaded rq, 551 * Queueing this task back might have overloaded rq,
550 * check if we need to kick someone away. 552 * check if we need to kick someone away.
551 */ 553 */
552 if (has_pushable_dl_tasks(rq)) 554 if (has_pushable_dl_tasks(rq))
553 push_dl_task(rq); 555 push_dl_task(rq);
554#endif 556#endif
555 }
556unlock: 557unlock:
557 raw_spin_unlock(&rq->lock); 558 raw_spin_unlock(&rq->lock);
558 559
@@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq)
613 614
614 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; 615 dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
615 if (dl_runtime_exceeded(rq, dl_se)) { 616 if (dl_runtime_exceeded(rq, dl_se)) {
617 dl_se->dl_throttled = 1;
616 __dequeue_task_dl(rq, curr, 0); 618 __dequeue_task_dl(rq, curr, 0);
617 if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) 619 if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
618 dl_se->dl_throttled = 1;
619 else
620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 620 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
621 621
622 if (!is_leftmost(curr, &rq->dl)) 622 if (!is_leftmost(curr, &rq->dl))
@@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
853 * its rq, the bandwidth timer callback (which clearly has not 853 * its rq, the bandwidth timer callback (which clearly has not
854 * run yet) will take care of this. 854 * run yet) will take care of this.
855 */ 855 */
856 if (p->dl.dl_throttled) 856 if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
857 return; 857 return;
858 858
859 enqueue_dl_entity(&p->dl, pi_se, flags); 859 enqueue_dl_entity(&p->dl, pi_se, flags);
@@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1073{ 1073{
1074 update_curr_dl(rq); 1074 update_curr_dl(rq);
1075 1075
1076 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) 1076 /*
1077 * Even when we have runtime, update_curr_dl() might have resulted in us
1078 * not being the leftmost task anymore. In that case NEED_RESCHED will
1079 * be set and schedule() will start a new hrtick for the next task.
1080 */
1081 if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
1082 is_leftmost(p, &rq->dl))
1077 start_hrtick_dl(rq, p); 1083 start_hrtick_dl(rq, p);
1078} 1084}
1079 1085
@@ -1166,9 +1172,6 @@ static int find_later_rq(struct task_struct *task)
1166 * We have to consider system topology and task affinity 1172 * We have to consider system topology and task affinity
1167 * first, then we can look for a suitable cpu. 1173 * first, then we can look for a suitable cpu.
1168 */ 1174 */
1169 cpumask_copy(later_mask, task_rq(task)->rd->span);
1170 cpumask_and(later_mask, later_mask, cpu_active_mask);
1171 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1175 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1176 task, later_mask);
1174 if (best_cpu == -1) 1177 if (best_cpu == -1)
@@ -1563,6 +1566,7 @@ static void rq_online_dl(struct rq *rq)
1563 if (rq->dl.overloaded) 1566 if (rq->dl.overloaded)
1564 dl_set_overload(rq); 1567 dl_set_overload(rq);
1565 1568
1569 cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
1566 if (rq->dl.dl_nr_running > 0) 1570 if (rq->dl.dl_nr_running > 0)
1567 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); 1571 cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
1568} 1572}
@@ -1574,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq)
1574 dl_clear_overload(rq); 1578 dl_clear_overload(rq);
1575 1579
1576 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); 1580 cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
1581 cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
1577} 1582}
1578 1583
1579void init_sched_dl_class(void) 1584void init_sched_dl_class(void)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 92cc52001e74..8baaf858d25c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -305,6 +305,7 @@ do { \
305 PN(next_balance); 305 PN(next_balance);
306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 306 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
307 PN(clock); 307 PN(clock);
308 PN(clock_task);
308 P(cpu_load[0]); 309 P(cpu_load[0]);
309 P(cpu_load[1]); 310 P(cpu_load[1]);
310 P(cpu_load[2]); 311 P(cpu_load[2]);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index fe331fc391f5..7ce18f3c097a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p)
676{ 676{
677 u32 slice; 677 u32 slice;
678 678
679 p->se.avg.decay_count = 0;
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 679 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
681 p->se.avg.runnable_avg_sum = slice; 680 p->se.avg.runnable_avg_sum = slice;
682 p->se.avg.runnable_avg_period = slice; 681 p->se.avg.runnable_avg_period = slice;
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2574 u64 decays = atomic64_read(&cfs_rq->decay_counter); 2573 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2575 2574
2576 decays -= se->avg.decay_count; 2575 decays -= se->avg.decay_count;
2576 se->avg.decay_count = 0;
2577 if (!decays) 2577 if (!decays)
2578 return 0; 2578 return 0;
2579 2579
2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2580 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
2581 se->avg.decay_count = 0;
2582 2581
2583 return decays; 2582 return decays;
2584} 2583}
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq)
5157 * so we don't do microscopic update in schedule() 5156 * so we don't do microscopic update in schedule()
5158 * and double the fastpath cost. 5157 * and double the fastpath cost.
5159 */ 5158 */
5160 rq->skip_clock_update = 1; 5159 rq_clock_skip_update(rq, true);
5161 } 5160 }
5162 5161
5163 set_skip_buddy(se); 5162 set_skip_buddy(se);
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu)
5949 */ 5948 */
5950 age_stamp = ACCESS_ONCE(rq->age_stamp); 5949 age_stamp = ACCESS_ONCE(rq->age_stamp);
5951 avg = ACCESS_ONCE(rq->rt_avg); 5950 avg = ACCESS_ONCE(rq->rt_avg);
5951 delta = __rq_clock_broken(rq) - age_stamp;
5952 5952
5953 delta = rq_clock(rq) - age_stamp;
5954 if (unlikely(delta < 0)) 5953 if (unlikely(delta < 0))
5955 delta = 0; 5954 delta = 0;
5956 5955
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c47fce75e666..aaf1c1d5cf5d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void)
47 rcu_idle_enter(); 47 rcu_idle_enter();
48 trace_cpu_idle_rcuidle(0, smp_processor_id()); 48 trace_cpu_idle_rcuidle(0, smp_processor_id());
49 local_irq_enable(); 49 local_irq_enable();
50 while (!tif_need_resched()) 50 while (!tif_need_resched() &&
51 (cpu_idle_force_poll || tick_check_broadcast_expired()))
51 cpu_relax(); 52 cpu_relax();
52 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 53 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
53 rcu_idle_exit(); 54 rcu_idle_exit();
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index ee15f5a0d1c1..f4d4b077eba0 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
831 enqueue = 1; 831 enqueue = 1;
832 832
833 /* 833 /*
834 * Force a clock update if the CPU was idle, 834 * When we're idle and a woken (rt) task is
835 * lest wakeup -> unthrottle time accumulate. 835 * throttled check_preempt_curr() will set
836 * skip_update and the time between the wakeup
837 * and this unthrottle will get accounted as
838 * 'runtime'.
836 */ 839 */
837 if (rt_rq->rt_nr_running && rq->curr == rq->idle) 840 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
838 rq->skip_clock_update = -1; 841 rq_clock_skip_update(rq, false);
839 } 842 }
840 if (rt_rq->rt_time || rt_rq->rt_nr_running) 843 if (rt_rq->rt_time || rt_rq->rt_nr_running)
841 idle = 0; 844 idle = 0;
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1337 curr->prio <= p->prio)) { 1340 curr->prio <= p->prio)) {
1338 int target = find_lowest_rq(p); 1341 int target = find_lowest_rq(p);
1339 1342
1340 if (target != -1) 1343 /*
1344 * Don't bother moving it if the destination CPU is
1345 * not running a lower priority task.
1346 */
1347 if (target != -1 &&
1348 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1341 cpu = target; 1349 cpu = target;
1342 } 1350 }
1343 rcu_read_unlock(); 1351 rcu_read_unlock();
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1614 1622
1615 lowest_rq = cpu_rq(cpu); 1623 lowest_rq = cpu_rq(cpu);
1616 1624
1625 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1626 /*
1627 * Target rq has tasks of equal or higher priority,
1628 * retrying does not release any lock and is unlikely
1629 * to yield a different result.
1630 */
1631 lowest_rq = NULL;
1632 break;
1633 }
1634
1617 /* if the prio of this runqueue changed, try again */ 1635 /* if the prio of this runqueue changed, try again */
1618 if (double_lock_balance(rq, lowest_rq)) { 1636 if (double_lock_balance(rq, lowest_rq)) {
1619 /* 1637 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 9a2a45c970e7..0870db23d79c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -558,8 +558,6 @@ struct rq {
558#ifdef CONFIG_NO_HZ_FULL 558#ifdef CONFIG_NO_HZ_FULL
559 unsigned long last_sched_tick; 559 unsigned long last_sched_tick;
560#endif 560#endif
561 int skip_clock_update;
562
563 /* capture load from *all* tasks on this cpu: */ 561 /* capture load from *all* tasks on this cpu: */
564 struct load_weight load; 562 struct load_weight load;
565 unsigned long nr_load_updates; 563 unsigned long nr_load_updates;
@@ -588,6 +586,7 @@ struct rq {
588 unsigned long next_balance; 586 unsigned long next_balance;
589 struct mm_struct *prev_mm; 587 struct mm_struct *prev_mm;
590 588
589 unsigned int clock_skip_update;
591 u64 clock; 590 u64 clock;
592 u64 clock_task; 591 u64 clock_task;
593 592
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
687#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 686#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
688#define raw_rq() raw_cpu_ptr(&runqueues) 687#define raw_rq() raw_cpu_ptr(&runqueues)
689 688
689static inline u64 __rq_clock_broken(struct rq *rq)
690{
691 return ACCESS_ONCE(rq->clock);
692}
693
690static inline u64 rq_clock(struct rq *rq) 694static inline u64 rq_clock(struct rq *rq)
691{ 695{
696 lockdep_assert_held(&rq->lock);
692 return rq->clock; 697 return rq->clock;
693} 698}
694 699
695static inline u64 rq_clock_task(struct rq *rq) 700static inline u64 rq_clock_task(struct rq *rq)
696{ 701{
702 lockdep_assert_held(&rq->lock);
697 return rq->clock_task; 703 return rq->clock_task;
698} 704}
699 705
706#define RQCF_REQ_SKIP 0x01
707#define RQCF_ACT_SKIP 0x02
708
709static inline void rq_clock_skip_update(struct rq *rq, bool skip)
710{
711 lockdep_assert_held(&rq->lock);
712 if (skip)
713 rq->clock_skip_update |= RQCF_REQ_SKIP;
714 else
715 rq->clock_skip_update &= ~RQCF_REQ_SKIP;
716}
717
700#ifdef CONFIG_NUMA 718#ifdef CONFIG_NUMA
701enum numa_topology_type { 719enum numa_topology_type {
702 NUMA_DIRECT, 720 NUMA_DIRECT,