diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-09 19:06:06 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-09 19:06:06 -0500 |
commit | 5b9b28a63f2e47dac5ff3a2503bfe3ade8796aa0 (patch) | |
tree | 3d6e42aa380f53c45ed60779960b420d40169256 | |
parent | a4cbbf549a9be10b7583c44249efccd64839533d (diff) | |
parent | 139b6fd26d85a65c4e0d2795b87b94f9505e5943 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main scheduler changes in this cycle were:
- various sched/deadline fixes and enhancements
- rescheduling latency fixes/cleanups
- rework the rq->clock code to be more consistent and more robust.
- minor micro-optimizations
- ->avg.decay_count fixes
- add a stack overflow check to might_sleep()
- idle-poll handler fix, possibly resulting in power savings
- misc smaller updates and fixes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/Documentation: Remove unneeded word
sched/wait: Introduce wait_on_bit_timeout()
sched: Pull resched loop to __schedule() callers
sched/deadline: Remove cpu_active_mask from cpudl_find()
sched: Fix hrtick_start() on UP
sched/deadline: Avoid pointless __setscheduler()
sched/deadline: Fix stale yield state
sched/deadline: Fix hrtick for a non-leftmost task
sched/deadline: Modify cpudl::free_cpus to reflect rd->online
sched/idle: Add missing checks to the exit condition of cpu_idle_poll()
sched: Fix missing preemption opportunity
sched/rt: Reduce rq lock contention by eliminating locking of non-feasible target
sched/debug: Print rq->clock_task
sched/core: Rework rq->clock update skips
sched/core: Validate rq_clock*() serialization
sched/core: Remove check of p->sched_class
sched/fair: Fix sched_entity::avg::decay_count initialization
sched/debug: Fix potential call to __ffs(0) in sched_show_task()
sched/debug: Check for stack overflow in ___might_sleep()
sched/fair: Fix the dealing with decay_count in __synchronize_entity_decay()
-rw-r--r-- | include/linux/wait.h | 26 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 107 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 27 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.h | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 51 | ||||
-rw-r--r-- | kernel/sched/debug.c | 1 | ||||
-rw-r--r-- | kernel/sched/fair.c | 7 | ||||
-rw-r--r-- | kernel/sched/idle.c | 3 | ||||
-rw-r--r-- | kernel/sched/rt.c | 26 | ||||
-rw-r--r-- | kernel/sched/sched.h | 22 |
11 files changed, 197 insertions, 77 deletions
diff --git a/include/linux/wait.h b/include/linux/wait.h index 37423e0e1379..537d58eea8a0 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h | |||
@@ -990,6 +990,32 @@ wait_on_bit_io(void *word, int bit, unsigned mode) | |||
990 | } | 990 | } |
991 | 991 | ||
992 | /** | 992 | /** |
993 | * wait_on_bit_timeout - wait for a bit to be cleared or a timeout elapses | ||
994 | * @word: the word being waited on, a kernel virtual address | ||
995 | * @bit: the bit of the word being waited on | ||
996 | * @mode: the task state to sleep in | ||
997 | * @timeout: timeout, in jiffies | ||
998 | * | ||
999 | * Use the standard hashed waitqueue table to wait for a bit | ||
1000 | * to be cleared. This is similar to wait_on_bit(), except also takes a | ||
1001 | * timeout parameter. | ||
1002 | * | ||
1003 | * Returned value will be zero if the bit was cleared before the | ||
1004 | * @timeout elapsed, or non-zero if the @timeout elapsed or process | ||
1005 | * received a signal and the mode permitted wakeup on that signal. | ||
1006 | */ | ||
1007 | static inline int | ||
1008 | wait_on_bit_timeout(void *word, int bit, unsigned mode, unsigned long timeout) | ||
1009 | { | ||
1010 | might_sleep(); | ||
1011 | if (!test_bit(bit, word)) | ||
1012 | return 0; | ||
1013 | return out_of_line_wait_on_bit_timeout(word, bit, | ||
1014 | bit_wait_timeout, | ||
1015 | mode, timeout); | ||
1016 | } | ||
1017 | |||
1018 | /** | ||
993 | * wait_on_bit_action - wait for a bit to be cleared | 1019 | * wait_on_bit_action - wait for a bit to be cleared |
994 | * @word: the word being waited on, a kernel virtual address | 1020 | * @word: the word being waited on, a kernel virtual address |
995 | * @bit: the bit of the word being waited on | 1021 | * @bit: the bit of the word being waited on |
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 57407062e209..94674e5919cb 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c | |||
@@ -81,7 +81,7 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); | |||
81 | * The mutex must later on be released by the same task that | 81 | * The mutex must later on be released by the same task that |
82 | * acquired it. Recursive locking is not allowed. The task | 82 | * acquired it. Recursive locking is not allowed. The task |
83 | * may not exit without first unlocking the mutex. Also, kernel | 83 | * may not exit without first unlocking the mutex. Also, kernel |
84 | * memory where the mutex resides mutex must not be freed with | 84 | * memory where the mutex resides must not be freed with |
85 | * the mutex still locked. The mutex must first be initialized | 85 | * the mutex still locked. The mutex must first be initialized |
86 | * (or statically defined) before it can be locked. memset()-ing | 86 | * (or statically defined) before it can be locked. memset()-ing |
87 | * the mutex to 0 is not allowed. | 87 | * the mutex to 0 is not allowed. |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1612578a5b7a..1f37fe7f77a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq) | |||
119 | { | 119 | { |
120 | s64 delta; | 120 | s64 delta; |
121 | 121 | ||
122 | if (rq->skip_clock_update > 0) | 122 | lockdep_assert_held(&rq->lock); |
123 | |||
124 | if (rq->clock_skip_update & RQCF_ACT_SKIP) | ||
123 | return; | 125 | return; |
124 | 126 | ||
125 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | 127 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
@@ -490,6 +492,11 @@ static __init void init_hrtick(void) | |||
490 | */ | 492 | */ |
491 | void hrtick_start(struct rq *rq, u64 delay) | 493 | void hrtick_start(struct rq *rq, u64 delay) |
492 | { | 494 | { |
495 | /* | ||
496 | * Don't schedule slices shorter than 10000ns, that just | ||
497 | * doesn't make sense. Rely on vruntime for fairness. | ||
498 | */ | ||
499 | delay = max_t(u64, delay, 10000LL); | ||
493 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, | 500 | __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, |
494 | HRTIMER_MODE_REL_PINNED, 0); | 501 | HRTIMER_MODE_REL_PINNED, 0); |
495 | } | 502 | } |
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | |||
1046 | * this case, we can save a useless back to back clock update. | 1053 | * this case, we can save a useless back to back clock update. |
1047 | */ | 1054 | */ |
1048 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) | 1055 | if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) |
1049 | rq->skip_clock_update = 1; | 1056 | rq_clock_skip_update(rq, true); |
1050 | } | 1057 | } |
1051 | 1058 | ||
1052 | #ifdef CONFIG_SMP | 1059 | #ifdef CONFIG_SMP |
@@ -1836,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
1836 | p->se.prev_sum_exec_runtime = 0; | 1843 | p->se.prev_sum_exec_runtime = 0; |
1837 | p->se.nr_migrations = 0; | 1844 | p->se.nr_migrations = 0; |
1838 | p->se.vruntime = 0; | 1845 | p->se.vruntime = 0; |
1846 | #ifdef CONFIG_SMP | ||
1847 | p->se.avg.decay_count = 0; | ||
1848 | #endif | ||
1839 | INIT_LIST_HEAD(&p->se.group_node); | 1849 | INIT_LIST_HEAD(&p->se.group_node); |
1840 | 1850 | ||
1841 | #ifdef CONFIG_SCHEDSTATS | 1851 | #ifdef CONFIG_SCHEDSTATS |
@@ -2755,6 +2765,10 @@ again: | |||
2755 | * - explicit schedule() call | 2765 | * - explicit schedule() call |
2756 | * - return from syscall or exception to user-space | 2766 | * - return from syscall or exception to user-space |
2757 | * - return from interrupt-handler to user-space | 2767 | * - return from interrupt-handler to user-space |
2768 | * | ||
2769 | * WARNING: all callers must re-check need_resched() afterward and reschedule | ||
2770 | * accordingly in case an event triggered the need for rescheduling (such as | ||
2771 | * an interrupt waking up a task) while preemption was disabled in __schedule(). | ||
2758 | */ | 2772 | */ |
2759 | static void __sched __schedule(void) | 2773 | static void __sched __schedule(void) |
2760 | { | 2774 | { |
@@ -2763,7 +2777,6 @@ static void __sched __schedule(void) | |||
2763 | struct rq *rq; | 2777 | struct rq *rq; |
2764 | int cpu; | 2778 | int cpu; |
2765 | 2779 | ||
2766 | need_resched: | ||
2767 | preempt_disable(); | 2780 | preempt_disable(); |
2768 | cpu = smp_processor_id(); | 2781 | cpu = smp_processor_id(); |
2769 | rq = cpu_rq(cpu); | 2782 | rq = cpu_rq(cpu); |
@@ -2783,6 +2796,8 @@ need_resched: | |||
2783 | smp_mb__before_spinlock(); | 2796 | smp_mb__before_spinlock(); |
2784 | raw_spin_lock_irq(&rq->lock); | 2797 | raw_spin_lock_irq(&rq->lock); |
2785 | 2798 | ||
2799 | rq->clock_skip_update <<= 1; /* promote REQ to ACT */ | ||
2800 | |||
2786 | switch_count = &prev->nivcsw; | 2801 | switch_count = &prev->nivcsw; |
2787 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 2802 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
2788 | if (unlikely(signal_pending_state(prev->state, prev))) { | 2803 | if (unlikely(signal_pending_state(prev->state, prev))) { |
@@ -2807,13 +2822,13 @@ need_resched: | |||
2807 | switch_count = &prev->nvcsw; | 2822 | switch_count = &prev->nvcsw; |
2808 | } | 2823 | } |
2809 | 2824 | ||
2810 | if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) | 2825 | if (task_on_rq_queued(prev)) |
2811 | update_rq_clock(rq); | 2826 | update_rq_clock(rq); |
2812 | 2827 | ||
2813 | next = pick_next_task(rq, prev); | 2828 | next = pick_next_task(rq, prev); |
2814 | clear_tsk_need_resched(prev); | 2829 | clear_tsk_need_resched(prev); |
2815 | clear_preempt_need_resched(); | 2830 | clear_preempt_need_resched(); |
2816 | rq->skip_clock_update = 0; | 2831 | rq->clock_skip_update = 0; |
2817 | 2832 | ||
2818 | if (likely(prev != next)) { | 2833 | if (likely(prev != next)) { |
2819 | rq->nr_switches++; | 2834 | rq->nr_switches++; |
@@ -2828,8 +2843,6 @@ need_resched: | |||
2828 | post_schedule(rq); | 2843 | post_schedule(rq); |
2829 | 2844 | ||
2830 | sched_preempt_enable_no_resched(); | 2845 | sched_preempt_enable_no_resched(); |
2831 | if (need_resched()) | ||
2832 | goto need_resched; | ||
2833 | } | 2846 | } |
2834 | 2847 | ||
2835 | static inline void sched_submit_work(struct task_struct *tsk) | 2848 | static inline void sched_submit_work(struct task_struct *tsk) |
@@ -2849,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void) | |||
2849 | struct task_struct *tsk = current; | 2862 | struct task_struct *tsk = current; |
2850 | 2863 | ||
2851 | sched_submit_work(tsk); | 2864 | sched_submit_work(tsk); |
2852 | __schedule(); | 2865 | do { |
2866 | __schedule(); | ||
2867 | } while (need_resched()); | ||
2853 | } | 2868 | } |
2854 | EXPORT_SYMBOL(schedule); | 2869 | EXPORT_SYMBOL(schedule); |
2855 | 2870 | ||
@@ -2884,6 +2899,21 @@ void __sched schedule_preempt_disabled(void) | |||
2884 | preempt_disable(); | 2899 | preempt_disable(); |
2885 | } | 2900 | } |
2886 | 2901 | ||
2902 | static void preempt_schedule_common(void) | ||
2903 | { | ||
2904 | do { | ||
2905 | __preempt_count_add(PREEMPT_ACTIVE); | ||
2906 | __schedule(); | ||
2907 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
2908 | |||
2909 | /* | ||
2910 | * Check again in case we missed a preemption opportunity | ||
2911 | * between schedule and now. | ||
2912 | */ | ||
2913 | barrier(); | ||
2914 | } while (need_resched()); | ||
2915 | } | ||
2916 | |||
2887 | #ifdef CONFIG_PREEMPT | 2917 | #ifdef CONFIG_PREEMPT |
2888 | /* | 2918 | /* |
2889 | * this is the entry point to schedule() from in-kernel preemption | 2919 | * this is the entry point to schedule() from in-kernel preemption |
@@ -2899,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) | |||
2899 | if (likely(!preemptible())) | 2929 | if (likely(!preemptible())) |
2900 | return; | 2930 | return; |
2901 | 2931 | ||
2902 | do { | 2932 | preempt_schedule_common(); |
2903 | __preempt_count_add(PREEMPT_ACTIVE); | ||
2904 | __schedule(); | ||
2905 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
2906 | |||
2907 | /* | ||
2908 | * Check again in case we missed a preemption opportunity | ||
2909 | * between schedule and now. | ||
2910 | */ | ||
2911 | barrier(); | ||
2912 | } while (need_resched()); | ||
2913 | } | 2933 | } |
2914 | NOKPROBE_SYMBOL(preempt_schedule); | 2934 | NOKPROBE_SYMBOL(preempt_schedule); |
2915 | EXPORT_SYMBOL(preempt_schedule); | 2935 | EXPORT_SYMBOL(preempt_schedule); |
@@ -3405,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p) | |||
3405 | return match; | 3425 | return match; |
3406 | } | 3426 | } |
3407 | 3427 | ||
3428 | static bool dl_param_changed(struct task_struct *p, | ||
3429 | const struct sched_attr *attr) | ||
3430 | { | ||
3431 | struct sched_dl_entity *dl_se = &p->dl; | ||
3432 | |||
3433 | if (dl_se->dl_runtime != attr->sched_runtime || | ||
3434 | dl_se->dl_deadline != attr->sched_deadline || | ||
3435 | dl_se->dl_period != attr->sched_period || | ||
3436 | dl_se->flags != attr->sched_flags) | ||
3437 | return true; | ||
3438 | |||
3439 | return false; | ||
3440 | } | ||
3441 | |||
3408 | static int __sched_setscheduler(struct task_struct *p, | 3442 | static int __sched_setscheduler(struct task_struct *p, |
3409 | const struct sched_attr *attr, | 3443 | const struct sched_attr *attr, |
3410 | bool user) | 3444 | bool user) |
@@ -3533,7 +3567,7 @@ recheck: | |||
3533 | goto change; | 3567 | goto change; |
3534 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) | 3568 | if (rt_policy(policy) && attr->sched_priority != p->rt_priority) |
3535 | goto change; | 3569 | goto change; |
3536 | if (dl_policy(policy)) | 3570 | if (dl_policy(policy) && dl_param_changed(p, attr)) |
3537 | goto change; | 3571 | goto change; |
3538 | 3572 | ||
3539 | p->sched_reset_on_fork = reset_on_fork; | 3573 | p->sched_reset_on_fork = reset_on_fork; |
@@ -4225,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield) | |||
4225 | return 0; | 4259 | return 0; |
4226 | } | 4260 | } |
4227 | 4261 | ||
4228 | static void __cond_resched(void) | ||
4229 | { | ||
4230 | __preempt_count_add(PREEMPT_ACTIVE); | ||
4231 | __schedule(); | ||
4232 | __preempt_count_sub(PREEMPT_ACTIVE); | ||
4233 | } | ||
4234 | |||
4235 | int __sched _cond_resched(void) | 4262 | int __sched _cond_resched(void) |
4236 | { | 4263 | { |
4237 | if (should_resched()) { | 4264 | if (should_resched()) { |
4238 | __cond_resched(); | 4265 | preempt_schedule_common(); |
4239 | return 1; | 4266 | return 1; |
4240 | } | 4267 | } |
4241 | return 0; | 4268 | return 0; |
@@ -4260,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock) | |||
4260 | if (spin_needbreak(lock) || resched) { | 4287 | if (spin_needbreak(lock) || resched) { |
4261 | spin_unlock(lock); | 4288 | spin_unlock(lock); |
4262 | if (resched) | 4289 | if (resched) |
4263 | __cond_resched(); | 4290 | preempt_schedule_common(); |
4264 | else | 4291 | else |
4265 | cpu_relax(); | 4292 | cpu_relax(); |
4266 | ret = 1; | 4293 | ret = 1; |
@@ -4276,7 +4303,7 @@ int __sched __cond_resched_softirq(void) | |||
4276 | 4303 | ||
4277 | if (should_resched()) { | 4304 | if (should_resched()) { |
4278 | local_bh_enable(); | 4305 | local_bh_enable(); |
4279 | __cond_resched(); | 4306 | preempt_schedule_common(); |
4280 | local_bh_disable(); | 4307 | local_bh_disable(); |
4281 | return 1; | 4308 | return 1; |
4282 | } | 4309 | } |
@@ -4531,9 +4558,10 @@ void sched_show_task(struct task_struct *p) | |||
4531 | { | 4558 | { |
4532 | unsigned long free = 0; | 4559 | unsigned long free = 0; |
4533 | int ppid; | 4560 | int ppid; |
4534 | unsigned state; | 4561 | unsigned long state = p->state; |
4535 | 4562 | ||
4536 | state = p->state ? __ffs(p->state) + 1 : 0; | 4563 | if (state) |
4564 | state = __ffs(state) + 1; | ||
4537 | printk(KERN_INFO "%-15.15s %c", p->comm, | 4565 | printk(KERN_INFO "%-15.15s %c", p->comm, |
4538 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 4566 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
4539 | #if BITS_PER_LONG == 32 | 4567 | #if BITS_PER_LONG == 32 |
@@ -4766,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu) | |||
4766 | 4794 | ||
4767 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | 4795 | void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) |
4768 | { | 4796 | { |
4769 | if (p->sched_class && p->sched_class->set_cpus_allowed) | 4797 | if (p->sched_class->set_cpus_allowed) |
4770 | p->sched_class->set_cpus_allowed(p, new_mask); | 4798 | p->sched_class->set_cpus_allowed(p, new_mask); |
4771 | 4799 | ||
4772 | cpumask_copy(&p->cpus_allowed, new_mask); | 4800 | cpumask_copy(&p->cpus_allowed, new_mask); |
@@ -7276,6 +7304,11 @@ void __init sched_init(void) | |||
7276 | enter_lazy_tlb(&init_mm, current); | 7304 | enter_lazy_tlb(&init_mm, current); |
7277 | 7305 | ||
7278 | /* | 7306 | /* |
7307 | * During early bootup we pretend to be a normal task: | ||
7308 | */ | ||
7309 | current->sched_class = &fair_sched_class; | ||
7310 | |||
7311 | /* | ||
7279 | * Make us the idle thread. Technically, schedule() should not be | 7312 | * Make us the idle thread. Technically, schedule() should not be |
7280 | * called from this thread, however somewhere below it might be, | 7313 | * called from this thread, however somewhere below it might be, |
7281 | * but because we are the idle thread, we just pick up running again | 7314 | * but because we are the idle thread, we just pick up running again |
@@ -7285,11 +7318,6 @@ void __init sched_init(void) | |||
7285 | 7318 | ||
7286 | calc_load_update = jiffies + LOAD_FREQ; | 7319 | calc_load_update = jiffies + LOAD_FREQ; |
7287 | 7320 | ||
7288 | /* | ||
7289 | * During early bootup we pretend to be a normal task: | ||
7290 | */ | ||
7291 | current->sched_class = &fair_sched_class; | ||
7292 | |||
7293 | #ifdef CONFIG_SMP | 7321 | #ifdef CONFIG_SMP |
7294 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); | 7322 | zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); |
7295 | /* May be allocated at isolcpus cmdline parse time */ | 7323 | /* May be allocated at isolcpus cmdline parse time */ |
@@ -7350,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset) | |||
7350 | in_atomic(), irqs_disabled(), | 7378 | in_atomic(), irqs_disabled(), |
7351 | current->pid, current->comm); | 7379 | current->pid, current->comm); |
7352 | 7380 | ||
7381 | if (task_stack_end_corrupted(current)) | ||
7382 | printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); | ||
7383 | |||
7353 | debug_show_held_locks(current); | 7384 | debug_show_held_locks(current); |
7354 | if (irqs_disabled()) | 7385 | if (irqs_disabled()) |
7355 | print_irqtrace_events(current); | 7386 | print_irqtrace_events(current); |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 539ca3ce071b..c6acb07466bb 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -107,7 +107,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
107 | int best_cpu = -1; | 107 | int best_cpu = -1; |
108 | const struct sched_dl_entity *dl_se = &p->dl; | 108 | const struct sched_dl_entity *dl_se = &p->dl; |
109 | 109 | ||
110 | if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) { | 110 | if (later_mask && |
111 | cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { | ||
111 | best_cpu = cpumask_any(later_mask); | 112 | best_cpu = cpumask_any(later_mask); |
112 | goto out; | 113 | goto out; |
113 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && | 114 | } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && |
@@ -186,6 +187,26 @@ out: | |||
186 | } | 187 | } |
187 | 188 | ||
188 | /* | 189 | /* |
190 | * cpudl_set_freecpu - Set the cpudl.free_cpus | ||
191 | * @cp: the cpudl max-heap context | ||
192 | * @cpu: rd attached cpu | ||
193 | */ | ||
194 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) | ||
195 | { | ||
196 | cpumask_set_cpu(cpu, cp->free_cpus); | ||
197 | } | ||
198 | |||
199 | /* | ||
200 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus | ||
201 | * @cp: the cpudl max-heap context | ||
202 | * @cpu: rd attached cpu | ||
203 | */ | ||
204 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) | ||
205 | { | ||
206 | cpumask_clear_cpu(cpu, cp->free_cpus); | ||
207 | } | ||
208 | |||
209 | /* | ||
189 | * cpudl_init - initialize the cpudl structure | 210 | * cpudl_init - initialize the cpudl structure |
190 | * @cp: the cpudl max-heap context | 211 | * @cp: the cpudl max-heap context |
191 | */ | 212 | */ |
@@ -203,7 +224,7 @@ int cpudl_init(struct cpudl *cp) | |||
203 | if (!cp->elements) | 224 | if (!cp->elements) |
204 | return -ENOMEM; | 225 | return -ENOMEM; |
205 | 226 | ||
206 | if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { | 227 | if (!zalloc_cpumask_var(&cp->free_cpus, GFP_KERNEL)) { |
207 | kfree(cp->elements); | 228 | kfree(cp->elements); |
208 | return -ENOMEM; | 229 | return -ENOMEM; |
209 | } | 230 | } |
@@ -211,8 +232,6 @@ int cpudl_init(struct cpudl *cp) | |||
211 | for_each_possible_cpu(i) | 232 | for_each_possible_cpu(i) |
212 | cp->elements[i].idx = IDX_INVALID; | 233 | cp->elements[i].idx = IDX_INVALID; |
213 | 234 | ||
214 | cpumask_setall(cp->free_cpus); | ||
215 | |||
216 | return 0; | 235 | return 0; |
217 | } | 236 | } |
218 | 237 | ||
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index 020039bd1326..1a0a6ef2fbe1 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -24,6 +24,8 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
24 | struct cpumask *later_mask); | 24 | struct cpumask *later_mask); |
25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); | 25 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid); |
26 | int cpudl_init(struct cpudl *cp); | 26 | int cpudl_init(struct cpudl *cp); |
27 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | ||
28 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | ||
27 | void cpudl_cleanup(struct cpudl *cp); | 29 | void cpudl_cleanup(struct cpudl *cp); |
28 | #endif /* CONFIG_SMP */ | 30 | #endif /* CONFIG_SMP */ |
29 | 31 | ||
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 726470d47f87..a027799ae130 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -350,6 +350,11 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, | |||
350 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; | 350 | dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; |
351 | dl_se->runtime = pi_se->dl_runtime; | 351 | dl_se->runtime = pi_se->dl_runtime; |
352 | } | 352 | } |
353 | |||
354 | if (dl_se->dl_yielded) | ||
355 | dl_se->dl_yielded = 0; | ||
356 | if (dl_se->dl_throttled) | ||
357 | dl_se->dl_throttled = 0; | ||
353 | } | 358 | } |
354 | 359 | ||
355 | /* | 360 | /* |
@@ -536,23 +541,19 @@ again: | |||
536 | 541 | ||
537 | sched_clock_tick(); | 542 | sched_clock_tick(); |
538 | update_rq_clock(rq); | 543 | update_rq_clock(rq); |
539 | dl_se->dl_throttled = 0; | 544 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); |
540 | dl_se->dl_yielded = 0; | 545 | if (dl_task(rq->curr)) |
541 | if (task_on_rq_queued(p)) { | 546 | check_preempt_curr_dl(rq, p, 0); |
542 | enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); | 547 | else |
543 | if (dl_task(rq->curr)) | 548 | resched_curr(rq); |
544 | check_preempt_curr_dl(rq, p, 0); | ||
545 | else | ||
546 | resched_curr(rq); | ||
547 | #ifdef CONFIG_SMP | 549 | #ifdef CONFIG_SMP |
548 | /* | 550 | /* |
549 | * Queueing this task back might have overloaded rq, | 551 | * Queueing this task back might have overloaded rq, |
550 | * check if we need to kick someone away. | 552 | * check if we need to kick someone away. |
551 | */ | 553 | */ |
552 | if (has_pushable_dl_tasks(rq)) | 554 | if (has_pushable_dl_tasks(rq)) |
553 | push_dl_task(rq); | 555 | push_dl_task(rq); |
554 | #endif | 556 | #endif |
555 | } | ||
556 | unlock: | 557 | unlock: |
557 | raw_spin_unlock(&rq->lock); | 558 | raw_spin_unlock(&rq->lock); |
558 | 559 | ||
@@ -613,10 +614,9 @@ static void update_curr_dl(struct rq *rq) | |||
613 | 614 | ||
614 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; | 615 | dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; |
615 | if (dl_runtime_exceeded(rq, dl_se)) { | 616 | if (dl_runtime_exceeded(rq, dl_se)) { |
617 | dl_se->dl_throttled = 1; | ||
616 | __dequeue_task_dl(rq, curr, 0); | 618 | __dequeue_task_dl(rq, curr, 0); |
617 | if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted))) | 619 | if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted))) |
618 | dl_se->dl_throttled = 1; | ||
619 | else | ||
620 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); | 620 | enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); |
621 | 621 | ||
622 | if (!is_leftmost(curr, &rq->dl)) | 622 | if (!is_leftmost(curr, &rq->dl)) |
@@ -853,7 +853,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) | |||
853 | * its rq, the bandwidth timer callback (which clearly has not | 853 | * its rq, the bandwidth timer callback (which clearly has not |
854 | * run yet) will take care of this. | 854 | * run yet) will take care of this. |
855 | */ | 855 | */ |
856 | if (p->dl.dl_throttled) | 856 | if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH)) |
857 | return; | 857 | return; |
858 | 858 | ||
859 | enqueue_dl_entity(&p->dl, pi_se, flags); | 859 | enqueue_dl_entity(&p->dl, pi_se, flags); |
@@ -1073,7 +1073,13 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | |||
1073 | { | 1073 | { |
1074 | update_curr_dl(rq); | 1074 | update_curr_dl(rq); |
1075 | 1075 | ||
1076 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0) | 1076 | /* |
1077 | * Even when we have runtime, update_curr_dl() might have resulted in us | ||
1078 | * not being the leftmost task anymore. In that case NEED_RESCHED will | ||
1079 | * be set and schedule() will start a new hrtick for the next task. | ||
1080 | */ | ||
1081 | if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && | ||
1082 | is_leftmost(p, &rq->dl)) | ||
1077 | start_hrtick_dl(rq, p); | 1083 | start_hrtick_dl(rq, p); |
1078 | } | 1084 | } |
1079 | 1085 | ||
@@ -1166,9 +1172,6 @@ static int find_later_rq(struct task_struct *task) | |||
1166 | * We have to consider system topology and task affinity | 1172 | * We have to consider system topology and task affinity |
1167 | * first, then we can look for a suitable cpu. | 1173 | * first, then we can look for a suitable cpu. |
1168 | */ | 1174 | */ |
1169 | cpumask_copy(later_mask, task_rq(task)->rd->span); | ||
1170 | cpumask_and(later_mask, later_mask, cpu_active_mask); | ||
1171 | cpumask_and(later_mask, later_mask, &task->cpus_allowed); | ||
1172 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, | 1175 | best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, |
1173 | task, later_mask); | 1176 | task, later_mask); |
1174 | if (best_cpu == -1) | 1177 | if (best_cpu == -1) |
@@ -1563,6 +1566,7 @@ static void rq_online_dl(struct rq *rq) | |||
1563 | if (rq->dl.overloaded) | 1566 | if (rq->dl.overloaded) |
1564 | dl_set_overload(rq); | 1567 | dl_set_overload(rq); |
1565 | 1568 | ||
1569 | cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu); | ||
1566 | if (rq->dl.dl_nr_running > 0) | 1570 | if (rq->dl.dl_nr_running > 0) |
1567 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); | 1571 | cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1); |
1568 | } | 1572 | } |
@@ -1574,6 +1578,7 @@ static void rq_offline_dl(struct rq *rq) | |||
1574 | dl_clear_overload(rq); | 1578 | dl_clear_overload(rq); |
1575 | 1579 | ||
1576 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); | 1580 | cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0); |
1581 | cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu); | ||
1577 | } | 1582 | } |
1578 | 1583 | ||
1579 | void init_sched_dl_class(void) | 1584 | void init_sched_dl_class(void) |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 92cc52001e74..8baaf858d25c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -305,6 +305,7 @@ do { \ | |||
305 | PN(next_balance); | 305 | PN(next_balance); |
306 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); | 306 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); |
307 | PN(clock); | 307 | PN(clock); |
308 | PN(clock_task); | ||
308 | P(cpu_load[0]); | 309 | P(cpu_load[0]); |
309 | P(cpu_load[1]); | 310 | P(cpu_load[1]); |
310 | P(cpu_load[2]); | 311 | P(cpu_load[2]); |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe331fc391f5..7ce18f3c097a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -676,7 +676,6 @@ void init_task_runnable_average(struct task_struct *p) | |||
676 | { | 676 | { |
677 | u32 slice; | 677 | u32 slice; |
678 | 678 | ||
679 | p->se.avg.decay_count = 0; | ||
680 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; | 679 | slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; |
681 | p->se.avg.runnable_avg_sum = slice; | 680 | p->se.avg.runnable_avg_sum = slice; |
682 | p->se.avg.runnable_avg_period = slice; | 681 | p->se.avg.runnable_avg_period = slice; |
@@ -2574,11 +2573,11 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) | |||
2574 | u64 decays = atomic64_read(&cfs_rq->decay_counter); | 2573 | u64 decays = atomic64_read(&cfs_rq->decay_counter); |
2575 | 2574 | ||
2576 | decays -= se->avg.decay_count; | 2575 | decays -= se->avg.decay_count; |
2576 | se->avg.decay_count = 0; | ||
2577 | if (!decays) | 2577 | if (!decays) |
2578 | return 0; | 2578 | return 0; |
2579 | 2579 | ||
2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); | 2580 | se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); |
2581 | se->avg.decay_count = 0; | ||
2582 | 2581 | ||
2583 | return decays; | 2582 | return decays; |
2584 | } | 2583 | } |
@@ -5157,7 +5156,7 @@ static void yield_task_fair(struct rq *rq) | |||
5157 | * so we don't do microscopic update in schedule() | 5156 | * so we don't do microscopic update in schedule() |
5158 | * and double the fastpath cost. | 5157 | * and double the fastpath cost. |
5159 | */ | 5158 | */ |
5160 | rq->skip_clock_update = 1; | 5159 | rq_clock_skip_update(rq, true); |
5161 | } | 5160 | } |
5162 | 5161 | ||
5163 | set_skip_buddy(se); | 5162 | set_skip_buddy(se); |
@@ -5949,8 +5948,8 @@ static unsigned long scale_rt_capacity(int cpu) | |||
5949 | */ | 5948 | */ |
5950 | age_stamp = ACCESS_ONCE(rq->age_stamp); | 5949 | age_stamp = ACCESS_ONCE(rq->age_stamp); |
5951 | avg = ACCESS_ONCE(rq->rt_avg); | 5950 | avg = ACCESS_ONCE(rq->rt_avg); |
5951 | delta = __rq_clock_broken(rq) - age_stamp; | ||
5952 | 5952 | ||
5953 | delta = rq_clock(rq) - age_stamp; | ||
5954 | if (unlikely(delta < 0)) | 5953 | if (unlikely(delta < 0)) |
5955 | delta = 0; | 5954 | delta = 0; |
5956 | 5955 | ||
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index c47fce75e666..aaf1c1d5cf5d 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -47,7 +47,8 @@ static inline int cpu_idle_poll(void) | |||
47 | rcu_idle_enter(); | 47 | rcu_idle_enter(); |
48 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 48 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
49 | local_irq_enable(); | 49 | local_irq_enable(); |
50 | while (!tif_need_resched()) | 50 | while (!tif_need_resched() && |
51 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | ||
51 | cpu_relax(); | 52 | cpu_relax(); |
52 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 53 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
53 | rcu_idle_exit(); | 54 | rcu_idle_exit(); |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ee15f5a0d1c1..f4d4b077eba0 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -831,11 +831,14 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) | |||
831 | enqueue = 1; | 831 | enqueue = 1; |
832 | 832 | ||
833 | /* | 833 | /* |
834 | * Force a clock update if the CPU was idle, | 834 | * When we're idle and a woken (rt) task is |
835 | * lest wakeup -> unthrottle time accumulate. | 835 | * throttled check_preempt_curr() will set |
836 | * skip_update and the time between the wakeup | ||
837 | * and this unthrottle will get accounted as | ||
838 | * 'runtime'. | ||
836 | */ | 839 | */ |
837 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) | 840 | if (rt_rq->rt_nr_running && rq->curr == rq->idle) |
838 | rq->skip_clock_update = -1; | 841 | rq_clock_skip_update(rq, false); |
839 | } | 842 | } |
840 | if (rt_rq->rt_time || rt_rq->rt_nr_running) | 843 | if (rt_rq->rt_time || rt_rq->rt_nr_running) |
841 | idle = 0; | 844 | idle = 0; |
@@ -1337,7 +1340,12 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) | |||
1337 | curr->prio <= p->prio)) { | 1340 | curr->prio <= p->prio)) { |
1338 | int target = find_lowest_rq(p); | 1341 | int target = find_lowest_rq(p); |
1339 | 1342 | ||
1340 | if (target != -1) | 1343 | /* |
1344 | * Don't bother moving it if the destination CPU is | ||
1345 | * not running a lower priority task. | ||
1346 | */ | ||
1347 | if (target != -1 && | ||
1348 | p->prio < cpu_rq(target)->rt.highest_prio.curr) | ||
1341 | cpu = target; | 1349 | cpu = target; |
1342 | } | 1350 | } |
1343 | rcu_read_unlock(); | 1351 | rcu_read_unlock(); |
@@ -1614,6 +1622,16 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1614 | 1622 | ||
1615 | lowest_rq = cpu_rq(cpu); | 1623 | lowest_rq = cpu_rq(cpu); |
1616 | 1624 | ||
1625 | if (lowest_rq->rt.highest_prio.curr <= task->prio) { | ||
1626 | /* | ||
1627 | * Target rq has tasks of equal or higher priority, | ||
1628 | * retrying does not release any lock and is unlikely | ||
1629 | * to yield a different result. | ||
1630 | */ | ||
1631 | lowest_rq = NULL; | ||
1632 | break; | ||
1633 | } | ||
1634 | |||
1617 | /* if the prio of this runqueue changed, try again */ | 1635 | /* if the prio of this runqueue changed, try again */ |
1618 | if (double_lock_balance(rq, lowest_rq)) { | 1636 | if (double_lock_balance(rq, lowest_rq)) { |
1619 | /* | 1637 | /* |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 9a2a45c970e7..0870db23d79c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -558,8 +558,6 @@ struct rq { | |||
558 | #ifdef CONFIG_NO_HZ_FULL | 558 | #ifdef CONFIG_NO_HZ_FULL |
559 | unsigned long last_sched_tick; | 559 | unsigned long last_sched_tick; |
560 | #endif | 560 | #endif |
561 | int skip_clock_update; | ||
562 | |||
563 | /* capture load from *all* tasks on this cpu: */ | 561 | /* capture load from *all* tasks on this cpu: */ |
564 | struct load_weight load; | 562 | struct load_weight load; |
565 | unsigned long nr_load_updates; | 563 | unsigned long nr_load_updates; |
@@ -588,6 +586,7 @@ struct rq { | |||
588 | unsigned long next_balance; | 586 | unsigned long next_balance; |
589 | struct mm_struct *prev_mm; | 587 | struct mm_struct *prev_mm; |
590 | 588 | ||
589 | unsigned int clock_skip_update; | ||
591 | u64 clock; | 590 | u64 clock; |
592 | u64 clock_task; | 591 | u64 clock_task; |
593 | 592 | ||
@@ -687,16 +686,35 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | |||
687 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 686 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
688 | #define raw_rq() raw_cpu_ptr(&runqueues) | 687 | #define raw_rq() raw_cpu_ptr(&runqueues) |
689 | 688 | ||
689 | static inline u64 __rq_clock_broken(struct rq *rq) | ||
690 | { | ||
691 | return ACCESS_ONCE(rq->clock); | ||
692 | } | ||
693 | |||
690 | static inline u64 rq_clock(struct rq *rq) | 694 | static inline u64 rq_clock(struct rq *rq) |
691 | { | 695 | { |
696 | lockdep_assert_held(&rq->lock); | ||
692 | return rq->clock; | 697 | return rq->clock; |
693 | } | 698 | } |
694 | 699 | ||
695 | static inline u64 rq_clock_task(struct rq *rq) | 700 | static inline u64 rq_clock_task(struct rq *rq) |
696 | { | 701 | { |
702 | lockdep_assert_held(&rq->lock); | ||
697 | return rq->clock_task; | 703 | return rq->clock_task; |
698 | } | 704 | } |
699 | 705 | ||
706 | #define RQCF_REQ_SKIP 0x01 | ||
707 | #define RQCF_ACT_SKIP 0x02 | ||
708 | |||
709 | static inline void rq_clock_skip_update(struct rq *rq, bool skip) | ||
710 | { | ||
711 | lockdep_assert_held(&rq->lock); | ||
712 | if (skip) | ||
713 | rq->clock_skip_update |= RQCF_REQ_SKIP; | ||
714 | else | ||
715 | rq->clock_skip_update &= ~RQCF_REQ_SKIP; | ||
716 | } | ||
717 | |||
700 | #ifdef CONFIG_NUMA | 718 | #ifdef CONFIG_NUMA |
701 | enum numa_topology_type { | 719 | enum numa_topology_type { |
702 | NUMA_DIRECT, | 720 | NUMA_DIRECT, |