aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c163
1 files changed, 108 insertions, 55 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119{ 119{
120 s64 delta; 120 s64 delta;
121 121
122 if (rq->skip_clock_update > 0) 122 lockdep_assert_held(&rq->lock);
123
124 if (rq->clock_skip_update & RQCF_ACT_SKIP)
123 return; 125 return;
124 126
125 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; 127 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490 */ 492 */
491void hrtick_start(struct rq *rq, u64 delay) 493void hrtick_start(struct rq *rq, u64 delay)
492{ 494{
495 /*
496 * Don't schedule slices shorter than 10000ns, that just
497 * doesn't make sense. Rely on vruntime for fairness.
498 */
499 delay = max_t(u64, delay, 10000LL);
493 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, 500 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494 HRTIMER_MODE_REL_PINNED, 0); 501 HRTIMER_MODE_REL_PINNED, 0);
495} 502}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1046 * this case, we can save a useless back to back clock update. 1053 * this case, we can save a useless back to back clock update.
1047 */ 1054 */
1048 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) 1055 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049 rq->skip_clock_update = 1; 1056 rq_clock_skip_update(rq, true);
1050} 1057}
1051 1058
1052#ifdef CONFIG_SMP 1059#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082 if (p->sched_class->migrate_task_rq) 1089 if (p->sched_class->migrate_task_rq)
1083 p->sched_class->migrate_task_rq(p, new_cpu); 1090 p->sched_class->migrate_task_rq(p, new_cpu);
1084 p->se.nr_migrations++; 1091 p->se.nr_migrations++;
1085 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); 1092 perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086 } 1093 }
1087 1094
1088 __set_task_cpu(p, new_cpu); 1095 __set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
1814 dl_se->dl_period = 0; 1821 dl_se->dl_period = 0;
1815 dl_se->flags = 0; 1822 dl_se->flags = 0;
1816 dl_se->dl_bw = 0; 1823 dl_se->dl_bw = 0;
1824
1825 dl_se->dl_throttled = 0;
1826 dl_se->dl_new = 1;
1827 dl_se->dl_yielded = 0;
1817} 1828}
1818 1829
1819/* 1830/*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1832 p->se.prev_sum_exec_runtime = 0; 1843 p->se.prev_sum_exec_runtime = 0;
1833 p->se.nr_migrations = 0; 1844 p->se.nr_migrations = 0;
1834 p->se.vruntime = 0; 1845 p->se.vruntime = 0;
1846#ifdef CONFIG_SMP
1847 p->se.avg.decay_count = 0;
1848#endif
1835 INIT_LIST_HEAD(&p->se.group_node); 1849 INIT_LIST_HEAD(&p->se.group_node);
1836 1850
1837#ifdef CONFIG_SCHEDSTATS 1851#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1839#endif 1853#endif
1840 1854
1841 RB_CLEAR_NODE(&p->dl.rb_node); 1855 RB_CLEAR_NODE(&p->dl.rb_node);
1842 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1856 init_dl_task_timer(&p->dl);
1843 __dl_clear_params(p); 1857 __dl_clear_params(p);
1844 1858
1845 INIT_LIST_HEAD(&p->rt.run_list); 1859 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
2049 * allocated bandwidth to reflect the new situation. 2063 * allocated bandwidth to reflect the new situation.
2050 * 2064 *
2051 * This function is called while holding p's rq->lock. 2065 * This function is called while holding p's rq->lock.
2066 *
2067 * XXX we should delay bw change until the task's 0-lag point, see
2068 * __setparam_dl().
2052 */ 2069 */
2053static int dl_overflow(struct task_struct *p, int policy, 2070static int dl_overflow(struct task_struct *p, int policy,
2054 const struct sched_attr *attr) 2071 const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
2748 * - explicit schedule() call 2765 * - explicit schedule() call
2749 * - return from syscall or exception to user-space 2766 * - return from syscall or exception to user-space
2750 * - return from interrupt-handler to user-space 2767 * - return from interrupt-handler to user-space
2768 *
2769 * WARNING: all callers must re-check need_resched() afterward and reschedule
2770 * accordingly in case an event triggered the need for rescheduling (such as
2771 * an interrupt waking up a task) while preemption was disabled in __schedule().
2751 */ 2772 */
2752static void __sched __schedule(void) 2773static void __sched __schedule(void)
2753{ 2774{
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
2756 struct rq *rq; 2777 struct rq *rq;
2757 int cpu; 2778 int cpu;
2758 2779
2759need_resched:
2760 preempt_disable(); 2780 preempt_disable();
2761 cpu = smp_processor_id(); 2781 cpu = smp_processor_id();
2762 rq = cpu_rq(cpu); 2782 rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
2776 smp_mb__before_spinlock(); 2796 smp_mb__before_spinlock();
2777 raw_spin_lock_irq(&rq->lock); 2797 raw_spin_lock_irq(&rq->lock);
2778 2798
2799 rq->clock_skip_update <<= 1; /* promote REQ to ACT */
2800
2779 switch_count = &prev->nivcsw; 2801 switch_count = &prev->nivcsw;
2780 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 2802 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2781 if (unlikely(signal_pending_state(prev->state, prev))) { 2803 if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
2800 switch_count = &prev->nvcsw; 2822 switch_count = &prev->nvcsw;
2801 } 2823 }
2802 2824
2803 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0) 2825 if (task_on_rq_queued(prev))
2804 update_rq_clock(rq); 2826 update_rq_clock(rq);
2805 2827
2806 next = pick_next_task(rq, prev); 2828 next = pick_next_task(rq, prev);
2807 clear_tsk_need_resched(prev); 2829 clear_tsk_need_resched(prev);
2808 clear_preempt_need_resched(); 2830 clear_preempt_need_resched();
2809 rq->skip_clock_update = 0; 2831 rq->clock_skip_update = 0;
2810 2832
2811 if (likely(prev != next)) { 2833 if (likely(prev != next)) {
2812 rq->nr_switches++; 2834 rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
2821 post_schedule(rq); 2843 post_schedule(rq);
2822 2844
2823 sched_preempt_enable_no_resched(); 2845 sched_preempt_enable_no_resched();
2824 if (need_resched())
2825 goto need_resched;
2826} 2846}
2827 2847
2828static inline void sched_submit_work(struct task_struct *tsk) 2848static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2842 struct task_struct *tsk = current; 2862 struct task_struct *tsk = current;
2843 2863
2844 sched_submit_work(tsk); 2864 sched_submit_work(tsk);
2845 __schedule(); 2865 do {
2866 __schedule();
2867 } while (need_resched());
2846} 2868}
2847EXPORT_SYMBOL(schedule); 2869EXPORT_SYMBOL(schedule);
2848 2870
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2877 preempt_disable(); 2899 preempt_disable();
2878} 2900}
2879 2901
2902static void preempt_schedule_common(void)
2903{
2904 do {
2905 __preempt_count_add(PREEMPT_ACTIVE);
2906 __schedule();
2907 __preempt_count_sub(PREEMPT_ACTIVE);
2908
2909 /*
2910 * Check again in case we missed a preemption opportunity
2911 * between schedule and now.
2912 */
2913 barrier();
2914 } while (need_resched());
2915}
2916
2880#ifdef CONFIG_PREEMPT 2917#ifdef CONFIG_PREEMPT
2881/* 2918/*
2882 * this is the entry point to schedule() from in-kernel preemption 2919 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2892 if (likely(!preemptible())) 2929 if (likely(!preemptible()))
2893 return; 2930 return;
2894 2931
2895 do { 2932 preempt_schedule_common();
2896 __preempt_count_add(PREEMPT_ACTIVE);
2897 __schedule();
2898 __preempt_count_sub(PREEMPT_ACTIVE);
2899
2900 /*
2901 * Check again in case we missed a preemption opportunity
2902 * between schedule and now.
2903 */
2904 barrier();
2905 } while (need_resched());
2906} 2933}
2907NOKPROBE_SYMBOL(preempt_schedule); 2934NOKPROBE_SYMBOL(preempt_schedule);
2908EXPORT_SYMBOL(preempt_schedule); 2935EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
3251{ 3278{
3252 struct sched_dl_entity *dl_se = &p->dl; 3279 struct sched_dl_entity *dl_se = &p->dl;
3253 3280
3254 init_dl_task_timer(dl_se);
3255 dl_se->dl_runtime = attr->sched_runtime; 3281 dl_se->dl_runtime = attr->sched_runtime;
3256 dl_se->dl_deadline = attr->sched_deadline; 3282 dl_se->dl_deadline = attr->sched_deadline;
3257 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline; 3283 dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3258 dl_se->flags = attr->sched_flags; 3284 dl_se->flags = attr->sched_flags;
3259 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); 3285 dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3260 dl_se->dl_throttled = 0; 3286
3261 dl_se->dl_new = 1; 3287 /*
3262 dl_se->dl_yielded = 0; 3288 * Changing the parameters of a task is 'tricky' and we're not doing
3289 * the correct thing -- also see task_dead_dl() and switched_from_dl().
3290 *
3291 * What we SHOULD do is delay the bandwidth release until the 0-lag
3292 * point. This would include retaining the task_struct until that time
3293 * and change dl_overflow() to not immediately decrement the current
3294 * amount.
3295 *
3296 * Instead we retain the current runtime/deadline and let the new
3297 * parameters take effect after the current reservation period lapses.
3298 * This is safe (albeit pessimistic) because the 0-lag point is always
3299 * before the current scheduling deadline.
3300 *
3301 * We can still have temporary overloads because we do not delay the
3302 * change in bandwidth until that time; so admission control is
3303 * not on the safe side. It does however guarantee tasks will never
3304 * consume more than promised.
3305 */
3263} 3306}
3264 3307
3265/* 3308/*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3382 return match; 3425 return match;
3383} 3426}
3384 3427
3428static bool dl_param_changed(struct task_struct *p,
3429 const struct sched_attr *attr)
3430{
3431 struct sched_dl_entity *dl_se = &p->dl;
3432
3433 if (dl_se->dl_runtime != attr->sched_runtime ||
3434 dl_se->dl_deadline != attr->sched_deadline ||
3435 dl_se->dl_period != attr->sched_period ||
3436 dl_se->flags != attr->sched_flags)
3437 return true;
3438
3439 return false;
3440}
3441
3385static int __sched_setscheduler(struct task_struct *p, 3442static int __sched_setscheduler(struct task_struct *p,
3386 const struct sched_attr *attr, 3443 const struct sched_attr *attr,
3387 bool user) 3444 bool user)
@@ -3510,7 +3567,7 @@ recheck:
3510 goto change; 3567 goto change;
3511 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) 3568 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3512 goto change; 3569 goto change;
3513 if (dl_policy(policy)) 3570 if (dl_policy(policy) && dl_param_changed(p, attr))
3514 goto change; 3571 goto change;
3515 3572
3516 p->sched_reset_on_fork = reset_on_fork; 3573 p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4202 return 0; 4259 return 0;
4203} 4260}
4204 4261
4205static void __cond_resched(void)
4206{
4207 __preempt_count_add(PREEMPT_ACTIVE);
4208 __schedule();
4209 __preempt_count_sub(PREEMPT_ACTIVE);
4210}
4211
4212int __sched _cond_resched(void) 4262int __sched _cond_resched(void)
4213{ 4263{
4214 if (should_resched()) { 4264 if (should_resched()) {
4215 __cond_resched(); 4265 preempt_schedule_common();
4216 return 1; 4266 return 1;
4217 } 4267 }
4218 return 0; 4268 return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4237 if (spin_needbreak(lock) || resched) { 4287 if (spin_needbreak(lock) || resched) {
4238 spin_unlock(lock); 4288 spin_unlock(lock);
4239 if (resched) 4289 if (resched)
4240 __cond_resched(); 4290 preempt_schedule_common();
4241 else 4291 else
4242 cpu_relax(); 4292 cpu_relax();
4243 ret = 1; 4293 ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4253 4303
4254 if (should_resched()) { 4304 if (should_resched()) {
4255 local_bh_enable(); 4305 local_bh_enable();
4256 __cond_resched(); 4306 preempt_schedule_common();
4257 local_bh_disable(); 4307 local_bh_disable();
4258 return 1; 4308 return 1;
4259 } 4309 }
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4508{ 4558{
4509 unsigned long free = 0; 4559 unsigned long free = 0;
4510 int ppid; 4560 int ppid;
4511 unsigned state; 4561 unsigned long state = p->state;
4512 4562
4513 state = p->state ? __ffs(p->state) + 1 : 0; 4563 if (state)
4564 state = __ffs(state) + 1;
4514 printk(KERN_INFO "%-15.15s %c", p->comm, 4565 printk(KERN_INFO "%-15.15s %c", p->comm,
4515 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 4566 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4516#if BITS_PER_LONG == 32 4567#if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4642 struct dl_bw *cur_dl_b; 4693 struct dl_bw *cur_dl_b;
4643 unsigned long flags; 4694 unsigned long flags;
4644 4695
4696 if (!cpumask_weight(cur))
4697 return ret;
4698
4645 rcu_read_lock_sched(); 4699 rcu_read_lock_sched();
4646 cur_dl_b = dl_bw_of(cpumask_any(cur)); 4700 cur_dl_b = dl_bw_of(cpumask_any(cur));
4647 trial_cpus = cpumask_weight(trial); 4701 trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4740 4794
4741void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4795void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4742{ 4796{
4743 if (p->sched_class && p->sched_class->set_cpus_allowed) 4797 if (p->sched_class->set_cpus_allowed)
4744 p->sched_class->set_cpus_allowed(p, new_mask); 4798 p->sched_class->set_cpus_allowed(p, new_mask);
4745 4799
4746 cpumask_copy(&p->cpus_allowed, new_mask); 4800 cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7113,9 +7167,6 @@ void __init sched_init(void)
7113#ifdef CONFIG_RT_GROUP_SCHED 7167#ifdef CONFIG_RT_GROUP_SCHED
7114 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7168 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7115#endif 7169#endif
7116#ifdef CONFIG_CPUMASK_OFFSTACK
7117 alloc_size += num_possible_cpus() * cpumask_size();
7118#endif
7119 if (alloc_size) { 7170 if (alloc_size) {
7120 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7171 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7121 7172
@@ -7135,13 +7186,13 @@ void __init sched_init(void)
7135 ptr += nr_cpu_ids * sizeof(void **); 7186 ptr += nr_cpu_ids * sizeof(void **);
7136 7187
7137#endif /* CONFIG_RT_GROUP_SCHED */ 7188#endif /* CONFIG_RT_GROUP_SCHED */
7189 }
7138#ifdef CONFIG_CPUMASK_OFFSTACK 7190#ifdef CONFIG_CPUMASK_OFFSTACK
7139 for_each_possible_cpu(i) { 7191 for_each_possible_cpu(i) {
7140 per_cpu(load_balance_mask, i) = (void *)ptr; 7192 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7141 ptr += cpumask_size(); 7193 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7142 }
7143#endif /* CONFIG_CPUMASK_OFFSTACK */
7144 } 7194 }
7195#endif /* CONFIG_CPUMASK_OFFSTACK */
7145 7196
7146 init_rt_bandwidth(&def_rt_bandwidth, 7197 init_rt_bandwidth(&def_rt_bandwidth,
7147 global_rt_period(), global_rt_runtime()); 7198 global_rt_period(), global_rt_runtime());
@@ -7253,6 +7304,11 @@ void __init sched_init(void)
7253 enter_lazy_tlb(&init_mm, current); 7304 enter_lazy_tlb(&init_mm, current);
7254 7305
7255 /* 7306 /*
7307 * During early bootup we pretend to be a normal task:
7308 */
7309 current->sched_class = &fair_sched_class;
7310
7311 /*
7256 * Make us the idle thread. Technically, schedule() should not be 7312 * Make us the idle thread. Technically, schedule() should not be
7257 * called from this thread, however somewhere below it might be, 7313 * called from this thread, however somewhere below it might be,
7258 * but because we are the idle thread, we just pick up running again 7314 * but because we are the idle thread, we just pick up running again
@@ -7262,11 +7318,6 @@ void __init sched_init(void)
7262 7318
7263 calc_load_update = jiffies + LOAD_FREQ; 7319 calc_load_update = jiffies + LOAD_FREQ;
7264 7320
7265 /*
7266 * During early bootup we pretend to be a normal task:
7267 */
7268 current->sched_class = &fair_sched_class;
7269
7270#ifdef CONFIG_SMP 7321#ifdef CONFIG_SMP
7271 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); 7322 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7272 /* May be allocated at isolcpus cmdline parse time */ 7323 /* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7295 * since we will exit with TASK_RUNNING make sure we enter with it, 7346 * since we will exit with TASK_RUNNING make sure we enter with it,
7296 * otherwise we will destroy state. 7347 * otherwise we will destroy state.
7297 */ 7348 */
7298 if (WARN_ONCE(current->state != TASK_RUNNING, 7349 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7299 "do not call blocking ops when !TASK_RUNNING; " 7350 "do not call blocking ops when !TASK_RUNNING; "
7300 "state=%lx set at [<%p>] %pS\n", 7351 "state=%lx set at [<%p>] %pS\n",
7301 current->state, 7352 current->state,
7302 (void *)current->task_state_change, 7353 (void *)current->task_state_change,
7303 (void *)current->task_state_change)) 7354 (void *)current->task_state_change);
7304 __set_current_state(TASK_RUNNING);
7305 7355
7306 ___might_sleep(file, line, preempt_offset); 7356 ___might_sleep(file, line, preempt_offset);
7307} 7357}
@@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7328 in_atomic(), irqs_disabled(), 7378 in_atomic(), irqs_disabled(),
7329 current->pid, current->comm); 7379 current->pid, current->comm);
7330 7380
7381 if (task_stack_end_corrupted(current))
7382 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
7383
7331 debug_show_held_locks(current); 7384 debug_show_held_locks(current);
7332 if (irqs_disabled()) 7385 if (irqs_disabled())
7333 print_irqtrace_events(current); 7386 print_irqtrace_events(current);