1 files changed, 108 insertions, 55 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b5797b78add6..1f37fe7f77a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
 {
        s64 delta;
-        if (rq->skip_clock_update > 0)
+        lockdep_assert_held(&rq->lock);
+        if (rq->clock_skip_update & RQCF_ACT_SKIP)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
 */
 void hrtick_start(struct rq *rq, u64 delay)
 {
+        /*
+         * Don't schedule slices shorter than 10000ns, that just
+         * doesn't make sense. Rely on vruntime for fairness.
+         */
+        delay = max_t(u64, delay, 10000LL);
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
 }
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * this case, we can save a useless back to back clock update.
         */
        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
-                rq->skip_clock_update = 1;
+                rq_clock_skip_update(rq, true);
 }
 #ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                if (p->sched_class->migrate_task_rq)
                        p->sched_class->migrate_task_rq(p, new_cpu);
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
+                perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
        dl_se->dl_period = 0;
        dl_se->flags = 0;
        dl_se->dl_bw = 0;
+        dl_se->dl_throttled = 0;
+        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 /*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+#ifdef CONFIG_SMP
+        p->se.avg.decay_count           = 0;
+#endif
        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 #endif
        RB_CLEAR_NODE(&p->dl.rb_node);
-        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        init_dl_task_timer(&p->dl);
        __dl_clear_params(p);
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
 * allocated bandwidth to reflect the new situation.
 *
 * This function is called while holding p's rq->lock.
+ *
+ * XXX we should delay bw change until the task's 0-lag point, see
+ * __setparam_dl().
 */
 static int dl_overflow(struct task_struct *p, int policy,
                       const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
 *          - explicit schedule() call
 *          - return from syscall or exception to user-space
 *          - return from interrupt-handler to user-space
+ *
+ * WARNING: all callers must re-check need_resched() afterward and reschedule
+ * accordingly in case an event triggered the need for rescheduling (such as
+ * an interrupt waking up a task) while preemption was disabled in __schedule().
 */
 static void __sched __schedule(void)
 {
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
        struct rq *rq;
        int cpu;
-need_resched:
        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
        smp_mb__before_spinlock();
        raw_spin_lock_irq(&rq->lock);
+        rq->clock_skip_update <<= 1; /* promote REQ to ACT */
        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
                if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
+        if (task_on_rq_queued(prev))
                update_rq_clock(rq);
        next = pick_next_task(rq, prev);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
-        rq->skip_clock_update = 0;
+        rq->clock_skip_update = 0;
        if (likely(prev != next)) {
                rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
        post_schedule(rq);
        sched_preempt_enable_no_resched();
-        if (need_resched())
-                goto need_resched;
 }
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
        struct task_struct *tsk = current;
        sched_submit_work(tsk);
-        __schedule();
+        do {
+                __schedule();
+        } while (need_resched());
 }
 EXPORT_SYMBOL(schedule);
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
        preempt_disable();
 }
+static void preempt_schedule_common(void)
+{
+        do {
+                __preempt_count_add(PREEMPT_ACTIVE);
+                __schedule();
+                __preempt_count_sub(PREEMPT_ACTIVE);
+                /*
+                 * Check again in case we missed a preemption opportunity
+                 * between schedule and now.
+                 */
+                barrier();
+        } while (need_resched());
+}
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
        if (likely(!preemptible()))
                return;
-        do {
+        preempt_schedule_common();
-                __preempt_count_add(PREEMPT_ACTIVE);
-                __schedule();
-                __preempt_count_sub(PREEMPT_ACTIVE);
-                /*
-                 * Check again in case we missed a preemption opportunity
-                 * between schedule and now.
-                 */
-                barrier();
-        } while (need_resched());
 }
 NOKPROBE_SYMBOL(preempt_schedule);
 EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
        struct sched_dl_entity *dl_se = &p->dl;
-        init_dl_task_timer(dl_se);
        dl_se->dl_runtime = attr->sched_runtime;
        dl_se->dl_deadline = attr->sched_deadline;
        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
        dl_se->flags = attr->sched_flags;
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
-        dl_se->dl_throttled = 0;
-        dl_se->dl_new = 1;
+        /*
-        dl_se->dl_yielded = 0;
+         * Changing the parameters of a task is 'tricky' and we're not doing
+         * the correct thing -- also see task_dead_dl() and switched_from_dl().
+         *
+         * What we SHOULD do is delay the bandwidth release until the 0-lag
+         * point. This would include retaining the task_struct until that time
+         * and change dl_overflow() to not immediately decrement the current
+         * amount.
+         *
+         * Instead we retain the current runtime/deadline and let the new
+         * parameters take effect after the current reservation period lapses.
+         * This is safe (albeit pessimistic) because the 0-lag point is always
+         * before the current scheduling deadline.
+         *
+         * We can still have temporary overloads because we do not delay the
+         * change in bandwidth until that time; so admission control is
+         * not on the safe side. It does however guarantee tasks will never
+         * consume more than promised.
+         */
 }
 /*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
+static bool dl_param_changed(struct task_struct *p,
+                const struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        if (dl_se->dl_runtime != attr->sched_runtime ||
+                dl_se->dl_deadline != attr->sched_deadline ||
+                dl_se->dl_period != attr->sched_period ||
+                dl_se->flags != attr->sched_flags)
+                return true;
+        return false;
+}
 static int __sched_setscheduler(struct task_struct *p,
                                const struct sched_attr *attr,
                                bool user)
@@ -3510,7 +3567,7 @@ recheck:
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
-                if (dl_policy(policy))
+                if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
                p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static void __cond_resched(void)
-{
-        __preempt_count_add(PREEMPT_ACTIVE);
-        __schedule();
-        __preempt_count_sub(PREEMPT_ACTIVE);
-}
 int __sched _cond_resched(void)
 {
        if (should_resched()) {
-                __cond_resched();
+                preempt_schedule_common();
                return 1;
        }
        return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
        if (spin_needbreak(lock) || resched) {
                spin_unlock(lock);
                if (resched)
-                        __cond_resched();
+                        preempt_schedule_common();
                else
                        cpu_relax();
                ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
        if (should_resched()) {
                local_bh_enable();
-                __cond_resched();
+                preempt_schedule_common();
                local_bh_disable();
                return 1;
        }
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
 {
        unsigned long free = 0;
        int ppid;
-        unsigned state;
+        unsigned long state = p->state;
-        state = p->state ? __ffs(p->state) + 1 : 0;
+        if (state)
+                state = __ffs(state) + 1;
        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
 #if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
        struct dl_bw *cur_dl_b;
        unsigned long flags;
+        if (!cpumask_weight(cur))
+                return ret;
        rcu_read_lock_sched();
        cur_dl_b = dl_bw_of(cpumask_any(cur));
        trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
-        if (p->sched_class && p->sched_class->set_cpus_allowed)
+        if (p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7113,9 +7167,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
-#ifdef CONFIG_CPUMASK_OFFSTACK
-        alloc_size += num_possible_cpus() * cpumask_size();
-#endif
        if (alloc_size) {
                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
@@ -7135,13 +7186,13 @@ void __init sched_init(void)
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_RT_GROUP_SCHED */
+        }
 #ifdef CONFIG_CPUMASK_OFFSTACK
-                for_each_possible_cpu(i) {
+        for_each_possible_cpu(i) {
-                        per_cpu(load_balance_mask, i) = (void *)ptr;
+                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
-                        ptr += cpumask_size();
+                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
-                }
-#endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
        init_rt_bandwidth(&def_rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -7253,6 +7304,11 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
        /*
+         * During early bootup we pretend to be a normal task:
+         */
+        current->sched_class = &fair_sched_class;
+        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7262,11 +7318,6 @@ void __init sched_init(void)
        calc_load_update = jiffies + LOAD_FREQ;
-        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
        /* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
         * since we will exit with TASK_RUNNING make sure we enter with it,
         * otherwise we will destroy state.
         */
-        if (WARN_ONCE(current->state != TASK_RUNNING,
+        WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
                        "do not call blocking ops when !TASK_RUNNING; "
                        "state=%lx set at [<%p>] %pS\n",
                        current->state,
                        (void *)current->task_state_change,
-                        (void *)current->task_state_change))
+                        (void *)current->task_state_change);
-                __set_current_state(TASK_RUNNING);
        ___might_sleep(file, line, preempt_offset);
 }
@@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                        in_atomic(), irqs_disabled(),
                        current->pid, current->comm);
+        if (task_stack_end_corrupted(current))
+                printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b5797b78add6..1f37fe7f77a4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -119,7 +119,9 @@ void update_rq_clock(struct rq *rq)
119	{	119	{
120	s64 delta;	120	s64 delta;
121		121
122	if (rq->skip_clock_update > 0)	122	lockdep_assert_held(&rq->lock);
		123
		124	if (rq->clock_skip_update & RQCF_ACT_SKIP)
123	return;	125	return;
124		126
125	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;	127	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -490,6 +492,11 @@ static __init void init_hrtick(void)
490	*/	492	*/
491	void hrtick_start(struct rq *rq, u64 delay)	493	void hrtick_start(struct rq *rq, u64 delay)
492	{	494	{
		495	/*
		496	* Don't schedule slices shorter than 10000ns, that just
		497	* doesn't make sense. Rely on vruntime for fairness.
		498	*/
		499	delay = max_t(u64, delay, 10000LL);
493	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,	500	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
494	HRTIMER_MODE_REL_PINNED, 0);	501	HRTIMER_MODE_REL_PINNED, 0);
495	}	502	}
@@ -1046,7 +1053,7 @@ void check_preempt_curr(struct rq rq, struct task_struct p, int flags)
1046	* this case, we can save a useless back to back clock update.	1053	* this case, we can save a useless back to back clock update.
1047	*/	1054	*/
1048	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))	1055	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1049	rq->skip_clock_update = 1;	1056	rq_clock_skip_update(rq, true);
1050	}	1057	}
1051		1058
1052	#ifdef CONFIG_SMP	1059	#ifdef CONFIG_SMP
@@ -1082,7 +1089,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1082	if (p->sched_class->migrate_task_rq)	1089	if (p->sched_class->migrate_task_rq)
1083	p->sched_class->migrate_task_rq(p, new_cpu);	1090	p->sched_class->migrate_task_rq(p, new_cpu);
1084	p->se.nr_migrations++;	1091	p->se.nr_migrations++;
1085	perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);	1092	perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
1086	}	1093	}
1087		1094
1088	__set_task_cpu(p, new_cpu);	1095	__set_task_cpu(p, new_cpu);
@@ -1814,6 +1821,10 @@ void __dl_clear_params(struct task_struct *p)
1814	dl_se->dl_period = 0;	1821	dl_se->dl_period = 0;
1815	dl_se->flags = 0;	1822	dl_se->flags = 0;
1816	dl_se->dl_bw = 0;	1823	dl_se->dl_bw = 0;
		1824
		1825	dl_se->dl_throttled = 0;
		1826	dl_se->dl_new = 1;
		1827	dl_se->dl_yielded = 0;
1817	}	1828	}
1818		1829
1819	/*	1830	/*
@@ -1832,6 +1843,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1832	p->se.prev_sum_exec_runtime = 0;	1843	p->se.prev_sum_exec_runtime = 0;
1833	p->se.nr_migrations = 0;	1844	p->se.nr_migrations = 0;
1834	p->se.vruntime = 0;	1845	p->se.vruntime = 0;
		1846	#ifdef CONFIG_SMP
		1847	p->se.avg.decay_count = 0;
		1848	#endif
1835	INIT_LIST_HEAD(&p->se.group_node);	1849	INIT_LIST_HEAD(&p->se.group_node);
1836		1850
1837	#ifdef CONFIG_SCHEDSTATS	1851	#ifdef CONFIG_SCHEDSTATS
@@ -1839,7 +1853,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1839	#endif	1853	#endif
1840		1854
1841	RB_CLEAR_NODE(&p->dl.rb_node);	1855	RB_CLEAR_NODE(&p->dl.rb_node);
1842	hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);	1856	init_dl_task_timer(&p->dl);
1843	__dl_clear_params(p);	1857	__dl_clear_params(p);
1844		1858
1845	INIT_LIST_HEAD(&p->rt.run_list);	1859	INIT_LIST_HEAD(&p->rt.run_list);
@@ -2049,6 +2063,9 @@ static inline int dl_bw_cpus(int i)
2049	* allocated bandwidth to reflect the new situation.	2063	* allocated bandwidth to reflect the new situation.
2050	*	2064	*
2051	* This function is called while holding p's rq->lock.	2065	* This function is called while holding p's rq->lock.
		2066	*
		2067	* XXX we should delay bw change until the task's 0-lag point, see
		2068	* __setparam_dl().
2052	*/	2069	*/
2053	static int dl_overflow(struct task_struct *p, int policy,	2070	static int dl_overflow(struct task_struct *p, int policy,
2054	const struct sched_attr *attr)	2071	const struct sched_attr *attr)
@@ -2748,6 +2765,10 @@ again:
2748	* - explicit schedule() call	2765	* - explicit schedule() call
2749	* - return from syscall or exception to user-space	2766	* - return from syscall or exception to user-space
2750	* - return from interrupt-handler to user-space	2767	* - return from interrupt-handler to user-space
		2768	*
		2769	* WARNING: all callers must re-check need_resched() afterward and reschedule
		2770	* accordingly in case an event triggered the need for rescheduling (such as
		2771	* an interrupt waking up a task) while preemption was disabled in __schedule().
2751	*/	2772	*/
2752	static void __sched __schedule(void)	2773	static void __sched __schedule(void)
2753	{	2774	{
@@ -2756,7 +2777,6 @@ static void __sched __schedule(void)
2756	struct rq *rq;	2777	struct rq *rq;
2757	int cpu;	2778	int cpu;
2758		2779
2759	need_resched:
2760	preempt_disable();	2780	preempt_disable();
2761	cpu = smp_processor_id();	2781	cpu = smp_processor_id();
2762	rq = cpu_rq(cpu);	2782	rq = cpu_rq(cpu);
@@ -2776,6 +2796,8 @@ need_resched:
2776	smp_mb__before_spinlock();	2796	smp_mb__before_spinlock();
2777	raw_spin_lock_irq(&rq->lock);	2797	raw_spin_lock_irq(&rq->lock);
2778		2798
		2799	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
		2800
2779	switch_count = &prev->nivcsw;	2801	switch_count = &prev->nivcsw;
2780	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {	2802	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2781	if (unlikely(signal_pending_state(prev->state, prev))) {	2803	if (unlikely(signal_pending_state(prev->state, prev))) {
@@ -2800,13 +2822,13 @@ need_resched:
2800	switch_count = &prev->nvcsw;	2822	switch_count = &prev->nvcsw;
2801	}	2823	}
2802		2824
2803	if (task_on_rq_queued(prev) \|\| rq->skip_clock_update < 0)	2825	if (task_on_rq_queued(prev))
2804	update_rq_clock(rq);	2826	update_rq_clock(rq);
2805		2827
2806	next = pick_next_task(rq, prev);	2828	next = pick_next_task(rq, prev);
2807	clear_tsk_need_resched(prev);	2829	clear_tsk_need_resched(prev);
2808	clear_preempt_need_resched();	2830	clear_preempt_need_resched();
2809	rq->skip_clock_update = 0;	2831	rq->clock_skip_update = 0;
2810		2832
2811	if (likely(prev != next)) {	2833	if (likely(prev != next)) {
2812	rq->nr_switches++;	2834	rq->nr_switches++;
@@ -2821,8 +2843,6 @@ need_resched:
2821	post_schedule(rq);	2843	post_schedule(rq);
2822		2844
2823	sched_preempt_enable_no_resched();	2845	sched_preempt_enable_no_resched();
2824	if (need_resched())
2825	goto need_resched;
2826	}	2846	}
2827		2847
2828	static inline void sched_submit_work(struct task_struct *tsk)	2848	static inline void sched_submit_work(struct task_struct *tsk)
@@ -2842,7 +2862,9 @@ asmlinkage __visible void __sched schedule(void)
2842	struct task_struct *tsk = current;	2862	struct task_struct *tsk = current;
2843		2863
2844	sched_submit_work(tsk);	2864	sched_submit_work(tsk);
2845	__schedule();	2865	do {
		2866	__schedule();
		2867	} while (need_resched());
2846	}	2868	}
2847	EXPORT_SYMBOL(schedule);	2869	EXPORT_SYMBOL(schedule);
2848		2870
@@ -2877,6 +2899,21 @@ void __sched schedule_preempt_disabled(void)
2877	preempt_disable();	2899	preempt_disable();
2878	}	2900	}
2879		2901
		2902	static void preempt_schedule_common(void)
		2903	{
		2904	do {
		2905	__preempt_count_add(PREEMPT_ACTIVE);
		2906	__schedule();
		2907	__preempt_count_sub(PREEMPT_ACTIVE);
		2908
		2909	/*
		2910	* Check again in case we missed a preemption opportunity
		2911	* between schedule and now.
		2912	*/
		2913	barrier();
		2914	} while (need_resched());
		2915	}
		2916
2880	#ifdef CONFIG_PREEMPT	2917	#ifdef CONFIG_PREEMPT
2881	/*	2918	/*
2882	* this is the entry point to schedule() from in-kernel preemption	2919	* this is the entry point to schedule() from in-kernel preemption
@@ -2892,17 +2929,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2892	if (likely(!preemptible()))	2929	if (likely(!preemptible()))
2893	return;	2930	return;
2894		2931
2895	do {	2932	preempt_schedule_common();
2896	__preempt_count_add(PREEMPT_ACTIVE);
2897	__schedule();
2898	__preempt_count_sub(PREEMPT_ACTIVE);
2899
2900	/*
2901	* Check again in case we missed a preemption opportunity
2902	* between schedule and now.
2903	*/
2904	barrier();
2905	} while (need_resched());
2906	}	2933	}
2907	NOKPROBE_SYMBOL(preempt_schedule);	2934	NOKPROBE_SYMBOL(preempt_schedule);
2908	EXPORT_SYMBOL(preempt_schedule);	2935	EXPORT_SYMBOL(preempt_schedule);
@@ -3251,15 +3278,31 @@ __setparam_dl(struct task_struct p, const struct sched_attr attr)
3251	{	3278	{
3252	struct sched_dl_entity *dl_se = &p->dl;	3279	struct sched_dl_entity *dl_se = &p->dl;
3253		3280
3254	init_dl_task_timer(dl_se);
3255	dl_se->dl_runtime = attr->sched_runtime;	3281	dl_se->dl_runtime = attr->sched_runtime;
3256	dl_se->dl_deadline = attr->sched_deadline;	3282	dl_se->dl_deadline = attr->sched_deadline;
3257	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;	3283	dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
3258	dl_se->flags = attr->sched_flags;	3284	dl_se->flags = attr->sched_flags;
3259	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);	3285	dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
3260	dl_se->dl_throttled = 0;	3286
3261	dl_se->dl_new = 1;	3287	/*
3262	dl_se->dl_yielded = 0;	3288	* Changing the parameters of a task is 'tricky' and we're not doing
		3289	* the correct thing -- also see task_dead_dl() and switched_from_dl().
		3290	*
		3291	* What we SHOULD do is delay the bandwidth release until the 0-lag
		3292	* point. This would include retaining the task_struct until that time
		3293	* and change dl_overflow() to not immediately decrement the current
		3294	* amount.
		3295	*
		3296	* Instead we retain the current runtime/deadline and let the new
		3297	* parameters take effect after the current reservation period lapses.
		3298	* This is safe (albeit pessimistic) because the 0-lag point is always
		3299	* before the current scheduling deadline.
		3300	*
		3301	* We can still have temporary overloads because we do not delay the
		3302	* change in bandwidth until that time; so admission control is
		3303	* not on the safe side. It does however guarantee tasks will never
		3304	* consume more than promised.
		3305	*/
3263	}	3306	}
3264		3307
3265	/*	3308	/*
@@ -3382,6 +3425,20 @@ static bool check_same_owner(struct task_struct *p)
3382	return match;	3425	return match;
3383	}	3426	}
3384		3427
		3428	static bool dl_param_changed(struct task_struct *p,
		3429	const struct sched_attr *attr)
		3430	{
		3431	struct sched_dl_entity *dl_se = &p->dl;
		3432
		3433	if (dl_se->dl_runtime != attr->sched_runtime \|\|
		3434	dl_se->dl_deadline != attr->sched_deadline \|\|
		3435	dl_se->dl_period != attr->sched_period \|\|
		3436	dl_se->flags != attr->sched_flags)
		3437	return true;
		3438
		3439	return false;
		3440	}
		3441
3385	static int __sched_setscheduler(struct task_struct *p,	3442	static int __sched_setscheduler(struct task_struct *p,
3386	const struct sched_attr *attr,	3443	const struct sched_attr *attr,
3387	bool user)	3444	bool user)
@@ -3510,7 +3567,7 @@ recheck:
3510	goto change;	3567	goto change;
3511	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)	3568	if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
3512	goto change;	3569	goto change;
3513	if (dl_policy(policy))	3570	if (dl_policy(policy) && dl_param_changed(p, attr))
3514	goto change;	3571	goto change;
3515		3572
3516	p->sched_reset_on_fork = reset_on_fork;	3573	p->sched_reset_on_fork = reset_on_fork;
@@ -4202,17 +4259,10 @@ SYSCALL_DEFINE0(sched_yield)
4202	return 0;	4259	return 0;
4203	}	4260	}
4204		4261
4205	static void __cond_resched(void)
4206	{
4207	__preempt_count_add(PREEMPT_ACTIVE);
4208	__schedule();
4209	__preempt_count_sub(PREEMPT_ACTIVE);
4210	}
4211
4212	int __sched _cond_resched(void)	4262	int __sched _cond_resched(void)
4213	{	4263	{
4214	if (should_resched()) {	4264	if (should_resched()) {
4215	__cond_resched();	4265	preempt_schedule_common();
4216	return 1;	4266	return 1;
4217	}	4267	}
4218	return 0;	4268	return 0;
@@ -4237,7 +4287,7 @@ int __cond_resched_lock(spinlock_t *lock)
4237	if (spin_needbreak(lock) \|\| resched) {	4287	if (spin_needbreak(lock) \|\| resched) {
4238	spin_unlock(lock);	4288	spin_unlock(lock);
4239	if (resched)	4289	if (resched)
4240	__cond_resched();	4290	preempt_schedule_common();
4241	else	4291	else
4242	cpu_relax();	4292	cpu_relax();
4243	ret = 1;	4293	ret = 1;
@@ -4253,7 +4303,7 @@ int __sched __cond_resched_softirq(void)
4253		4303
4254	if (should_resched()) {	4304	if (should_resched()) {
4255	local_bh_enable();	4305	local_bh_enable();
4256	__cond_resched();	4306	preempt_schedule_common();
4257	local_bh_disable();	4307	local_bh_disable();
4258	return 1;	4308	return 1;
4259	}	4309	}
@@ -4508,9 +4558,10 @@ void sched_show_task(struct task_struct *p)
4508	{	4558	{
4509	unsigned long free = 0;	4559	unsigned long free = 0;
4510	int ppid;	4560	int ppid;
4511	unsigned state;	4561	unsigned long state = p->state;
4512		4562
4513	state = p->state ? __ffs(p->state) + 1 : 0;	4563	if (state)
		4564	state = __ffs(state) + 1;
4514	printk(KERN_INFO "%-15.15s %c", p->comm,	4565	printk(KERN_INFO "%-15.15s %c", p->comm,
4515	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');	4566	state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4516	#if BITS_PER_LONG == 32	4567	#if BITS_PER_LONG == 32
@@ -4642,6 +4693,9 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
4642	struct dl_bw *cur_dl_b;	4693	struct dl_bw *cur_dl_b;
4643	unsigned long flags;	4694	unsigned long flags;
4644		4695
		4696	if (!cpumask_weight(cur))
		4697	return ret;
		4698
4645	rcu_read_lock_sched();	4699	rcu_read_lock_sched();
4646	cur_dl_b = dl_bw_of(cpumask_any(cur));	4700	cur_dl_b = dl_bw_of(cpumask_any(cur));
4647	trial_cpus = cpumask_weight(trial);	4701	trial_cpus = cpumask_weight(trial);
@@ -4740,7 +4794,7 @@ static struct rq move_queued_task(struct task_struct p, int new_cpu)
4740		4794
4741	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)	4795	void do_set_cpus_allowed(struct task_struct p, const struct cpumask new_mask)
4742	{	4796	{
4743	if (p->sched_class && p->sched_class->set_cpus_allowed)	4797	if (p->sched_class->set_cpus_allowed)
4744	p->sched_class->set_cpus_allowed(p, new_mask);	4798	p->sched_class->set_cpus_allowed(p, new_mask);
4745		4799
4746	cpumask_copy(&p->cpus_allowed, new_mask);	4800	cpumask_copy(&p->cpus_allowed, new_mask);
@@ -7113,9 +7167,6 @@ void __init sched_init(void)
7113	#ifdef CONFIG_RT_GROUP_SCHED	7167	#ifdef CONFIG_RT_GROUP_SCHED
7114	alloc_size += 2 * nr_cpu_ids * sizeof(void **);	7168	alloc_size += 2 * nr_cpu_ids * sizeof(void **);
7115	#endif	7169	#endif
7116	#ifdef CONFIG_CPUMASK_OFFSTACK
7117	alloc_size += num_possible_cpus() * cpumask_size();
7118	#endif
7119	if (alloc_size) {	7170	if (alloc_size) {
7120	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);	7171	ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7121		7172
@@ -7135,13 +7186,13 @@ void __init sched_init(void)
7135	ptr += nr_cpu_ids * sizeof(void **);	7186	ptr += nr_cpu_ids * sizeof(void **);
7136		7187
7137	#endif /* CONFIG_RT_GROUP_SCHED */	7188	#endif /* CONFIG_RT_GROUP_SCHED */
		7189	}
7138	#ifdef CONFIG_CPUMASK_OFFSTACK	7190	#ifdef CONFIG_CPUMASK_OFFSTACK
7139	for_each_possible_cpu(i) {	7191	for_each_possible_cpu(i) {
7140	per_cpu(load_balance_mask, i) = (void *)ptr;	7192	per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
7141	ptr += cpumask_size();	7193	cpumask_size(), GFP_KERNEL, cpu_to_node(i));
7142	}
7143	#endif /* CONFIG_CPUMASK_OFFSTACK */
7144	}	7194	}
		7195	#endif /* CONFIG_CPUMASK_OFFSTACK */
7145		7196
7146	init_rt_bandwidth(&def_rt_bandwidth,	7197	init_rt_bandwidth(&def_rt_bandwidth,
7147	global_rt_period(), global_rt_runtime());	7198	global_rt_period(), global_rt_runtime());
@@ -7253,6 +7304,11 @@ void __init sched_init(void)
7253	enter_lazy_tlb(&init_mm, current);	7304	enter_lazy_tlb(&init_mm, current);
7254		7305
7255	/*	7306	/*
		7307	* During early bootup we pretend to be a normal task:
		7308	*/
		7309	current->sched_class = &fair_sched_class;
		7310
		7311	/*
7256	* Make us the idle thread. Technically, schedule() should not be	7312	* Make us the idle thread. Technically, schedule() should not be
7257	* called from this thread, however somewhere below it might be,	7313	* called from this thread, however somewhere below it might be,
7258	* but because we are the idle thread, we just pick up running again	7314	* but because we are the idle thread, we just pick up running again
@@ -7262,11 +7318,6 @@ void __init sched_init(void)
7262		7318
7263	calc_load_update = jiffies + LOAD_FREQ;	7319	calc_load_update = jiffies + LOAD_FREQ;
7264		7320
7265	/*
7266	* During early bootup we pretend to be a normal task:
7267	*/
7268	current->sched_class = &fair_sched_class;
7269
7270	#ifdef CONFIG_SMP	7321	#ifdef CONFIG_SMP
7271	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);	7322	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
7272	/* May be allocated at isolcpus cmdline parse time */	7323	/* May be allocated at isolcpus cmdline parse time */
@@ -7295,13 +7346,12 @@ void __might_sleep(const char *file, int line, int preempt_offset)
7295	* since we will exit with TASK_RUNNING make sure we enter with it,	7346	* since we will exit with TASK_RUNNING make sure we enter with it,
7296	* otherwise we will destroy state.	7347	* otherwise we will destroy state.
7297	*/	7348	*/
7298	if (WARN_ONCE(current->state != TASK_RUNNING,	7349	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
7299	"do not call blocking ops when !TASK_RUNNING; "	7350	"do not call blocking ops when !TASK_RUNNING; "
7300	"state=%lx set at [<%p>] %pS\n",	7351	"state=%lx set at [<%p>] %pS\n",
7301	current->state,	7352	current->state,
7302	(void *)current->task_state_change,	7353	(void *)current->task_state_change,
7303	(void *)current->task_state_change))	7354	(void *)current->task_state_change);
7304	__set_current_state(TASK_RUNNING);
7305		7355
7306	___might_sleep(file, line, preempt_offset);	7356	___might_sleep(file, line, preempt_offset);
7307	}	7357	}
@@ -7328,6 +7378,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
7328	in_atomic(), irqs_disabled(),	7378	in_atomic(), irqs_disabled(),
7329	current->pid, current->comm);	7379	current->pid, current->comm);
7330		7380
		7381	if (task_stack_end_corrupted(current))
		7382	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
		7383
7331	debug_show_held_locks(current);	7384	debug_show_held_locks(current);
7332	if (irqs_disabled())	7385	if (irqs_disabled())
7333	print_irqtrace_events(current);	7386	print_irqtrace_events(current);