aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/kthread.c30
-rw-r--r--kernel/sched/core.c67
-rw-r--r--kernel/sched/cpufreq_schedutil.c2
-rw-r--r--kernel/sched/fair.c45
-rw-r--r--kernel/sched/rt.c16
-rw-r--r--kernel/sched/sched.h11
8 files changed, 99 insertions, 75 deletions
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2803264c512f..c1961761311d 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -62,7 +62,6 @@ void *kthread_probe_data(struct task_struct *k);
62int kthread_park(struct task_struct *k); 62int kthread_park(struct task_struct *k);
63void kthread_unpark(struct task_struct *k); 63void kthread_unpark(struct task_struct *k);
64void kthread_parkme(void); 64void kthread_parkme(void);
65void kthread_park_complete(struct task_struct *k);
66 65
67int kthreadd(void *unused); 66int kthreadd(void *unused);
68extern struct task_struct *kthreadd_task; 67extern struct task_struct *kthreadd_task;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9256118bd40c..43731fe51c97 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -118,7 +118,7 @@ struct task_group;
118 * the comment with set_special_state(). 118 * the comment with set_special_state().
119 */ 119 */
120#define is_special_task_state(state) \ 120#define is_special_task_state(state) \
121 ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) 121 ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD))
122 122
123#define __set_current_state(state_value) \ 123#define __set_current_state(state_value) \
124 do { \ 124 do { \
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 481951bf091d..750cb8082694 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -177,9 +177,20 @@ void *kthread_probe_data(struct task_struct *task)
177static void __kthread_parkme(struct kthread *self) 177static void __kthread_parkme(struct kthread *self)
178{ 178{
179 for (;;) { 179 for (;;) {
180 set_current_state(TASK_PARKED); 180 /*
181 * TASK_PARKED is a special state; we must serialize against
182 * possible pending wakeups to avoid store-store collisions on
183 * task->state.
184 *
185 * Such a collision might possibly result in the task state
186 * changin from TASK_PARKED and us failing the
187 * wait_task_inactive() in kthread_park().
188 */
189 set_special_state(TASK_PARKED);
181 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags)) 190 if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
182 break; 191 break;
192
193 complete_all(&self->parked);
183 schedule(); 194 schedule();
184 } 195 }
185 __set_current_state(TASK_RUNNING); 196 __set_current_state(TASK_RUNNING);
@@ -191,11 +202,6 @@ void kthread_parkme(void)
191} 202}
192EXPORT_SYMBOL_GPL(kthread_parkme); 203EXPORT_SYMBOL_GPL(kthread_parkme);
193 204
194void kthread_park_complete(struct task_struct *k)
195{
196 complete_all(&to_kthread(k)->parked);
197}
198
199static int kthread(void *_create) 205static int kthread(void *_create)
200{ 206{
201 /* Copy data: it's on kthread's stack */ 207 /* Copy data: it's on kthread's stack */
@@ -461,6 +467,9 @@ void kthread_unpark(struct task_struct *k)
461 467
462 reinit_completion(&kthread->parked); 468 reinit_completion(&kthread->parked);
463 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 469 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
470 /*
471 * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
472 */
464 wake_up_state(k, TASK_PARKED); 473 wake_up_state(k, TASK_PARKED);
465} 474}
466EXPORT_SYMBOL_GPL(kthread_unpark); 475EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -487,7 +496,16 @@ int kthread_park(struct task_struct *k)
487 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 496 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
488 if (k != current) { 497 if (k != current) {
489 wake_up_process(k); 498 wake_up_process(k);
499 /*
500 * Wait for __kthread_parkme() to complete(), this means we
501 * _will_ have TASK_PARKED and are about to call schedule().
502 */
490 wait_for_completion(&kthread->parked); 503 wait_for_completion(&kthread->parked);
504 /*
505 * Now wait for that schedule() to complete and the task to
506 * get scheduled out.
507 */
508 WARN_ON_ONCE(!wait_task_inactive(k, TASK_PARKED));
491 } 509 }
492 510
493 return 0; 511 return 0;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78d8facba456..fe365c9a08e9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7,7 +7,6 @@
7 */ 7 */
8#include "sched.h" 8#include "sched.h"
9 9
10#include <linux/kthread.h>
11#include <linux/nospec.h> 10#include <linux/nospec.h>
12 11
13#include <linux/kcov.h> 12#include <linux/kcov.h>
@@ -2724,28 +2723,20 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2724 membarrier_mm_sync_core_before_usermode(mm); 2723 membarrier_mm_sync_core_before_usermode(mm);
2725 mmdrop(mm); 2724 mmdrop(mm);
2726 } 2725 }
2727 if (unlikely(prev_state & (TASK_DEAD|TASK_PARKED))) { 2726 if (unlikely(prev_state == TASK_DEAD)) {
2728 switch (prev_state) { 2727 if (prev->sched_class->task_dead)
2729 case TASK_DEAD: 2728 prev->sched_class->task_dead(prev);
2730 if (prev->sched_class->task_dead)
2731 prev->sched_class->task_dead(prev);
2732 2729
2733 /* 2730 /*
2734 * Remove function-return probe instances associated with this 2731 * Remove function-return probe instances associated with this
2735 * task and put them back on the free list. 2732 * task and put them back on the free list.
2736 */ 2733 */
2737 kprobe_flush_task(prev); 2734 kprobe_flush_task(prev);
2738
2739 /* Task is done with its stack. */
2740 put_task_stack(prev);
2741 2735
2742 put_task_struct(prev); 2736 /* Task is done with its stack. */
2743 break; 2737 put_task_stack(prev);
2744 2738
2745 case TASK_PARKED: 2739 put_task_struct(prev);
2746 kthread_park_complete(prev);
2747 break;
2748 }
2749 } 2740 }
2750 2741
2751 tick_nohz_task_switch(); 2742 tick_nohz_task_switch();
@@ -3113,7 +3104,9 @@ static void sched_tick_remote(struct work_struct *work)
3113 struct tick_work *twork = container_of(dwork, struct tick_work, work); 3104 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3114 int cpu = twork->cpu; 3105 int cpu = twork->cpu;
3115 struct rq *rq = cpu_rq(cpu); 3106 struct rq *rq = cpu_rq(cpu);
3107 struct task_struct *curr;
3116 struct rq_flags rf; 3108 struct rq_flags rf;
3109 u64 delta;
3117 3110
3118 /* 3111 /*
3119 * Handle the tick only if it appears the remote CPU is running in full 3112 * Handle the tick only if it appears the remote CPU is running in full
@@ -3122,24 +3115,28 @@ static void sched_tick_remote(struct work_struct *work)
3122 * statistics and checks timeslices in a time-independent way, regardless 3115 * statistics and checks timeslices in a time-independent way, regardless
3123 * of when exactly it is running. 3116 * of when exactly it is running.
3124 */ 3117 */
3125 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { 3118 if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
3126 struct task_struct *curr; 3119 goto out_requeue;
3127 u64 delta;
3128 3120
3129 rq_lock_irq(rq, &rf); 3121 rq_lock_irq(rq, &rf);
3130 update_rq_clock(rq); 3122 curr = rq->curr;
3131 curr = rq->curr; 3123 if (is_idle_task(curr))
3132 delta = rq_clock_task(rq) - curr->se.exec_start; 3124 goto out_unlock;
3133 3125
3134 /* 3126 update_rq_clock(rq);
3135 * Make sure the next tick runs within a reasonable 3127 delta = rq_clock_task(rq) - curr->se.exec_start;
3136 * amount of time. 3128
3137 */ 3129 /*
3138 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); 3130 * Make sure the next tick runs within a reasonable
3139 curr->sched_class->task_tick(rq, curr, 0); 3131 * amount of time.
3140 rq_unlock_irq(rq, &rf); 3132 */
3141 } 3133 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3134 curr->sched_class->task_tick(rq, curr, 0);
3135
3136out_unlock:
3137 rq_unlock_irq(rq, &rf);
3142 3138
3139out_requeue:
3143 /* 3140 /*
3144 * Run the remote tick once per second (1Hz). This arbitrary 3141 * Run the remote tick once per second (1Hz). This arbitrary
3145 * frequency is large enough to avoid overload but short enough 3142 * frequency is large enough to avoid overload but short enough
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3cde46483f0a..c907fde01eaa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -192,7 +192,7 @@ static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
192{ 192{
193 struct rq *rq = cpu_rq(sg_cpu->cpu); 193 struct rq *rq = cpu_rq(sg_cpu->cpu);
194 194
195 if (rq->rt.rt_nr_running) 195 if (rt_rq_is_runnable(&rq->rt))
196 return sg_cpu->max; 196 return sg_cpu->max;
197 197
198 /* 198 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1866e64792a7..2f0a0be4d344 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3982,18 +3982,10 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3982 if (!sched_feat(UTIL_EST)) 3982 if (!sched_feat(UTIL_EST))
3983 return; 3983 return;
3984 3984
3985 /* 3985 /* Update root cfs_rq's estimated utilization */
3986 * Update root cfs_rq's estimated utilization 3986 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3987 * 3987 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3988 * If *p is the last task then the root cfs_rq's estimated utilization 3988 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3989 * of a CPU is 0 by definition.
3990 */
3991 ue.enqueued = 0;
3992 if (cfs_rq->nr_running) {
3993 ue.enqueued = cfs_rq->avg.util_est.enqueued;
3994 ue.enqueued -= min_t(unsigned int, ue.enqueued,
3995 (_task_util_est(p) | UTIL_AVG_UNCHANGED));
3996 }
3997 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); 3989 WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
3998 3990
3999 /* 3991 /*
@@ -4590,6 +4582,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4590 now = sched_clock_cpu(smp_processor_id()); 4582 now = sched_clock_cpu(smp_processor_id());
4591 cfs_b->runtime = cfs_b->quota; 4583 cfs_b->runtime = cfs_b->quota;
4592 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period); 4584 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4585 cfs_b->expires_seq++;
4593} 4586}
4594 4587
4595static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 4588static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4612,6 +4605,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4612 struct task_group *tg = cfs_rq->tg; 4605 struct task_group *tg = cfs_rq->tg;
4613 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 4606 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4614 u64 amount = 0, min_amount, expires; 4607 u64 amount = 0, min_amount, expires;
4608 int expires_seq;
4615 4609
4616 /* note: this is a positive sum as runtime_remaining <= 0 */ 4610 /* note: this is a positive sum as runtime_remaining <= 0 */
4617 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; 4611 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4628,6 +4622,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4628 cfs_b->idle = 0; 4622 cfs_b->idle = 0;
4629 } 4623 }
4630 } 4624 }
4625 expires_seq = cfs_b->expires_seq;
4631 expires = cfs_b->runtime_expires; 4626 expires = cfs_b->runtime_expires;
4632 raw_spin_unlock(&cfs_b->lock); 4627 raw_spin_unlock(&cfs_b->lock);
4633 4628
@@ -4637,8 +4632,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4637 * spread between our sched_clock and the one on which runtime was 4632 * spread between our sched_clock and the one on which runtime was
4638 * issued. 4633 * issued.
4639 */ 4634 */
4640 if ((s64)(expires - cfs_rq->runtime_expires) > 0) 4635 if (cfs_rq->expires_seq != expires_seq) {
4636 cfs_rq->expires_seq = expires_seq;
4641 cfs_rq->runtime_expires = expires; 4637 cfs_rq->runtime_expires = expires;
4638 }
4642 4639
4643 return cfs_rq->runtime_remaining > 0; 4640 return cfs_rq->runtime_remaining > 0;
4644} 4641}
@@ -4664,12 +4661,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4664 * has not truly expired. 4661 * has not truly expired.
4665 * 4662 *
4666 * Fortunately we can check determine whether this the case by checking 4663 * Fortunately we can check determine whether this the case by checking
4667 * whether the global deadline has advanced. It is valid to compare 4664 * whether the global deadline(cfs_b->expires_seq) has advanced.
4668 * cfs_b->runtime_expires without any locks since we only care about
4669 * exact equality, so a partial write will still work.
4670 */ 4665 */
4671 4666 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4672 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4673 /* extend local deadline, drift is bounded above by 2 ticks */ 4667 /* extend local deadline, drift is bounded above by 2 ticks */
4674 cfs_rq->runtime_expires += TICK_NSEC; 4668 cfs_rq->runtime_expires += TICK_NSEC;
4675 } else { 4669 } else {
@@ -5202,13 +5196,18 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
5202 5196
5203void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 5197void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5204{ 5198{
5199 u64 overrun;
5200
5205 lockdep_assert_held(&cfs_b->lock); 5201 lockdep_assert_held(&cfs_b->lock);
5206 5202
5207 if (!cfs_b->period_active) { 5203 if (cfs_b->period_active)
5208 cfs_b->period_active = 1; 5204 return;
5209 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 5205
5210 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 5206 cfs_b->period_active = 1;
5211 } 5207 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5208 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
5209 cfs_b->expires_seq++;
5210 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5212} 5211}
5213 5212
5214static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) 5213static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 47556b0c9a95..572567078b60 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -508,8 +508,11 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
508 508
509 rt_se = rt_rq->tg->rt_se[cpu]; 509 rt_se = rt_rq->tg->rt_se[cpu];
510 510
511 if (!rt_se) 511 if (!rt_se) {
512 dequeue_top_rt_rq(rt_rq); 512 dequeue_top_rt_rq(rt_rq);
513 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
514 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
515 }
513 else if (on_rt_rq(rt_se)) 516 else if (on_rt_rq(rt_se))
514 dequeue_rt_entity(rt_se, 0); 517 dequeue_rt_entity(rt_se, 0);
515} 518}
@@ -1001,8 +1004,6 @@ dequeue_top_rt_rq(struct rt_rq *rt_rq)
1001 sub_nr_running(rq, rt_rq->rt_nr_running); 1004 sub_nr_running(rq, rt_rq->rt_nr_running);
1002 rt_rq->rt_queued = 0; 1005 rt_rq->rt_queued = 0;
1003 1006
1004 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1005 cpufreq_update_util(rq, 0);
1006} 1007}
1007 1008
1008static void 1009static void
@@ -1014,11 +1015,14 @@ enqueue_top_rt_rq(struct rt_rq *rt_rq)
1014 1015
1015 if (rt_rq->rt_queued) 1016 if (rt_rq->rt_queued)
1016 return; 1017 return;
1017 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running) 1018
1019 if (rt_rq_throttled(rt_rq))
1018 return; 1020 return;
1019 1021
1020 add_nr_running(rq, rt_rq->rt_nr_running); 1022 if (rt_rq->rt_nr_running) {
1021 rt_rq->rt_queued = 1; 1023 add_nr_running(rq, rt_rq->rt_nr_running);
1024 rt_rq->rt_queued = 1;
1025 }
1022 1026
1023 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */ 1027 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1024 cpufreq_update_util(rq, 0); 1028 cpufreq_update_util(rq, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 6601baf2361c..c7742dcc136c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -334,9 +334,10 @@ struct cfs_bandwidth {
334 u64 runtime; 334 u64 runtime;
335 s64 hierarchical_quota; 335 s64 hierarchical_quota;
336 u64 runtime_expires; 336 u64 runtime_expires;
337 int expires_seq;
337 338
338 int idle; 339 short idle;
339 int period_active; 340 short period_active;
340 struct hrtimer period_timer; 341 struct hrtimer period_timer;
341 struct hrtimer slack_timer; 342 struct hrtimer slack_timer;
342 struct list_head throttled_cfs_rq; 343 struct list_head throttled_cfs_rq;
@@ -551,6 +552,7 @@ struct cfs_rq {
551 552
552#ifdef CONFIG_CFS_BANDWIDTH 553#ifdef CONFIG_CFS_BANDWIDTH
553 int runtime_enabled; 554 int runtime_enabled;
555 int expires_seq;
554 u64 runtime_expires; 556 u64 runtime_expires;
555 s64 runtime_remaining; 557 s64 runtime_remaining;
556 558
@@ -609,6 +611,11 @@ struct rt_rq {
609#endif 611#endif
610}; 612};
611 613
614static inline bool rt_rq_is_runnable(struct rt_rq *rt_rq)
615{
616 return rt_rq->rt_queued && rt_rq->rt_nr_running;
617}
618
612/* Deadline class' related fields in a runqueue */ 619/* Deadline class' related fields in a runqueue */
613struct dl_rq { 620struct dl_rq {
614 /* runqueue is an rbtree, ordered by deadline */ 621 /* runqueue is an rbtree, ordered by deadline */