aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 14:55:56 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2018-01-30 14:55:56 -0500
commitaf8c5e2d6071c71d228788d1ebb0b9676829001a (patch)
treec898379e89ed05fdc5c6b7ebddbf4a8d50f11657
parenta1c75e17e7d1306d35d51d3c330a13f42eba1d2d (diff)
parent07881166a892fa4908ac4924660a7793f75d6544 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Implement frequency/CPU invariance and OPP selection for SCHED_DEADLINE (Juri Lelli) - Tweak the task migration logic for better multi-tasking workload scalability (Mel Gorman) - Misc cleanups, fixes and improvements" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/deadline: Make bandwidth enforcement scale-invariant sched/cpufreq: Move arch_scale_{freq,cpu}_capacity() outside of #ifdef CONFIG_SMP sched/cpufreq: Remove arch_scale_freq_capacity()'s 'sd' parameter sched/cpufreq: Always consider all CPUs when deciding next freq sched/cpufreq: Split utilization signals sched/cpufreq: Change the worker kthread to SCHED_DEADLINE sched/deadline: Move CPU frequency selection triggering points sched/cpufreq: Use the DEADLINE utilization signal sched/deadline: Implement "runtime overrun signal" support sched/fair: Only immediately migrate tasks due to interrupts if prev and target CPUs share cache sched/fair: Correct obsolete comment about cpufreq_update_util() sched/fair: Remove impossible condition from find_idlest_group_cpu() sched/cpufreq: Don't pass flags to sugov_set_iowait_boost() sched/cpufreq: Initialize sg_cpu->flags to 0 sched/fair: Consider RT/IRQ pressure in capacity_spare_wake() sched/fair: Use 'unsigned long' for utilization, consistently sched/core: Rework and clarify prepare_lock_switch() sched/fair: Remove unused 'curr' parameter from wakeup_gran sched/headers: Constify object_is_on_stack()
-rw-r--r--include/linux/arch_topology.h2
-rw-r--r--include/linux/sched.h5
-rw-r--r--include/linux/sched/cpufreq.h2
-rw-r--r--include/linux/sched/task_stack.h2
-rw-r--r--include/linux/sched/topology.h12
-rw-r--r--include/uapi/linux/sched.h5
-rw-r--r--kernel/sched/core.c67
-rw-r--r--kernel/sched/cpufreq_schedutil.c93
-rw-r--r--kernel/sched/deadline.c143
-rw-r--r--kernel/sched/fair.c39
-rw-r--r--kernel/sched/sched.h112
-rw-r--r--kernel/time/posix-cpu-timers.c18
12 files changed, 339 insertions, 161 deletions
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index 304511267c82..2b709416de05 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -27,7 +27,7 @@ void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
27DECLARE_PER_CPU(unsigned long, freq_scale); 27DECLARE_PER_CPU(unsigned long, freq_scale);
28 28
29static inline 29static inline
30unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu) 30unsigned long topology_get_freq_scale(int cpu)
31{ 31{
32 return per_cpu(freq_scale, cpu); 32 return per_cpu(freq_scale, cpu);
33} 33}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68a504f6e474..166144c04ef6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -472,11 +472,15 @@ struct sched_dl_entity {
472 * has not been executed yet. This flag is useful to avoid race 472 * has not been executed yet. This flag is useful to avoid race
473 * conditions between the inactive timer handler and the wakeup 473 * conditions between the inactive timer handler and the wakeup
474 * code. 474 * code.
475 *
476 * @dl_overrun tells if the task asked to be informed about runtime
477 * overruns.
475 */ 478 */
476 unsigned int dl_throttled : 1; 479 unsigned int dl_throttled : 1;
477 unsigned int dl_boosted : 1; 480 unsigned int dl_boosted : 1;
478 unsigned int dl_yielded : 1; 481 unsigned int dl_yielded : 1;
479 unsigned int dl_non_contending : 1; 482 unsigned int dl_non_contending : 1;
483 unsigned int dl_overrun : 1;
480 484
481 /* 485 /*
482 * Bandwidth enforcement timer. Each -deadline task has its 486 * Bandwidth enforcement timer. Each -deadline task has its
@@ -1427,6 +1431,7 @@ extern int idle_cpu(int cpu);
1427extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); 1431extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
1428extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); 1432extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
1429extern int sched_setattr(struct task_struct *, const struct sched_attr *); 1433extern int sched_setattr(struct task_struct *, const struct sched_attr *);
1434extern int sched_setattr_nocheck(struct task_struct *, const struct sched_attr *);
1430extern struct task_struct *idle_task(int cpu); 1435extern struct task_struct *idle_task(int cpu);
1431 1436
1432/** 1437/**
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index d1ad3d825561..0b55834efd46 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -12,8 +12,6 @@
12#define SCHED_CPUFREQ_DL (1U << 1) 12#define SCHED_CPUFREQ_DL (1U << 1)
13#define SCHED_CPUFREQ_IOWAIT (1U << 2) 13#define SCHED_CPUFREQ_IOWAIT (1U << 2)
14 14
15#define SCHED_CPUFREQ_RT_DL (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
16
17#ifdef CONFIG_CPU_FREQ 15#ifdef CONFIG_CPU_FREQ
18struct update_util_data { 16struct update_util_data {
19 void (*func)(struct update_util_data *data, u64 time, unsigned int flags); 17 void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
diff --git a/include/linux/sched/task_stack.h b/include/linux/sched/task_stack.h
index cb4828aaa34f..6a841929073f 100644
--- a/include/linux/sched/task_stack.h
+++ b/include/linux/sched/task_stack.h
@@ -78,7 +78,7 @@ static inline void put_task_stack(struct task_struct *tsk) {}
78#define task_stack_end_corrupted(task) \ 78#define task_stack_end_corrupted(task) \
79 (*(end_of_stack(task)) != STACK_END_MAGIC) 79 (*(end_of_stack(task)) != STACK_END_MAGIC)
80 80
81static inline int object_is_on_stack(void *obj) 81static inline int object_is_on_stack(const void *obj)
82{ 82{
83 void *stack = task_stack_page(current); 83 void *stack = task_stack_page(current);
84 84
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index cf257c2e728d..26347741ba50 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -7,6 +7,12 @@
7#include <linux/sched/idle.h> 7#include <linux/sched/idle.h>
8 8
9/* 9/*
10 * Increase resolution of cpu_capacity calculations
11 */
12#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
13#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
14
15/*
10 * sched-domains (multiprocessor balancing) declarations: 16 * sched-domains (multiprocessor balancing) declarations:
11 */ 17 */
12#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
@@ -27,12 +33,6 @@
27#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ 33#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
28#define SD_NUMA 0x4000 /* cross-node balancing */ 34#define SD_NUMA 0x4000 /* cross-node balancing */
29 35
30/*
31 * Increase resolution of cpu_capacity calculations
32 */
33#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
34#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
35
36#ifdef CONFIG_SCHED_SMT 36#ifdef CONFIG_SCHED_SMT
37static inline int cpu_smt_flags(void) 37static inline int cpu_smt_flags(void)
38{ 38{
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 30a9e51bbb1e..22627f80063e 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -49,5 +49,10 @@
49 */ 49 */
50#define SCHED_FLAG_RESET_ON_FORK 0x01 50#define SCHED_FLAG_RESET_ON_FORK 0x01
51#define SCHED_FLAG_RECLAIM 0x02 51#define SCHED_FLAG_RECLAIM 0x02
52#define SCHED_FLAG_DL_OVERRUN 0x04
53
54#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
55 SCHED_FLAG_RECLAIM | \
56 SCHED_FLAG_DL_OVERRUN)
52 57
53#endif /* _UAPI_LINUX_SCHED_H */ 58#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5a31a85bbd84..3da7a2444a91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2046,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2046 * If the owning (remote) CPU is still in the middle of schedule() with 2046 * If the owning (remote) CPU is still in the middle of schedule() with
2047 * this task as prev, wait until its done referencing the task. 2047 * this task as prev, wait until its done referencing the task.
2048 * 2048 *
2049 * Pairs with the smp_store_release() in finish_lock_switch(). 2049 * Pairs with the smp_store_release() in finish_task().
2050 * 2050 *
2051 * This ensures that tasks getting woken will be fully ordered against 2051 * This ensures that tasks getting woken will be fully ordered against
2052 * their previous state and preserve Program Order. 2052 * their previous state and preserve Program Order.
@@ -2572,6 +2572,50 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2572 2572
2573#endif /* CONFIG_PREEMPT_NOTIFIERS */ 2573#endif /* CONFIG_PREEMPT_NOTIFIERS */
2574 2574
2575static inline void prepare_task(struct task_struct *next)
2576{
2577#ifdef CONFIG_SMP
2578 /*
2579 * Claim the task as running, we do this before switching to it
2580 * such that any running task will have this set.
2581 */
2582 next->on_cpu = 1;
2583#endif
2584}
2585
2586static inline void finish_task(struct task_struct *prev)
2587{
2588#ifdef CONFIG_SMP
2589 /*
2590 * After ->on_cpu is cleared, the task can be moved to a different CPU.
2591 * We must ensure this doesn't happen until the switch is completely
2592 * finished.
2593 *
2594 * In particular, the load of prev->state in finish_task_switch() must
2595 * happen before this.
2596 *
2597 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
2598 */
2599 smp_store_release(&prev->on_cpu, 0);
2600#endif
2601}
2602
2603static inline void finish_lock_switch(struct rq *rq)
2604{
2605#ifdef CONFIG_DEBUG_SPINLOCK
2606 /* this is a valid case when another task releases the spinlock */
2607 rq->lock.owner = current;
2608#endif
2609 /*
2610 * If we are tracking spinlock dependencies then we have to
2611 * fix up the runqueue lock - which gets 'carried over' from
2612 * prev into current:
2613 */
2614 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
2615
2616 raw_spin_unlock_irq(&rq->lock);
2617}
2618
2575/** 2619/**
2576 * prepare_task_switch - prepare to switch tasks 2620 * prepare_task_switch - prepare to switch tasks
2577 * @rq: the runqueue preparing to switch 2621 * @rq: the runqueue preparing to switch
@@ -2592,7 +2636,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
2592 sched_info_switch(rq, prev, next); 2636 sched_info_switch(rq, prev, next);
2593 perf_event_task_sched_out(prev, next); 2637 perf_event_task_sched_out(prev, next);
2594 fire_sched_out_preempt_notifiers(prev, next); 2638 fire_sched_out_preempt_notifiers(prev, next);
2595 prepare_lock_switch(rq, next); 2639 prepare_task(next);
2596 prepare_arch_switch(next); 2640 prepare_arch_switch(next);
2597} 2641}
2598 2642
@@ -2647,7 +2691,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2647 * the scheduled task must drop that reference. 2691 * the scheduled task must drop that reference.
2648 * 2692 *
2649 * We must observe prev->state before clearing prev->on_cpu (in 2693 * We must observe prev->state before clearing prev->on_cpu (in
2650 * finish_lock_switch), otherwise a concurrent wakeup can get prev 2694 * finish_task), otherwise a concurrent wakeup can get prev
2651 * running on another CPU and we could rave with its RUNNING -> DEAD 2695 * running on another CPU and we could rave with its RUNNING -> DEAD
2652 * transition, resulting in a double drop. 2696 * transition, resulting in a double drop.
2653 */ 2697 */
@@ -2664,7 +2708,8 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2664 * to use. 2708 * to use.
2665 */ 2709 */
2666 smp_mb__after_unlock_lock(); 2710 smp_mb__after_unlock_lock();
2667 finish_lock_switch(rq, prev); 2711 finish_task(prev);
2712 finish_lock_switch(rq);
2668 finish_arch_post_lock_switch(); 2713 finish_arch_post_lock_switch();
2669 2714
2670 fire_sched_in_preempt_notifiers(current); 2715 fire_sched_in_preempt_notifiers(current);
@@ -4041,8 +4086,7 @@ recheck:
4041 return -EINVAL; 4086 return -EINVAL;
4042 } 4087 }
4043 4088
4044 if (attr->sched_flags & 4089 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
4045 ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
4046 return -EINVAL; 4090 return -EINVAL;
4047 4091
4048 /* 4092 /*
@@ -4109,6 +4153,9 @@ recheck:
4109 } 4153 }
4110 4154
4111 if (user) { 4155 if (user) {
4156 if (attr->sched_flags & SCHED_FLAG_SUGOV)
4157 return -EINVAL;
4158
4112 retval = security_task_setscheduler(p); 4159 retval = security_task_setscheduler(p);
4113 if (retval) 4160 if (retval)
4114 return retval; 4161 return retval;
@@ -4164,7 +4211,8 @@ change:
4164 } 4211 }
4165#endif 4212#endif
4166#ifdef CONFIG_SMP 4213#ifdef CONFIG_SMP
4167 if (dl_bandwidth_enabled() && dl_policy(policy)) { 4214 if (dl_bandwidth_enabled() && dl_policy(policy) &&
4215 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
4168 cpumask_t *span = rq->rd->span; 4216 cpumask_t *span = rq->rd->span;
4169 4217
4170 /* 4218 /*
@@ -4294,6 +4342,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4294} 4342}
4295EXPORT_SYMBOL_GPL(sched_setattr); 4343EXPORT_SYMBOL_GPL(sched_setattr);
4296 4344
4345int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
4346{
4347 return __sched_setscheduler(p, attr, false, true);
4348}
4349
4297/** 4350/**
4298 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. 4351 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4299 * @p: the task in question. 4352 * @p: the task in question.
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d6717a3331a1..dd062a1c8cf0 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -60,7 +60,8 @@ struct sugov_cpu {
60 u64 last_update; 60 u64 last_update;
61 61
62 /* The fields below are only needed when sharing a policy. */ 62 /* The fields below are only needed when sharing a policy. */
63 unsigned long util; 63 unsigned long util_cfs;
64 unsigned long util_dl;
64 unsigned long max; 65 unsigned long max;
65 unsigned int flags; 66 unsigned int flags;
66 67
@@ -176,21 +177,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
176 return cpufreq_driver_resolve_freq(policy, freq); 177 return cpufreq_driver_resolve_freq(policy, freq);
177} 178}
178 179
179static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) 180static void sugov_get_util(struct sugov_cpu *sg_cpu)
180{ 181{
181 struct rq *rq = cpu_rq(cpu); 182 struct rq *rq = cpu_rq(sg_cpu->cpu);
182 unsigned long cfs_max;
183 183
184 cfs_max = arch_scale_cpu_capacity(NULL, cpu); 184 sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
185 sg_cpu->util_cfs = cpu_util_cfs(rq);
186 sg_cpu->util_dl = cpu_util_dl(rq);
187}
185 188
186 *util = min(rq->cfs.avg.util_avg, cfs_max); 189static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
187 *max = cfs_max; 190{
191 /*
192 * Ideally we would like to set util_dl as min/guaranteed freq and
193 * util_cfs + util_dl as requested freq. However, cpufreq is not yet
194 * ready for such an interface. So, we only do the latter for now.
195 */
196 return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max);
188} 197}
189 198
190static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, 199static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time)
191 unsigned int flags)
192{ 200{
193 if (flags & SCHED_CPUFREQ_IOWAIT) { 201 if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) {
194 if (sg_cpu->iowait_boost_pending) 202 if (sg_cpu->iowait_boost_pending)
195 return; 203 return;
196 204
@@ -264,7 +272,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
264 unsigned int next_f; 272 unsigned int next_f;
265 bool busy; 273 bool busy;
266 274
267 sugov_set_iowait_boost(sg_cpu, time, flags); 275 sugov_set_iowait_boost(sg_cpu, time);
268 sg_cpu->last_update = time; 276 sg_cpu->last_update = time;
269 277
270 if (!sugov_should_update_freq(sg_policy, time)) 278 if (!sugov_should_update_freq(sg_policy, time))
@@ -272,10 +280,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
272 280
273 busy = sugov_cpu_is_busy(sg_cpu); 281 busy = sugov_cpu_is_busy(sg_cpu);
274 282
275 if (flags & SCHED_CPUFREQ_RT_DL) { 283 if (flags & SCHED_CPUFREQ_RT) {
276 next_f = policy->cpuinfo.max_freq; 284 next_f = policy->cpuinfo.max_freq;
277 } else { 285 } else {
278 sugov_get_util(&util, &max, sg_cpu->cpu); 286 sugov_get_util(sg_cpu);
287 max = sg_cpu->max;
288 util = sugov_aggregate_util(sg_cpu);
279 sugov_iowait_boost(sg_cpu, &util, &max); 289 sugov_iowait_boost(sg_cpu, &util, &max);
280 next_f = get_next_freq(sg_policy, util, max); 290 next_f = get_next_freq(sg_policy, util, max);
281 /* 291 /*
@@ -305,23 +315,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
305 s64 delta_ns; 315 s64 delta_ns;
306 316
307 /* 317 /*
308 * If the CPU utilization was last updated before the previous 318 * If the CFS CPU utilization was last updated before the
309 * frequency update and the time elapsed between the last update 319 * previous frequency update and the time elapsed between the
310 * of the CPU utilization and the last frequency update is long 320 * last update of the CPU utilization and the last frequency
311 * enough, don't take the CPU into account as it probably is 321 * update is long enough, reset iowait_boost and util_cfs, as
312 * idle now (and clear iowait_boost for it). 322 * they are now probably stale. However, still consider the
323 * CPU contribution if it has some DEADLINE utilization
324 * (util_dl).
313 */ 325 */
314 delta_ns = time - j_sg_cpu->last_update; 326 delta_ns = time - j_sg_cpu->last_update;
315 if (delta_ns > TICK_NSEC) { 327 if (delta_ns > TICK_NSEC) {
316 j_sg_cpu->iowait_boost = 0; 328 j_sg_cpu->iowait_boost = 0;
317 j_sg_cpu->iowait_boost_pending = false; 329 j_sg_cpu->iowait_boost_pending = false;
318 continue; 330 j_sg_cpu->util_cfs = 0;
331 if (j_sg_cpu->util_dl == 0)
332 continue;
319 } 333 }
320 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) 334 if (j_sg_cpu->flags & SCHED_CPUFREQ_RT)
321 return policy->cpuinfo.max_freq; 335 return policy->cpuinfo.max_freq;
322 336
323 j_util = j_sg_cpu->util;
324 j_max = j_sg_cpu->max; 337 j_max = j_sg_cpu->max;
338 j_util = sugov_aggregate_util(j_sg_cpu);
325 if (j_util * max > j_max * util) { 339 if (j_util * max > j_max * util) {
326 util = j_util; 340 util = j_util;
327 max = j_max; 341 max = j_max;
@@ -338,22 +352,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
338{ 352{
339 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 353 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
340 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 354 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
341 unsigned long util, max;
342 unsigned int next_f; 355 unsigned int next_f;
343 356
344 sugov_get_util(&util, &max, sg_cpu->cpu);
345
346 raw_spin_lock(&sg_policy->update_lock); 357 raw_spin_lock(&sg_policy->update_lock);
347 358
348 sg_cpu->util = util; 359 sugov_get_util(sg_cpu);
349 sg_cpu->max = max;
350 sg_cpu->flags = flags; 360 sg_cpu->flags = flags;
351 361
352 sugov_set_iowait_boost(sg_cpu, time, flags); 362 sugov_set_iowait_boost(sg_cpu, time);
353 sg_cpu->last_update = time; 363 sg_cpu->last_update = time;
354 364
355 if (sugov_should_update_freq(sg_policy, time)) { 365 if (sugov_should_update_freq(sg_policy, time)) {
356 if (flags & SCHED_CPUFREQ_RT_DL) 366 if (flags & SCHED_CPUFREQ_RT)
357 next_f = sg_policy->policy->cpuinfo.max_freq; 367 next_f = sg_policy->policy->cpuinfo.max_freq;
358 else 368 else
359 next_f = sugov_next_freq_shared(sg_cpu, time); 369 next_f = sugov_next_freq_shared(sg_cpu, time);
@@ -383,9 +393,9 @@ static void sugov_irq_work(struct irq_work *irq_work)
383 sg_policy = container_of(irq_work, struct sugov_policy, irq_work); 393 sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
384 394
385 /* 395 /*
386 * For RT and deadline tasks, the schedutil governor shoots the 396 * For RT tasks, the schedutil governor shoots the frequency to maximum.
387 * frequency to maximum. Special care must be taken to ensure that this 397 * Special care must be taken to ensure that this kthread doesn't result
388 * kthread doesn't result in the same behavior. 398 * in the same behavior.
389 * 399 *
390 * This is (mostly) guaranteed by the work_in_progress flag. The flag is 400 * This is (mostly) guaranteed by the work_in_progress flag. The flag is
391 * updated only at the end of the sugov_work() function and before that 401 * updated only at the end of the sugov_work() function and before that
@@ -470,7 +480,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy)
470static int sugov_kthread_create(struct sugov_policy *sg_policy) 480static int sugov_kthread_create(struct sugov_policy *sg_policy)
471{ 481{
472 struct task_struct *thread; 482 struct task_struct *thread;
473 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; 483 struct sched_attr attr = {
484 .size = sizeof(struct sched_attr),
485 .sched_policy = SCHED_DEADLINE,
486 .sched_flags = SCHED_FLAG_SUGOV,
487 .sched_nice = 0,
488 .sched_priority = 0,
489 /*
490 * Fake (unused) bandwidth; workaround to "fix"
491 * priority inheritance.
492 */
493 .sched_runtime = 1000000,
494 .sched_deadline = 10000000,
495 .sched_period = 10000000,
496 };
474 struct cpufreq_policy *policy = sg_policy->policy; 497 struct cpufreq_policy *policy = sg_policy->policy;
475 int ret; 498 int ret;
476 499
@@ -488,10 +511,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
488 return PTR_ERR(thread); 511 return PTR_ERR(thread);
489 } 512 }
490 513
491 ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, &param); 514 ret = sched_setattr_nocheck(thread, &attr);
492 if (ret) { 515 if (ret) {
493 kthread_stop(thread); 516 kthread_stop(thread);
494 pr_warn("%s: failed to set SCHED_FIFO\n", __func__); 517 pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__);
495 return ret; 518 return ret;
496 } 519 }
497 520
@@ -655,7 +678,7 @@ static int sugov_start(struct cpufreq_policy *policy)
655 memset(sg_cpu, 0, sizeof(*sg_cpu)); 678 memset(sg_cpu, 0, sizeof(*sg_cpu));
656 sg_cpu->cpu = cpu; 679 sg_cpu->cpu = cpu;
657 sg_cpu->sg_policy = sg_policy; 680 sg_cpu->sg_policy = sg_policy;
658 sg_cpu->flags = SCHED_CPUFREQ_RT; 681 sg_cpu->flags = 0;
659 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 682 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
660 } 683 }
661 684
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2473736c7616..9bb0e0c412ec 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i)
78#endif 78#endif
79 79
80static inline 80static inline
81void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) 81void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
82{ 82{
83 u64 old = dl_rq->running_bw; 83 u64 old = dl_rq->running_bw;
84 84
@@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
86 dl_rq->running_bw += dl_bw; 86 dl_rq->running_bw += dl_bw;
87 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ 87 SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */
88 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 88 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
89 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
90 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
89} 91}
90 92
91static inline 93static inline
92void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) 94void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
93{ 95{
94 u64 old = dl_rq->running_bw; 96 u64 old = dl_rq->running_bw;
95 97
@@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq)
98 SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ 100 SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */
99 if (dl_rq->running_bw > old) 101 if (dl_rq->running_bw > old)
100 dl_rq->running_bw = 0; 102 dl_rq->running_bw = 0;
103 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
104 cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL);
101} 105}
102 106
103static inline 107static inline
104void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) 108void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
105{ 109{
106 u64 old = dl_rq->this_bw; 110 u64 old = dl_rq->this_bw;
107 111
@@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
111} 115}
112 116
113static inline 117static inline
114void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) 118void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
115{ 119{
116 u64 old = dl_rq->this_bw; 120 u64 old = dl_rq->this_bw;
117 121
@@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq)
123 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); 127 SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw);
124} 128}
125 129
130static inline
131void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
132{
133 if (!dl_entity_is_special(dl_se))
134 __add_rq_bw(dl_se->dl_bw, dl_rq);
135}
136
137static inline
138void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
139{
140 if (!dl_entity_is_special(dl_se))
141 __sub_rq_bw(dl_se->dl_bw, dl_rq);
142}
143
144static inline
145void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
146{
147 if (!dl_entity_is_special(dl_se))
148 __add_running_bw(dl_se->dl_bw, dl_rq);
149}
150
151static inline
152void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
153{
154 if (!dl_entity_is_special(dl_se))
155 __sub_running_bw(dl_se->dl_bw, dl_rq);
156}
157
126void dl_change_utilization(struct task_struct *p, u64 new_bw) 158void dl_change_utilization(struct task_struct *p, u64 new_bw)
127{ 159{
128 struct rq *rq; 160 struct rq *rq;
129 161
162 BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV);
163
130 if (task_on_rq_queued(p)) 164 if (task_on_rq_queued(p))
131 return; 165 return;
132 166
133 rq = task_rq(p); 167 rq = task_rq(p);
134 if (p->dl.dl_non_contending) { 168 if (p->dl.dl_non_contending) {
135 sub_running_bw(p->dl.dl_bw, &rq->dl); 169 sub_running_bw(&p->dl, &rq->dl);
136 p->dl.dl_non_contending = 0; 170 p->dl.dl_non_contending = 0;
137 /* 171 /*
138 * If the timer handler is currently running and the 172 * If the timer handler is currently running and the
@@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw)
144 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) 178 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
145 put_task_struct(p); 179 put_task_struct(p);
146 } 180 }
147 sub_rq_bw(p->dl.dl_bw, &rq->dl); 181 __sub_rq_bw(p->dl.dl_bw, &rq->dl);
148 add_rq_bw(new_bw, &rq->dl); 182 __add_rq_bw(new_bw, &rq->dl);
149} 183}
150 184
151/* 185/*
@@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p)
217 if (dl_se->dl_runtime == 0) 251 if (dl_se->dl_runtime == 0)
218 return; 252 return;
219 253
254 if (dl_entity_is_special(dl_se))
255 return;
256
220 WARN_ON(hrtimer_active(&dl_se->inactive_timer)); 257 WARN_ON(hrtimer_active(&dl_se->inactive_timer));
221 WARN_ON(dl_se->dl_non_contending); 258 WARN_ON(dl_se->dl_non_contending);
222 259
@@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p)
236 */ 273 */
237 if (zerolag_time < 0) { 274 if (zerolag_time < 0) {
238 if (dl_task(p)) 275 if (dl_task(p))
239 sub_running_bw(dl_se->dl_bw, dl_rq); 276 sub_running_bw(dl_se, dl_rq);
240 if (!dl_task(p) || p->state == TASK_DEAD) { 277 if (!dl_task(p) || p->state == TASK_DEAD) {
241 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 278 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
242 279
243 if (p->state == TASK_DEAD) 280 if (p->state == TASK_DEAD)
244 sub_rq_bw(p->dl.dl_bw, &rq->dl); 281 sub_rq_bw(&p->dl, &rq->dl);
245 raw_spin_lock(&dl_b->lock); 282 raw_spin_lock(&dl_b->lock);
246 __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); 283 __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
247 __dl_clear_params(p); 284 __dl_clear_params(p);
@@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
268 return; 305 return;
269 306
270 if (flags & ENQUEUE_MIGRATED) 307 if (flags & ENQUEUE_MIGRATED)
271 add_rq_bw(dl_se->dl_bw, dl_rq); 308 add_rq_bw(dl_se, dl_rq);
272 309
273 if (dl_se->dl_non_contending) { 310 if (dl_se->dl_non_contending) {
274 dl_se->dl_non_contending = 0; 311 dl_se->dl_non_contending = 0;
@@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
289 * when the "inactive timer" fired). 326 * when the "inactive timer" fired).
290 * So, add it back. 327 * So, add it back.
291 */ 328 */
292 add_running_bw(dl_se->dl_bw, dl_rq); 329 add_running_bw(dl_se, dl_rq);
293 } 330 }
294} 331}
295 332
@@ -1114,7 +1151,8 @@ static void update_curr_dl(struct rq *rq)
1114{ 1151{
1115 struct task_struct *curr = rq->curr; 1152 struct task_struct *curr = rq->curr;
1116 struct sched_dl_entity *dl_se = &curr->dl; 1153 struct sched_dl_entity *dl_se = &curr->dl;
1117 u64 delta_exec; 1154 u64 delta_exec, scaled_delta_exec;
1155 int cpu = cpu_of(rq);
1118 1156
1119 if (!dl_task(curr) || !on_dl_rq(dl_se)) 1157 if (!dl_task(curr) || !on_dl_rq(dl_se))
1120 return; 1158 return;
@@ -1134,9 +1172,6 @@ static void update_curr_dl(struct rq *rq)
1134 return; 1172 return;
1135 } 1173 }
1136 1174
1137 /* kick cpufreq (see the comment in kernel/sched/sched.h). */
1138 cpufreq_update_util(rq, SCHED_CPUFREQ_DL);
1139
1140 schedstat_set(curr->se.statistics.exec_max, 1175 schedstat_set(curr->se.statistics.exec_max,
1141 max(curr->se.statistics.exec_max, delta_exec)); 1176 max(curr->se.statistics.exec_max, delta_exec));
1142 1177
@@ -1148,13 +1183,39 @@ static void update_curr_dl(struct rq *rq)
1148 1183
1149 sched_rt_avg_update(rq, delta_exec); 1184 sched_rt_avg_update(rq, delta_exec);
1150 1185
1151 if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) 1186 if (dl_entity_is_special(dl_se))
1152 delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); 1187 return;
1153 dl_se->runtime -= delta_exec; 1188
1189 /*
1190 * For tasks that participate in GRUB, we implement GRUB-PA: the
1191 * spare reclaimed bandwidth is used to clock down frequency.
1192 *
1193 * For the others, we still need to scale reservation parameters
1194 * according to current frequency and CPU maximum capacity.
1195 */
1196 if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) {
1197 scaled_delta_exec = grub_reclaim(delta_exec,
1198 rq,
1199 &curr->dl);
1200 } else {
1201 unsigned long scale_freq = arch_scale_freq_capacity(cpu);
1202 unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
1203
1204 scaled_delta_exec = cap_scale(delta_exec, scale_freq);
1205 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
1206 }
1207
1208 dl_se->runtime -= scaled_delta_exec;
1154 1209
1155throttle: 1210throttle:
1156 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { 1211 if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
1157 dl_se->dl_throttled = 1; 1212 dl_se->dl_throttled = 1;
1213
1214 /* If requested, inform the user about runtime overruns. */
1215 if (dl_runtime_exceeded(dl_se) &&
1216 (dl_se->flags & SCHED_FLAG_DL_OVERRUN))
1217 dl_se->dl_overrun = 1;
1218
1158 __dequeue_task_dl(rq, curr, 0); 1219 __dequeue_task_dl(rq, curr, 0);
1159 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) 1220 if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
1160 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); 1221 enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
@@ -1204,8 +1265,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
1204 struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); 1265 struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
1205 1266
1206 if (p->state == TASK_DEAD && dl_se->dl_non_contending) { 1267 if (p->state == TASK_DEAD && dl_se->dl_non_contending) {
1207 sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); 1268 sub_running_bw(&p->dl, dl_rq_of_se(&p->dl));
1208 sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); 1269 sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl));
1209 dl_se->dl_non_contending = 0; 1270 dl_se->dl_non_contending = 0;
1210 } 1271 }
1211 1272
@@ -1222,7 +1283,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
1222 sched_clock_tick(); 1283 sched_clock_tick();
1223 update_rq_clock(rq); 1284 update_rq_clock(rq);
1224 1285
1225 sub_running_bw(dl_se->dl_bw, &rq->dl); 1286 sub_running_bw(dl_se, &rq->dl);
1226 dl_se->dl_non_contending = 0; 1287 dl_se->dl_non_contending = 0;
1227unlock: 1288unlock:
1228 task_rq_unlock(rq, p, &rf); 1289 task_rq_unlock(rq, p, &rf);
@@ -1416,8 +1477,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1416 dl_check_constrained_dl(&p->dl); 1477 dl_check_constrained_dl(&p->dl);
1417 1478
1418 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { 1479 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) {
1419 add_rq_bw(p->dl.dl_bw, &rq->dl); 1480 add_rq_bw(&p->dl, &rq->dl);
1420 add_running_bw(p->dl.dl_bw, &rq->dl); 1481 add_running_bw(&p->dl, &rq->dl);
1421 } 1482 }
1422 1483
1423 /* 1484 /*
@@ -1457,8 +1518,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
1457 __dequeue_task_dl(rq, p, flags); 1518 __dequeue_task_dl(rq, p, flags);
1458 1519
1459 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { 1520 if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) {
1460 sub_running_bw(p->dl.dl_bw, &rq->dl); 1521 sub_running_bw(&p->dl, &rq->dl);
1461 sub_rq_bw(p->dl.dl_bw, &rq->dl); 1522 sub_rq_bw(&p->dl, &rq->dl);
1462 } 1523 }
1463 1524
1464 /* 1525 /*
@@ -1564,7 +1625,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
1564 */ 1625 */
1565 raw_spin_lock(&rq->lock); 1626 raw_spin_lock(&rq->lock);
1566 if (p->dl.dl_non_contending) { 1627 if (p->dl.dl_non_contending) {
1567 sub_running_bw(p->dl.dl_bw, &rq->dl); 1628 sub_running_bw(&p->dl, &rq->dl);
1568 p->dl.dl_non_contending = 0; 1629 p->dl.dl_non_contending = 0;
1569 /* 1630 /*
1570 * If the timer handler is currently running and the 1631 * If the timer handler is currently running and the
@@ -1576,7 +1637,7 @@ static void migrate_task_rq_dl(struct task_struct *p)
1576 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) 1637 if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
1577 put_task_struct(p); 1638 put_task_struct(p);
1578 } 1639 }
1579 sub_rq_bw(p->dl.dl_bw, &rq->dl); 1640 sub_rq_bw(&p->dl, &rq->dl);
1580 raw_spin_unlock(&rq->lock); 1641 raw_spin_unlock(&rq->lock);
1581} 1642}
1582 1643
@@ -2019,11 +2080,11 @@ retry:
2019 } 2080 }
2020 2081
2021 deactivate_task(rq, next_task, 0); 2082 deactivate_task(rq, next_task, 0);
2022 sub_running_bw(next_task->dl.dl_bw, &rq->dl); 2083 sub_running_bw(&next_task->dl, &rq->dl);
2023 sub_rq_bw(next_task->dl.dl_bw, &rq->dl); 2084 sub_rq_bw(&next_task->dl, &rq->dl);
2024 set_task_cpu(next_task, later_rq->cpu); 2085 set_task_cpu(next_task, later_rq->cpu);
2025 add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); 2086 add_rq_bw(&next_task->dl, &later_rq->dl);
2026 add_running_bw(next_task->dl.dl_bw, &later_rq->dl); 2087 add_running_bw(&next_task->dl, &later_rq->dl);
2027 activate_task(later_rq, next_task, 0); 2088 activate_task(later_rq, next_task, 0);
2028 ret = 1; 2089 ret = 1;
2029 2090
@@ -2111,11 +2172,11 @@ static void pull_dl_task(struct rq *this_rq)
2111 resched = true; 2172 resched = true;
2112 2173
2113 deactivate_task(src_rq, p, 0); 2174 deactivate_task(src_rq, p, 0);
2114 sub_running_bw(p->dl.dl_bw, &src_rq->dl); 2175 sub_running_bw(&p->dl, &src_rq->dl);
2115 sub_rq_bw(p->dl.dl_bw, &src_rq->dl); 2176 sub_rq_bw(&p->dl, &src_rq->dl);
2116 set_task_cpu(p, this_cpu); 2177 set_task_cpu(p, this_cpu);
2117 add_rq_bw(p->dl.dl_bw, &this_rq->dl); 2178 add_rq_bw(&p->dl, &this_rq->dl);
2118 add_running_bw(p->dl.dl_bw, &this_rq->dl); 2179 add_running_bw(&p->dl, &this_rq->dl);
2119 activate_task(this_rq, p, 0); 2180 activate_task(this_rq, p, 0);
2120 dmin = p->dl.deadline; 2181 dmin = p->dl.deadline;
2121 2182
@@ -2224,7 +2285,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2224 task_non_contending(p); 2285 task_non_contending(p);
2225 2286
2226 if (!task_on_rq_queued(p)) 2287 if (!task_on_rq_queued(p))
2227 sub_rq_bw(p->dl.dl_bw, &rq->dl); 2288 sub_rq_bw(&p->dl, &rq->dl);
2228 2289
2229 /* 2290 /*
2230 * We cannot use inactive_task_timer() to invoke sub_running_bw() 2291 * We cannot use inactive_task_timer() to invoke sub_running_bw()
@@ -2256,7 +2317,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2256 2317
2257 /* If p is not queued we will update its parameters at next wakeup. */ 2318 /* If p is not queued we will update its parameters at next wakeup. */
2258 if (!task_on_rq_queued(p)) { 2319 if (!task_on_rq_queued(p)) {
2259 add_rq_bw(p->dl.dl_bw, &rq->dl); 2320 add_rq_bw(&p->dl, &rq->dl);
2260 2321
2261 return; 2322 return;
2262 } 2323 }
@@ -2435,6 +2496,9 @@ int sched_dl_overflow(struct task_struct *p, int policy,
2435 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; 2496 u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
2436 int cpus, err = -1; 2497 int cpus, err = -1;
2437 2498
2499 if (attr->sched_flags & SCHED_FLAG_SUGOV)
2500 return 0;
2501
2438 /* !deadline task may carry old deadline bandwidth */ 2502 /* !deadline task may carry old deadline bandwidth */
2439 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) 2503 if (new_bw == p->dl.dl_bw && task_has_dl_policy(p))
2440 return 0; 2504 return 0;
@@ -2521,6 +2585,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
2521 */ 2585 */
2522bool __checkparam_dl(const struct sched_attr *attr) 2586bool __checkparam_dl(const struct sched_attr *attr)
2523{ 2587{
2588 /* special dl tasks don't actually use any parameter */
2589 if (attr->sched_flags & SCHED_FLAG_SUGOV)
2590 return true;
2591
2524 /* deadline != 0 */ 2592 /* deadline != 0 */
2525 if (attr->sched_deadline == 0) 2593 if (attr->sched_deadline == 0)
2526 return false; 2594 return false;
@@ -2566,6 +2634,7 @@ void __dl_clear_params(struct task_struct *p)
2566 dl_se->dl_throttled = 0; 2634 dl_se->dl_throttled = 0;
2567 dl_se->dl_yielded = 0; 2635 dl_se->dl_yielded = 0;
2568 dl_se->dl_non_contending = 0; 2636 dl_se->dl_non_contending = 0;
2637 dl_se->dl_overrun = 0;
2569} 2638}
2570 2639
2571bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2640bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 26a71ebcd3c2..7b6535987500 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3020 /* 3020 /*
3021 * There are a few boundary cases this might miss but it should 3021 * There are a few boundary cases this might miss but it should
3022 * get called often enough that that should (hopefully) not be 3022 * get called often enough that that should (hopefully) not be
3023 * a real problem -- added to that it only calls on the local 3023 * a real problem.
3024 * CPU, so if we enqueue remotely we'll miss an update, but
3025 * the next tick/schedule should update.
3026 * 3024 *
3027 * It will not get called when we go idle, because the idle 3025 * It will not get called when we go idle, because the idle
3028 * thread is a different class (!fair), nor will the utilization 3026 * thread is a different class (!fair), nor will the utilization
@@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
3091 return c1 + c2 + c3; 3089 return c1 + c2 + c3;
3092} 3090}
3093 3091
3094#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
3095
3096/* 3092/*
3097 * Accumulate the three separate parts of the sum; d1 the remainder 3093 * Accumulate the three separate parts of the sum; d1 the remainder
3098 * of the last (incomplete) period, d2 the span of full periods and d3 3094 * of the last (incomplete) period, d2 the span of full periods and d3
@@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
3122 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ 3118 u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
3123 u64 periods; 3119 u64 periods;
3124 3120
3125 scale_freq = arch_scale_freq_capacity(NULL, cpu); 3121 scale_freq = arch_scale_freq_capacity(cpu);
3126 scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 3122 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
3127 3123
3128 delta += sa->period_contrib; 3124 delta += sa->period_contrib;
@@ -5689,8 +5685,8 @@ static int wake_wide(struct task_struct *p)
5689 * soonest. For the purpose of speed we only consider the waking and previous 5685 * soonest. For the purpose of speed we only consider the waking and previous
5690 * CPU. 5686 * CPU.
5691 * 5687 *
5692 * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or 5688 * wake_affine_idle() - only considers 'now', it check if the waking CPU is
5693 * will be) idle. 5689 * cache-affine and is (or will be) idle.
5694 * 5690 *
5695 * wake_affine_weight() - considers the weight to reflect the average 5691 * wake_affine_weight() - considers the weight to reflect the average
5696 * scheduling latency of the CPUs. This seems to work 5692 * scheduling latency of the CPUs. This seems to work
@@ -5701,7 +5697,13 @@ static bool
5701wake_affine_idle(struct sched_domain *sd, struct task_struct *p, 5697wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
5702 int this_cpu, int prev_cpu, int sync) 5698 int this_cpu, int prev_cpu, int sync)
5703{ 5699{
5704 if (idle_cpu(this_cpu)) 5700 /*
5701 * If this_cpu is idle, it implies the wakeup is from interrupt
5702 * context. Only allow the move if cache is shared. Otherwise an
5703 * interrupt intensive workload could force all tasks onto one
5704 * node depending on the IO topology or IRQ affinity settings.
5705 */
5706 if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
5705 return true; 5707 return true;
5706 5708
5707 if (sync && cpu_rq(this_cpu)->nr_running == 1) 5709 if (sync && cpu_rq(this_cpu)->nr_running == 1)
@@ -5765,12 +5767,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5765 return affine; 5767 return affine;
5766} 5768}
5767 5769
5768static inline int task_util(struct task_struct *p); 5770static inline unsigned long task_util(struct task_struct *p);
5769static int cpu_util_wake(int cpu, struct task_struct *p); 5771static unsigned long cpu_util_wake(int cpu, struct task_struct *p);
5770 5772
5771static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) 5773static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5772{ 5774{
5773 return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); 5775 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
5774} 5776}
5775 5777
5776/* 5778/*
@@ -5950,7 +5952,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5950 } 5952 }
5951 } else if (shallowest_idle_cpu == -1) { 5953 } else if (shallowest_idle_cpu == -1) {
5952 load = weighted_cpuload(cpu_rq(i)); 5954 load = weighted_cpuload(cpu_rq(i));
5953 if (load < min_load || (load == min_load && i == this_cpu)) { 5955 if (load < min_load) {
5954 min_load = load; 5956 min_load = load;
5955 least_loaded_cpu = i; 5957 least_loaded_cpu = i;
5956 } 5958 }
@@ -6247,7 +6249,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6247 * capacity_orig) as it useful for predicting the capacity required after task 6249 * capacity_orig) as it useful for predicting the capacity required after task
6248 * migrations (scheduler-driven DVFS). 6250 * migrations (scheduler-driven DVFS).
6249 */ 6251 */
6250static int cpu_util(int cpu) 6252static unsigned long cpu_util(int cpu)
6251{ 6253{
6252 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; 6254 unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
6253 unsigned long capacity = capacity_orig_of(cpu); 6255 unsigned long capacity = capacity_orig_of(cpu);
@@ -6255,7 +6257,7 @@ static int cpu_util(int cpu)
6255 return (util >= capacity) ? capacity : util; 6257 return (util >= capacity) ? capacity : util;
6256} 6258}
6257 6259
6258static inline int task_util(struct task_struct *p) 6260static inline unsigned long task_util(struct task_struct *p)
6259{ 6261{
6260 return p->se.avg.util_avg; 6262 return p->se.avg.util_avg;
6261} 6263}
@@ -6264,7 +6266,7 @@ static inline int task_util(struct task_struct *p)
6264 * cpu_util_wake: Compute cpu utilization with any contributions from 6266 * cpu_util_wake: Compute cpu utilization with any contributions from
6265 * the waking task p removed. 6267 * the waking task p removed.
6266 */ 6268 */
6267static int cpu_util_wake(int cpu, struct task_struct *p) 6269static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
6268{ 6270{
6269 unsigned long util, capacity; 6271 unsigned long util, capacity;
6270 6272
@@ -6449,8 +6451,7 @@ static void task_dead_fair(struct task_struct *p)
6449} 6451}
6450#endif /* CONFIG_SMP */ 6452#endif /* CONFIG_SMP */
6451 6453
6452static unsigned long 6454static unsigned long wakeup_gran(struct sched_entity *se)
6453wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
6454{ 6455{
6455 unsigned long gran = sysctl_sched_wakeup_granularity; 6456 unsigned long gran = sysctl_sched_wakeup_granularity;
6456 6457
@@ -6492,7 +6493,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6492 if (vdiff <= 0) 6493 if (vdiff <= 0)
6493 return -1; 6494 return -1;
6494 6495
6495 gran = wakeup_gran(curr, se); 6496 gran = wakeup_gran(se);
6496 if (vdiff > gran) 6497 if (vdiff > gran)
6497 return 1; 6498 return 1;
6498 6499
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a212de..2e95505e23c6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p)
156 return dl_policy(p->policy); 156 return dl_policy(p->policy);
157} 157}
158 158
159#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
160
161/*
162 * !! For sched_setattr_nocheck() (kernel) only !!
163 *
164 * This is actually gross. :(
165 *
166 * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE
167 * tasks, but still be able to sleep. We need this on platforms that cannot
168 * atomically change clock frequency. Remove once fast switching will be
169 * available on such platforms.
170 *
171 * SUGOV stands for SchedUtil GOVernor.
172 */
173#define SCHED_FLAG_SUGOV 0x10000000
174
175static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
176{
177#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
178 return unlikely(dl_se->flags & SCHED_FLAG_SUGOV);
179#else
180 return false;
181#endif
182}
183
159/* 184/*
160 * Tells if entity @a should preempt entity @b. 185 * Tells if entity @a should preempt entity @b.
161 */ 186 */
162static inline bool 187static inline bool
163dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) 188dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
164{ 189{
165 return dl_time_before(a->deadline, b->deadline); 190 return dl_entity_is_special(a) ||
191 dl_time_before(a->deadline, b->deadline);
166} 192}
167 193
168/* 194/*
@@ -1328,47 +1354,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1328# define finish_arch_post_lock_switch() do { } while (0) 1354# define finish_arch_post_lock_switch() do { } while (0)
1329#endif 1355#endif
1330 1356
1331static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
1332{
1333#ifdef CONFIG_SMP
1334 /*
1335 * We can optimise this out completely for !SMP, because the
1336 * SMP rebalancing from interrupt is the only thing that cares
1337 * here.
1338 */
1339 next->on_cpu = 1;
1340#endif
1341}
1342
1343static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1344{
1345#ifdef CONFIG_SMP
1346 /*
1347 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1348 * We must ensure this doesn't happen until the switch is completely
1349 * finished.
1350 *
1351 * In particular, the load of prev->state in finish_task_switch() must
1352 * happen before this.
1353 *
1354 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
1355 */
1356 smp_store_release(&prev->on_cpu, 0);
1357#endif
1358#ifdef CONFIG_DEBUG_SPINLOCK
1359 /* this is a valid case when another task releases the spinlock */
1360 rq->lock.owner = current;
1361#endif
1362 /*
1363 * If we are tracking spinlock dependencies then we have to
1364 * fix up the runqueue lock - which gets 'carried over' from
1365 * prev into current:
1366 */
1367 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
1368
1369 raw_spin_unlock_irq(&rq->lock);
1370}
1371
1372/* 1357/*
1373 * wake flags 1358 * wake flags
1374 */ 1359 */
@@ -1687,17 +1672,17 @@ static inline int hrtick_enabled(struct rq *rq)
1687 1672
1688#endif /* CONFIG_SCHED_HRTICK */ 1673#endif /* CONFIG_SCHED_HRTICK */
1689 1674
1690#ifdef CONFIG_SMP
1691extern void sched_avg_update(struct rq *rq);
1692
1693#ifndef arch_scale_freq_capacity 1675#ifndef arch_scale_freq_capacity
1694static __always_inline 1676static __always_inline
1695unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) 1677unsigned long arch_scale_freq_capacity(int cpu)
1696{ 1678{
1697 return SCHED_CAPACITY_SCALE; 1679 return SCHED_CAPACITY_SCALE;
1698} 1680}
1699#endif 1681#endif
1700 1682
1683#ifdef CONFIG_SMP
1684extern void sched_avg_update(struct rq *rq);
1685
1701#ifndef arch_scale_cpu_capacity 1686#ifndef arch_scale_cpu_capacity
1702static __always_inline 1687static __always_inline
1703unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) 1688unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1711,10 +1696,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
1711 1696
1712static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) 1697static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1713{ 1698{
1714 rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); 1699 rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
1715 sched_avg_update(rq); 1700 sched_avg_update(rq);
1716} 1701}
1717#else 1702#else
1703#ifndef arch_scale_cpu_capacity
1704static __always_inline
1705unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
1706{
1707 return SCHED_CAPACITY_SCALE;
1708}
1709#endif
1718static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } 1710static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
1719static inline void sched_avg_update(struct rq *rq) { } 1711static inline void sched_avg_update(struct rq *rq) { }
1720#endif 1712#endif
@@ -2096,14 +2088,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
2096 * The way cpufreq is currently arranged requires it to evaluate the CPU 2088 * The way cpufreq is currently arranged requires it to evaluate the CPU
2097 * performance state (frequency/voltage) on a regular basis to prevent it from 2089 * performance state (frequency/voltage) on a regular basis to prevent it from
2098 * being stuck in a completely inadequate performance level for too long. 2090 * being stuck in a completely inadequate performance level for too long.
2099 * That is not guaranteed to happen if the updates are only triggered from CFS, 2091 * That is not guaranteed to happen if the updates are only triggered from CFS
2100 * though, because they may not be coming in if RT or deadline tasks are active 2092 * and DL, though, because they may not be coming in if only RT tasks are
2101 * all the time (or there are RT and DL tasks only). 2093 * active all the time (or there are RT tasks only).
2102 * 2094 *
2103 * As a workaround for that issue, this function is called by the RT and DL 2095 * As a workaround for that issue, this function is called periodically by the
2104 * sched classes to trigger extra cpufreq updates to prevent it from stalling, 2096 * RT sched class to trigger extra cpufreq updates to prevent it from stalling,
2105 * but that really is a band-aid. Going forward it should be replaced with 2097 * but that really is a band-aid. Going forward it should be replaced with
2106 * solutions targeted more specifically at RT and DL tasks. 2098 * solutions targeted more specifically at RT tasks.
2107 */ 2099 */
2108static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) 2100static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
2109{ 2101{
@@ -2125,3 +2117,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2125#else /* arch_scale_freq_capacity */ 2117#else /* arch_scale_freq_capacity */
2126#define arch_scale_freq_invariant() (false) 2118#define arch_scale_freq_invariant() (false)
2127#endif 2119#endif
2120
2121#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2122
2123static inline unsigned long cpu_util_dl(struct rq *rq)
2124{
2125 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
2126}
2127
2128static inline unsigned long cpu_util_cfs(struct rq *rq)
2129{
2130 return rq->cfs.avg.util_avg;
2131}
2132
2133#endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index ec9f5da6f163..2541bd89f20e 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -14,6 +14,7 @@
14#include <linux/tick.h> 14#include <linux/tick.h>
15#include <linux/workqueue.h> 15#include <linux/workqueue.h>
16#include <linux/compat.h> 16#include <linux/compat.h>
17#include <linux/sched/deadline.h>
17 18
18#include "posix-timers.h" 19#include "posix-timers.h"
19 20
@@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers,
791 return 0; 792 return 0;
792} 793}
793 794
795static inline void check_dl_overrun(struct task_struct *tsk)
796{
797 if (tsk->dl.dl_overrun) {
798 tsk->dl.dl_overrun = 0;
799 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
800 }
801}
802
794/* 803/*
795 * Check for any per-thread CPU timers that have fired and move them off 804 * Check for any per-thread CPU timers that have fired and move them off
796 * the tsk->cpu_timers[N] list onto the firing list. Here we update the 805 * the tsk->cpu_timers[N] list onto the firing list. Here we update the
@@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk,
804 u64 expires; 813 u64 expires;
805 unsigned long soft; 814 unsigned long soft;
806 815
816 if (dl_task(tsk))
817 check_dl_overrun(tsk);
818
807 /* 819 /*
808 * If cputime_expires is zero, then there are no active 820 * If cputime_expires is zero, then there are no active
809 * per thread CPU timers. 821 * per thread CPU timers.
@@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk,
906 struct task_cputime cputime; 918 struct task_cputime cputime;
907 unsigned long soft; 919 unsigned long soft;
908 920
921 if (dl_task(tsk))
922 check_dl_overrun(tsk);
923
909 /* 924 /*
910 * If cputimer is not running, then there are no active 925 * If cputimer is not running, then there are no active
911 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). 926 * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU).
@@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1111 return 1; 1126 return 1;
1112 } 1127 }
1113 1128
1129 if (dl_task(tsk) && tsk->dl.dl_overrun)
1130 return 1;
1131
1114 return 0; 1132 return 0;
1115} 1133}
1116 1134