diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-08 19:39:53 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-07-08 19:39:53 -0400 |
commit | dad1c12ed831a7a89cc01e5582cd0b81a4be7f19 (patch) | |
tree | 7a84799d3108bd9d3f1d4b530afd3ff9300db982 | |
parent | 090bc5a2a91499c1fd64b78d125daa6ca5531d38 (diff) | |
parent | af24bde8df2029f067dc46aff0393c8f18ff6e2f (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Remove the unused per rq load array and all its infrastructure, by
Dietmar Eggemann.
- Add utilization clamping support by Patrick Bellasi. This is a
refinement of the energy aware scheduling framework with support for
boosting of interactive and capping of background workloads: to make
sure critical GUI threads get maximum frequency ASAP, and to make
sure background processing doesn't unnecessarily move to cpufreq
governor to higher frequencies and less energy efficient CPU modes.
- Add the bare minimum of tracepoints required for LISA EAS regression
testing, by Qais Yousef - which allows automated testing of various
power management features, including energy aware scheduling.
- Restructure the former tsk_nr_cpus_allowed() facility that the -rt
kernel used to modify the scheduler's CPU affinity logic such as
migrate_disable() - introduce the task->cpus_ptr value instead of
taking the address of &task->cpus_allowed directly - by Sebastian
Andrzej Siewior.
- Misc optimizations, fixes, cleanups and small enhancements - see the
Git log for details.
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
sched/uclamp: Add uclamp support to energy_compute()
sched/uclamp: Add uclamp_util_with()
sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks
sched/uclamp: Set default clamps for RT tasks
sched/uclamp: Reset uclamp values on RESET_ON_FORK
sched/uclamp: Extend sched_setattr() to support utilization clamping
sched/core: Allow sched_setattr() to use the current policy
sched/uclamp: Add system default clamps
sched/uclamp: Enforce last task's UCLAMP_MAX
sched/uclamp: Add bucket local max tracking
sched/uclamp: Add CPU's clamp buckets refcounting
sched/fair: Rename weighted_cpuload() to cpu_runnable_load()
sched/debug: Export the newly added tracepoints
sched/debug: Add sched_overutilized tracepoint
sched/debug: Add new tracepoint to track PELT at se level
sched/debug: Add new tracepoints to track PELT at rq level
sched/debug: Add a new sched_trace_*() helper functions
sched/autogroup: Make autogroup_path() always available
sched/wait: Deduplicate code with do-while
sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity()
...
49 files changed, 1216 insertions, 618 deletions
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c index e4219139386a..7238b355919c 100644 --- a/Documentation/scheduler/sched-pelt.c +++ b/Documentation/scheduler/sched-pelt.c | |||
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void) | |||
20 | int i; | 20 | int i; |
21 | unsigned int x; | 21 | unsigned int x; |
22 | 22 | ||
23 | printf("static const u32 runnable_avg_yN_inv[] = {"); | 23 | /* To silence -Wunused-but-set-variable warnings. */ |
24 | printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {"); | ||
24 | for (i = 0; i < HALFLIFE; i++) { | 25 | for (i = 0; i < HALFLIFE; i++) { |
25 | x = ((1UL<<32)-1)*pow(y, i); | 26 | x = ((1UL<<32)-1)*pow(y, i); |
26 | 27 | ||
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c index 60e375ce1ab2..d17cb1e6d679 100644 --- a/arch/arm/kernel/topology.c +++ b/arch/arm/kernel/topology.c | |||
@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu) | |||
169 | topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); | 169 | topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); |
170 | 170 | ||
171 | pr_info("CPU%u: update cpu_capacity %lu\n", | 171 | pr_info("CPU%u: update cpu_capacity %lu\n", |
172 | cpu, topology_get_cpu_scale(NULL, cpu)); | 172 | cpu, topology_get_cpu_scale(cpu)); |
173 | } | 173 | } |
174 | 174 | ||
175 | #else | 175 | #else |
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 6a52d761854b..79190d877fa7 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c | |||
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset, | |||
1831 | ti->cpu = cpu; | 1831 | ti->cpu = cpu; |
1832 | p->stack = ti; | 1832 | p->stack = ti; |
1833 | p->state = TASK_UNINTERRUPTIBLE; | 1833 | p->state = TASK_UNINTERRUPTIBLE; |
1834 | cpumask_set_cpu(cpu, &p->cpus_allowed); | 1834 | cpumask_set_cpu(cpu, &p->cpus_mask); |
1835 | INIT_LIST_HEAD(&p->tasks); | 1835 | INIT_LIST_HEAD(&p->tasks); |
1836 | p->parent = p->real_parent = p->group_leader = p; | 1836 | p->parent = p->real_parent = p->group_leader = p; |
1837 | INIT_LIST_HEAD(&p->children); | 1837 | INIT_LIST_HEAD(&p->children); |
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h index 0f813bb753c6..09cbe9042828 100644 --- a/arch/mips/include/asm/switch_to.h +++ b/arch/mips/include/asm/switch_to.h | |||
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task; | |||
42 | * inline to try to keep the overhead down. If we have been forced to run on | 42 | * inline to try to keep the overhead down. If we have been forced to run on |
43 | * a "CPU" with an FPU because of a previous high level of FP computation, | 43 | * a "CPU" with an FPU because of a previous high level of FP computation, |
44 | * but did not actually use the FPU during the most recent time-slice (CU1 | 44 | * but did not actually use the FPU during the most recent time-slice (CU1 |
45 | * isn't set), we undo the restriction on cpus_allowed. | 45 | * isn't set), we undo the restriction on cpus_mask. |
46 | * | 46 | * |
47 | * We're not calling set_cpus_allowed() here, because we have no need to | 47 | * We're not calling set_cpus_allowed() here, because we have no need to |
48 | * force prompt migration - we're already switching the current CPU to a | 48 | * force prompt migration - we're already switching the current CPU to a |
@@ -57,7 +57,7 @@ do { \ | |||
57 | test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ | 57 | test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ |
58 | (!(KSTK_STATUS(prev) & ST0_CU1))) { \ | 58 | (!(KSTK_STATUS(prev) & ST0_CU1))) { \ |
59 | clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ | 59 | clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ |
60 | prev->cpus_allowed = prev->thread.user_cpus_allowed; \ | 60 | prev->cpus_mask = prev->thread.user_cpus_allowed; \ |
61 | } \ | 61 | } \ |
62 | next->thread.emulated_fp = 0; \ | 62 | next->thread.emulated_fp = 0; \ |
63 | } while(0) | 63 | } while(0) |
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c index a7c0f97e4b0d..1a08428eedcf 100644 --- a/arch/mips/kernel/mips-mt-fpaff.c +++ b/arch/mips/kernel/mips-mt-fpaff.c | |||
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len, | |||
177 | if (retval) | 177 | if (retval) |
178 | goto out_unlock; | 178 | goto out_unlock; |
179 | 179 | ||
180 | cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); | 180 | cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr); |
181 | cpumask_and(&mask, &allowed, cpu_active_mask); | 181 | cpumask_and(&mask, &allowed, cpu_active_mask); |
182 | 182 | ||
183 | out_unlock: | 183 | out_unlock: |
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index c52766a5b85f..ac7159263da0 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c | |||
@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void) | |||
891 | * restricted the allowed set to exclude any CPUs with FPUs, | 891 | * restricted the allowed set to exclude any CPUs with FPUs, |
892 | * we'll skip the procedure. | 892 | * we'll skip the procedure. |
893 | */ | 893 | */ |
894 | if (cpumask_intersects(¤t->cpus_allowed, &mt_fpu_cpumask)) { | 894 | if (cpumask_intersects(¤t->cpus_mask, &mt_fpu_cpumask)) { |
895 | cpumask_t tmask; | 895 | cpumask_t tmask; |
896 | 896 | ||
897 | current->thread.user_cpus_allowed | 897 | current->thread.user_cpus_allowed |
898 | = current->cpus_allowed; | 898 | = current->cpus_mask; |
899 | cpumask_and(&tmask, ¤t->cpus_allowed, | 899 | cpumask_and(&tmask, ¤t->cpus_mask, |
900 | &mt_fpu_cpumask); | 900 | &mt_fpu_cpumask); |
901 | set_cpus_allowed_ptr(current, &tmask); | 901 | set_cpus_allowed_ptr(current, &tmask); |
902 | set_thread_flag(TIF_FPUBOUND); | 902 | set_thread_flag(TIF_FPUBOUND); |
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index e56b553de27b..f18d5067cd0f 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c | |||
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx) | |||
128 | * runqueue. The context will be rescheduled on the proper node | 128 | * runqueue. The context will be rescheduled on the proper node |
129 | * if it is timesliced or preempted. | 129 | * if it is timesliced or preempted. |
130 | */ | 130 | */ |
131 | cpumask_copy(&ctx->cpus_allowed, ¤t->cpus_allowed); | 131 | cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr); |
132 | 132 | ||
133 | /* Save the current cpu id for spu interrupt routing. */ | 133 | /* Save the current cpu id for spu interrupt routing. */ |
134 | ctx->last_ran = raw_smp_processor_id(); | 134 | ctx->last_ran = raw_smp_processor_id(); |
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 604c0e3bcc83..f68baccc69f0 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c | |||
@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) | |||
1503 | * may be scheduled elsewhere and invalidate entries in the | 1503 | * may be scheduled elsewhere and invalidate entries in the |
1504 | * pseudo-locked region. | 1504 | * pseudo-locked region. |
1505 | */ | 1505 | */ |
1506 | if (!cpumask_subset(¤t->cpus_allowed, &plr->d->cpu_mask)) { | 1506 | if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) { |
1507 | mutex_unlock(&rdtgroup_mutex); | 1507 | mutex_unlock(&rdtgroup_mutex); |
1508 | return -EINVAL; | 1508 | return -EINVAL; |
1509 | } | 1509 | } |
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 1739d7e1952a..9b09e31ae82f 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c | |||
@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev, | |||
43 | { | 43 | { |
44 | struct cpu *cpu = container_of(dev, struct cpu, dev); | 44 | struct cpu *cpu = container_of(dev, struct cpu, dev); |
45 | 45 | ||
46 | return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); | 46 | return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id)); |
47 | } | 47 | } |
48 | 48 | ||
49 | static void update_topology_flags_workfn(struct work_struct *work); | 49 | static void update_topology_flags_workfn(struct work_struct *work); |
@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void) | |||
116 | / capacity_scale; | 116 | / capacity_scale; |
117 | topology_set_cpu_scale(cpu, capacity); | 117 | topology_set_cpu_scale(cpu, capacity); |
118 | pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", | 118 | pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", |
119 | cpu, topology_get_cpu_scale(NULL, cpu)); | 119 | cpu, topology_get_cpu_scale(cpu)); |
120 | } | 120 | } |
121 | } | 121 | } |
122 | 122 | ||
@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, | |||
185 | cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); | 185 | cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); |
186 | 186 | ||
187 | for_each_cpu(cpu, policy->related_cpus) { | 187 | for_each_cpu(cpu, policy->related_cpus) { |
188 | raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) * | 188 | raw_capacity[cpu] = topology_get_cpu_scale(cpu) * |
189 | policy->cpuinfo.max_freq / 1000UL; | 189 | policy->cpuinfo.max_freq / 1000UL; |
190 | capacity_scale = max(raw_capacity[cpu], capacity_scale); | 190 | capacity_scale = max(raw_capacity[cpu], capacity_scale); |
191 | } | 191 | } |
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index 4fe662c3bbc1..c142b23bb401 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c | |||
@@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node) | |||
1038 | struct hfi1_affinity_node *entry; | 1038 | struct hfi1_affinity_node *entry; |
1039 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; | 1039 | cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; |
1040 | const struct cpumask *node_mask, | 1040 | const struct cpumask *node_mask, |
1041 | *proc_mask = ¤t->cpus_allowed; | 1041 | *proc_mask = current->cpus_ptr; |
1042 | struct hfi1_affinity_node_list *affinity = &node_affinity; | 1042 | struct hfi1_affinity_node_list *affinity = &node_affinity; |
1043 | struct cpu_mask_set *set = &affinity->proc; | 1043 | struct cpu_mask_set *set = &affinity->proc; |
1044 | 1044 | ||
@@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node) | |||
1046 | * check whether process/context affinity has already | 1046 | * check whether process/context affinity has already |
1047 | * been set | 1047 | * been set |
1048 | */ | 1048 | */ |
1049 | if (cpumask_weight(proc_mask) == 1) { | 1049 | if (current->nr_cpus_allowed == 1) { |
1050 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", | 1050 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", |
1051 | current->pid, current->comm, | 1051 | current->pid, current->comm, |
1052 | cpumask_pr_args(proc_mask)); | 1052 | cpumask_pr_args(proc_mask)); |
@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node) | |||
1057 | cpu = cpumask_first(proc_mask); | 1057 | cpu = cpumask_first(proc_mask); |
1058 | cpumask_set_cpu(cpu, &set->used); | 1058 | cpumask_set_cpu(cpu, &set->used); |
1059 | goto done; | 1059 | goto done; |
1060 | } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { | 1060 | } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) { |
1061 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", | 1061 | hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", |
1062 | current->pid, current->comm, | 1062 | current->pid, current->comm, |
1063 | cpumask_pr_args(proc_mask)); | 1063 | cpumask_pr_args(proc_mask)); |
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 28b66bd70b74..2395fd4233a7 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c | |||
@@ -869,14 +869,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd, | |||
869 | { | 869 | { |
870 | struct sdma_rht_node *rht_node; | 870 | struct sdma_rht_node *rht_node; |
871 | struct sdma_engine *sde = NULL; | 871 | struct sdma_engine *sde = NULL; |
872 | const struct cpumask *current_mask = ¤t->cpus_allowed; | ||
873 | unsigned long cpu_id; | 872 | unsigned long cpu_id; |
874 | 873 | ||
875 | /* | 874 | /* |
876 | * To ensure that always the same sdma engine(s) will be | 875 | * To ensure that always the same sdma engine(s) will be |
877 | * selected make sure the process is pinned to this CPU only. | 876 | * selected make sure the process is pinned to this CPU only. |
878 | */ | 877 | */ |
879 | if (cpumask_weight(current_mask) != 1) | 878 | if (current->nr_cpus_allowed != 1) |
880 | goto out; | 879 | goto out; |
881 | 880 | ||
882 | cpu_id = smp_processor_id(); | 881 | cpu_id = smp_processor_id(); |
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 78fa634de98a..27b6e664e59d 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c | |||
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt) | |||
1142 | static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) | 1142 | static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) |
1143 | { | 1143 | { |
1144 | struct qib_filedata *fd = fp->private_data; | 1144 | struct qib_filedata *fd = fp->private_data; |
1145 | const unsigned int weight = cpumask_weight(¤t->cpus_allowed); | 1145 | const unsigned int weight = current->nr_cpus_allowed; |
1146 | const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); | 1146 | const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); |
1147 | int local_cpu; | 1147 | int local_cpu; |
1148 | 1148 | ||
@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo) | |||
1623 | ret = find_free_ctxt(i_minor - 1, fp, uinfo); | 1623 | ret = find_free_ctxt(i_minor - 1, fp, uinfo); |
1624 | else { | 1624 | else { |
1625 | int unit; | 1625 | int unit; |
1626 | const unsigned int cpu = cpumask_first(¤t->cpus_allowed); | 1626 | const unsigned int cpu = cpumask_first(current->cpus_ptr); |
1627 | const unsigned int weight = | 1627 | const unsigned int weight = current->nr_cpus_allowed; |
1628 | cpumask_weight(¤t->cpus_allowed); | ||
1629 | 1628 | ||
1630 | if (weight == 1 && !test_bit(cpu, qib_cpulist)) | 1629 | if (weight == 1 && !test_bit(cpu, qib_cpulist)) |
1631 | if (!find_hca(cpu, &unit) && unit >= 0) | 1630 | if (!find_hca(cpu, &unit) && unit >= 0) |
diff --git a/fs/proc/array.c b/fs/proc/array.c index 55180501b915..46dcb6f0eccf 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c | |||
@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m, | |||
381 | static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) | 381 | static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) |
382 | { | 382 | { |
383 | seq_printf(m, "Cpus_allowed:\t%*pb\n", | 383 | seq_printf(m, "Cpus_allowed:\t%*pb\n", |
384 | cpumask_pr_args(&task->cpus_allowed)); | 384 | cpumask_pr_args(task->cpus_ptr)); |
385 | seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", | 385 | seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", |
386 | cpumask_pr_args(&task->cpus_allowed)); | 386 | cpumask_pr_args(task->cpus_ptr)); |
387 | } | 387 | } |
388 | 388 | ||
389 | static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) | 389 | static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) |
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index d9bdc1a7f4e7..1cfe05ea1d89 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h | |||
@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale); | |||
18 | 18 | ||
19 | struct sched_domain; | 19 | struct sched_domain; |
20 | static inline | 20 | static inline |
21 | unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu) | 21 | unsigned long topology_get_cpu_scale(int cpu) |
22 | { | 22 | { |
23 | return per_cpu(cpu_scale, cpu); | 23 | return per_cpu(cpu_scale, cpu); |
24 | } | 24 | } |
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index aa027f7bcb3e..73f8c3cb9588 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h | |||
@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd, | |||
89 | * like schedutil. | 89 | * like schedutil. |
90 | */ | 90 | */ |
91 | cpu = cpumask_first(to_cpumask(pd->cpus)); | 91 | cpu = cpumask_first(to_cpumask(pd->cpus)); |
92 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | 92 | scale_cpu = arch_scale_cpu_capacity(cpu); |
93 | cs = &pd->table[pd->nr_cap_states - 1]; | 93 | cs = &pd->table[pd->nr_cap_states - 1]; |
94 | freq = map_util_freq(max_util, cs->frequency, scale_cpu); | 94 | freq = map_util_freq(max_util, cs->frequency, scale_cpu); |
95 | 95 | ||
diff --git a/include/linux/log2.h b/include/linux/log2.h index 1aec01365ed4..83a4a3ca3e8a 100644 --- a/include/linux/log2.h +++ b/include/linux/log2.h | |||
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n) | |||
220 | ilog2((n) - 1) + 1) : \ | 220 | ilog2((n) - 1) + 1) : \ |
221 | __order_base_2(n) \ | 221 | __order_base_2(n) \ |
222 | ) | 222 | ) |
223 | |||
224 | static inline __attribute__((const)) | ||
225 | int __bits_per(unsigned long n) | ||
226 | { | ||
227 | if (n < 2) | ||
228 | return 1; | ||
229 | if (is_power_of_2(n)) | ||
230 | return order_base_2(n) + 1; | ||
231 | return order_base_2(n); | ||
232 | } | ||
233 | |||
234 | /** | ||
235 | * bits_per - calculate the number of bits required for the argument | ||
236 | * @n: parameter | ||
237 | * | ||
238 | * This is constant-capable and can be used for compile time | ||
239 | * initializations, e.g bitfields. | ||
240 | * | ||
241 | * The first few values calculated by this routine: | ||
242 | * bf(0) = 1 | ||
243 | * bf(1) = 1 | ||
244 | * bf(2) = 2 | ||
245 | * bf(3) = 2 | ||
246 | * bf(4) = 3 | ||
247 | * ... and so on. | ||
248 | */ | ||
249 | #define bits_per(n) \ | ||
250 | ( \ | ||
251 | __builtin_constant_p(n) ? ( \ | ||
252 | ((n) == 0 || (n) == 1) \ | ||
253 | ? 1 : ilog2(n) + 1 \ | ||
254 | ) : \ | ||
255 | __bits_per(n) \ | ||
256 | ) | ||
223 | #endif /* _LINUX_LOG2_H */ | 257 | #endif /* _LINUX_LOG2_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 76adce49b5ad..459d95e4a574 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -35,6 +35,7 @@ struct audit_context; | |||
35 | struct backing_dev_info; | 35 | struct backing_dev_info; |
36 | struct bio_list; | 36 | struct bio_list; |
37 | struct blk_plug; | 37 | struct blk_plug; |
38 | struct capture_control; | ||
38 | struct cfs_rq; | 39 | struct cfs_rq; |
39 | struct fs_struct; | 40 | struct fs_struct; |
40 | struct futex_pi_state; | 41 | struct futex_pi_state; |
@@ -47,8 +48,9 @@ struct pid_namespace; | |||
47 | struct pipe_inode_info; | 48 | struct pipe_inode_info; |
48 | struct rcu_node; | 49 | struct rcu_node; |
49 | struct reclaim_state; | 50 | struct reclaim_state; |
50 | struct capture_control; | ||
51 | struct robust_list_head; | 51 | struct robust_list_head; |
52 | struct root_domain; | ||
53 | struct rq; | ||
52 | struct sched_attr; | 54 | struct sched_attr; |
53 | struct sched_param; | 55 | struct sched_param; |
54 | struct seq_file; | 56 | struct seq_file; |
@@ -281,6 +283,18 @@ struct vtime { | |||
281 | u64 gtime; | 283 | u64 gtime; |
282 | }; | 284 | }; |
283 | 285 | ||
286 | /* | ||
287 | * Utilization clamp constraints. | ||
288 | * @UCLAMP_MIN: Minimum utilization | ||
289 | * @UCLAMP_MAX: Maximum utilization | ||
290 | * @UCLAMP_CNT: Utilization clamp constraints count | ||
291 | */ | ||
292 | enum uclamp_id { | ||
293 | UCLAMP_MIN = 0, | ||
294 | UCLAMP_MAX, | ||
295 | UCLAMP_CNT | ||
296 | }; | ||
297 | |||
284 | struct sched_info { | 298 | struct sched_info { |
285 | #ifdef CONFIG_SCHED_INFO | 299 | #ifdef CONFIG_SCHED_INFO |
286 | /* Cumulative counters: */ | 300 | /* Cumulative counters: */ |
@@ -312,6 +326,10 @@ struct sched_info { | |||
312 | # define SCHED_FIXEDPOINT_SHIFT 10 | 326 | # define SCHED_FIXEDPOINT_SHIFT 10 |
313 | # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) | 327 | # define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) |
314 | 328 | ||
329 | /* Increase resolution of cpu_capacity calculations */ | ||
330 | # define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT | ||
331 | # define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) | ||
332 | |||
315 | struct load_weight { | 333 | struct load_weight { |
316 | unsigned long weight; | 334 | unsigned long weight; |
317 | u32 inv_weight; | 335 | u32 inv_weight; |
@@ -560,6 +578,41 @@ struct sched_dl_entity { | |||
560 | struct hrtimer inactive_timer; | 578 | struct hrtimer inactive_timer; |
561 | }; | 579 | }; |
562 | 580 | ||
581 | #ifdef CONFIG_UCLAMP_TASK | ||
582 | /* Number of utilization clamp buckets (shorter alias) */ | ||
583 | #define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT | ||
584 | |||
585 | /* | ||
586 | * Utilization clamp for a scheduling entity | ||
587 | * @value: clamp value "assigned" to a se | ||
588 | * @bucket_id: bucket index corresponding to the "assigned" value | ||
589 | * @active: the se is currently refcounted in a rq's bucket | ||
590 | * @user_defined: the requested clamp value comes from user-space | ||
591 | * | ||
592 | * The bucket_id is the index of the clamp bucket matching the clamp value | ||
593 | * which is pre-computed and stored to avoid expensive integer divisions from | ||
594 | * the fast path. | ||
595 | * | ||
596 | * The active bit is set whenever a task has got an "effective" value assigned, | ||
597 | * which can be different from the clamp value "requested" from user-space. | ||
598 | * This allows to know a task is refcounted in the rq's bucket corresponding | ||
599 | * to the "effective" bucket_id. | ||
600 | * | ||
601 | * The user_defined bit is set whenever a task has got a task-specific clamp | ||
602 | * value requested from userspace, i.e. the system defaults apply to this task | ||
603 | * just as a restriction. This allows to relax default clamps when a less | ||
604 | * restrictive task-specific value has been requested, thus allowing to | ||
605 | * implement a "nice" semantic. For example, a task running with a 20% | ||
606 | * default boost can still drop its own boosting to 0%. | ||
607 | */ | ||
608 | struct uclamp_se { | ||
609 | unsigned int value : bits_per(SCHED_CAPACITY_SCALE); | ||
610 | unsigned int bucket_id : bits_per(UCLAMP_BUCKETS); | ||
611 | unsigned int active : 1; | ||
612 | unsigned int user_defined : 1; | ||
613 | }; | ||
614 | #endif /* CONFIG_UCLAMP_TASK */ | ||
615 | |||
563 | union rcu_special { | 616 | union rcu_special { |
564 | struct { | 617 | struct { |
565 | u8 blocked; | 618 | u8 blocked; |
@@ -640,6 +693,13 @@ struct task_struct { | |||
640 | #endif | 693 | #endif |
641 | struct sched_dl_entity dl; | 694 | struct sched_dl_entity dl; |
642 | 695 | ||
696 | #ifdef CONFIG_UCLAMP_TASK | ||
697 | /* Clamp values requested for a scheduling entity */ | ||
698 | struct uclamp_se uclamp_req[UCLAMP_CNT]; | ||
699 | /* Effective clamp values used for a scheduling entity */ | ||
700 | struct uclamp_se uclamp[UCLAMP_CNT]; | ||
701 | #endif | ||
702 | |||
643 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 703 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
644 | /* List of struct preempt_notifier: */ | 704 | /* List of struct preempt_notifier: */ |
645 | struct hlist_head preempt_notifiers; | 705 | struct hlist_head preempt_notifiers; |
@@ -651,7 +711,8 @@ struct task_struct { | |||
651 | 711 | ||
652 | unsigned int policy; | 712 | unsigned int policy; |
653 | int nr_cpus_allowed; | 713 | int nr_cpus_allowed; |
654 | cpumask_t cpus_allowed; | 714 | const cpumask_t *cpus_ptr; |
715 | cpumask_t cpus_mask; | ||
655 | 716 | ||
656 | #ifdef CONFIG_PREEMPT_RCU | 717 | #ifdef CONFIG_PREEMPT_RCU |
657 | int rcu_read_lock_nesting; | 718 | int rcu_read_lock_nesting; |
@@ -1399,7 +1460,7 @@ extern struct pid *cad_pid; | |||
1399 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ | 1460 | #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ |
1400 | #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ | 1461 | #define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ |
1401 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ | 1462 | #define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ |
1402 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ | 1463 | #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ |
1403 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ | 1464 | #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ |
1404 | #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ | 1465 | #define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ |
1405 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ | 1466 | #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ |
@@ -1915,4 +1976,16 @@ static inline void rseq_syscall(struct pt_regs *regs) | |||
1915 | 1976 | ||
1916 | #endif | 1977 | #endif |
1917 | 1978 | ||
1979 | const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq); | ||
1980 | char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len); | ||
1981 | int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq); | ||
1982 | |||
1983 | const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq); | ||
1984 | const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq); | ||
1985 | const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq); | ||
1986 | |||
1987 | int sched_trace_rq_cpu(struct rq *rq); | ||
1988 | |||
1989 | const struct cpumask *sched_trace_rd_span(struct root_domain *rd); | ||
1990 | |||
1918 | #endif | 1991 | #endif |
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index b36f4cf38111..1abe91ff6e4a 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h | |||
@@ -7,14 +7,6 @@ | |||
7 | */ | 7 | */ |
8 | 8 | ||
9 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | 9 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) |
10 | extern void cpu_load_update_nohz_start(void); | ||
11 | extern void cpu_load_update_nohz_stop(void); | ||
12 | #else | ||
13 | static inline void cpu_load_update_nohz_start(void) { } | ||
14 | static inline void cpu_load_update_nohz_stop(void) { } | ||
15 | #endif | ||
16 | |||
17 | #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) | ||
18 | extern void nohz_balance_enter_idle(int cpu); | 10 | extern void nohz_balance_enter_idle(int cpu); |
19 | extern int get_nohz_timer_target(void); | 11 | extern int get_nohz_timer_target(void); |
20 | #else | 12 | #else |
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 99ce6d728df7..d4f6215ee03f 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
56 | extern unsigned int sysctl_sched_rt_period; | 56 | extern unsigned int sysctl_sched_rt_period; |
57 | extern int sysctl_sched_rt_runtime; | 57 | extern int sysctl_sched_rt_runtime; |
58 | 58 | ||
59 | #ifdef CONFIG_UCLAMP_TASK | ||
60 | extern unsigned int sysctl_sched_uclamp_util_min; | ||
61 | extern unsigned int sysctl_sched_uclamp_util_max; | ||
62 | #endif | ||
63 | |||
59 | #ifdef CONFIG_CFS_BANDWIDTH | 64 | #ifdef CONFIG_CFS_BANDWIDTH |
60 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; | 65 | extern unsigned int sysctl_sched_cfs_bandwidth_slice; |
61 | #endif | 66 | #endif |
@@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write, | |||
75 | void __user *buffer, size_t *lenp, | 80 | void __user *buffer, size_t *lenp, |
76 | loff_t *ppos); | 81 | loff_t *ppos); |
77 | 82 | ||
83 | #ifdef CONFIG_UCLAMP_TASK | ||
84 | extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, | ||
85 | void __user *buffer, size_t *lenp, | ||
86 | loff_t *ppos); | ||
87 | #endif | ||
88 | |||
78 | extern int sysctl_numa_balancing(struct ctl_table *table, int write, | 89 | extern int sysctl_numa_balancing(struct ctl_table *table, int write, |
79 | void __user *buffer, size_t *lenp, | 90 | void __user *buffer, size_t *lenp, |
80 | loff_t *ppos); | 91 | loff_t *ppos); |
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index cfc0a89a7159..7863bb62d2ab 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h | |||
@@ -7,12 +7,6 @@ | |||
7 | #include <linux/sched/idle.h> | 7 | #include <linux/sched/idle.h> |
8 | 8 | ||
9 | /* | 9 | /* |
10 | * Increase resolution of cpu_capacity calculations | ||
11 | */ | ||
12 | #define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT | ||
13 | #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) | ||
14 | |||
15 | /* | ||
16 | * sched-domains (multiprocessor balancing) declarations: | 10 | * sched-domains (multiprocessor balancing) declarations: |
17 | */ | 11 | */ |
18 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
@@ -84,11 +78,6 @@ struct sched_domain { | |||
84 | unsigned int busy_factor; /* less balancing by factor if busy */ | 78 | unsigned int busy_factor; /* less balancing by factor if busy */ |
85 | unsigned int imbalance_pct; /* No balance until over watermark */ | 79 | unsigned int imbalance_pct; /* No balance until over watermark */ |
86 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ | 80 | unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ |
87 | unsigned int busy_idx; | ||
88 | unsigned int idle_idx; | ||
89 | unsigned int newidle_idx; | ||
90 | unsigned int wake_idx; | ||
91 | unsigned int forkexec_idx; | ||
92 | 81 | ||
93 | int nohz_idle; /* NOHZ IDLE status */ | 82 | int nohz_idle; /* NOHZ IDLE status */ |
94 | int flags; /* See SD_* */ | 83 | int flags; /* See SD_* */ |
@@ -201,14 +190,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl); | |||
201 | # define SD_INIT_NAME(type) | 190 | # define SD_INIT_NAME(type) |
202 | #endif | 191 | #endif |
203 | 192 | ||
204 | #ifndef arch_scale_cpu_capacity | ||
205 | static __always_inline | ||
206 | unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) | ||
207 | { | ||
208 | return SCHED_CAPACITY_SCALE; | ||
209 | } | ||
210 | #endif | ||
211 | |||
212 | #else /* CONFIG_SMP */ | 193 | #else /* CONFIG_SMP */ |
213 | 194 | ||
214 | struct sched_domain_attr; | 195 | struct sched_domain_attr; |
@@ -224,16 +205,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) | |||
224 | return true; | 205 | return true; |
225 | } | 206 | } |
226 | 207 | ||
208 | #endif /* !CONFIG_SMP */ | ||
209 | |||
227 | #ifndef arch_scale_cpu_capacity | 210 | #ifndef arch_scale_cpu_capacity |
228 | static __always_inline | 211 | static __always_inline |
229 | unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) | 212 | unsigned long arch_scale_cpu_capacity(int cpu) |
230 | { | 213 | { |
231 | return SCHED_CAPACITY_SCALE; | 214 | return SCHED_CAPACITY_SCALE; |
232 | } | 215 | } |
233 | #endif | 216 | #endif |
234 | 217 | ||
235 | #endif /* !CONFIG_SMP */ | ||
236 | |||
237 | static inline int task_node(const struct task_struct *p) | 218 | static inline int task_node(const struct task_struct *p) |
238 | { | 219 | { |
239 | return cpu_to_node(task_cpu(p)); | 220 | return cpu_to_node(task_cpu(p)); |
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c8c7c7efb487..420e80e56e55 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h | |||
@@ -594,6 +594,37 @@ TRACE_EVENT(sched_wake_idle_without_ipi, | |||
594 | 594 | ||
595 | TP_printk("cpu=%d", __entry->cpu) | 595 | TP_printk("cpu=%d", __entry->cpu) |
596 | ); | 596 | ); |
597 | |||
598 | /* | ||
599 | * Following tracepoints are not exported in tracefs and provide hooking | ||
600 | * mechanisms only for testing and debugging purposes. | ||
601 | * | ||
602 | * Postfixed with _tp to make them easily identifiable in the code. | ||
603 | */ | ||
604 | DECLARE_TRACE(pelt_cfs_tp, | ||
605 | TP_PROTO(struct cfs_rq *cfs_rq), | ||
606 | TP_ARGS(cfs_rq)); | ||
607 | |||
608 | DECLARE_TRACE(pelt_rt_tp, | ||
609 | TP_PROTO(struct rq *rq), | ||
610 | TP_ARGS(rq)); | ||
611 | |||
612 | DECLARE_TRACE(pelt_dl_tp, | ||
613 | TP_PROTO(struct rq *rq), | ||
614 | TP_ARGS(rq)); | ||
615 | |||
616 | DECLARE_TRACE(pelt_irq_tp, | ||
617 | TP_PROTO(struct rq *rq), | ||
618 | TP_ARGS(rq)); | ||
619 | |||
620 | DECLARE_TRACE(pelt_se_tp, | ||
621 | TP_PROTO(struct sched_entity *se), | ||
622 | TP_ARGS(se)); | ||
623 | |||
624 | DECLARE_TRACE(sched_overutilized_tp, | ||
625 | TP_PROTO(struct root_domain *rd, bool overutilized), | ||
626 | TP_ARGS(rd, overutilized)); | ||
627 | |||
597 | #endif /* _TRACE_SCHED_H */ | 628 | #endif /* _TRACE_SCHED_H */ |
598 | 629 | ||
599 | /* This part must be outside protection */ | 630 | /* This part must be outside protection */ |
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index ed4ee170bee2..617bb59aa8ba 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h | |||
@@ -51,9 +51,21 @@ | |||
51 | #define SCHED_FLAG_RESET_ON_FORK 0x01 | 51 | #define SCHED_FLAG_RESET_ON_FORK 0x01 |
52 | #define SCHED_FLAG_RECLAIM 0x02 | 52 | #define SCHED_FLAG_RECLAIM 0x02 |
53 | #define SCHED_FLAG_DL_OVERRUN 0x04 | 53 | #define SCHED_FLAG_DL_OVERRUN 0x04 |
54 | #define SCHED_FLAG_KEEP_POLICY 0x08 | ||
55 | #define SCHED_FLAG_KEEP_PARAMS 0x10 | ||
56 | #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 | ||
57 | #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 | ||
58 | |||
59 | #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ | ||
60 | SCHED_FLAG_KEEP_PARAMS) | ||
61 | |||
62 | #define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \ | ||
63 | SCHED_FLAG_UTIL_CLAMP_MAX) | ||
54 | 64 | ||
55 | #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ | 65 | #define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ |
56 | SCHED_FLAG_RECLAIM | \ | 66 | SCHED_FLAG_RECLAIM | \ |
57 | SCHED_FLAG_DL_OVERRUN) | 67 | SCHED_FLAG_DL_OVERRUN | \ |
68 | SCHED_FLAG_KEEP_ALL | \ | ||
69 | SCHED_FLAG_UTIL_CLAMP) | ||
58 | 70 | ||
59 | #endif /* _UAPI_LINUX_SCHED_H */ | 71 | #endif /* _UAPI_LINUX_SCHED_H */ |
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index 10fbb8031930..c852153ddb0d 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h | |||
@@ -9,6 +9,7 @@ struct sched_param { | |||
9 | }; | 9 | }; |
10 | 10 | ||
11 | #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ | 11 | #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ |
12 | #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ | ||
12 | 13 | ||
13 | /* | 14 | /* |
14 | * Extended scheduling parameters data structure. | 15 | * Extended scheduling parameters data structure. |
@@ -21,8 +22,33 @@ struct sched_param { | |||
21 | * the tasks may be useful for a wide variety of application fields, e.g., | 22 | * the tasks may be useful for a wide variety of application fields, e.g., |
22 | * multimedia, streaming, automation and control, and many others. | 23 | * multimedia, streaming, automation and control, and many others. |
23 | * | 24 | * |
24 | * This variant (sched_attr) is meant at describing a so-called | 25 | * This variant (sched_attr) allows to define additional attributes to |
25 | * sporadic time-constrained task. In such model a task is specified by: | 26 | * improve the scheduler knowledge about task requirements. |
27 | * | ||
28 | * Scheduling Class Attributes | ||
29 | * =========================== | ||
30 | * | ||
31 | * A subset of sched_attr attributes specifies the | ||
32 | * scheduling policy and relative POSIX attributes: | ||
33 | * | ||
34 | * @size size of the structure, for fwd/bwd compat. | ||
35 | * | ||
36 | * @sched_policy task's scheduling policy | ||
37 | * @sched_nice task's nice value (SCHED_NORMAL/BATCH) | ||
38 | * @sched_priority task's static priority (SCHED_FIFO/RR) | ||
39 | * | ||
40 | * Certain more advanced scheduling features can be controlled by a | ||
41 | * predefined set of flags via the attribute: | ||
42 | * | ||
43 | * @sched_flags for customizing the scheduler behaviour | ||
44 | * | ||
45 | * Sporadic Time-Constrained Task Attributes | ||
46 | * ========================================= | ||
47 | * | ||
48 | * A subset of sched_attr attributes allows to describe a so-called | ||
49 | * sporadic time-constrained task. | ||
50 | * | ||
51 | * In such a model a task is specified by: | ||
26 | * - the activation period or minimum instance inter-arrival time; | 52 | * - the activation period or minimum instance inter-arrival time; |
27 | * - the maximum (or average, depending on the actual scheduling | 53 | * - the maximum (or average, depending on the actual scheduling |
28 | * discipline) computation time of all instances, a.k.a. runtime; | 54 | * discipline) computation time of all instances, a.k.a. runtime; |
@@ -34,14 +60,8 @@ struct sched_param { | |||
34 | * than the runtime and must be completed by time instant t equal to | 60 | * than the runtime and must be completed by time instant t equal to |
35 | * the instance activation time + the deadline. | 61 | * the instance activation time + the deadline. |
36 | * | 62 | * |
37 | * This is reflected by the actual fields of the sched_attr structure: | 63 | * This is reflected by the following fields of the sched_attr structure: |
38 | * | 64 | * |
39 | * @size size of the structure, for fwd/bwd compat. | ||
40 | * | ||
41 | * @sched_policy task's scheduling policy | ||
42 | * @sched_flags for customizing the scheduler behaviour | ||
43 | * @sched_nice task's nice value (SCHED_NORMAL/BATCH) | ||
44 | * @sched_priority task's static priority (SCHED_FIFO/RR) | ||
45 | * @sched_deadline representative of the task's deadline | 65 | * @sched_deadline representative of the task's deadline |
46 | * @sched_runtime representative of the task's runtime | 66 | * @sched_runtime representative of the task's runtime |
47 | * @sched_period representative of the task's period | 67 | * @sched_period representative of the task's period |
@@ -53,6 +73,29 @@ struct sched_param { | |||
53 | * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the | 73 | * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the |
54 | * only user of this new interface. More information about the algorithm | 74 | * only user of this new interface. More information about the algorithm |
55 | * available in the scheduling class file or in Documentation/. | 75 | * available in the scheduling class file or in Documentation/. |
76 | * | ||
77 | * Task Utilization Attributes | ||
78 | * =========================== | ||
79 | * | ||
80 | * A subset of sched_attr attributes allows to specify the utilization | ||
81 | * expected for a task. These attributes allow to inform the scheduler about | ||
82 | * the utilization boundaries within which it should schedule the task. These | ||
83 | * boundaries are valuable hints to support scheduler decisions on both task | ||
84 | * placement and frequency selection. | ||
85 | * | ||
86 | * @sched_util_min represents the minimum utilization | ||
87 | * @sched_util_max represents the maximum utilization | ||
88 | * | ||
89 | * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It | ||
90 | * represents the percentage of CPU time used by a task when running at the | ||
91 | * maximum frequency on the highest capacity CPU of the system. For example, a | ||
92 | * 20% utilization task is a task running for 2ms every 10ms at maximum | ||
93 | * frequency. | ||
94 | * | ||
95 | * A task with a min utilization value bigger than 0 is more likely scheduled | ||
96 | * on a CPU with a capacity big enough to fit the specified value. | ||
97 | * A task with a max utilization value smaller than 1024 is more likely | ||
98 | * scheduled on a CPU with no more capacity than the specified value. | ||
56 | */ | 99 | */ |
57 | struct sched_attr { | 100 | struct sched_attr { |
58 | __u32 size; | 101 | __u32 size; |
@@ -70,6 +113,11 @@ struct sched_attr { | |||
70 | __u64 sched_runtime; | 113 | __u64 sched_runtime; |
71 | __u64 sched_deadline; | 114 | __u64 sched_deadline; |
72 | __u64 sched_period; | 115 | __u64 sched_period; |
116 | |||
117 | /* Utilization hints */ | ||
118 | __u32 sched_util_min; | ||
119 | __u32 sched_util_max; | ||
120 | |||
73 | }; | 121 | }; |
74 | 122 | ||
75 | #endif /* _UAPI_LINUX_SCHED_TYPES_H */ | 123 | #endif /* _UAPI_LINUX_SCHED_TYPES_H */ |
diff --git a/init/Kconfig b/init/Kconfig index 0e2344389501..c88289c18d59 100644 --- a/init/Kconfig +++ b/init/Kconfig | |||
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK | |||
677 | config GENERIC_SCHED_CLOCK | 677 | config GENERIC_SCHED_CLOCK |
678 | bool | 678 | bool |
679 | 679 | ||
680 | menu "Scheduler features" | ||
681 | |||
682 | config UCLAMP_TASK | ||
683 | bool "Enable utilization clamping for RT/FAIR tasks" | ||
684 | depends on CPU_FREQ_GOV_SCHEDUTIL | ||
685 | help | ||
686 | This feature enables the scheduler to track the clamped utilization | ||
687 | of each CPU based on RUNNABLE tasks scheduled on that CPU. | ||
688 | |||
689 | With this option, the user can specify the min and max CPU | ||
690 | utilization allowed for RUNNABLE tasks. The max utilization defines | ||
691 | the maximum frequency a task should use while the min utilization | ||
692 | defines the minimum frequency it should use. | ||
693 | |||
694 | Both min and max utilization clamp values are hints to the scheduler, | ||
695 | aiming at improving its frequency selection policy, but they do not | ||
696 | enforce or grant any specific bandwidth for tasks. | ||
697 | |||
698 | If in doubt, say N. | ||
699 | |||
700 | config UCLAMP_BUCKETS_COUNT | ||
701 | int "Number of supported utilization clamp buckets" | ||
702 | range 5 20 | ||
703 | default 5 | ||
704 | depends on UCLAMP_TASK | ||
705 | help | ||
706 | Defines the number of clamp buckets to use. The range of each bucket | ||
707 | will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the | ||
708 | number of clamp buckets the finer their granularity and the higher | ||
709 | the precision of clamping aggregation and tracking at run-time. | ||
710 | |||
711 | For example, with the minimum configuration value we will have 5 | ||
712 | clamp buckets tracking 20% utilization each. A 25% boosted tasks will | ||
713 | be refcounted in the [20..39]% bucket and will set the bucket clamp | ||
714 | effective value to 25%. | ||
715 | If a second 30% boosted task should be co-scheduled on the same CPU, | ||
716 | that task will be refcounted in the same bucket of the first task and | ||
717 | it will boost the bucket clamp effective value to 30%. | ||
718 | The clamp effective value of a bucket is reset to its nominal value | ||
719 | (20% in the example above) when there are no more tasks refcounted in | ||
720 | that bucket. | ||
721 | |||
722 | An additional boost/capping margin can be added to some tasks. In the | ||
723 | example above the 25% task will be boosted to 30% until it exits the | ||
724 | CPU. If that should be considered not acceptable on certain systems, | ||
725 | it's always possible to reduce the margin by increasing the number of | ||
726 | clamp buckets to trade off used memory for run-time tracking | ||
727 | precision. | ||
728 | |||
729 | If in doubt, use the default value. | ||
730 | |||
731 | endmenu | ||
732 | |||
680 | # | 733 | # |
681 | # For architectures that want to enable the support for NUMA-affine scheduler | 734 | # For architectures that want to enable the support for NUMA-affine scheduler |
682 | # balancing logic: | 735 | # balancing logic: |
diff --git a/init/init_task.c b/init/init_task.c index afa6ad795355..7ab773b9b3cd 100644 --- a/init/init_task.c +++ b/init/init_task.c | |||
@@ -72,7 +72,8 @@ struct task_struct init_task | |||
72 | .static_prio = MAX_PRIO - 20, | 72 | .static_prio = MAX_PRIO - 20, |
73 | .normal_prio = MAX_PRIO - 20, | 73 | .normal_prio = MAX_PRIO - 20, |
74 | .policy = SCHED_NORMAL, | 74 | .policy = SCHED_NORMAL, |
75 | .cpus_allowed = CPU_MASK_ALL, | 75 | .cpus_ptr = &init_task.cpus_mask, |
76 | .cpus_mask = CPU_MASK_ALL, | ||
76 | .nr_cpus_allowed= NR_CPUS, | 77 | .nr_cpus_allowed= NR_CPUS, |
77 | .mm = NULL, | 78 | .mm = NULL, |
78 | .active_mm = &init_mm, | 79 | .active_mm = &init_mm, |
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 515525ff1cfd..a1590e244f5f 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c | |||
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task) | |||
2829 | if (task_css_is_root(task, cpuset_cgrp_id)) | 2829 | if (task_css_is_root(task, cpuset_cgrp_id)) |
2830 | return; | 2830 | return; |
2831 | 2831 | ||
2832 | set_cpus_allowed_ptr(task, ¤t->cpus_allowed); | 2832 | set_cpus_allowed_ptr(task, current->cpus_ptr); |
2833 | task->mems_allowed = current->mems_allowed; | 2833 | task->mems_allowed = current->mems_allowed; |
2834 | } | 2834 | } |
2835 | 2835 | ||
diff --git a/kernel/fork.c b/kernel/fork.c index d18e343d4aab..847dd147b068 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -898,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
898 | #ifdef CONFIG_STACKPROTECTOR | 898 | #ifdef CONFIG_STACKPROTECTOR |
899 | tsk->stack_canary = get_random_canary(); | 899 | tsk->stack_canary = get_random_canary(); |
900 | #endif | 900 | #endif |
901 | if (orig->cpus_ptr == &orig->cpus_mask) | ||
902 | tsk->cpus_ptr = &tsk->cpus_mask; | ||
901 | 903 | ||
902 | /* | 904 | /* |
903 | * One for us, one for whoever does the "release_task()" (usually | 905 | * One for us, one for whoever does the "release_task()" (usually |
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index 7d66ee68aaaf..0a9326f5f421 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c | |||
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states, | |||
223 | * All CPUs of a domain must have the same micro-architecture | 223 | * All CPUs of a domain must have the same micro-architecture |
224 | * since they all share the same table. | 224 | * since they all share the same table. |
225 | */ | 225 | */ |
226 | cap = arch_scale_cpu_capacity(NULL, cpu); | 226 | cap = arch_scale_cpu_capacity(cpu); |
227 | if (prev_cap && prev_cap != cap) { | 227 | if (prev_cap && prev_cap != cap) { |
228 | pr_err("CPUs of %*pbl must have the same capacity\n", | 228 | pr_err("CPUs of %*pbl must have the same capacity\n", |
229 | cpumask_pr_args(span)); | 229 | cpumask_pr_args(span)); |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 2d4ff5353ded..2067080bb235 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
@@ -259,7 +259,6 @@ out: | |||
259 | } | 259 | } |
260 | #endif /* CONFIG_PROC_FS */ | 260 | #endif /* CONFIG_PROC_FS */ |
261 | 261 | ||
262 | #ifdef CONFIG_SCHED_DEBUG | ||
263 | int autogroup_path(struct task_group *tg, char *buf, int buflen) | 262 | int autogroup_path(struct task_group *tg, char *buf, int buflen) |
264 | { | 263 | { |
265 | if (!task_group_is_autogroup(tg)) | 264 | if (!task_group_is_autogroup(tg)) |
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
267 | 266 | ||
268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 267 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
269 | } | 268 | } |
270 | #endif | ||
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 874c427742a9..fa43ce3962e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -23,6 +23,17 @@ | |||
23 | #define CREATE_TRACE_POINTS | 23 | #define CREATE_TRACE_POINTS |
24 | #include <trace/events/sched.h> | 24 | #include <trace/events/sched.h> |
25 | 25 | ||
26 | /* | ||
27 | * Export tracepoints that act as a bare tracehook (ie: have no trace event | ||
28 | * associated with them) to allow external modules to probe them. | ||
29 | */ | ||
30 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp); | ||
31 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp); | ||
32 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp); | ||
33 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); | ||
34 | EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp); | ||
35 | EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); | ||
36 | |||
26 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 37 | DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
27 | 38 | ||
28 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) | 39 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) |
@@ -761,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load) | |||
761 | } | 772 | } |
762 | } | 773 | } |
763 | 774 | ||
775 | #ifdef CONFIG_UCLAMP_TASK | ||
776 | /* Max allowed minimum utilization */ | ||
777 | unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; | ||
778 | |||
779 | /* Max allowed maximum utilization */ | ||
780 | unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE; | ||
781 | |||
782 | /* All clamps are required to be less or equal than these values */ | ||
783 | static struct uclamp_se uclamp_default[UCLAMP_CNT]; | ||
784 | |||
785 | /* Integer rounded range for each bucket */ | ||
786 | #define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS) | ||
787 | |||
788 | #define for_each_clamp_id(clamp_id) \ | ||
789 | for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++) | ||
790 | |||
791 | static inline unsigned int uclamp_bucket_id(unsigned int clamp_value) | ||
792 | { | ||
793 | return clamp_value / UCLAMP_BUCKET_DELTA; | ||
794 | } | ||
795 | |||
796 | static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) | ||
797 | { | ||
798 | return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); | ||
799 | } | ||
800 | |||
801 | static inline unsigned int uclamp_none(int clamp_id) | ||
802 | { | ||
803 | if (clamp_id == UCLAMP_MIN) | ||
804 | return 0; | ||
805 | return SCHED_CAPACITY_SCALE; | ||
806 | } | ||
807 | |||
808 | static inline void uclamp_se_set(struct uclamp_se *uc_se, | ||
809 | unsigned int value, bool user_defined) | ||
810 | { | ||
811 | uc_se->value = value; | ||
812 | uc_se->bucket_id = uclamp_bucket_id(value); | ||
813 | uc_se->user_defined = user_defined; | ||
814 | } | ||
815 | |||
816 | static inline unsigned int | ||
817 | uclamp_idle_value(struct rq *rq, unsigned int clamp_id, | ||
818 | unsigned int clamp_value) | ||
819 | { | ||
820 | /* | ||
821 | * Avoid blocked utilization pushing up the frequency when we go | ||
822 | * idle (which drops the max-clamp) by retaining the last known | ||
823 | * max-clamp. | ||
824 | */ | ||
825 | if (clamp_id == UCLAMP_MAX) { | ||
826 | rq->uclamp_flags |= UCLAMP_FLAG_IDLE; | ||
827 | return clamp_value; | ||
828 | } | ||
829 | |||
830 | return uclamp_none(UCLAMP_MIN); | ||
831 | } | ||
832 | |||
833 | static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, | ||
834 | unsigned int clamp_value) | ||
835 | { | ||
836 | /* Reset max-clamp retention only on idle exit */ | ||
837 | if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) | ||
838 | return; | ||
839 | |||
840 | WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); | ||
841 | } | ||
842 | |||
843 | static inline | ||
844 | unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, | ||
845 | unsigned int clamp_value) | ||
846 | { | ||
847 | struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; | ||
848 | int bucket_id = UCLAMP_BUCKETS - 1; | ||
849 | |||
850 | /* | ||
851 | * Since both min and max clamps are max aggregated, find the | ||
852 | * top most bucket with tasks in. | ||
853 | */ | ||
854 | for ( ; bucket_id >= 0; bucket_id--) { | ||
855 | if (!bucket[bucket_id].tasks) | ||
856 | continue; | ||
857 | return bucket[bucket_id].value; | ||
858 | } | ||
859 | |||
860 | /* No tasks -- default clamp values */ | ||
861 | return uclamp_idle_value(rq, clamp_id, clamp_value); | ||
862 | } | ||
863 | |||
864 | /* | ||
865 | * The effective clamp bucket index of a task depends on, by increasing | ||
866 | * priority: | ||
867 | * - the task specific clamp value, when explicitly requested from userspace | ||
868 | * - the system default clamp value, defined by the sysadmin | ||
869 | */ | ||
870 | static inline struct uclamp_se | ||
871 | uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) | ||
872 | { | ||
873 | struct uclamp_se uc_req = p->uclamp_req[clamp_id]; | ||
874 | struct uclamp_se uc_max = uclamp_default[clamp_id]; | ||
875 | |||
876 | /* System default restrictions always apply */ | ||
877 | if (unlikely(uc_req.value > uc_max.value)) | ||
878 | return uc_max; | ||
879 | |||
880 | return uc_req; | ||
881 | } | ||
882 | |||
883 | unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) | ||
884 | { | ||
885 | struct uclamp_se uc_eff; | ||
886 | |||
887 | /* Task currently refcounted: use back-annotated (effective) value */ | ||
888 | if (p->uclamp[clamp_id].active) | ||
889 | return p->uclamp[clamp_id].value; | ||
890 | |||
891 | uc_eff = uclamp_eff_get(p, clamp_id); | ||
892 | |||
893 | return uc_eff.value; | ||
894 | } | ||
895 | |||
896 | /* | ||
897 | * When a task is enqueued on a rq, the clamp bucket currently defined by the | ||
898 | * task's uclamp::bucket_id is refcounted on that rq. This also immediately | ||
899 | * updates the rq's clamp value if required. | ||
900 | * | ||
901 | * Tasks can have a task-specific value requested from user-space, track | ||
902 | * within each bucket the maximum value for tasks refcounted in it. | ||
903 | * This "local max aggregation" allows to track the exact "requested" value | ||
904 | * for each bucket when all its RUNNABLE tasks require the same clamp. | ||
905 | */ | ||
906 | static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, | ||
907 | unsigned int clamp_id) | ||
908 | { | ||
909 | struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; | ||
910 | struct uclamp_se *uc_se = &p->uclamp[clamp_id]; | ||
911 | struct uclamp_bucket *bucket; | ||
912 | |||
913 | lockdep_assert_held(&rq->lock); | ||
914 | |||
915 | /* Update task effective clamp */ | ||
916 | p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id); | ||
917 | |||
918 | bucket = &uc_rq->bucket[uc_se->bucket_id]; | ||
919 | bucket->tasks++; | ||
920 | uc_se->active = true; | ||
921 | |||
922 | uclamp_idle_reset(rq, clamp_id, uc_se->value); | ||
923 | |||
924 | /* | ||
925 | * Local max aggregation: rq buckets always track the max | ||
926 | * "requested" clamp value of its RUNNABLE tasks. | ||
927 | */ | ||
928 | if (bucket->tasks == 1 || uc_se->value > bucket->value) | ||
929 | bucket->value = uc_se->value; | ||
930 | |||
931 | if (uc_se->value > READ_ONCE(uc_rq->value)) | ||
932 | WRITE_ONCE(uc_rq->value, uc_se->value); | ||
933 | } | ||
934 | |||
935 | /* | ||
936 | * When a task is dequeued from a rq, the clamp bucket refcounted by the task | ||
937 | * is released. If this is the last task reference counting the rq's max | ||
938 | * active clamp value, then the rq's clamp value is updated. | ||
939 | * | ||
940 | * Both refcounted tasks and rq's cached clamp values are expected to be | ||
941 | * always valid. If it's detected they are not, as defensive programming, | ||
942 | * enforce the expected state and warn. | ||
943 | */ | ||
944 | static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, | ||
945 | unsigned int clamp_id) | ||
946 | { | ||
947 | struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; | ||
948 | struct uclamp_se *uc_se = &p->uclamp[clamp_id]; | ||
949 | struct uclamp_bucket *bucket; | ||
950 | unsigned int bkt_clamp; | ||
951 | unsigned int rq_clamp; | ||
952 | |||
953 | lockdep_assert_held(&rq->lock); | ||
954 | |||
955 | bucket = &uc_rq->bucket[uc_se->bucket_id]; | ||
956 | SCHED_WARN_ON(!bucket->tasks); | ||
957 | if (likely(bucket->tasks)) | ||
958 | bucket->tasks--; | ||
959 | uc_se->active = false; | ||
960 | |||
961 | /* | ||
962 | * Keep "local max aggregation" simple and accept to (possibly) | ||
963 | * overboost some RUNNABLE tasks in the same bucket. | ||
964 | * The rq clamp bucket value is reset to its base value whenever | ||
965 | * there are no more RUNNABLE tasks refcounting it. | ||
966 | */ | ||
967 | if (likely(bucket->tasks)) | ||
968 | return; | ||
969 | |||
970 | rq_clamp = READ_ONCE(uc_rq->value); | ||
971 | /* | ||
972 | * Defensive programming: this should never happen. If it happens, | ||
973 | * e.g. due to future modification, warn and fixup the expected value. | ||
974 | */ | ||
975 | SCHED_WARN_ON(bucket->value > rq_clamp); | ||
976 | if (bucket->value >= rq_clamp) { | ||
977 | bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); | ||
978 | WRITE_ONCE(uc_rq->value, bkt_clamp); | ||
979 | } | ||
980 | } | ||
981 | |||
982 | static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) | ||
983 | { | ||
984 | unsigned int clamp_id; | ||
985 | |||
986 | if (unlikely(!p->sched_class->uclamp_enabled)) | ||
987 | return; | ||
988 | |||
989 | for_each_clamp_id(clamp_id) | ||
990 | uclamp_rq_inc_id(rq, p, clamp_id); | ||
991 | |||
992 | /* Reset clamp idle holding when there is one RUNNABLE task */ | ||
993 | if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) | ||
994 | rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE; | ||
995 | } | ||
996 | |||
997 | static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) | ||
998 | { | ||
999 | unsigned int clamp_id; | ||
1000 | |||
1001 | if (unlikely(!p->sched_class->uclamp_enabled)) | ||
1002 | return; | ||
1003 | |||
1004 | for_each_clamp_id(clamp_id) | ||
1005 | uclamp_rq_dec_id(rq, p, clamp_id); | ||
1006 | } | ||
1007 | |||
1008 | int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, | ||
1009 | void __user *buffer, size_t *lenp, | ||
1010 | loff_t *ppos) | ||
1011 | { | ||
1012 | int old_min, old_max; | ||
1013 | static DEFINE_MUTEX(mutex); | ||
1014 | int result; | ||
1015 | |||
1016 | mutex_lock(&mutex); | ||
1017 | old_min = sysctl_sched_uclamp_util_min; | ||
1018 | old_max = sysctl_sched_uclamp_util_max; | ||
1019 | |||
1020 | result = proc_dointvec(table, write, buffer, lenp, ppos); | ||
1021 | if (result) | ||
1022 | goto undo; | ||
1023 | if (!write) | ||
1024 | goto done; | ||
1025 | |||
1026 | if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max || | ||
1027 | sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) { | ||
1028 | result = -EINVAL; | ||
1029 | goto undo; | ||
1030 | } | ||
1031 | |||
1032 | if (old_min != sysctl_sched_uclamp_util_min) { | ||
1033 | uclamp_se_set(&uclamp_default[UCLAMP_MIN], | ||
1034 | sysctl_sched_uclamp_util_min, false); | ||
1035 | } | ||
1036 | if (old_max != sysctl_sched_uclamp_util_max) { | ||
1037 | uclamp_se_set(&uclamp_default[UCLAMP_MAX], | ||
1038 | sysctl_sched_uclamp_util_max, false); | ||
1039 | } | ||
1040 | |||
1041 | /* | ||
1042 | * Updating all the RUNNABLE task is expensive, keep it simple and do | ||
1043 | * just a lazy update at each next enqueue time. | ||
1044 | */ | ||
1045 | goto done; | ||
1046 | |||
1047 | undo: | ||
1048 | sysctl_sched_uclamp_util_min = old_min; | ||
1049 | sysctl_sched_uclamp_util_max = old_max; | ||
1050 | done: | ||
1051 | mutex_unlock(&mutex); | ||
1052 | |||
1053 | return result; | ||
1054 | } | ||
1055 | |||
1056 | static int uclamp_validate(struct task_struct *p, | ||
1057 | const struct sched_attr *attr) | ||
1058 | { | ||
1059 | unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; | ||
1060 | unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; | ||
1061 | |||
1062 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) | ||
1063 | lower_bound = attr->sched_util_min; | ||
1064 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) | ||
1065 | upper_bound = attr->sched_util_max; | ||
1066 | |||
1067 | if (lower_bound > upper_bound) | ||
1068 | return -EINVAL; | ||
1069 | if (upper_bound > SCHED_CAPACITY_SCALE) | ||
1070 | return -EINVAL; | ||
1071 | |||
1072 | return 0; | ||
1073 | } | ||
1074 | |||
1075 | static void __setscheduler_uclamp(struct task_struct *p, | ||
1076 | const struct sched_attr *attr) | ||
1077 | { | ||
1078 | unsigned int clamp_id; | ||
1079 | |||
1080 | /* | ||
1081 | * On scheduling class change, reset to default clamps for tasks | ||
1082 | * without a task-specific value. | ||
1083 | */ | ||
1084 | for_each_clamp_id(clamp_id) { | ||
1085 | struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; | ||
1086 | unsigned int clamp_value = uclamp_none(clamp_id); | ||
1087 | |||
1088 | /* Keep using defined clamps across class changes */ | ||
1089 | if (uc_se->user_defined) | ||
1090 | continue; | ||
1091 | |||
1092 | /* By default, RT tasks always get 100% boost */ | ||
1093 | if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) | ||
1094 | clamp_value = uclamp_none(UCLAMP_MAX); | ||
1095 | |||
1096 | uclamp_se_set(uc_se, clamp_value, false); | ||
1097 | } | ||
1098 | |||
1099 | if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) | ||
1100 | return; | ||
1101 | |||
1102 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { | ||
1103 | uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], | ||
1104 | attr->sched_util_min, true); | ||
1105 | } | ||
1106 | |||
1107 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { | ||
1108 | uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], | ||
1109 | attr->sched_util_max, true); | ||
1110 | } | ||
1111 | } | ||
1112 | |||
1113 | static void uclamp_fork(struct task_struct *p) | ||
1114 | { | ||
1115 | unsigned int clamp_id; | ||
1116 | |||
1117 | for_each_clamp_id(clamp_id) | ||
1118 | p->uclamp[clamp_id].active = false; | ||
1119 | |||
1120 | if (likely(!p->sched_reset_on_fork)) | ||
1121 | return; | ||
1122 | |||
1123 | for_each_clamp_id(clamp_id) { | ||
1124 | unsigned int clamp_value = uclamp_none(clamp_id); | ||
1125 | |||
1126 | /* By default, RT tasks always get 100% boost */ | ||
1127 | if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) | ||
1128 | clamp_value = uclamp_none(UCLAMP_MAX); | ||
1129 | |||
1130 | uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false); | ||
1131 | } | ||
1132 | } | ||
1133 | |||
1134 | static void __init init_uclamp(void) | ||
1135 | { | ||
1136 | struct uclamp_se uc_max = {}; | ||
1137 | unsigned int clamp_id; | ||
1138 | int cpu; | ||
1139 | |||
1140 | for_each_possible_cpu(cpu) { | ||
1141 | memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); | ||
1142 | cpu_rq(cpu)->uclamp_flags = 0; | ||
1143 | } | ||
1144 | |||
1145 | for_each_clamp_id(clamp_id) { | ||
1146 | uclamp_se_set(&init_task.uclamp_req[clamp_id], | ||
1147 | uclamp_none(clamp_id), false); | ||
1148 | } | ||
1149 | |||
1150 | /* System defaults allow max clamp values for both indexes */ | ||
1151 | uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); | ||
1152 | for_each_clamp_id(clamp_id) | ||
1153 | uclamp_default[clamp_id] = uc_max; | ||
1154 | } | ||
1155 | |||
1156 | #else /* CONFIG_UCLAMP_TASK */ | ||
1157 | static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { } | ||
1158 | static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { } | ||
1159 | static inline int uclamp_validate(struct task_struct *p, | ||
1160 | const struct sched_attr *attr) | ||
1161 | { | ||
1162 | return -EOPNOTSUPP; | ||
1163 | } | ||
1164 | static void __setscheduler_uclamp(struct task_struct *p, | ||
1165 | const struct sched_attr *attr) { } | ||
1166 | static inline void uclamp_fork(struct task_struct *p) { } | ||
1167 | static inline void init_uclamp(void) { } | ||
1168 | #endif /* CONFIG_UCLAMP_TASK */ | ||
1169 | |||
764 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | 1170 | static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) |
765 | { | 1171 | { |
766 | if (!(flags & ENQUEUE_NOCLOCK)) | 1172 | if (!(flags & ENQUEUE_NOCLOCK)) |
@@ -771,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) | |||
771 | psi_enqueue(p, flags & ENQUEUE_WAKEUP); | 1177 | psi_enqueue(p, flags & ENQUEUE_WAKEUP); |
772 | } | 1178 | } |
773 | 1179 | ||
1180 | uclamp_rq_inc(rq, p); | ||
774 | p->sched_class->enqueue_task(rq, p, flags); | 1181 | p->sched_class->enqueue_task(rq, p, flags); |
775 | } | 1182 | } |
776 | 1183 | ||
@@ -784,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) | |||
784 | psi_dequeue(p, flags & DEQUEUE_SLEEP); | 1191 | psi_dequeue(p, flags & DEQUEUE_SLEEP); |
785 | } | 1192 | } |
786 | 1193 | ||
1194 | uclamp_rq_dec(rq, p); | ||
787 | p->sched_class->dequeue_task(rq, p, flags); | 1195 | p->sched_class->dequeue_task(rq, p, flags); |
788 | } | 1196 | } |
789 | 1197 | ||
@@ -930,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) | |||
930 | */ | 1338 | */ |
931 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) | 1339 | static inline bool is_cpu_allowed(struct task_struct *p, int cpu) |
932 | { | 1340 | { |
933 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1341 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
934 | return false; | 1342 | return false; |
935 | 1343 | ||
936 | if (is_per_cpu_kthread(p)) | 1344 | if (is_per_cpu_kthread(p)) |
@@ -1025,7 +1433,7 @@ static int migration_cpu_stop(void *data) | |||
1025 | local_irq_disable(); | 1433 | local_irq_disable(); |
1026 | /* | 1434 | /* |
1027 | * We need to explicitly wake pending tasks before running | 1435 | * We need to explicitly wake pending tasks before running |
1028 | * __migrate_task() such that we will not miss enforcing cpus_allowed | 1436 | * __migrate_task() such that we will not miss enforcing cpus_ptr |
1029 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. | 1437 | * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. |
1030 | */ | 1438 | */ |
1031 | sched_ttwu_pending(); | 1439 | sched_ttwu_pending(); |
@@ -1056,7 +1464,7 @@ static int migration_cpu_stop(void *data) | |||
1056 | */ | 1464 | */ |
1057 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) | 1465 | void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) |
1058 | { | 1466 | { |
1059 | cpumask_copy(&p->cpus_allowed, new_mask); | 1467 | cpumask_copy(&p->cpus_mask, new_mask); |
1060 | p->nr_cpus_allowed = cpumask_weight(new_mask); | 1468 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
1061 | } | 1469 | } |
1062 | 1470 | ||
@@ -1126,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1126 | goto out; | 1534 | goto out; |
1127 | } | 1535 | } |
1128 | 1536 | ||
1129 | if (cpumask_equal(&p->cpus_allowed, new_mask)) | 1537 | if (cpumask_equal(p->cpus_ptr, new_mask)) |
1130 | goto out; | 1538 | goto out; |
1131 | 1539 | ||
1132 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { | 1540 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { |
@@ -1286,10 +1694,10 @@ static int migrate_swap_stop(void *data) | |||
1286 | if (task_cpu(arg->src_task) != arg->src_cpu) | 1694 | if (task_cpu(arg->src_task) != arg->src_cpu) |
1287 | goto unlock; | 1695 | goto unlock; |
1288 | 1696 | ||
1289 | if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) | 1697 | if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) |
1290 | goto unlock; | 1698 | goto unlock; |
1291 | 1699 | ||
1292 | if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) | 1700 | if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) |
1293 | goto unlock; | 1701 | goto unlock; |
1294 | 1702 | ||
1295 | __migrate_swap_task(arg->src_task, arg->dst_cpu); | 1703 | __migrate_swap_task(arg->src_task, arg->dst_cpu); |
@@ -1331,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, | |||
1331 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) | 1739 | if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) |
1332 | goto out; | 1740 | goto out; |
1333 | 1741 | ||
1334 | if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) | 1742 | if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) |
1335 | goto out; | 1743 | goto out; |
1336 | 1744 | ||
1337 | if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) | 1745 | if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) |
1338 | goto out; | 1746 | goto out; |
1339 | 1747 | ||
1340 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); | 1748 | trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); |
@@ -1479,7 +1887,7 @@ void kick_process(struct task_struct *p) | |||
1479 | EXPORT_SYMBOL_GPL(kick_process); | 1887 | EXPORT_SYMBOL_GPL(kick_process); |
1480 | 1888 | ||
1481 | /* | 1889 | /* |
1482 | * ->cpus_allowed is protected by both rq->lock and p->pi_lock | 1890 | * ->cpus_ptr is protected by both rq->lock and p->pi_lock |
1483 | * | 1891 | * |
1484 | * A few notes on cpu_active vs cpu_online: | 1892 | * A few notes on cpu_active vs cpu_online: |
1485 | * | 1893 | * |
@@ -1519,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
1519 | for_each_cpu(dest_cpu, nodemask) { | 1927 | for_each_cpu(dest_cpu, nodemask) { |
1520 | if (!cpu_active(dest_cpu)) | 1928 | if (!cpu_active(dest_cpu)) |
1521 | continue; | 1929 | continue; |
1522 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) | 1930 | if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) |
1523 | return dest_cpu; | 1931 | return dest_cpu; |
1524 | } | 1932 | } |
1525 | } | 1933 | } |
1526 | 1934 | ||
1527 | for (;;) { | 1935 | for (;;) { |
1528 | /* Any allowed, online CPU? */ | 1936 | /* Any allowed, online CPU? */ |
1529 | for_each_cpu(dest_cpu, &p->cpus_allowed) { | 1937 | for_each_cpu(dest_cpu, p->cpus_ptr) { |
1530 | if (!is_cpu_allowed(p, dest_cpu)) | 1938 | if (!is_cpu_allowed(p, dest_cpu)) |
1531 | continue; | 1939 | continue; |
1532 | 1940 | ||
@@ -1570,7 +1978,7 @@ out: | |||
1570 | } | 1978 | } |
1571 | 1979 | ||
1572 | /* | 1980 | /* |
1573 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. | 1981 | * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. |
1574 | */ | 1982 | */ |
1575 | static inline | 1983 | static inline |
1576 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | 1984 | int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) |
@@ -1580,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) | |||
1580 | if (p->nr_cpus_allowed > 1) | 1988 | if (p->nr_cpus_allowed > 1) |
1581 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); | 1989 | cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); |
1582 | else | 1990 | else |
1583 | cpu = cpumask_any(&p->cpus_allowed); | 1991 | cpu = cpumask_any(p->cpus_ptr); |
1584 | 1992 | ||
1585 | /* | 1993 | /* |
1586 | * In order not to call set_task_cpu() on a blocking task we need | 1994 | * In order not to call set_task_cpu() on a blocking task we need |
1587 | * to rely on ttwu() to place the task on a valid ->cpus_allowed | 1995 | * to rely on ttwu() to place the task on a valid ->cpus_ptr |
1588 | * CPU. | 1996 | * CPU. |
1589 | * | 1997 | * |
1590 | * Since this is common to all placement strategies, this lives here. | 1998 | * Since this is common to all placement strategies, this lives here. |
@@ -1991,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
1991 | unsigned long flags; | 2399 | unsigned long flags; |
1992 | int cpu, success = 0; | 2400 | int cpu, success = 0; |
1993 | 2401 | ||
2402 | if (p == current) { | ||
2403 | /* | ||
2404 | * We're waking current, this means 'p->on_rq' and 'task_cpu(p) | ||
2405 | * == smp_processor_id()'. Together this means we can special | ||
2406 | * case the whole 'p->on_rq && ttwu_remote()' case below | ||
2407 | * without taking any locks. | ||
2408 | * | ||
2409 | * In particular: | ||
2410 | * - we rely on Program-Order guarantees for all the ordering, | ||
2411 | * - we're serialized against set_special_state() by virtue of | ||
2412 | * it disabling IRQs (this allows not taking ->pi_lock). | ||
2413 | */ | ||
2414 | if (!(p->state & state)) | ||
2415 | return false; | ||
2416 | |||
2417 | success = 1; | ||
2418 | cpu = task_cpu(p); | ||
2419 | trace_sched_waking(p); | ||
2420 | p->state = TASK_RUNNING; | ||
2421 | trace_sched_wakeup(p); | ||
2422 | goto out; | ||
2423 | } | ||
2424 | |||
1994 | /* | 2425 | /* |
1995 | * If we are going to wake up a thread waiting for CONDITION we | 2426 | * If we are going to wake up a thread waiting for CONDITION we |
1996 | * need to ensure that CONDITION=1 done by the caller can not be | 2427 | * need to ensure that CONDITION=1 done by the caller can not be |
@@ -2000,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2000 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 2431 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
2001 | smp_mb__after_spinlock(); | 2432 | smp_mb__after_spinlock(); |
2002 | if (!(p->state & state)) | 2433 | if (!(p->state & state)) |
2003 | goto out; | 2434 | goto unlock; |
2004 | 2435 | ||
2005 | trace_sched_waking(p); | 2436 | trace_sched_waking(p); |
2006 | 2437 | ||
@@ -2030,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2030 | */ | 2461 | */ |
2031 | smp_rmb(); | 2462 | smp_rmb(); |
2032 | if (p->on_rq && ttwu_remote(p, wake_flags)) | 2463 | if (p->on_rq && ttwu_remote(p, wake_flags)) |
2033 | goto stat; | 2464 | goto unlock; |
2034 | 2465 | ||
2035 | #ifdef CONFIG_SMP | 2466 | #ifdef CONFIG_SMP |
2036 | /* | 2467 | /* |
@@ -2090,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | |||
2090 | #endif /* CONFIG_SMP */ | 2521 | #endif /* CONFIG_SMP */ |
2091 | 2522 | ||
2092 | ttwu_queue(p, cpu, wake_flags); | 2523 | ttwu_queue(p, cpu, wake_flags); |
2093 | stat: | 2524 | unlock: |
2094 | ttwu_stat(p, cpu, wake_flags); | ||
2095 | out: | ||
2096 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 2525 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
2526 | out: | ||
2527 | if (success) | ||
2528 | ttwu_stat(p, cpu, wake_flags); | ||
2097 | 2529 | ||
2098 | return success; | 2530 | return success; |
2099 | } | 2531 | } |
@@ -2300,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) | |||
2300 | */ | 2732 | */ |
2301 | p->prio = current->normal_prio; | 2733 | p->prio = current->normal_prio; |
2302 | 2734 | ||
2735 | uclamp_fork(p); | ||
2736 | |||
2303 | /* | 2737 | /* |
2304 | * Revert to default priority/policy on fork if requested. | 2738 | * Revert to default priority/policy on fork if requested. |
2305 | */ | 2739 | */ |
@@ -2395,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p) | |||
2395 | #ifdef CONFIG_SMP | 2829 | #ifdef CONFIG_SMP |
2396 | /* | 2830 | /* |
2397 | * Fork balancing, do it here and not earlier because: | 2831 | * Fork balancing, do it here and not earlier because: |
2398 | * - cpus_allowed can change in the fork path | 2832 | * - cpus_ptr can change in the fork path |
2399 | * - any previously selected CPU might disappear through hotplug | 2833 | * - any previously selected CPU might disappear through hotplug |
2400 | * | 2834 | * |
2401 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, | 2835 | * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, |
@@ -3033,7 +3467,6 @@ void scheduler_tick(void) | |||
3033 | 3467 | ||
3034 | update_rq_clock(rq); | 3468 | update_rq_clock(rq); |
3035 | curr->sched_class->task_tick(rq, curr, 0); | 3469 | curr->sched_class->task_tick(rq, curr, 0); |
3036 | cpu_load_update_active(rq); | ||
3037 | calc_global_load_tick(rq); | 3470 | calc_global_load_tick(rq); |
3038 | psi_task_tick(rq); | 3471 | psi_task_tick(rq); |
3039 | 3472 | ||
@@ -4071,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p, | |||
4071 | static void __setscheduler(struct rq *rq, struct task_struct *p, | 4504 | static void __setscheduler(struct rq *rq, struct task_struct *p, |
4072 | const struct sched_attr *attr, bool keep_boost) | 4505 | const struct sched_attr *attr, bool keep_boost) |
4073 | { | 4506 | { |
4507 | /* | ||
4508 | * If params can't change scheduling class changes aren't allowed | ||
4509 | * either. | ||
4510 | */ | ||
4511 | if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) | ||
4512 | return; | ||
4513 | |||
4074 | __setscheduler_params(p, attr); | 4514 | __setscheduler_params(p, attr); |
4075 | 4515 | ||
4076 | /* | 4516 | /* |
@@ -4208,6 +4648,13 @@ recheck: | |||
4208 | return retval; | 4648 | return retval; |
4209 | } | 4649 | } |
4210 | 4650 | ||
4651 | /* Update task specific "requested" clamps */ | ||
4652 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) { | ||
4653 | retval = uclamp_validate(p, attr); | ||
4654 | if (retval) | ||
4655 | return retval; | ||
4656 | } | ||
4657 | |||
4211 | /* | 4658 | /* |
4212 | * Make sure no PI-waiters arrive (or leave) while we are | 4659 | * Make sure no PI-waiters arrive (or leave) while we are |
4213 | * changing the priority of the task: | 4660 | * changing the priority of the task: |
@@ -4237,6 +4684,8 @@ recheck: | |||
4237 | goto change; | 4684 | goto change; |
4238 | if (dl_policy(policy) && dl_param_changed(p, attr)) | 4685 | if (dl_policy(policy) && dl_param_changed(p, attr)) |
4239 | goto change; | 4686 | goto change; |
4687 | if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) | ||
4688 | goto change; | ||
4240 | 4689 | ||
4241 | p->sched_reset_on_fork = reset_on_fork; | 4690 | p->sched_reset_on_fork = reset_on_fork; |
4242 | task_rq_unlock(rq, p, &rf); | 4691 | task_rq_unlock(rq, p, &rf); |
@@ -4267,7 +4716,7 @@ change: | |||
4267 | * the entire root_domain to become SCHED_DEADLINE. We | 4716 | * the entire root_domain to become SCHED_DEADLINE. We |
4268 | * will also fail if there's no bandwidth available. | 4717 | * will also fail if there's no bandwidth available. |
4269 | */ | 4718 | */ |
4270 | if (!cpumask_subset(span, &p->cpus_allowed) || | 4719 | if (!cpumask_subset(span, p->cpus_ptr) || |
4271 | rq->rd->dl_bw.bw == 0) { | 4720 | rq->rd->dl_bw.bw == 0) { |
4272 | task_rq_unlock(rq, p, &rf); | 4721 | task_rq_unlock(rq, p, &rf); |
4273 | return -EPERM; | 4722 | return -EPERM; |
@@ -4317,7 +4766,9 @@ change: | |||
4317 | put_prev_task(rq, p); | 4766 | put_prev_task(rq, p); |
4318 | 4767 | ||
4319 | prev_class = p->sched_class; | 4768 | prev_class = p->sched_class; |
4769 | |||
4320 | __setscheduler(rq, p, attr, pi); | 4770 | __setscheduler(rq, p, attr, pi); |
4771 | __setscheduler_uclamp(p, attr); | ||
4321 | 4772 | ||
4322 | if (queued) { | 4773 | if (queued) { |
4323 | /* | 4774 | /* |
@@ -4493,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a | |||
4493 | if (ret) | 4944 | if (ret) |
4494 | return -EFAULT; | 4945 | return -EFAULT; |
4495 | 4946 | ||
4947 | if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) && | ||
4948 | size < SCHED_ATTR_SIZE_VER1) | ||
4949 | return -EINVAL; | ||
4950 | |||
4496 | /* | 4951 | /* |
4497 | * XXX: Do we want to be lenient like existing syscalls; or do we want | 4952 | * XXX: Do we want to be lenient like existing syscalls; or do we want |
4498 | * to be strict and return an error on out-of-bounds values? | 4953 | * to be strict and return an error on out-of-bounds values? |
@@ -4556,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
4556 | 5011 | ||
4557 | if ((int)attr.sched_policy < 0) | 5012 | if ((int)attr.sched_policy < 0) |
4558 | return -EINVAL; | 5013 | return -EINVAL; |
5014 | if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) | ||
5015 | attr.sched_policy = SETPARAM_POLICY; | ||
4559 | 5016 | ||
4560 | rcu_read_lock(); | 5017 | rcu_read_lock(); |
4561 | retval = -ESRCH; | 5018 | retval = -ESRCH; |
4562 | p = find_process_by_pid(pid); | 5019 | p = find_process_by_pid(pid); |
4563 | if (p != NULL) | 5020 | if (likely(p)) |
4564 | retval = sched_setattr(p, &attr); | 5021 | get_task_struct(p); |
4565 | rcu_read_unlock(); | 5022 | rcu_read_unlock(); |
4566 | 5023 | ||
5024 | if (likely(p)) { | ||
5025 | retval = sched_setattr(p, &attr); | ||
5026 | put_task_struct(p); | ||
5027 | } | ||
5028 | |||
4567 | return retval; | 5029 | return retval; |
4568 | } | 5030 | } |
4569 | 5031 | ||
@@ -4714,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, | |||
4714 | else | 5176 | else |
4715 | attr.sched_nice = task_nice(p); | 5177 | attr.sched_nice = task_nice(p); |
4716 | 5178 | ||
5179 | #ifdef CONFIG_UCLAMP_TASK | ||
5180 | attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; | ||
5181 | attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; | ||
5182 | #endif | ||
5183 | |||
4717 | rcu_read_unlock(); | 5184 | rcu_read_unlock(); |
4718 | 5185 | ||
4719 | retval = sched_read_attr(uattr, &attr, size); | 5186 | retval = sched_read_attr(uattr, &attr, size); |
@@ -4866,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask) | |||
4866 | goto out_unlock; | 5333 | goto out_unlock; |
4867 | 5334 | ||
4868 | raw_spin_lock_irqsave(&p->pi_lock, flags); | 5335 | raw_spin_lock_irqsave(&p->pi_lock, flags); |
4869 | cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); | 5336 | cpumask_and(mask, &p->cpus_mask, cpu_active_mask); |
4870 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 5337 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4871 | 5338 | ||
4872 | out_unlock: | 5339 | out_unlock: |
@@ -5123,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout) | |||
5123 | } | 5590 | } |
5124 | EXPORT_SYMBOL(io_schedule_timeout); | 5591 | EXPORT_SYMBOL(io_schedule_timeout); |
5125 | 5592 | ||
5126 | void io_schedule(void) | 5593 | void __sched io_schedule(void) |
5127 | { | 5594 | { |
5128 | int token; | 5595 | int token; |
5129 | 5596 | ||
@@ -5443,7 +5910,7 @@ int task_can_attach(struct task_struct *p, | |||
5443 | * allowed nodes is unnecessary. Thus, cpusets are not | 5910 | * allowed nodes is unnecessary. Thus, cpusets are not |
5444 | * applicable for such threads. This prevents checking for | 5911 | * applicable for such threads. This prevents checking for |
5445 | * success of set_cpus_allowed_ptr() on all attached tasks | 5912 | * success of set_cpus_allowed_ptr() on all attached tasks |
5446 | * before cpus_allowed may be changed. | 5913 | * before cpus_mask may be changed. |
5447 | */ | 5914 | */ |
5448 | if (p->flags & PF_NO_SETAFFINITY) { | 5915 | if (p->flags & PF_NO_SETAFFINITY) { |
5449 | ret = -EINVAL; | 5916 | ret = -EINVAL; |
@@ -5470,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu) | |||
5470 | if (curr_cpu == target_cpu) | 5937 | if (curr_cpu == target_cpu) |
5471 | return 0; | 5938 | return 0; |
5472 | 5939 | ||
5473 | if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) | 5940 | if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) |
5474 | return -EINVAL; | 5941 | return -EINVAL; |
5475 | 5942 | ||
5476 | /* TODO: This is not properly updating schedstats */ | 5943 | /* TODO: This is not properly updating schedstats */ |
@@ -5608,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) | |||
5608 | put_prev_task(rq, next); | 6075 | put_prev_task(rq, next); |
5609 | 6076 | ||
5610 | /* | 6077 | /* |
5611 | * Rules for changing task_struct::cpus_allowed are holding | 6078 | * Rules for changing task_struct::cpus_mask are holding |
5612 | * both pi_lock and rq->lock, such that holding either | 6079 | * both pi_lock and rq->lock, such that holding either |
5613 | * stabilizes the mask. | 6080 | * stabilizes the mask. |
5614 | * | 6081 | * |
@@ -5902,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
5902 | 6369 | ||
5903 | void __init sched_init(void) | 6370 | void __init sched_init(void) |
5904 | { | 6371 | { |
5905 | int i, j; | ||
5906 | unsigned long alloc_size = 0, ptr; | 6372 | unsigned long alloc_size = 0, ptr; |
6373 | int i; | ||
5907 | 6374 | ||
5908 | wait_bit_init(); | 6375 | wait_bit_init(); |
5909 | 6376 | ||
@@ -6005,10 +6472,6 @@ void __init sched_init(void) | |||
6005 | #ifdef CONFIG_RT_GROUP_SCHED | 6472 | #ifdef CONFIG_RT_GROUP_SCHED |
6006 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); | 6473 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
6007 | #endif | 6474 | #endif |
6008 | |||
6009 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | ||
6010 | rq->cpu_load[j] = 0; | ||
6011 | |||
6012 | #ifdef CONFIG_SMP | 6475 | #ifdef CONFIG_SMP |
6013 | rq->sd = NULL; | 6476 | rq->sd = NULL; |
6014 | rq->rd = NULL; | 6477 | rq->rd = NULL; |
@@ -6063,6 +6526,8 @@ void __init sched_init(void) | |||
6063 | 6526 | ||
6064 | psi_init(); | 6527 | psi_init(); |
6065 | 6528 | ||
6529 | init_uclamp(); | ||
6530 | |||
6066 | scheduler_running = 1; | 6531 | scheduler_running = 1; |
6067 | } | 6532 | } |
6068 | 6533 | ||
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index ec4e4a9aab5f..5cc4012572ec 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
120 | const struct sched_dl_entity *dl_se = &p->dl; | 120 | const struct sched_dl_entity *dl_se = &p->dl; |
121 | 121 | ||
122 | if (later_mask && | 122 | if (later_mask && |
123 | cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { | 123 | cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { |
124 | return 1; | 124 | return 1; |
125 | } else { | 125 | } else { |
126 | int best_cpu = cpudl_maximum(cp); | 126 | int best_cpu = cpudl_maximum(cp); |
127 | 127 | ||
128 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 128 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
129 | 129 | ||
130 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 130 | if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && |
131 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { | 131 | dl_time_before(dl_se->deadline, cp->elements[0].dl)) { |
132 | if (later_mask) | 132 | if (later_mask) |
133 | cpumask_set_cpu(best_cpu, later_mask); | 133 | cpumask_set_cpu(best_cpu, later_mask); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 962cf343f798..636ca6f88c8e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, | |||
196 | * based on the task model parameters and gives the minimal utilization | 196 | * based on the task model parameters and gives the minimal utilization |
197 | * required to meet deadlines. | 197 | * required to meet deadlines. |
198 | */ | 198 | */ |
199 | unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | 199 | unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
200 | unsigned long max, enum schedutil_type type) | 200 | unsigned long max, enum schedutil_type type, |
201 | struct task_struct *p) | ||
201 | { | 202 | { |
202 | unsigned long dl_util, util, irq; | 203 | unsigned long dl_util, util, irq; |
203 | struct rq *rq = cpu_rq(cpu); | 204 | struct rq *rq = cpu_rq(cpu); |
204 | 205 | ||
205 | if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) | 206 | if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) && |
207 | type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { | ||
206 | return max; | 208 | return max; |
209 | } | ||
207 | 210 | ||
208 | /* | 211 | /* |
209 | * Early check to see if IRQ/steal time saturates the CPU, can be | 212 | * Early check to see if IRQ/steal time saturates the CPU, can be |
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | |||
219 | * CFS tasks and we use the same metric to track the effective | 222 | * CFS tasks and we use the same metric to track the effective |
220 | * utilization (PELT windows are synchronized) we can directly add them | 223 | * utilization (PELT windows are synchronized) we can directly add them |
221 | * to obtain the CPU's actual utilization. | 224 | * to obtain the CPU's actual utilization. |
225 | * | ||
226 | * CFS and RT utilization can be boosted or capped, depending on | ||
227 | * utilization clamp constraints requested by currently RUNNABLE | ||
228 | * tasks. | ||
229 | * When there are no CFS RUNNABLE tasks, clamps are released and | ||
230 | * frequency will be gracefully reduced with the utilization decay. | ||
222 | */ | 231 | */ |
223 | util = util_cfs; | 232 | util = util_cfs + cpu_util_rt(rq); |
224 | util += cpu_util_rt(rq); | 233 | if (type == FREQUENCY_UTIL) |
234 | util = uclamp_util_with(rq, util, p); | ||
225 | 235 | ||
226 | dl_util = cpu_util_dl(rq); | 236 | dl_util = cpu_util_dl(rq); |
227 | 237 | ||
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) | |||
276 | { | 286 | { |
277 | struct rq *rq = cpu_rq(sg_cpu->cpu); | 287 | struct rq *rq = cpu_rq(sg_cpu->cpu); |
278 | unsigned long util = cpu_util_cfs(rq); | 288 | unsigned long util = cpu_util_cfs(rq); |
279 | unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); | 289 | unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); |
280 | 290 | ||
281 | sg_cpu->max = max; | 291 | sg_cpu->max = max; |
282 | sg_cpu->bw_dl = cpu_bw_dl(rq); | 292 | sg_cpu->bw_dl = cpu_bw_dl(rq); |
283 | 293 | ||
284 | return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); | 294 | return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); |
285 | } | 295 | } |
286 | 296 | ||
287 | /** | 297 | /** |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 9c6480e6d62d..b7abca987d94 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
94 | if (skip) | 94 | if (skip) |
95 | continue; | 95 | continue; |
96 | 96 | ||
97 | if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) | 97 | if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) |
98 | continue; | 98 | continue; |
99 | 99 | ||
100 | if (lowest_mask) { | 100 | if (lowest_mask) { |
101 | cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); | 101 | cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); |
102 | 102 | ||
103 | /* | 103 | /* |
104 | * We have to ensure that we have at least one bit | 104 | * We have to ensure that we have at least one bit |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 43901fa3f269..8b5bb2ac16e2 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
538 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
539 | * online CPU: | 539 | * online CPU: |
540 | */ | 540 | */ |
541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr); |
542 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
543 | /* | 543 | /* |
544 | * Failed to find any suitable CPU. | 544 | * Failed to find any suitable CPU. |
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq) | |||
1195 | &curr->dl); | 1195 | &curr->dl); |
1196 | } else { | 1196 | } else { |
1197 | unsigned long scale_freq = arch_scale_freq_capacity(cpu); | 1197 | unsigned long scale_freq = arch_scale_freq_capacity(cpu); |
1198 | unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | 1198 | unsigned long scale_cpu = arch_scale_cpu_capacity(cpu); |
1199 | 1199 | ||
1200 | scaled_delta_exec = cap_scale(delta_exec, scale_freq); | 1200 | scaled_delta_exec = cap_scale(delta_exec, scale_freq); |
1201 | scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); | 1201 | scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); |
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq) | |||
1824 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) | 1824 | static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) |
1825 | { | 1825 | { |
1826 | if (!task_running(rq, p) && | 1826 | if (!task_running(rq, p) && |
1827 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1827 | cpumask_test_cpu(cpu, p->cpus_ptr)) |
1828 | return 1; | 1828 | return 1; |
1829 | return 0; | 1829 | return 0; |
1830 | } | 1830 | } |
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) | |||
1974 | /* Retry if something changed. */ | 1974 | /* Retry if something changed. */ |
1975 | if (double_lock_balance(rq, later_rq)) { | 1975 | if (double_lock_balance(rq, later_rq)) { |
1976 | if (unlikely(task_rq(task) != rq || | 1976 | if (unlikely(task_rq(task) != rq || |
1977 | !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || | 1977 | !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || |
1978 | task_running(rq, task) || | 1978 | task_running(rq, task) || |
1979 | !dl_task(task) || | 1979 | !dl_task(task) || |
1980 | !task_on_rq_queued(task))) { | 1980 | !task_on_rq_queued(task))) { |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 14c6a8716ba1..f7e4579e746c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
233 | *tablep = NULL; | 233 | *tablep = NULL; |
234 | } | 234 | } |
235 | 235 | ||
236 | static int min_load_idx = 0; | ||
237 | static int max_load_idx = CPU_LOAD_IDX_MAX-1; | ||
238 | |||
239 | static void | 236 | static void |
240 | set_table_entry(struct ctl_table *entry, | 237 | set_table_entry(struct ctl_table *entry, |
241 | const char *procname, void *data, int maxlen, | 238 | const char *procname, void *data, int maxlen, |
242 | umode_t mode, proc_handler *proc_handler, | 239 | umode_t mode, proc_handler *proc_handler) |
243 | bool load_idx) | ||
244 | { | 240 | { |
245 | entry->procname = procname; | 241 | entry->procname = procname; |
246 | entry->data = data; | 242 | entry->data = data; |
247 | entry->maxlen = maxlen; | 243 | entry->maxlen = maxlen; |
248 | entry->mode = mode; | 244 | entry->mode = mode; |
249 | entry->proc_handler = proc_handler; | 245 | entry->proc_handler = proc_handler; |
250 | |||
251 | if (load_idx) { | ||
252 | entry->extra1 = &min_load_idx; | ||
253 | entry->extra2 = &max_load_idx; | ||
254 | } | ||
255 | } | 246 | } |
256 | 247 | ||
257 | static struct ctl_table * | 248 | static struct ctl_table * |
258 | sd_alloc_ctl_domain_table(struct sched_domain *sd) | 249 | sd_alloc_ctl_domain_table(struct sched_domain *sd) |
259 | { | 250 | { |
260 | struct ctl_table *table = sd_alloc_ctl_entry(14); | 251 | struct ctl_table *table = sd_alloc_ctl_entry(9); |
261 | 252 | ||
262 | if (table == NULL) | 253 | if (table == NULL) |
263 | return NULL; | 254 | return NULL; |
264 | 255 | ||
265 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); | 256 | set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); |
266 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); | 257 | set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); |
267 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 258 | set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); |
268 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 259 | set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); |
269 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 260 | set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); |
270 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 261 | set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); |
271 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); | 262 | set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); |
272 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); | 263 | set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); |
273 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); | 264 | /* &table[8] is terminator */ |
274 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); | ||
275 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); | ||
276 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
277 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
278 | /* &table[13] is terminator */ | ||
279 | 265 | ||
280 | return table; | 266 | return table; |
281 | } | 267 | } |
@@ -653,8 +639,6 @@ do { \ | |||
653 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) | 639 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) |
654 | 640 | ||
655 | P(nr_running); | 641 | P(nr_running); |
656 | SEQ_printf(m, " .%-30s: %lu\n", "load", | ||
657 | rq->load.weight); | ||
658 | P(nr_switches); | 642 | P(nr_switches); |
659 | P(nr_load_updates); | 643 | P(nr_load_updates); |
660 | P(nr_uninterruptible); | 644 | P(nr_uninterruptible); |
@@ -662,11 +646,6 @@ do { \ | |||
662 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); | 646 | SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); |
663 | PN(clock); | 647 | PN(clock); |
664 | PN(clock_task); | 648 | PN(clock_task); |
665 | P(cpu_load[0]); | ||
666 | P(cpu_load[1]); | ||
667 | P(cpu_load[2]); | ||
668 | P(cpu_load[3]); | ||
669 | P(cpu_load[4]); | ||
670 | #undef P | 649 | #undef P |
671 | #undef PN | 650 | #undef PN |
672 | 651 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8591529e1753..036be95a87e9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
275 | return grp->my_q; | 275 | return grp->my_q; |
276 | } | 276 | } |
277 | 277 | ||
278 | static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) | ||
279 | { | ||
280 | if (!path) | ||
281 | return; | ||
282 | |||
283 | if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) | ||
284 | autogroup_path(cfs_rq->tg, path, len); | ||
285 | else if (cfs_rq && cfs_rq->tg->css.cgroup) | ||
286 | cgroup_path(cfs_rq->tg->css.cgroup, path, len); | ||
287 | else | ||
288 | strlcpy(path, "(null)", len); | ||
289 | } | ||
290 | |||
278 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 291 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
279 | { | 292 | { |
280 | struct rq *rq = rq_of(cfs_rq); | 293 | struct rq *rq = rq_of(cfs_rq); |
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
449 | return NULL; | 462 | return NULL; |
450 | } | 463 | } |
451 | 464 | ||
465 | static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) | ||
466 | { | ||
467 | if (path) | ||
468 | strlcpy(path, "(null)", len); | ||
469 | } | ||
470 | |||
452 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 471 | static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
453 | { | 472 | { |
454 | return true; | 473 | return true; |
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p) | |||
764 | struct sched_entity *se = &p->se; | 783 | struct sched_entity *se = &p->se; |
765 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 784 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
766 | struct sched_avg *sa = &se->avg; | 785 | struct sched_avg *sa = &se->avg; |
767 | long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); | 786 | long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq))); |
768 | long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; | 787 | long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; |
769 | 788 | ||
770 | if (cap > 0) { | 789 | if (cap > 0) { |
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, | |||
1466 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; | 1485 | group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; |
1467 | } | 1486 | } |
1468 | 1487 | ||
1469 | static unsigned long weighted_cpuload(struct rq *rq); | 1488 | static unsigned long cpu_runnable_load(struct rq *rq); |
1470 | static unsigned long source_load(int cpu, int type); | ||
1471 | static unsigned long target_load(int cpu, int type); | ||
1472 | 1489 | ||
1473 | /* Cached statistics for all CPUs within a node */ | 1490 | /* Cached statistics for all CPUs within a node */ |
1474 | struct numa_stats { | 1491 | struct numa_stats { |
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid) | |||
1489 | for_each_cpu(cpu, cpumask_of_node(nid)) { | 1506 | for_each_cpu(cpu, cpumask_of_node(nid)) { |
1490 | struct rq *rq = cpu_rq(cpu); | 1507 | struct rq *rq = cpu_rq(cpu); |
1491 | 1508 | ||
1492 | ns->load += weighted_cpuload(rq); | 1509 | ns->load += cpu_runnable_load(rq); |
1493 | ns->compute_capacity += capacity_of(cpu); | 1510 | ns->compute_capacity += capacity_of(cpu); |
1494 | } | 1511 | } |
1495 | 1512 | ||
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1621 | * be incurred if the tasks were swapped. | 1638 | * be incurred if the tasks were swapped. |
1622 | */ | 1639 | */ |
1623 | /* Skip this swap candidate if cannot move to the source cpu */ | 1640 | /* Skip this swap candidate if cannot move to the source cpu */ |
1624 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1641 | if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr)) |
1625 | goto unlock; | 1642 | goto unlock; |
1626 | 1643 | ||
1627 | /* | 1644 | /* |
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env, | |||
1718 | 1735 | ||
1719 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { | 1736 | for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { |
1720 | /* Skip this CPU if the source task cannot migrate */ | 1737 | /* Skip this CPU if the source task cannot migrate */ |
1721 | if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) | 1738 | if (!cpumask_test_cpu(cpu, env->p->cpus_ptr)) |
1722 | continue; | 1739 | continue; |
1723 | 1740 | ||
1724 | env->dst_cpu = cpu; | 1741 | env->dst_cpu = cpu; |
@@ -2686,8 +2703,6 @@ static void | |||
2686 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2703 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2687 | { | 2704 | { |
2688 | update_load_add(&cfs_rq->load, se->load.weight); | 2705 | update_load_add(&cfs_rq->load, se->load.weight); |
2689 | if (!parent_entity(se)) | ||
2690 | update_load_add(&rq_of(cfs_rq)->load, se->load.weight); | ||
2691 | #ifdef CONFIG_SMP | 2706 | #ifdef CONFIG_SMP |
2692 | if (entity_is_task(se)) { | 2707 | if (entity_is_task(se)) { |
2693 | struct rq *rq = rq_of(cfs_rq); | 2708 | struct rq *rq = rq_of(cfs_rq); |
@@ -2703,8 +2718,6 @@ static void | |||
2703 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 2718 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
2704 | { | 2719 | { |
2705 | update_load_sub(&cfs_rq->load, se->load.weight); | 2720 | update_load_sub(&cfs_rq->load, se->load.weight); |
2706 | if (!parent_entity(se)) | ||
2707 | update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); | ||
2708 | #ifdef CONFIG_SMP | 2721 | #ifdef CONFIG_SMP |
2709 | if (entity_is_task(se)) { | 2722 | if (entity_is_task(se)) { |
2710 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); | 2723 | account_numa_dequeue(rq_of(cfs_rq), task_of(se)); |
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) | |||
3334 | update_tg_cfs_util(cfs_rq, se, gcfs_rq); | 3347 | update_tg_cfs_util(cfs_rq, se, gcfs_rq); |
3335 | update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); | 3348 | update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); |
3336 | 3349 | ||
3350 | trace_pelt_cfs_tp(cfs_rq); | ||
3351 | trace_pelt_se_tp(se); | ||
3352 | |||
3337 | return 1; | 3353 | return 1; |
3338 | } | 3354 | } |
3339 | 3355 | ||
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3486 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); | 3502 | add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); |
3487 | 3503 | ||
3488 | cfs_rq_util_change(cfs_rq, flags); | 3504 | cfs_rq_util_change(cfs_rq, flags); |
3505 | |||
3506 | trace_pelt_cfs_tp(cfs_rq); | ||
3489 | } | 3507 | } |
3490 | 3508 | ||
3491 | /** | 3509 | /** |
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s | |||
3505 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); | 3523 | add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); |
3506 | 3524 | ||
3507 | cfs_rq_util_change(cfs_rq, 0); | 3525 | cfs_rq_util_change(cfs_rq, 0); |
3526 | |||
3527 | trace_pelt_cfs_tp(cfs_rq); | ||
3508 | } | 3528 | } |
3509 | 3529 | ||
3510 | /* | 3530 | /* |
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
4100 | * least twice that of our own weight (i.e. dont track it | 4120 | * least twice that of our own weight (i.e. dont track it |
4101 | * when there are only lesser-weight tasks around): | 4121 | * when there are only lesser-weight tasks around): |
4102 | */ | 4122 | */ |
4103 | if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { | 4123 | if (schedstat_enabled() && |
4124 | rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { | ||
4104 | schedstat_set(se->statistics.slice_max, | 4125 | schedstat_set(se->statistics.slice_max, |
4105 | max((u64)schedstat_val(se->statistics.slice_max), | 4126 | max((u64)schedstat_val(se->statistics.slice_max), |
4106 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); | 4127 | se->sum_exec_runtime - se->prev_sum_exec_runtime)); |
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) | |||
4734 | if (runtime_refresh_within(cfs_b, min_left)) | 4755 | if (runtime_refresh_within(cfs_b, min_left)) |
4735 | return; | 4756 | return; |
4736 | 4757 | ||
4758 | /* don't push forwards an existing deferred unthrottle */ | ||
4759 | if (cfs_b->slack_started) | ||
4760 | return; | ||
4761 | cfs_b->slack_started = true; | ||
4762 | |||
4737 | hrtimer_start(&cfs_b->slack_timer, | 4763 | hrtimer_start(&cfs_b->slack_timer, |
4738 | ns_to_ktime(cfs_bandwidth_slack_period), | 4764 | ns_to_ktime(cfs_bandwidth_slack_period), |
4739 | HRTIMER_MODE_REL); | 4765 | HRTIMER_MODE_REL); |
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) | |||
4787 | 4813 | ||
4788 | /* confirm we're still not at a refresh boundary */ | 4814 | /* confirm we're still not at a refresh boundary */ |
4789 | raw_spin_lock_irqsave(&cfs_b->lock, flags); | 4815 | raw_spin_lock_irqsave(&cfs_b->lock, flags); |
4816 | cfs_b->slack_started = false; | ||
4790 | if (cfs_b->distribute_running) { | 4817 | if (cfs_b->distribute_running) { |
4791 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); | 4818 | raw_spin_unlock_irqrestore(&cfs_b->lock, flags); |
4792 | return; | 4819 | return; |
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
4950 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 4977 | hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); |
4951 | cfs_b->slack_timer.function = sched_cfs_slack_timer; | 4978 | cfs_b->slack_timer.function = sched_cfs_slack_timer; |
4952 | cfs_b->distribute_running = 0; | 4979 | cfs_b->distribute_running = 0; |
4980 | cfs_b->slack_started = false; | ||
4953 | } | 4981 | } |
4954 | 4982 | ||
4955 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4983 | static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu) | |||
5153 | 5181 | ||
5154 | static inline void update_overutilized_status(struct rq *rq) | 5182 | static inline void update_overutilized_status(struct rq *rq) |
5155 | { | 5183 | { |
5156 | if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) | 5184 | if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) { |
5157 | WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); | 5185 | WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); |
5186 | trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED); | ||
5187 | } | ||
5158 | } | 5188 | } |
5159 | #else | 5189 | #else |
5160 | static inline void update_overutilized_status(struct rq *rq) { } | 5190 | static inline void update_overutilized_status(struct rq *rq) { } |
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); | |||
5325 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | 5355 | DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); |
5326 | 5356 | ||
5327 | #ifdef CONFIG_NO_HZ_COMMON | 5357 | #ifdef CONFIG_NO_HZ_COMMON |
5328 | /* | ||
5329 | * per rq 'load' arrray crap; XXX kill this. | ||
5330 | */ | ||
5331 | |||
5332 | /* | ||
5333 | * The exact cpuload calculated at every tick would be: | ||
5334 | * | ||
5335 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | ||
5336 | * | ||
5337 | * If a CPU misses updates for n ticks (as it was idle) and update gets | ||
5338 | * called on the n+1-th tick when CPU may be busy, then we have: | ||
5339 | * | ||
5340 | * load_n = (1 - 1/2^i)^n * load_0 | ||
5341 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | ||
5342 | * | ||
5343 | * decay_load_missed() below does efficient calculation of | ||
5344 | * | ||
5345 | * load' = (1 - 1/2^i)^n * load | ||
5346 | * | ||
5347 | * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. | ||
5348 | * This allows us to precompute the above in said factors, thereby allowing the | ||
5349 | * reduction of an arbitrary n in O(log_2 n) steps. (See also | ||
5350 | * fixed_power_int()) | ||
5351 | * | ||
5352 | * The calculation is approximated on a 128 point scale. | ||
5353 | */ | ||
5354 | #define DEGRADE_SHIFT 7 | ||
5355 | |||
5356 | static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
5357 | static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
5358 | { 0, 0, 0, 0, 0, 0, 0, 0 }, | ||
5359 | { 64, 32, 8, 0, 0, 0, 0, 0 }, | ||
5360 | { 96, 72, 40, 12, 1, 0, 0, 0 }, | ||
5361 | { 112, 98, 75, 43, 15, 1, 0, 0 }, | ||
5362 | { 120, 112, 98, 76, 45, 16, 2, 0 } | ||
5363 | }; | ||
5364 | |||
5365 | /* | ||
5366 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
5367 | * would be when CPU is idle and so we just decay the old load without | ||
5368 | * adding any new load. | ||
5369 | */ | ||
5370 | static unsigned long | ||
5371 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
5372 | { | ||
5373 | int j = 0; | ||
5374 | |||
5375 | if (!missed_updates) | ||
5376 | return load; | ||
5377 | |||
5378 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
5379 | return 0; | ||
5380 | |||
5381 | if (idx == 1) | ||
5382 | return load >> missed_updates; | ||
5383 | |||
5384 | while (missed_updates) { | ||
5385 | if (missed_updates % 2) | ||
5386 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
5387 | |||
5388 | missed_updates >>= 1; | ||
5389 | j++; | ||
5390 | } | ||
5391 | return load; | ||
5392 | } | ||
5393 | 5358 | ||
5394 | static struct { | 5359 | static struct { |
5395 | cpumask_var_t idle_cpus_mask; | 5360 | cpumask_var_t idle_cpus_mask; |
@@ -5401,234 +5366,11 @@ static struct { | |||
5401 | 5366 | ||
5402 | #endif /* CONFIG_NO_HZ_COMMON */ | 5367 | #endif /* CONFIG_NO_HZ_COMMON */ |
5403 | 5368 | ||
5404 | /** | 5369 | static unsigned long cpu_runnable_load(struct rq *rq) |
5405 | * __cpu_load_update - update the rq->cpu_load[] statistics | ||
5406 | * @this_rq: The rq to update statistics for | ||
5407 | * @this_load: The current load | ||
5408 | * @pending_updates: The number of missed updates | ||
5409 | * | ||
5410 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
5411 | * scheduler tick (TICK_NSEC). | ||
5412 | * | ||
5413 | * This function computes a decaying average: | ||
5414 | * | ||
5415 | * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load | ||
5416 | * | ||
5417 | * Because of NOHZ it might not get called on every tick which gives need for | ||
5418 | * the @pending_updates argument. | ||
5419 | * | ||
5420 | * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 | ||
5421 | * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load | ||
5422 | * = A * (A * load[i]_n-2 + B) + B | ||
5423 | * = A * (A * (A * load[i]_n-3 + B) + B) + B | ||
5424 | * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B | ||
5425 | * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B | ||
5426 | * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B | ||
5427 | * = (1 - 1/2^i)^n * (load[i]_0 - load) + load | ||
5428 | * | ||
5429 | * In the above we've assumed load_n := load, which is true for NOHZ_FULL as | ||
5430 | * any change in load would have resulted in the tick being turned back on. | ||
5431 | * | ||
5432 | * For regular NOHZ, this reduces to: | ||
5433 | * | ||
5434 | * load[i]_n = (1 - 1/2^i)^n * load[i]_0 | ||
5435 | * | ||
5436 | * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra | ||
5437 | * term. | ||
5438 | */ | ||
5439 | static void cpu_load_update(struct rq *this_rq, unsigned long this_load, | ||
5440 | unsigned long pending_updates) | ||
5441 | { | ||
5442 | unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0]; | ||
5443 | int i, scale; | ||
5444 | |||
5445 | this_rq->nr_load_updates++; | ||
5446 | |||
5447 | /* Update our load: */ | ||
5448 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
5449 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
5450 | unsigned long old_load, new_load; | ||
5451 | |||
5452 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
5453 | |||
5454 | old_load = this_rq->cpu_load[i]; | ||
5455 | #ifdef CONFIG_NO_HZ_COMMON | ||
5456 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
5457 | if (tickless_load) { | ||
5458 | old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); | ||
5459 | /* | ||
5460 | * old_load can never be a negative value because a | ||
5461 | * decayed tickless_load cannot be greater than the | ||
5462 | * original tickless_load. | ||
5463 | */ | ||
5464 | old_load += tickless_load; | ||
5465 | } | ||
5466 | #endif | ||
5467 | new_load = this_load; | ||
5468 | /* | ||
5469 | * Round up the averaging division if load is increasing. This | ||
5470 | * prevents us from getting stuck on 9 if the load is 10, for | ||
5471 | * example. | ||
5472 | */ | ||
5473 | if (new_load > old_load) | ||
5474 | new_load += scale - 1; | ||
5475 | |||
5476 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
5477 | } | ||
5478 | } | ||
5479 | |||
5480 | /* Used instead of source_load when we know the type == 0 */ | ||
5481 | static unsigned long weighted_cpuload(struct rq *rq) | ||
5482 | { | 5370 | { |
5483 | return cfs_rq_runnable_load_avg(&rq->cfs); | 5371 | return cfs_rq_runnable_load_avg(&rq->cfs); |
5484 | } | 5372 | } |
5485 | 5373 | ||
5486 | #ifdef CONFIG_NO_HZ_COMMON | ||
5487 | /* | ||
5488 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
5489 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading | ||
5490 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
5491 | * | ||
5492 | * Therefore we need to avoid the delta approach from the regular tick when | ||
5493 | * possible since that would seriously skew the load calculation. This is why we | ||
5494 | * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on | ||
5495 | * jiffies deltas for updates happening while in nohz mode (idle ticks, idle | ||
5496 | * loop exit, nohz_idle_balance, nohz full exit...) | ||
5497 | * | ||
5498 | * This means we might still be one tick off for nohz periods. | ||
5499 | */ | ||
5500 | |||
5501 | static void cpu_load_update_nohz(struct rq *this_rq, | ||
5502 | unsigned long curr_jiffies, | ||
5503 | unsigned long load) | ||
5504 | { | ||
5505 | unsigned long pending_updates; | ||
5506 | |||
5507 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
5508 | if (pending_updates) { | ||
5509 | this_rq->last_load_update_tick = curr_jiffies; | ||
5510 | /* | ||
5511 | * In the regular NOHZ case, we were idle, this means load 0. | ||
5512 | * In the NOHZ_FULL case, we were non-idle, we should consider | ||
5513 | * its weighted load. | ||
5514 | */ | ||
5515 | cpu_load_update(this_rq, load, pending_updates); | ||
5516 | } | ||
5517 | } | ||
5518 | |||
5519 | /* | ||
5520 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
5521 | * idle balance. | ||
5522 | */ | ||
5523 | static void cpu_load_update_idle(struct rq *this_rq) | ||
5524 | { | ||
5525 | /* | ||
5526 | * bail if there's load or we're actually up-to-date. | ||
5527 | */ | ||
5528 | if (weighted_cpuload(this_rq)) | ||
5529 | return; | ||
5530 | |||
5531 | cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0); | ||
5532 | } | ||
5533 | |||
5534 | /* | ||
5535 | * Record CPU load on nohz entry so we know the tickless load to account | ||
5536 | * on nohz exit. cpu_load[0] happens then to be updated more frequently | ||
5537 | * than other cpu_load[idx] but it should be fine as cpu_load readers | ||
5538 | * shouldn't rely into synchronized cpu_load[*] updates. | ||
5539 | */ | ||
5540 | void cpu_load_update_nohz_start(void) | ||
5541 | { | ||
5542 | struct rq *this_rq = this_rq(); | ||
5543 | |||
5544 | /* | ||
5545 | * This is all lockless but should be fine. If weighted_cpuload changes | ||
5546 | * concurrently we'll exit nohz. And cpu_load write can race with | ||
5547 | * cpu_load_update_idle() but both updater would be writing the same. | ||
5548 | */ | ||
5549 | this_rq->cpu_load[0] = weighted_cpuload(this_rq); | ||
5550 | } | ||
5551 | |||
5552 | /* | ||
5553 | * Account the tickless load in the end of a nohz frame. | ||
5554 | */ | ||
5555 | void cpu_load_update_nohz_stop(void) | ||
5556 | { | ||
5557 | unsigned long curr_jiffies = READ_ONCE(jiffies); | ||
5558 | struct rq *this_rq = this_rq(); | ||
5559 | unsigned long load; | ||
5560 | struct rq_flags rf; | ||
5561 | |||
5562 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
5563 | return; | ||
5564 | |||
5565 | load = weighted_cpuload(this_rq); | ||
5566 | rq_lock(this_rq, &rf); | ||
5567 | update_rq_clock(this_rq); | ||
5568 | cpu_load_update_nohz(this_rq, curr_jiffies, load); | ||
5569 | rq_unlock(this_rq, &rf); | ||
5570 | } | ||
5571 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
5572 | static inline void cpu_load_update_nohz(struct rq *this_rq, | ||
5573 | unsigned long curr_jiffies, | ||
5574 | unsigned long load) { } | ||
5575 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
5576 | |||
5577 | static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load) | ||
5578 | { | ||
5579 | #ifdef CONFIG_NO_HZ_COMMON | ||
5580 | /* See the mess around cpu_load_update_nohz(). */ | ||
5581 | this_rq->last_load_update_tick = READ_ONCE(jiffies); | ||
5582 | #endif | ||
5583 | cpu_load_update(this_rq, load, 1); | ||
5584 | } | ||
5585 | |||
5586 | /* | ||
5587 | * Called from scheduler_tick() | ||
5588 | */ | ||
5589 | void cpu_load_update_active(struct rq *this_rq) | ||
5590 | { | ||
5591 | unsigned long load = weighted_cpuload(this_rq); | ||
5592 | |||
5593 | if (tick_nohz_tick_stopped()) | ||
5594 | cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load); | ||
5595 | else | ||
5596 | cpu_load_update_periodic(this_rq, load); | ||
5597 | } | ||
5598 | |||
5599 | /* | ||
5600 | * Return a low guess at the load of a migration-source CPU weighted | ||
5601 | * according to the scheduling class and "nice" value. | ||
5602 | * | ||
5603 | * We want to under-estimate the load of migration sources, to | ||
5604 | * balance conservatively. | ||
5605 | */ | ||
5606 | static unsigned long source_load(int cpu, int type) | ||
5607 | { | ||
5608 | struct rq *rq = cpu_rq(cpu); | ||
5609 | unsigned long total = weighted_cpuload(rq); | ||
5610 | |||
5611 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
5612 | return total; | ||
5613 | |||
5614 | return min(rq->cpu_load[type-1], total); | ||
5615 | } | ||
5616 | |||
5617 | /* | ||
5618 | * Return a high guess at the load of a migration-target CPU weighted | ||
5619 | * according to the scheduling class and "nice" value. | ||
5620 | */ | ||
5621 | static unsigned long target_load(int cpu, int type) | ||
5622 | { | ||
5623 | struct rq *rq = cpu_rq(cpu); | ||
5624 | unsigned long total = weighted_cpuload(rq); | ||
5625 | |||
5626 | if (type == 0 || !sched_feat(LB_BIAS)) | ||
5627 | return total; | ||
5628 | |||
5629 | return max(rq->cpu_load[type-1], total); | ||
5630 | } | ||
5631 | |||
5632 | static unsigned long capacity_of(int cpu) | 5374 | static unsigned long capacity_of(int cpu) |
5633 | { | 5375 | { |
5634 | return cpu_rq(cpu)->cpu_capacity; | 5376 | return cpu_rq(cpu)->cpu_capacity; |
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
5638 | { | 5380 | { |
5639 | struct rq *rq = cpu_rq(cpu); | 5381 | struct rq *rq = cpu_rq(cpu); |
5640 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); | 5382 | unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); |
5641 | unsigned long load_avg = weighted_cpuload(rq); | 5383 | unsigned long load_avg = cpu_runnable_load(rq); |
5642 | 5384 | ||
5643 | if (nr_running) | 5385 | if (nr_running) |
5644 | return load_avg / nr_running; | 5386 | return load_avg / nr_running; |
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5736 | s64 this_eff_load, prev_eff_load; | 5478 | s64 this_eff_load, prev_eff_load; |
5737 | unsigned long task_load; | 5479 | unsigned long task_load; |
5738 | 5480 | ||
5739 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5481 | this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); |
5740 | 5482 | ||
5741 | if (sync) { | 5483 | if (sync) { |
5742 | unsigned long current_load = task_h_load(current); | 5484 | unsigned long current_load = task_h_load(current); |
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5754 | this_eff_load *= 100; | 5496 | this_eff_load *= 100; |
5755 | this_eff_load *= capacity_of(prev_cpu); | 5497 | this_eff_load *= capacity_of(prev_cpu); |
5756 | 5498 | ||
5757 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | 5499 | prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); |
5758 | prev_eff_load -= task_load; | 5500 | prev_eff_load -= task_load; |
5759 | if (sched_feat(WA_BIAS)) | 5501 | if (sched_feat(WA_BIAS)) |
5760 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5502 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5815 | unsigned long this_runnable_load = ULONG_MAX; | 5557 | unsigned long this_runnable_load = ULONG_MAX; |
5816 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; | 5558 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; |
5817 | unsigned long most_spare = 0, this_spare = 0; | 5559 | unsigned long most_spare = 0, this_spare = 0; |
5818 | int load_idx = sd->forkexec_idx; | ||
5819 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; | 5560 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; |
5820 | unsigned long imbalance = scale_load_down(NICE_0_LOAD) * | 5561 | unsigned long imbalance = scale_load_down(NICE_0_LOAD) * |
5821 | (sd->imbalance_pct-100) / 100; | 5562 | (sd->imbalance_pct-100) / 100; |
5822 | 5563 | ||
5823 | if (sd_flag & SD_BALANCE_WAKE) | ||
5824 | load_idx = sd->wake_idx; | ||
5825 | |||
5826 | do { | 5564 | do { |
5827 | unsigned long load, avg_load, runnable_load; | 5565 | unsigned long load, avg_load, runnable_load; |
5828 | unsigned long spare_cap, max_spare_cap; | 5566 | unsigned long spare_cap, max_spare_cap; |
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5831 | 5569 | ||
5832 | /* Skip over this group if it has no CPUs allowed */ | 5570 | /* Skip over this group if it has no CPUs allowed */ |
5833 | if (!cpumask_intersects(sched_group_span(group), | 5571 | if (!cpumask_intersects(sched_group_span(group), |
5834 | &p->cpus_allowed)) | 5572 | p->cpus_ptr)) |
5835 | continue; | 5573 | continue; |
5836 | 5574 | ||
5837 | local_group = cpumask_test_cpu(this_cpu, | 5575 | local_group = cpumask_test_cpu(this_cpu, |
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5846 | max_spare_cap = 0; | 5584 | max_spare_cap = 0; |
5847 | 5585 | ||
5848 | for_each_cpu(i, sched_group_span(group)) { | 5586 | for_each_cpu(i, sched_group_span(group)) { |
5849 | /* Bias balancing toward CPUs of our domain */ | 5587 | load = cpu_runnable_load(cpu_rq(i)); |
5850 | if (local_group) | ||
5851 | load = source_load(i, load_idx); | ||
5852 | else | ||
5853 | load = target_load(i, load_idx); | ||
5854 | |||
5855 | runnable_load += load; | 5588 | runnable_load += load; |
5856 | 5589 | ||
5857 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); | 5590 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); |
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this | |||
5963 | return cpumask_first(sched_group_span(group)); | 5696 | return cpumask_first(sched_group_span(group)); |
5964 | 5697 | ||
5965 | /* Traverse only the allowed CPUs */ | 5698 | /* Traverse only the allowed CPUs */ |
5966 | for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { | 5699 | for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) { |
5967 | if (available_idle_cpu(i)) { | 5700 | if (available_idle_cpu(i)) { |
5968 | struct rq *rq = cpu_rq(i); | 5701 | struct rq *rq = cpu_rq(i); |
5969 | struct cpuidle_state *idle = idle_get_state(rq); | 5702 | struct cpuidle_state *idle = idle_get_state(rq); |
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this | |||
5987 | shallowest_idle_cpu = i; | 5720 | shallowest_idle_cpu = i; |
5988 | } | 5721 | } |
5989 | } else if (shallowest_idle_cpu == -1) { | 5722 | } else if (shallowest_idle_cpu == -1) { |
5990 | load = weighted_cpuload(cpu_rq(i)); | 5723 | load = cpu_runnable_load(cpu_rq(i)); |
5991 | if (load < min_load) { | 5724 | if (load < min_load) { |
5992 | min_load = load; | 5725 | min_load = load; |
5993 | least_loaded_cpu = i; | 5726 | least_loaded_cpu = i; |
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
6003 | { | 5736 | { |
6004 | int new_cpu = cpu; | 5737 | int new_cpu = cpu; |
6005 | 5738 | ||
6006 | if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) | 5739 | if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr)) |
6007 | return prev_cpu; | 5740 | return prev_cpu; |
6008 | 5741 | ||
6009 | /* | 5742 | /* |
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int | |||
6120 | if (!test_idle_cores(target, false)) | 5853 | if (!test_idle_cores(target, false)) |
6121 | return -1; | 5854 | return -1; |
6122 | 5855 | ||
6123 | cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); | 5856 | cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); |
6124 | 5857 | ||
6125 | for_each_cpu_wrap(core, cpus, target) { | 5858 | for_each_cpu_wrap(core, cpus, target) { |
6126 | bool idle = true; | 5859 | bool idle = true; |
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target) | |||
6154 | return -1; | 5887 | return -1; |
6155 | 5888 | ||
6156 | for_each_cpu(cpu, cpu_smt_mask(target)) { | 5889 | for_each_cpu(cpu, cpu_smt_mask(target)) { |
6157 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 5890 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
6158 | continue; | 5891 | continue; |
6159 | if (available_idle_cpu(cpu)) | 5892 | if (available_idle_cpu(cpu)) |
6160 | return cpu; | 5893 | return cpu; |
@@ -6218,7 +5951,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t | |||
6218 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { | 5951 | for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { |
6219 | if (!--nr) | 5952 | if (!--nr) |
6220 | return -1; | 5953 | return -1; |
6221 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 5954 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
6222 | continue; | 5955 | continue; |
6223 | if (available_idle_cpu(cpu)) | 5956 | if (available_idle_cpu(cpu)) |
6224 | break; | 5957 | break; |
@@ -6255,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6255 | recent_used_cpu != target && | 5988 | recent_used_cpu != target && |
6256 | cpus_share_cache(recent_used_cpu, target) && | 5989 | cpus_share_cache(recent_used_cpu, target) && |
6257 | available_idle_cpu(recent_used_cpu) && | 5990 | available_idle_cpu(recent_used_cpu) && |
6258 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 5991 | cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { |
6259 | /* | 5992 | /* |
6260 | * Replace recent_used_cpu with prev as it is a potential | 5993 | * Replace recent_used_cpu with prev as it is a potential |
6261 | * candidate for the next wake: | 5994 | * candidate for the next wake: |
@@ -6499,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) | |||
6499 | static long | 6232 | static long |
6500 | compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) | 6233 | compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) |
6501 | { | 6234 | { |
6502 | long util, max_util, sum_util, energy = 0; | 6235 | unsigned int max_util, util_cfs, cpu_util, cpu_cap; |
6236 | unsigned long sum_util, energy = 0; | ||
6237 | struct task_struct *tsk; | ||
6503 | int cpu; | 6238 | int cpu; |
6504 | 6239 | ||
6505 | for (; pd; pd = pd->next) { | 6240 | for (; pd; pd = pd->next) { |
6241 | struct cpumask *pd_mask = perf_domain_span(pd); | ||
6242 | |||
6243 | /* | ||
6244 | * The energy model mandates all the CPUs of a performance | ||
6245 | * domain have the same capacity. | ||
6246 | */ | ||
6247 | cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); | ||
6506 | max_util = sum_util = 0; | 6248 | max_util = sum_util = 0; |
6249 | |||
6507 | /* | 6250 | /* |
6508 | * The capacity state of CPUs of the current rd can be driven by | 6251 | * The capacity state of CPUs of the current rd can be driven by |
6509 | * CPUs of another rd if they belong to the same performance | 6252 | * CPUs of another rd if they belong to the same performance |
@@ -6514,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) | |||
6514 | * it will not appear in its pd list and will not be accounted | 6257 | * it will not appear in its pd list and will not be accounted |
6515 | * by compute_energy(). | 6258 | * by compute_energy(). |
6516 | */ | 6259 | */ |
6517 | for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { | 6260 | for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { |
6518 | util = cpu_util_next(cpu, p, dst_cpu); | 6261 | util_cfs = cpu_util_next(cpu, p, dst_cpu); |
6519 | util = schedutil_energy_util(cpu, util); | 6262 | |
6520 | max_util = max(util, max_util); | 6263 | /* |
6521 | sum_util += util; | 6264 | * Busy time computation: utilization clamping is not |
6265 | * required since the ratio (sum_util / cpu_capacity) | ||
6266 | * is already enough to scale the EM reported power | ||
6267 | * consumption at the (eventually clamped) cpu_capacity. | ||
6268 | */ | ||
6269 | sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, | ||
6270 | ENERGY_UTIL, NULL); | ||
6271 | |||
6272 | /* | ||
6273 | * Performance domain frequency: utilization clamping | ||
6274 | * must be considered since it affects the selection | ||
6275 | * of the performance domain frequency. | ||
6276 | * NOTE: in case RT tasks are running, by default the | ||
6277 | * FREQUENCY_UTIL's utilization can be max OPP. | ||
6278 | */ | ||
6279 | tsk = cpu == dst_cpu ? p : NULL; | ||
6280 | cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, | ||
6281 | FREQUENCY_UTIL, tsk); | ||
6282 | max_util = max(max_util, cpu_util); | ||
6522 | } | 6283 | } |
6523 | 6284 | ||
6524 | energy += em_pd_energy(pd->em_pd, max_util, sum_util); | 6285 | energy += em_pd_energy(pd->em_pd, max_util, sum_util); |
@@ -6601,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) | |||
6601 | int max_spare_cap_cpu = -1; | 6362 | int max_spare_cap_cpu = -1; |
6602 | 6363 | ||
6603 | for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { | 6364 | for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { |
6604 | if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) | 6365 | if (!cpumask_test_cpu(cpu, p->cpus_ptr)) |
6605 | continue; | 6366 | continue; |
6606 | 6367 | ||
6607 | /* Skip CPUs that will be overutilized. */ | 6368 | /* Skip CPUs that will be overutilized. */ |
@@ -6690,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6690 | } | 6451 | } |
6691 | 6452 | ||
6692 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && | 6453 | want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && |
6693 | cpumask_test_cpu(cpu, &p->cpus_allowed); | 6454 | cpumask_test_cpu(cpu, p->cpus_ptr); |
6694 | } | 6455 | } |
6695 | 6456 | ||
6696 | rcu_read_lock(); | 6457 | rcu_read_lock(); |
@@ -7446,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7446 | /* | 7207 | /* |
7447 | * We do not migrate tasks that are: | 7208 | * We do not migrate tasks that are: |
7448 | * 1) throttled_lb_pair, or | 7209 | * 1) throttled_lb_pair, or |
7449 | * 2) cannot be migrated to this CPU due to cpus_allowed, or | 7210 | * 2) cannot be migrated to this CPU due to cpus_ptr, or |
7450 | * 3) running (obviously), or | 7211 | * 3) running (obviously), or |
7451 | * 4) are cache-hot on their current CPU. | 7212 | * 4) are cache-hot on their current CPU. |
7452 | */ | 7213 | */ |
7453 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) | 7214 | if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) |
7454 | return 0; | 7215 | return 0; |
7455 | 7216 | ||
7456 | if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { | 7217 | if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { |
7457 | int cpu; | 7218 | int cpu; |
7458 | 7219 | ||
7459 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); | 7220 | schedstat_inc(p->se.statistics.nr_failed_migrations_affine); |
@@ -7473,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7473 | 7234 | ||
7474 | /* Prevent to re-select dst_cpu via env's CPUs: */ | 7235 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
7475 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7236 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
7476 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7237 | if (cpumask_test_cpu(cpu, p->cpus_ptr)) { |
7477 | env->flags |= LBF_DST_PINNED; | 7238 | env->flags |= LBF_DST_PINNED; |
7478 | env->new_dst_cpu = cpu; | 7239 | env->new_dst_cpu = cpu; |
7479 | break; | 7240 | break; |
@@ -7559,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) | |||
7559 | static const unsigned int sched_nr_migrate_break = 32; | 7320 | static const unsigned int sched_nr_migrate_break = 32; |
7560 | 7321 | ||
7561 | /* | 7322 | /* |
7562 | * detach_tasks() -- tries to detach up to imbalance weighted load from | 7323 | * detach_tasks() -- tries to detach up to imbalance runnable load from |
7563 | * busiest_rq, as part of a balancing operation within domain "sd". | 7324 | * busiest_rq, as part of a balancing operation within domain "sd". |
7564 | * | 7325 | * |
7565 | * Returns number of detached tasks if successful and 0 otherwise. | 7326 | * Returns number of detached tasks if successful and 0 otherwise. |
@@ -7627,7 +7388,7 @@ static int detach_tasks(struct lb_env *env) | |||
7627 | 7388 | ||
7628 | /* | 7389 | /* |
7629 | * We only want to steal up to the prescribed amount of | 7390 | * We only want to steal up to the prescribed amount of |
7630 | * weighted load. | 7391 | * runnable load. |
7631 | */ | 7392 | */ |
7632 | if (env->imbalance <= 0) | 7393 | if (env->imbalance <= 0) |
7633 | break; | 7394 | break; |
@@ -7696,6 +7457,7 @@ static void attach_tasks(struct lb_env *env) | |||
7696 | rq_unlock(env->dst_rq, &rf); | 7457 | rq_unlock(env->dst_rq, &rf); |
7697 | } | 7458 | } |
7698 | 7459 | ||
7460 | #ifdef CONFIG_NO_HZ_COMMON | ||
7699 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) | 7461 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) |
7700 | { | 7462 | { |
7701 | if (cfs_rq->avg.load_avg) | 7463 | if (cfs_rq->avg.load_avg) |
@@ -7723,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq) | |||
7723 | return false; | 7485 | return false; |
7724 | } | 7486 | } |
7725 | 7487 | ||
7488 | static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) | ||
7489 | { | ||
7490 | rq->last_blocked_load_update_tick = jiffies; | ||
7491 | |||
7492 | if (!has_blocked) | ||
7493 | rq->has_blocked_load = 0; | ||
7494 | } | ||
7495 | #else | ||
7496 | static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; } | ||
7497 | static inline bool others_have_blocked(struct rq *rq) { return false; } | ||
7498 | static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} | ||
7499 | #endif | ||
7500 | |||
7726 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7501 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7727 | 7502 | ||
7728 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) | 7503 | static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) |
@@ -7788,11 +7563,7 @@ static void update_blocked_averages(int cpu) | |||
7788 | if (others_have_blocked(rq)) | 7563 | if (others_have_blocked(rq)) |
7789 | done = false; | 7564 | done = false; |
7790 | 7565 | ||
7791 | #ifdef CONFIG_NO_HZ_COMMON | 7566 | update_blocked_load_status(rq, !done); |
7792 | rq->last_blocked_load_update_tick = jiffies; | ||
7793 | if (done) | ||
7794 | rq->has_blocked_load = 0; | ||
7795 | #endif | ||
7796 | rq_unlock_irqrestore(rq, &rf); | 7567 | rq_unlock_irqrestore(rq, &rf); |
7797 | } | 7568 | } |
7798 | 7569 | ||
@@ -7858,11 +7629,7 @@ static inline void update_blocked_averages(int cpu) | |||
7858 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); | 7629 | update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); |
7859 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); | 7630 | update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); |
7860 | update_irq_load_avg(rq, 0); | 7631 | update_irq_load_avg(rq, 0); |
7861 | #ifdef CONFIG_NO_HZ_COMMON | 7632 | update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq)); |
7862 | rq->last_blocked_load_update_tick = jiffies; | ||
7863 | if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq)) | ||
7864 | rq->has_blocked_load = 0; | ||
7865 | #endif | ||
7866 | rq_unlock_irqrestore(rq, &rf); | 7633 | rq_unlock_irqrestore(rq, &rf); |
7867 | } | 7634 | } |
7868 | 7635 | ||
@@ -7880,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p) | |||
7880 | struct sg_lb_stats { | 7647 | struct sg_lb_stats { |
7881 | unsigned long avg_load; /*Avg load across the CPUs of the group */ | 7648 | unsigned long avg_load; /*Avg load across the CPUs of the group */ |
7882 | unsigned long group_load; /* Total load over the CPUs of the group */ | 7649 | unsigned long group_load; /* Total load over the CPUs of the group */ |
7883 | unsigned long sum_weighted_load; /* Weighted load of group's tasks */ | ||
7884 | unsigned long load_per_task; | 7650 | unsigned long load_per_task; |
7885 | unsigned long group_capacity; | 7651 | unsigned long group_capacity; |
7886 | unsigned long group_util; /* Total utilization of the group */ | 7652 | unsigned long group_util; /* Total utilization of the group */ |
@@ -7934,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds) | |||
7934 | }; | 7700 | }; |
7935 | } | 7701 | } |
7936 | 7702 | ||
7937 | /** | ||
7938 | * get_sd_load_idx - Obtain the load index for a given sched domain. | ||
7939 | * @sd: The sched_domain whose load_idx is to be obtained. | ||
7940 | * @idle: The idle status of the CPU for whose sd load_idx is obtained. | ||
7941 | * | ||
7942 | * Return: The load index. | ||
7943 | */ | ||
7944 | static inline int get_sd_load_idx(struct sched_domain *sd, | ||
7945 | enum cpu_idle_type idle) | ||
7946 | { | ||
7947 | int load_idx; | ||
7948 | |||
7949 | switch (idle) { | ||
7950 | case CPU_NOT_IDLE: | ||
7951 | load_idx = sd->busy_idx; | ||
7952 | break; | ||
7953 | |||
7954 | case CPU_NEWLY_IDLE: | ||
7955 | load_idx = sd->newidle_idx; | ||
7956 | break; | ||
7957 | default: | ||
7958 | load_idx = sd->idle_idx; | ||
7959 | break; | ||
7960 | } | ||
7961 | |||
7962 | return load_idx; | ||
7963 | } | ||
7964 | |||
7965 | static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) | 7703 | static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) |
7966 | { | 7704 | { |
7967 | struct rq *rq = cpu_rq(cpu); | 7705 | struct rq *rq = cpu_rq(cpu); |
7968 | unsigned long max = arch_scale_cpu_capacity(sd, cpu); | 7706 | unsigned long max = arch_scale_cpu_capacity(cpu); |
7969 | unsigned long used, free; | 7707 | unsigned long used, free; |
7970 | unsigned long irq; | 7708 | unsigned long irq; |
7971 | 7709 | ||
@@ -7990,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
7990 | unsigned long capacity = scale_rt_capacity(sd, cpu); | 7728 | unsigned long capacity = scale_rt_capacity(sd, cpu); |
7991 | struct sched_group *sdg = sd->groups; | 7729 | struct sched_group *sdg = sd->groups; |
7992 | 7730 | ||
7993 | cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); | 7731 | cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); |
7994 | 7732 | ||
7995 | if (!capacity) | 7733 | if (!capacity) |
7996 | capacity = 1; | 7734 | capacity = 1; |
@@ -8100,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) | |||
8100 | 7838 | ||
8101 | /* | 7839 | /* |
8102 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7840 | * Group imbalance indicates (and tries to solve) the problem where balancing |
8103 | * groups is inadequate due to ->cpus_allowed constraints. | 7841 | * groups is inadequate due to ->cpus_ptr constraints. |
8104 | * | 7842 | * |
8105 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a | 7843 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
8106 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. | 7844 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
@@ -8250,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
8250 | struct sg_lb_stats *sgs, | 7988 | struct sg_lb_stats *sgs, |
8251 | int *sg_status) | 7989 | int *sg_status) |
8252 | { | 7990 | { |
8253 | int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group)); | ||
8254 | int load_idx = get_sd_load_idx(env->sd, env->idle); | ||
8255 | unsigned long load; | ||
8256 | int i, nr_running; | 7991 | int i, nr_running; |
8257 | 7992 | ||
8258 | memset(sgs, 0, sizeof(*sgs)); | 7993 | memset(sgs, 0, sizeof(*sgs)); |
@@ -8263,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
8263 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) | 7998 | if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) |
8264 | env->flags |= LBF_NOHZ_AGAIN; | 7999 | env->flags |= LBF_NOHZ_AGAIN; |
8265 | 8000 | ||
8266 | /* Bias balancing toward CPUs of our domain: */ | 8001 | sgs->group_load += cpu_runnable_load(rq); |
8267 | if (local_group) | ||
8268 | load = target_load(i, load_idx); | ||
8269 | else | ||
8270 | load = source_load(i, load_idx); | ||
8271 | |||
8272 | sgs->group_load += load; | ||
8273 | sgs->group_util += cpu_util(i); | 8002 | sgs->group_util += cpu_util(i); |
8274 | sgs->sum_nr_running += rq->cfs.h_nr_running; | 8003 | sgs->sum_nr_running += rq->cfs.h_nr_running; |
8275 | 8004 | ||
@@ -8284,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
8284 | sgs->nr_numa_running += rq->nr_numa_running; | 8013 | sgs->nr_numa_running += rq->nr_numa_running; |
8285 | sgs->nr_preferred_running += rq->nr_preferred_running; | 8014 | sgs->nr_preferred_running += rq->nr_preferred_running; |
8286 | #endif | 8015 | #endif |
8287 | sgs->sum_weighted_load += weighted_cpuload(rq); | ||
8288 | /* | 8016 | /* |
8289 | * No need to call idle_cpu() if nr_running is not 0 | 8017 | * No need to call idle_cpu() if nr_running is not 0 |
8290 | */ | 8018 | */ |
@@ -8303,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
8303 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; | 8031 | sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; |
8304 | 8032 | ||
8305 | if (sgs->sum_nr_running) | 8033 | if (sgs->sum_nr_running) |
8306 | sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; | 8034 | sgs->load_per_task = sgs->group_load / sgs->sum_nr_running; |
8307 | 8035 | ||
8308 | sgs->group_weight = group->group_weight; | 8036 | sgs->group_weight = group->group_weight; |
8309 | 8037 | ||
@@ -8517,8 +8245,12 @@ next_group: | |||
8517 | 8245 | ||
8518 | /* Update over-utilization (tipping point, U >= 0) indicator */ | 8246 | /* Update over-utilization (tipping point, U >= 0) indicator */ |
8519 | WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); | 8247 | WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); |
8248 | trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED); | ||
8520 | } else if (sg_status & SG_OVERUTILIZED) { | 8249 | } else if (sg_status & SG_OVERUTILIZED) { |
8521 | WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); | 8250 | struct root_domain *rd = env->dst_rq->rd; |
8251 | |||
8252 | WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED); | ||
8253 | trace_sched_overutilized_tp(rd, SG_OVERUTILIZED); | ||
8522 | } | 8254 | } |
8523 | } | 8255 | } |
8524 | 8256 | ||
@@ -8724,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8724 | * find_busiest_group - Returns the busiest group within the sched_domain | 8456 | * find_busiest_group - Returns the busiest group within the sched_domain |
8725 | * if there is an imbalance. | 8457 | * if there is an imbalance. |
8726 | * | 8458 | * |
8727 | * Also calculates the amount of weighted load which should be moved | 8459 | * Also calculates the amount of runnable load which should be moved |
8728 | * to restore balance. | 8460 | * to restore balance. |
8729 | * | 8461 | * |
8730 | * @env: The load balancing environment. | 8462 | * @env: The load balancing environment. |
@@ -8769,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
8769 | /* | 8501 | /* |
8770 | * If the busiest group is imbalanced the below checks don't | 8502 | * If the busiest group is imbalanced the below checks don't |
8771 | * work because they assume all things are equal, which typically | 8503 | * work because they assume all things are equal, which typically |
8772 | * isn't true due to cpus_allowed constraints and the like. | 8504 | * isn't true due to cpus_ptr constraints and the like. |
8773 | */ | 8505 | */ |
8774 | if (busiest->group_type == group_imbalanced) | 8506 | if (busiest->group_type == group_imbalanced) |
8775 | goto force_balance; | 8507 | goto force_balance; |
@@ -8843,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8843 | int i; | 8575 | int i; |
8844 | 8576 | ||
8845 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 8577 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
8846 | unsigned long capacity, wl; | 8578 | unsigned long capacity, load; |
8847 | enum fbq_type rt; | 8579 | enum fbq_type rt; |
8848 | 8580 | ||
8849 | rq = cpu_rq(i); | 8581 | rq = cpu_rq(i); |
@@ -8897,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8897 | rq->nr_running == 1) | 8629 | rq->nr_running == 1) |
8898 | continue; | 8630 | continue; |
8899 | 8631 | ||
8900 | wl = weighted_cpuload(rq); | 8632 | load = cpu_runnable_load(rq); |
8901 | 8633 | ||
8902 | /* | 8634 | /* |
8903 | * When comparing with imbalance, use weighted_cpuload() | 8635 | * When comparing with imbalance, use cpu_runnable_load() |
8904 | * which is not scaled with the CPU capacity. | 8636 | * which is not scaled with the CPU capacity. |
8905 | */ | 8637 | */ |
8906 | 8638 | ||
8907 | if (rq->nr_running == 1 && wl > env->imbalance && | 8639 | if (rq->nr_running == 1 && load > env->imbalance && |
8908 | !check_cpu_capacity(rq, env->sd)) | 8640 | !check_cpu_capacity(rq, env->sd)) |
8909 | continue; | 8641 | continue; |
8910 | 8642 | ||
8911 | /* | 8643 | /* |
8912 | * For the load comparisons with the other CPU's, consider | 8644 | * For the load comparisons with the other CPU's, consider |
8913 | * the weighted_cpuload() scaled with the CPU capacity, so | 8645 | * the cpu_runnable_load() scaled with the CPU capacity, so |
8914 | * that the load can be moved away from the CPU that is | 8646 | * that the load can be moved away from the CPU that is |
8915 | * potentially running at a lower capacity. | 8647 | * potentially running at a lower capacity. |
8916 | * | 8648 | * |
8917 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8649 | * Thus we're looking for max(load_i / capacity_i), crosswise |
8918 | * multiplication to rid ourselves of the division works out | 8650 | * multiplication to rid ourselves of the division works out |
8919 | * to: wl_i * capacity_j > wl_j * capacity_i; where j is | 8651 | * to: load_i * capacity_j > load_j * capacity_i; where j is |
8920 | * our previous maximum. | 8652 | * our previous maximum. |
8921 | */ | 8653 | */ |
8922 | if (wl * busiest_capacity > busiest_load * capacity) { | 8654 | if (load * busiest_capacity > busiest_load * capacity) { |
8923 | busiest_load = wl; | 8655 | busiest_load = load; |
8924 | busiest_capacity = capacity; | 8656 | busiest_capacity = capacity; |
8925 | busiest = rq; | 8657 | busiest = rq; |
8926 | } | 8658 | } |
@@ -9211,7 +8943,7 @@ more_balance: | |||
9211 | * if the curr task on busiest CPU can't be | 8943 | * if the curr task on busiest CPU can't be |
9212 | * moved to this_cpu: | 8944 | * moved to this_cpu: |
9213 | */ | 8945 | */ |
9214 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 8946 | if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { |
9215 | raw_spin_unlock_irqrestore(&busiest->lock, | 8947 | raw_spin_unlock_irqrestore(&busiest->lock, |
9216 | flags); | 8948 | flags); |
9217 | env.flags |= LBF_ALL_PINNED; | 8949 | env.flags |= LBF_ALL_PINNED; |
@@ -9880,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, | |||
9880 | 9612 | ||
9881 | rq_lock_irqsave(rq, &rf); | 9613 | rq_lock_irqsave(rq, &rf); |
9882 | update_rq_clock(rq); | 9614 | update_rq_clock(rq); |
9883 | cpu_load_update_idle(rq); | ||
9884 | rq_unlock_irqrestore(rq, &rf); | 9615 | rq_unlock_irqrestore(rq, &rf); |
9885 | 9616 | ||
9886 | if (flags & NOHZ_BALANCE_KICK) | 9617 | if (flags & NOHZ_BALANCE_KICK) |
@@ -10691,6 +10422,10 @@ const struct sched_class fair_sched_class = { | |||
10691 | #ifdef CONFIG_FAIR_GROUP_SCHED | 10422 | #ifdef CONFIG_FAIR_GROUP_SCHED |
10692 | .task_change_group = task_change_group_fair, | 10423 | .task_change_group = task_change_group_fair, |
10693 | #endif | 10424 | #endif |
10425 | |||
10426 | #ifdef CONFIG_UCLAMP_TASK | ||
10427 | .uclamp_enabled = 1, | ||
10428 | #endif | ||
10694 | }; | 10429 | }; |
10695 | 10430 | ||
10696 | #ifdef CONFIG_SCHED_DEBUG | 10431 | #ifdef CONFIG_SCHED_DEBUG |
@@ -10738,3 +10473,83 @@ __init void init_sched_fair_class(void) | |||
10738 | #endif /* SMP */ | 10473 | #endif /* SMP */ |
10739 | 10474 | ||
10740 | } | 10475 | } |
10476 | |||
10477 | /* | ||
10478 | * Helper functions to facilitate extracting info from tracepoints. | ||
10479 | */ | ||
10480 | |||
10481 | const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq) | ||
10482 | { | ||
10483 | #ifdef CONFIG_SMP | ||
10484 | return cfs_rq ? &cfs_rq->avg : NULL; | ||
10485 | #else | ||
10486 | return NULL; | ||
10487 | #endif | ||
10488 | } | ||
10489 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg); | ||
10490 | |||
10491 | char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len) | ||
10492 | { | ||
10493 | if (!cfs_rq) { | ||
10494 | if (str) | ||
10495 | strlcpy(str, "(null)", len); | ||
10496 | else | ||
10497 | return NULL; | ||
10498 | } | ||
10499 | |||
10500 | cfs_rq_tg_path(cfs_rq, str, len); | ||
10501 | return str; | ||
10502 | } | ||
10503 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path); | ||
10504 | |||
10505 | int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq) | ||
10506 | { | ||
10507 | return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1; | ||
10508 | } | ||
10509 | EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu); | ||
10510 | |||
10511 | const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq) | ||
10512 | { | ||
10513 | #ifdef CONFIG_SMP | ||
10514 | return rq ? &rq->avg_rt : NULL; | ||
10515 | #else | ||
10516 | return NULL; | ||
10517 | #endif | ||
10518 | } | ||
10519 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt); | ||
10520 | |||
10521 | const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq) | ||
10522 | { | ||
10523 | #ifdef CONFIG_SMP | ||
10524 | return rq ? &rq->avg_dl : NULL; | ||
10525 | #else | ||
10526 | return NULL; | ||
10527 | #endif | ||
10528 | } | ||
10529 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl); | ||
10530 | |||
10531 | const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq) | ||
10532 | { | ||
10533 | #if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ) | ||
10534 | return rq ? &rq->avg_irq : NULL; | ||
10535 | #else | ||
10536 | return NULL; | ||
10537 | #endif | ||
10538 | } | ||
10539 | EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq); | ||
10540 | |||
10541 | int sched_trace_rq_cpu(struct rq *rq) | ||
10542 | { | ||
10543 | return rq ? cpu_of(rq) : -1; | ||
10544 | } | ||
10545 | EXPORT_SYMBOL_GPL(sched_trace_rq_cpu); | ||
10546 | |||
10547 | const struct cpumask *sched_trace_rd_span(struct root_domain *rd) | ||
10548 | { | ||
10549 | #ifdef CONFIG_SMP | ||
10550 | return rd ? rd->span : NULL; | ||
10551 | #else | ||
10552 | return NULL; | ||
10553 | #endif | ||
10554 | } | ||
10555 | EXPORT_SYMBOL_GPL(sched_trace_rd_span); | ||
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 858589b83377..2410db5e9a35 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true) | |||
39 | 39 | ||
40 | SCHED_FEAT(HRTICK, false) | 40 | SCHED_FEAT(HRTICK, false) |
41 | SCHED_FEAT(DOUBLE_TICK, false) | 41 | SCHED_FEAT(DOUBLE_TICK, false) |
42 | SCHED_FEAT(LB_BIAS, false) | ||
43 | 42 | ||
44 | /* | 43 | /* |
45 | * Decrement CPU capacity based on time not spent running tasks | 44 | * Decrement CPU capacity based on time not spent running tasks |
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index befce29bd882..a96db50d40e0 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c | |||
@@ -28,6 +28,8 @@ | |||
28 | #include "sched.h" | 28 | #include "sched.h" |
29 | #include "pelt.h" | 29 | #include "pelt.h" |
30 | 30 | ||
31 | #include <trace/events/sched.h> | ||
32 | |||
31 | /* | 33 | /* |
32 | * Approximate: | 34 | * Approximate: |
33 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | 35 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) |
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se) | |||
265 | { | 267 | { |
266 | if (___update_load_sum(now, &se->avg, 0, 0, 0)) { | 268 | if (___update_load_sum(now, &se->avg, 0, 0, 0)) { |
267 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 269 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
270 | trace_pelt_se_tp(se); | ||
268 | return 1; | 271 | return 1; |
269 | } | 272 | } |
270 | 273 | ||
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se | |||
278 | 281 | ||
279 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); | 282 | ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); |
280 | cfs_se_util_change(&se->avg); | 283 | cfs_se_util_change(&se->avg); |
284 | trace_pelt_se_tp(se); | ||
281 | return 1; | 285 | return 1; |
282 | } | 286 | } |
283 | 287 | ||
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq) | |||
292 | cfs_rq->curr != NULL)) { | 296 | cfs_rq->curr != NULL)) { |
293 | 297 | ||
294 | ___update_load_avg(&cfs_rq->avg, 1, 1); | 298 | ___update_load_avg(&cfs_rq->avg, 1, 1); |
299 | trace_pelt_cfs_tp(cfs_rq); | ||
295 | return 1; | 300 | return 1; |
296 | } | 301 | } |
297 | 302 | ||
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running) | |||
317 | running)) { | 322 | running)) { |
318 | 323 | ||
319 | ___update_load_avg(&rq->avg_rt, 1, 1); | 324 | ___update_load_avg(&rq->avg_rt, 1, 1); |
325 | trace_pelt_rt_tp(rq); | ||
320 | return 1; | 326 | return 1; |
321 | } | 327 | } |
322 | 328 | ||
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) | |||
340 | running)) { | 346 | running)) { |
341 | 347 | ||
342 | ___update_load_avg(&rq->avg_dl, 1, 1); | 348 | ___update_load_avg(&rq->avg_dl, 1, 1); |
349 | trace_pelt_dl_tp(rq); | ||
343 | return 1; | 350 | return 1; |
344 | } | 351 | } |
345 | 352 | ||
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running) | |||
366 | * reflect the real amount of computation | 373 | * reflect the real amount of computation |
367 | */ | 374 | */ |
368 | running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); | 375 | running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); |
369 | running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | 376 | running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq))); |
370 | 377 | ||
371 | /* | 378 | /* |
372 | * We know the time that has been used by interrupt since last update | 379 | * We know the time that has been used by interrupt since last update |
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running) | |||
388 | 1, | 395 | 1, |
389 | 1); | 396 | 1); |
390 | 397 | ||
391 | if (ret) | 398 | if (ret) { |
392 | ___update_load_avg(&rq->avg_irq, 1, 1); | 399 | ___update_load_avg(&rq->avg_irq, 1, 1); |
400 | trace_pelt_irq_tp(rq); | ||
401 | } | ||
393 | 402 | ||
394 | return ret; | 403 | return ret; |
395 | } | 404 | } |
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 7489d5f56960..afff644da065 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h | |||
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta) | |||
79 | * Scale the elapsed time to reflect the real amount of | 79 | * Scale the elapsed time to reflect the real amount of |
80 | * computation | 80 | * computation |
81 | */ | 81 | */ |
82 | delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); | 82 | delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq))); |
83 | delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); | 83 | delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); |
84 | 84 | ||
85 | rq->clock_pelt += delta; | 85 | rq->clock_pelt += delta; |
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 1e6b909dca36..a532558a5176 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
1614 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | 1614 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
1615 | { | 1615 | { |
1616 | if (!task_running(rq, p) && | 1616 | if (!task_running(rq, p) && |
1617 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1617 | cpumask_test_cpu(cpu, p->cpus_ptr)) |
1618 | return 1; | 1618 | return 1; |
1619 | 1619 | ||
1620 | return 0; | 1620 | return 0; |
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
1751 | * Also make sure that it wasn't scheduled on its rq. | 1751 | * Also make sure that it wasn't scheduled on its rq. |
1752 | */ | 1752 | */ |
1753 | if (unlikely(task_rq(task) != rq || | 1753 | if (unlikely(task_rq(task) != rq || |
1754 | !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || | 1754 | !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || |
1755 | task_running(rq, task) || | 1755 | task_running(rq, task) || |
1756 | !rt_task(task) || | 1756 | !rt_task(task) || |
1757 | !task_on_rq_queued(task))) { | 1757 | !task_on_rq_queued(task))) { |
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = { | |||
2400 | .switched_to = switched_to_rt, | 2400 | .switched_to = switched_to_rt, |
2401 | 2401 | ||
2402 | .update_curr = update_curr_rt, | 2402 | .update_curr = update_curr_rt, |
2403 | |||
2404 | #ifdef CONFIG_UCLAMP_TASK | ||
2405 | .uclamp_enabled = 1, | ||
2406 | #endif | ||
2403 | }; | 2407 | }; |
2404 | 2408 | ||
2405 | #ifdef CONFIG_RT_GROUP_SCHED | 2409 | #ifdef CONFIG_RT_GROUP_SCHED |
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h index a26473674fb7..c529706bed11 100644 --- a/kernel/sched/sched-pelt.h +++ b/kernel/sched/sched-pelt.h | |||
@@ -1,7 +1,7 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ | 2 | /* Generated by Documentation/scheduler/sched-pelt; do not modify. */ |
3 | 3 | ||
4 | static const u32 runnable_avg_yN_inv[] = { | 4 | static const u32 runnable_avg_yN_inv[] __maybe_unused = { |
5 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, | 5 | 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, |
6 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, | 6 | 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, |
7 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, | 7 | 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b52ed1ada0be..802b1f3405f2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks; | |||
96 | extern void calc_global_load_tick(struct rq *this_rq); | 96 | extern void calc_global_load_tick(struct rq *this_rq); |
97 | extern long calc_load_fold_active(struct rq *this_rq, long adjust); | 97 | extern long calc_load_fold_active(struct rq *this_rq, long adjust); |
98 | 98 | ||
99 | #ifdef CONFIG_SMP | ||
100 | extern void cpu_load_update_active(struct rq *this_rq); | ||
101 | #else | ||
102 | static inline void cpu_load_update_active(struct rq *this_rq) { } | ||
103 | #endif | ||
104 | |||
105 | /* | 99 | /* |
106 | * Helpers for converting nanosecond timing to jiffy resolution | 100 | * Helpers for converting nanosecond timing to jiffy resolution |
107 | */ | 101 | */ |
@@ -344,8 +338,10 @@ struct cfs_bandwidth { | |||
344 | u64 runtime_expires; | 338 | u64 runtime_expires; |
345 | int expires_seq; | 339 | int expires_seq; |
346 | 340 | ||
347 | short idle; | 341 | u8 idle; |
348 | short period_active; | 342 | u8 period_active; |
343 | u8 distribute_running; | ||
344 | u8 slack_started; | ||
349 | struct hrtimer period_timer; | 345 | struct hrtimer period_timer; |
350 | struct hrtimer slack_timer; | 346 | struct hrtimer slack_timer; |
351 | struct list_head throttled_cfs_rq; | 347 | struct list_head throttled_cfs_rq; |
@@ -354,8 +350,6 @@ struct cfs_bandwidth { | |||
354 | int nr_periods; | 350 | int nr_periods; |
355 | int nr_throttled; | 351 | int nr_throttled; |
356 | u64 throttled_time; | 352 | u64 throttled_time; |
357 | |||
358 | bool distribute_running; | ||
359 | #endif | 353 | #endif |
360 | }; | 354 | }; |
361 | 355 | ||
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
797 | #endif | 791 | #endif |
798 | #endif /* CONFIG_SMP */ | 792 | #endif /* CONFIG_SMP */ |
799 | 793 | ||
794 | #ifdef CONFIG_UCLAMP_TASK | ||
795 | /* | ||
796 | * struct uclamp_bucket - Utilization clamp bucket | ||
797 | * @value: utilization clamp value for tasks on this clamp bucket | ||
798 | * @tasks: number of RUNNABLE tasks on this clamp bucket | ||
799 | * | ||
800 | * Keep track of how many tasks are RUNNABLE for a given utilization | ||
801 | * clamp value. | ||
802 | */ | ||
803 | struct uclamp_bucket { | ||
804 | unsigned long value : bits_per(SCHED_CAPACITY_SCALE); | ||
805 | unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE); | ||
806 | }; | ||
807 | |||
808 | /* | ||
809 | * struct uclamp_rq - rq's utilization clamp | ||
810 | * @value: currently active clamp values for a rq | ||
811 | * @bucket: utilization clamp buckets affecting a rq | ||
812 | * | ||
813 | * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values. | ||
814 | * A clamp value is affecting a rq when there is at least one task RUNNABLE | ||
815 | * (or actually running) with that value. | ||
816 | * | ||
817 | * There are up to UCLAMP_CNT possible different clamp values, currently there | ||
818 | * are only two: minimum utilization and maximum utilization. | ||
819 | * | ||
820 | * All utilization clamping values are MAX aggregated, since: | ||
821 | * - for util_min: we want to run the CPU at least at the max of the minimum | ||
822 | * utilization required by its currently RUNNABLE tasks. | ||
823 | * - for util_max: we want to allow the CPU to run up to the max of the | ||
824 | * maximum utilization allowed by its currently RUNNABLE tasks. | ||
825 | * | ||
826 | * Since on each system we expect only a limited number of different | ||
827 | * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track | ||
828 | * the metrics required to compute all the per-rq utilization clamp values. | ||
829 | */ | ||
830 | struct uclamp_rq { | ||
831 | unsigned int value; | ||
832 | struct uclamp_bucket bucket[UCLAMP_BUCKETS]; | ||
833 | }; | ||
834 | #endif /* CONFIG_UCLAMP_TASK */ | ||
835 | |||
800 | /* | 836 | /* |
801 | * This is the main, per-CPU runqueue data structure. | 837 | * This is the main, per-CPU runqueue data structure. |
802 | * | 838 | * |
@@ -818,8 +854,6 @@ struct rq { | |||
818 | unsigned int nr_preferred_running; | 854 | unsigned int nr_preferred_running; |
819 | unsigned int numa_migrate_on; | 855 | unsigned int numa_migrate_on; |
820 | #endif | 856 | #endif |
821 | #define CPU_LOAD_IDX_MAX 5 | ||
822 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | ||
823 | #ifdef CONFIG_NO_HZ_COMMON | 857 | #ifdef CONFIG_NO_HZ_COMMON |
824 | #ifdef CONFIG_SMP | 858 | #ifdef CONFIG_SMP |
825 | unsigned long last_load_update_tick; | 859 | unsigned long last_load_update_tick; |
@@ -830,11 +864,16 @@ struct rq { | |||
830 | atomic_t nohz_flags; | 864 | atomic_t nohz_flags; |
831 | #endif /* CONFIG_NO_HZ_COMMON */ | 865 | #endif /* CONFIG_NO_HZ_COMMON */ |
832 | 866 | ||
833 | /* capture load from *all* tasks on this CPU: */ | ||
834 | struct load_weight load; | ||
835 | unsigned long nr_load_updates; | 867 | unsigned long nr_load_updates; |
836 | u64 nr_switches; | 868 | u64 nr_switches; |
837 | 869 | ||
870 | #ifdef CONFIG_UCLAMP_TASK | ||
871 | /* Utilization clamp values based on CPU's RUNNABLE tasks */ | ||
872 | struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned; | ||
873 | unsigned int uclamp_flags; | ||
874 | #define UCLAMP_FLAG_IDLE 0x01 | ||
875 | #endif | ||
876 | |||
838 | struct cfs_rq cfs; | 877 | struct cfs_rq cfs; |
839 | struct rt_rq rt; | 878 | struct rt_rq rt; |
840 | struct dl_rq dl; | 879 | struct dl_rq dl; |
@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40]; | |||
1649 | struct sched_class { | 1688 | struct sched_class { |
1650 | const struct sched_class *next; | 1689 | const struct sched_class *next; |
1651 | 1690 | ||
1691 | #ifdef CONFIG_UCLAMP_TASK | ||
1692 | int uclamp_enabled; | ||
1693 | #endif | ||
1694 | |||
1652 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1695 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1653 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1696 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1654 | void (*yield_task) (struct rq *rq); | 1697 | void (*yield_task) (struct rq *rq); |
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) | |||
2222 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | 2265 | static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} |
2223 | #endif /* CONFIG_CPU_FREQ */ | 2266 | #endif /* CONFIG_CPU_FREQ */ |
2224 | 2267 | ||
2268 | #ifdef CONFIG_UCLAMP_TASK | ||
2269 | unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); | ||
2270 | |||
2271 | static __always_inline | ||
2272 | unsigned int uclamp_util_with(struct rq *rq, unsigned int util, | ||
2273 | struct task_struct *p) | ||
2274 | { | ||
2275 | unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); | ||
2276 | unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); | ||
2277 | |||
2278 | if (p) { | ||
2279 | min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); | ||
2280 | max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); | ||
2281 | } | ||
2282 | |||
2283 | /* | ||
2284 | * Since CPU's {min,max}_util clamps are MAX aggregated considering | ||
2285 | * RUNNABLE tasks with _different_ clamps, we can end up with an | ||
2286 | * inversion. Fix it now when the clamps are applied. | ||
2287 | */ | ||
2288 | if (unlikely(min_util >= max_util)) | ||
2289 | return min_util; | ||
2290 | |||
2291 | return clamp(util, min_util, max_util); | ||
2292 | } | ||
2293 | |||
2294 | static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) | ||
2295 | { | ||
2296 | return uclamp_util_with(rq, util, NULL); | ||
2297 | } | ||
2298 | #else /* CONFIG_UCLAMP_TASK */ | ||
2299 | static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util, | ||
2300 | struct task_struct *p) | ||
2301 | { | ||
2302 | return util; | ||
2303 | } | ||
2304 | static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) | ||
2305 | { | ||
2306 | return util; | ||
2307 | } | ||
2308 | #endif /* CONFIG_UCLAMP_TASK */ | ||
2309 | |||
2225 | #ifdef arch_scale_freq_capacity | 2310 | #ifdef arch_scale_freq_capacity |
2226 | # ifndef arch_scale_freq_invariant | 2311 | # ifndef arch_scale_freq_invariant |
2227 | # define arch_scale_freq_invariant() true | 2312 | # define arch_scale_freq_invariant() true |
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu) | |||
2237 | } | 2322 | } |
2238 | #endif | 2323 | #endif |
2239 | 2324 | ||
2240 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | ||
2241 | /** | 2325 | /** |
2242 | * enum schedutil_type - CPU utilization type | 2326 | * enum schedutil_type - CPU utilization type |
2243 | * @FREQUENCY_UTIL: Utilization used to select frequency | 2327 | * @FREQUENCY_UTIL: Utilization used to select frequency |
@@ -2253,15 +2337,11 @@ enum schedutil_type { | |||
2253 | ENERGY_UTIL, | 2337 | ENERGY_UTIL, |
2254 | }; | 2338 | }; |
2255 | 2339 | ||
2256 | unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, | 2340 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
2257 | unsigned long max, enum schedutil_type type); | ||
2258 | |||
2259 | static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) | ||
2260 | { | ||
2261 | unsigned long max = arch_scale_cpu_capacity(NULL, cpu); | ||
2262 | 2341 | ||
2263 | return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); | 2342 | unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
2264 | } | 2343 | unsigned long max, enum schedutil_type type, |
2344 | struct task_struct *p); | ||
2265 | 2345 | ||
2266 | static inline unsigned long cpu_bw_dl(struct rq *rq) | 2346 | static inline unsigned long cpu_bw_dl(struct rq *rq) |
2267 | { | 2347 | { |
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq) | |||
2290 | return READ_ONCE(rq->avg_rt.util_avg); | 2370 | return READ_ONCE(rq->avg_rt.util_avg); |
2291 | } | 2371 | } |
2292 | #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ | 2372 | #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
2293 | static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) | 2373 | static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, |
2374 | unsigned long max, enum schedutil_type type, | ||
2375 | struct task_struct *p) | ||
2294 | { | 2376 | { |
2295 | return cfs; | 2377 | return 0; |
2296 | } | 2378 | } |
2297 | #endif | 2379 | #endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
2298 | 2380 | ||
2299 | #ifdef CONFIG_HAVE_SCHED_AVG_IRQ | 2381 | #ifdef CONFIG_HAVE_SCHED_AVG_IRQ |
2300 | static inline unsigned long cpu_util_irq(struct rq *rq) | 2382 | static inline unsigned long cpu_util_irq(struct rq *rq) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f53f89df837d..f751ce0b783e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl, | |||
1344 | .imbalance_pct = 125, | 1344 | .imbalance_pct = 125, |
1345 | 1345 | ||
1346 | .cache_nice_tries = 0, | 1346 | .cache_nice_tries = 0, |
1347 | .busy_idx = 0, | ||
1348 | .idle_idx = 0, | ||
1349 | .newidle_idx = 0, | ||
1350 | .wake_idx = 0, | ||
1351 | .forkexec_idx = 0, | ||
1352 | 1347 | ||
1353 | .flags = 1*SD_LOAD_BALANCE | 1348 | .flags = 1*SD_LOAD_BALANCE |
1354 | | 1*SD_BALANCE_NEWIDLE | 1349 | | 1*SD_BALANCE_NEWIDLE |
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl, | |||
1400 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { | 1395 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
1401 | sd->imbalance_pct = 117; | 1396 | sd->imbalance_pct = 117; |
1402 | sd->cache_nice_tries = 1; | 1397 | sd->cache_nice_tries = 1; |
1403 | sd->busy_idx = 2; | ||
1404 | 1398 | ||
1405 | #ifdef CONFIG_NUMA | 1399 | #ifdef CONFIG_NUMA |
1406 | } else if (sd->flags & SD_NUMA) { | 1400 | } else if (sd->flags & SD_NUMA) { |
1407 | sd->cache_nice_tries = 2; | 1401 | sd->cache_nice_tries = 2; |
1408 | sd->busy_idx = 3; | ||
1409 | sd->idle_idx = 2; | ||
1410 | 1402 | ||
1411 | sd->flags &= ~SD_PREFER_SIBLING; | 1403 | sd->flags &= ~SD_PREFER_SIBLING; |
1412 | sd->flags |= SD_SERIALIZE; | 1404 | sd->flags |= SD_SERIALIZE; |
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl, | |||
1419 | #endif | 1411 | #endif |
1420 | } else { | 1412 | } else { |
1421 | sd->cache_nice_tries = 1; | 1413 | sd->cache_nice_tries = 1; |
1422 | sd->busy_idx = 2; | ||
1423 | sd->idle_idx = 1; | ||
1424 | } | 1414 | } |
1425 | 1415 | ||
1426 | /* | 1416 | /* |
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level | |||
1884 | unsigned long cap; | 1874 | unsigned long cap; |
1885 | 1875 | ||
1886 | /* Is there any asymmetry? */ | 1876 | /* Is there any asymmetry? */ |
1887 | cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); | 1877 | cap = arch_scale_cpu_capacity(cpumask_first(cpu_map)); |
1888 | 1878 | ||
1889 | for_each_cpu(i, cpu_map) { | 1879 | for_each_cpu(i, cpu_map) { |
1890 | if (arch_scale_cpu_capacity(NULL, i) != cap) { | 1880 | if (arch_scale_cpu_capacity(i) != cap) { |
1891 | asym = true; | 1881 | asym = true; |
1892 | break; | 1882 | break; |
1893 | } | 1883 | } |
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level | |||
1902 | * to everyone. | 1892 | * to everyone. |
1903 | */ | 1893 | */ |
1904 | for_each_cpu(i, cpu_map) { | 1894 | for_each_cpu(i, cpu_map) { |
1905 | unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); | 1895 | unsigned long max_capacity = arch_scale_cpu_capacity(i); |
1906 | int tl_id = 0; | 1896 | int tl_id = 0; |
1907 | 1897 | ||
1908 | for_each_sd_topology(tl) { | 1898 | for_each_sd_topology(tl) { |
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level | |||
1912 | for_each_cpu_and(j, tl->mask(i), cpu_map) { | 1902 | for_each_cpu_and(j, tl->mask(i), cpu_map) { |
1913 | unsigned long capacity; | 1903 | unsigned long capacity; |
1914 | 1904 | ||
1915 | capacity = arch_scale_cpu_capacity(NULL, j); | 1905 | capacity = arch_scale_cpu_capacity(j); |
1916 | 1906 | ||
1917 | if (capacity <= max_capacity) | 1907 | if (capacity <= max_capacity) |
1918 | continue; | 1908 | continue; |
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index fa0f9adfb752..c1e566a114ca 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int | |||
118 | bookmark.func = NULL; | 118 | bookmark.func = NULL; |
119 | INIT_LIST_HEAD(&bookmark.entry); | 119 | INIT_LIST_HEAD(&bookmark.entry); |
120 | 120 | ||
121 | spin_lock_irqsave(&wq_head->lock, flags); | 121 | do { |
122 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark); | ||
123 | spin_unlock_irqrestore(&wq_head->lock, flags); | ||
124 | |||
125 | while (bookmark.flags & WQ_FLAG_BOOKMARK) { | ||
126 | spin_lock_irqsave(&wq_head->lock, flags); | 122 | spin_lock_irqsave(&wq_head->lock, flags); |
127 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, | 123 | nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, |
128 | wake_flags, key, &bookmark); | 124 | wake_flags, key, &bookmark); |
129 | spin_unlock_irqrestore(&wq_head->lock, flags); | 125 | spin_unlock_irqrestore(&wq_head->lock, flags); |
130 | } | 126 | } while (bookmark.flags & WQ_FLAG_BOOKMARK); |
131 | } | 127 | } |
132 | 128 | ||
133 | /** | 129 | /** |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1beca96fb625..1c1ad1e14f21 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = { | |||
452 | .mode = 0644, | 452 | .mode = 0644, |
453 | .proc_handler = sched_rr_handler, | 453 | .proc_handler = sched_rr_handler, |
454 | }, | 454 | }, |
455 | #ifdef CONFIG_UCLAMP_TASK | ||
456 | { | ||
457 | .procname = "sched_util_clamp_min", | ||
458 | .data = &sysctl_sched_uclamp_util_min, | ||
459 | .maxlen = sizeof(unsigned int), | ||
460 | .mode = 0644, | ||
461 | .proc_handler = sysctl_sched_uclamp_handler, | ||
462 | }, | ||
463 | { | ||
464 | .procname = "sched_util_clamp_max", | ||
465 | .data = &sysctl_sched_uclamp_util_max, | ||
466 | .maxlen = sizeof(unsigned int), | ||
467 | .mode = 0644, | ||
468 | .proc_handler = sysctl_sched_uclamp_handler, | ||
469 | }, | ||
470 | #endif | ||
455 | #ifdef CONFIG_SCHED_AUTOGROUP | 471 | #ifdef CONFIG_SCHED_AUTOGROUP |
456 | { | 472 | { |
457 | .procname = "sched_autogroup_enabled", | 473 | .procname = "sched_autogroup_enabled", |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4ee1a3428ae..be9707f68024 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu) | |||
782 | */ | 782 | */ |
783 | if (!ts->tick_stopped) { | 783 | if (!ts->tick_stopped) { |
784 | calc_load_nohz_start(); | 784 | calc_load_nohz_start(); |
785 | cpu_load_update_nohz_start(); | ||
786 | quiet_vmstat(); | 785 | quiet_vmstat(); |
787 | 786 | ||
788 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); | 787 | ts->last_tick = hrtimer_get_expires(&ts->sched_timer); |
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) | |||
829 | { | 828 | { |
830 | /* Update jiffies first */ | 829 | /* Update jiffies first */ |
831 | tick_do_update_jiffies64(now); | 830 | tick_do_update_jiffies64(now); |
832 | cpu_load_update_nohz_stop(); | ||
833 | 831 | ||
834 | /* | 832 | /* |
835 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and | 833 | * Clear the timer idle flag, so we avoid IPIs on remote queueing and |
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 1e6db9cbe4dc..fa95139445b2 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c | |||
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void) | |||
277 | * of this thread, than stop migrating for the duration | 277 | * of this thread, than stop migrating for the duration |
278 | * of the current test. | 278 | * of the current test. |
279 | */ | 279 | */ |
280 | if (!cpumask_equal(current_mask, ¤t->cpus_allowed)) | 280 | if (!cpumask_equal(current_mask, current->cpus_ptr)) |
281 | goto disable; | 281 | goto disable; |
282 | 282 | ||
283 | get_online_cpus(); | 283 | get_online_cpus(); |
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 157d9e31f6c2..60ba93fc42ce 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c | |||
@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) | |||
23 | * Kernel threads bound to a single CPU can safely use | 23 | * Kernel threads bound to a single CPU can safely use |
24 | * smp_processor_id(): | 24 | * smp_processor_id(): |
25 | */ | 25 | */ |
26 | if (cpumask_equal(¤t->cpus_allowed, cpumask_of(this_cpu))) | 26 | if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu))) |
27 | goto out; | 27 | goto out; |
28 | 28 | ||
29 | /* | 29 | /* |
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index 1da597aa6141..1a72b7d95cdc 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c | |||
@@ -34,7 +34,7 @@ static void simple_thread_func(int cnt) | |||
34 | 34 | ||
35 | /* Silly tracepoints */ | 35 | /* Silly tracepoints */ |
36 | trace_foo_bar("hello", cnt, array, random_strings[len], | 36 | trace_foo_bar("hello", cnt, array, random_strings[len], |
37 | ¤t->cpus_allowed); | 37 | current->cpus_ptr); |
38 | 38 | ||
39 | trace_foo_with_template_simple("HELLO", cnt); | 39 | trace_foo_with_template_simple("HELLO", cnt); |
40 | 40 | ||