summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-07-08 19:39:53 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-07-08 19:39:53 -0400
commitdad1c12ed831a7a89cc01e5582cd0b81a4be7f19 (patch)
tree7a84799d3108bd9d3f1d4b530afd3ff9300db982
parent090bc5a2a91499c1fd64b78d125daa6ca5531d38 (diff)
parentaf24bde8df2029f067dc46aff0393c8f18ff6e2f (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Remove the unused per rq load array and all its infrastructure, by Dietmar Eggemann. - Add utilization clamping support by Patrick Bellasi. This is a refinement of the energy aware scheduling framework with support for boosting of interactive and capping of background workloads: to make sure critical GUI threads get maximum frequency ASAP, and to make sure background processing doesn't unnecessarily move to cpufreq governor to higher frequencies and less energy efficient CPU modes. - Add the bare minimum of tracepoints required for LISA EAS regression testing, by Qais Yousef - which allows automated testing of various power management features, including energy aware scheduling. - Restructure the former tsk_nr_cpus_allowed() facility that the -rt kernel used to modify the scheduler's CPU affinity logic such as migrate_disable() - introduce the task->cpus_ptr value instead of taking the address of &task->cpus_allowed directly - by Sebastian Andrzej Siewior. - Misc optimizations, fixes, cleanups and small enhancements - see the Git log for details. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) sched/uclamp: Add uclamp support to energy_compute() sched/uclamp: Add uclamp_util_with() sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks sched/uclamp: Set default clamps for RT tasks sched/uclamp: Reset uclamp values on RESET_ON_FORK sched/uclamp: Extend sched_setattr() to support utilization clamping sched/core: Allow sched_setattr() to use the current policy sched/uclamp: Add system default clamps sched/uclamp: Enforce last task's UCLAMP_MAX sched/uclamp: Add bucket local max tracking sched/uclamp: Add CPU's clamp buckets refcounting sched/fair: Rename weighted_cpuload() to cpu_runnable_load() sched/debug: Export the newly added tracepoints sched/debug: Add sched_overutilized tracepoint sched/debug: Add new tracepoint to track PELT at se level sched/debug: Add new tracepoints to track PELT at rq level sched/debug: Add a new sched_trace_*() helper functions sched/autogroup: Make autogroup_path() always available sched/wait: Deduplicate code with do-while sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() ...
-rw-r--r--Documentation/scheduler/sched-pelt.c3
-rw-r--r--arch/arm/kernel/topology.c2
-rw-r--r--arch/ia64/kernel/mca.c2
-rw-r--r--arch/mips/include/asm/switch_to.h4
-rw-r--r--arch/mips/kernel/mips-mt-fpaff.c2
-rw-r--r--arch/mips/kernel/traps.c6
-rw-r--r--arch/powerpc/platforms/cell/spufs/sched.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/pseudo_lock.c2
-rw-r--r--drivers/base/arch_topology.c6
-rw-r--r--drivers/infiniband/hw/hfi1/affinity.c6
-rw-r--r--drivers/infiniband/hw/hfi1/sdma.c3
-rw-r--r--drivers/infiniband/hw/qib/qib_file_ops.c7
-rw-r--r--fs/proc/array.c4
-rw-r--r--include/linux/arch_topology.h2
-rw-r--r--include/linux/energy_model.h2
-rw-r--r--include/linux/log2.h34
-rw-r--r--include/linux/sched.h79
-rw-r--r--include/linux/sched/nohz.h8
-rw-r--r--include/linux/sched/sysctl.h11
-rw-r--r--include/linux/sched/topology.h25
-rw-r--r--include/trace/events/sched.h31
-rw-r--r--include/uapi/linux/sched.h14
-rw-r--r--include/uapi/linux/sched/types.h66
-rw-r--r--init/Kconfig53
-rw-r--r--init/init_task.c3
-rw-r--r--kernel/cgroup/cpuset.c2
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/power/energy_model.c2
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c533
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c24
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c43
-rw-r--r--kernel/sched/fair.c623
-rw-r--r--kernel/sched/features.h1
-rw-r--r--kernel/sched/pelt.c13
-rw-r--r--kernel/sched/pelt.h2
-rw-r--r--kernel/sched/rt.c8
-rw-r--r--kernel/sched/sched-pelt.h2
-rw-r--r--kernel/sched/sched.h134
-rw-r--r--kernel/sched/topology.c18
-rw-r--r--kernel/sched/wait.c8
-rw-r--r--kernel/sysctl.c16
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/trace/trace_hwlat.c2
-rw-r--r--lib/smp_processor_id.c2
-rw-r--r--samples/trace_events/trace-events-sample.c2
49 files changed, 1216 insertions, 618 deletions
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
index e4219139386a..7238b355919c 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
20 int i; 20 int i;
21 unsigned int x; 21 unsigned int x;
22 22
23 printf("static const u32 runnable_avg_yN_inv[] = {"); 23 /* To silence -Wunused-but-set-variable warnings. */
24 printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
24 for (i = 0; i < HALFLIFE; i++) { 25 for (i = 0; i < HALFLIFE; i++) {
25 x = ((1UL<<32)-1)*pow(y, i); 26 x = ((1UL<<32)-1)*pow(y, i);
26 27
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 60e375ce1ab2..d17cb1e6d679 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu)
169 topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity); 169 topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
170 170
171 pr_info("CPU%u: update cpu_capacity %lu\n", 171 pr_info("CPU%u: update cpu_capacity %lu\n",
172 cpu, topology_get_cpu_scale(NULL, cpu)); 172 cpu, topology_get_cpu_scale(cpu));
173} 173}
174 174
175#else 175#else
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6a52d761854b..79190d877fa7 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
1831 ti->cpu = cpu; 1831 ti->cpu = cpu;
1832 p->stack = ti; 1832 p->stack = ti;
1833 p->state = TASK_UNINTERRUPTIBLE; 1833 p->state = TASK_UNINTERRUPTIBLE;
1834 cpumask_set_cpu(cpu, &p->cpus_allowed); 1834 cpumask_set_cpu(cpu, &p->cpus_mask);
1835 INIT_LIST_HEAD(&p->tasks); 1835 INIT_LIST_HEAD(&p->tasks);
1836 p->parent = p->real_parent = p->group_leader = p; 1836 p->parent = p->real_parent = p->group_leader = p;
1837 INIT_LIST_HEAD(&p->children); 1837 INIT_LIST_HEAD(&p->children);
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index 0f813bb753c6..09cbe9042828 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
42 * inline to try to keep the overhead down. If we have been forced to run on 42 * inline to try to keep the overhead down. If we have been forced to run on
43 * a "CPU" with an FPU because of a previous high level of FP computation, 43 * a "CPU" with an FPU because of a previous high level of FP computation,
44 * but did not actually use the FPU during the most recent time-slice (CU1 44 * but did not actually use the FPU during the most recent time-slice (CU1
45 * isn't set), we undo the restriction on cpus_allowed. 45 * isn't set), we undo the restriction on cpus_mask.
46 * 46 *
47 * We're not calling set_cpus_allowed() here, because we have no need to 47 * We're not calling set_cpus_allowed() here, because we have no need to
48 * force prompt migration - we're already switching the current CPU to a 48 * force prompt migration - we're already switching the current CPU to a
@@ -57,7 +57,7 @@ do { \
57 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \ 57 test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) && \
58 (!(KSTK_STATUS(prev) & ST0_CU1))) { \ 58 (!(KSTK_STATUS(prev) & ST0_CU1))) { \
59 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \ 59 clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND); \
60 prev->cpus_allowed = prev->thread.user_cpus_allowed; \ 60 prev->cpus_mask = prev->thread.user_cpus_allowed; \
61 } \ 61 } \
62 next->thread.emulated_fp = 0; \ 62 next->thread.emulated_fp = 0; \
63} while(0) 63} while(0)
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index a7c0f97e4b0d..1a08428eedcf 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
177 if (retval) 177 if (retval)
178 goto out_unlock; 178 goto out_unlock;
179 179
180 cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed); 180 cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
181 cpumask_and(&mask, &allowed, cpu_active_mask); 181 cpumask_and(&mask, &allowed, cpu_active_mask);
182 182
183out_unlock: 183out_unlock:
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c52766a5b85f..ac7159263da0 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void)
891 * restricted the allowed set to exclude any CPUs with FPUs, 891 * restricted the allowed set to exclude any CPUs with FPUs,
892 * we'll skip the procedure. 892 * we'll skip the procedure.
893 */ 893 */
894 if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) { 894 if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
895 cpumask_t tmask; 895 cpumask_t tmask;
896 896
897 current->thread.user_cpus_allowed 897 current->thread.user_cpus_allowed
898 = current->cpus_allowed; 898 = current->cpus_mask;
899 cpumask_and(&tmask, &current->cpus_allowed, 899 cpumask_and(&tmask, &current->cpus_mask,
900 &mt_fpu_cpumask); 900 &mt_fpu_cpumask);
901 set_cpus_allowed_ptr(current, &tmask); 901 set_cpus_allowed_ptr(current, &tmask);
902 set_thread_flag(TIF_FPUBOUND); 902 set_thread_flag(TIF_FPUBOUND);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e56b553de27b..f18d5067cd0f 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
128 * runqueue. The context will be rescheduled on the proper node 128 * runqueue. The context will be rescheduled on the proper node
129 * if it is timesliced or preempted. 129 * if it is timesliced or preempted.
130 */ 130 */
131 cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed); 131 cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
132 132
133 /* Save the current cpu id for spu interrupt routing. */ 133 /* Save the current cpu id for spu interrupt routing. */
134 ctx->last_ran = raw_smp_processor_id(); 134 ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 604c0e3bcc83..f68baccc69f0 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
1503 * may be scheduled elsewhere and invalidate entries in the 1503 * may be scheduled elsewhere and invalidate entries in the
1504 * pseudo-locked region. 1504 * pseudo-locked region.
1505 */ 1505 */
1506 if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) { 1506 if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
1507 mutex_unlock(&rdtgroup_mutex); 1507 mutex_unlock(&rdtgroup_mutex);
1508 return -EINVAL; 1508 return -EINVAL;
1509 } 1509 }
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1739d7e1952a..9b09e31ae82f 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
43{ 43{
44 struct cpu *cpu = container_of(dev, struct cpu, dev); 44 struct cpu *cpu = container_of(dev, struct cpu, dev);
45 45
46 return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id)); 46 return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
47} 47}
48 48
49static void update_topology_flags_workfn(struct work_struct *work); 49static void update_topology_flags_workfn(struct work_struct *work);
@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void)
116 / capacity_scale; 116 / capacity_scale;
117 topology_set_cpu_scale(cpu, capacity); 117 topology_set_cpu_scale(cpu, capacity);
118 pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", 118 pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
119 cpu, topology_get_cpu_scale(NULL, cpu)); 119 cpu, topology_get_cpu_scale(cpu));
120 } 120 }
121} 121}
122 122
@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
185 cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); 185 cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
186 186
187 for_each_cpu(cpu, policy->related_cpus) { 187 for_each_cpu(cpu, policy->related_cpus) {
188 raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) * 188 raw_capacity[cpu] = topology_get_cpu_scale(cpu) *
189 policy->cpuinfo.max_freq / 1000UL; 189 policy->cpuinfo.max_freq / 1000UL;
190 capacity_scale = max(raw_capacity[cpu], capacity_scale); 190 capacity_scale = max(raw_capacity[cpu], capacity_scale);
191 } 191 }
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 4fe662c3bbc1..c142b23bb401 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node)
1038 struct hfi1_affinity_node *entry; 1038 struct hfi1_affinity_node *entry;
1039 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask; 1039 cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
1040 const struct cpumask *node_mask, 1040 const struct cpumask *node_mask,
1041 *proc_mask = &current->cpus_allowed; 1041 *proc_mask = current->cpus_ptr;
1042 struct hfi1_affinity_node_list *affinity = &node_affinity; 1042 struct hfi1_affinity_node_list *affinity = &node_affinity;
1043 struct cpu_mask_set *set = &affinity->proc; 1043 struct cpu_mask_set *set = &affinity->proc;
1044 1044
@@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node)
1046 * check whether process/context affinity has already 1046 * check whether process/context affinity has already
1047 * been set 1047 * been set
1048 */ 1048 */
1049 if (cpumask_weight(proc_mask) == 1) { 1049 if (current->nr_cpus_allowed == 1) {
1050 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl", 1050 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
1051 current->pid, current->comm, 1051 current->pid, current->comm,
1052 cpumask_pr_args(proc_mask)); 1052 cpumask_pr_args(proc_mask));
@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node)
1057 cpu = cpumask_first(proc_mask); 1057 cpu = cpumask_first(proc_mask);
1058 cpumask_set_cpu(cpu, &set->used); 1058 cpumask_set_cpu(cpu, &set->used);
1059 goto done; 1059 goto done;
1060 } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) { 1060 } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
1061 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl", 1061 hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
1062 current->pid, current->comm, 1062 current->pid, current->comm,
1063 cpumask_pr_args(proc_mask)); 1063 cpumask_pr_args(proc_mask));
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 28b66bd70b74..2395fd4233a7 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -869,14 +869,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
869{ 869{
870 struct sdma_rht_node *rht_node; 870 struct sdma_rht_node *rht_node;
871 struct sdma_engine *sde = NULL; 871 struct sdma_engine *sde = NULL;
872 const struct cpumask *current_mask = &current->cpus_allowed;
873 unsigned long cpu_id; 872 unsigned long cpu_id;
874 873
875 /* 874 /*
876 * To ensure that always the same sdma engine(s) will be 875 * To ensure that always the same sdma engine(s) will be
877 * selected make sure the process is pinned to this CPU only. 876 * selected make sure the process is pinned to this CPU only.
878 */ 877 */
879 if (cpumask_weight(current_mask) != 1) 878 if (current->nr_cpus_allowed != 1)
880 goto out; 879 goto out;
881 880
882 cpu_id = smp_processor_id(); 881 cpu_id = smp_processor_id();
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 78fa634de98a..27b6e664e59d 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
1142static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd) 1142static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
1143{ 1143{
1144 struct qib_filedata *fd = fp->private_data; 1144 struct qib_filedata *fd = fp->private_data;
1145 const unsigned int weight = cpumask_weight(&current->cpus_allowed); 1145 const unsigned int weight = current->nr_cpus_allowed;
1146 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus); 1146 const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
1147 int local_cpu; 1147 int local_cpu;
1148 1148
@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
1623 ret = find_free_ctxt(i_minor - 1, fp, uinfo); 1623 ret = find_free_ctxt(i_minor - 1, fp, uinfo);
1624 else { 1624 else {
1625 int unit; 1625 int unit;
1626 const unsigned int cpu = cpumask_first(&current->cpus_allowed); 1626 const unsigned int cpu = cpumask_first(current->cpus_ptr);
1627 const unsigned int weight = 1627 const unsigned int weight = current->nr_cpus_allowed;
1628 cpumask_weight(&current->cpus_allowed);
1629 1628
1630 if (weight == 1 && !test_bit(cpu, qib_cpulist)) 1629 if (weight == 1 && !test_bit(cpu, qib_cpulist))
1631 if (!find_hca(cpu, &unit) && unit >= 0) 1630 if (!find_hca(cpu, &unit) && unit >= 0)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 55180501b915..46dcb6f0eccf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
381static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) 381static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
382{ 382{
383 seq_printf(m, "Cpus_allowed:\t%*pb\n", 383 seq_printf(m, "Cpus_allowed:\t%*pb\n",
384 cpumask_pr_args(&task->cpus_allowed)); 384 cpumask_pr_args(task->cpus_ptr));
385 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", 385 seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
386 cpumask_pr_args(&task->cpus_allowed)); 386 cpumask_pr_args(task->cpus_ptr));
387} 387}
388 388
389static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) 389static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d9bdc1a7f4e7..1cfe05ea1d89 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale);
18 18
19struct sched_domain; 19struct sched_domain;
20static inline 20static inline
21unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu) 21unsigned long topology_get_cpu_scale(int cpu)
22{ 22{
23 return per_cpu(cpu_scale, cpu); 23 return per_cpu(cpu_scale, cpu);
24} 24}
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index aa027f7bcb3e..73f8c3cb9588 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
89 * like schedutil. 89 * like schedutil.
90 */ 90 */
91 cpu = cpumask_first(to_cpumask(pd->cpus)); 91 cpu = cpumask_first(to_cpumask(pd->cpus));
92 scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 92 scale_cpu = arch_scale_cpu_capacity(cpu);
93 cs = &pd->table[pd->nr_cap_states - 1]; 93 cs = &pd->table[pd->nr_cap_states - 1];
94 freq = map_util_freq(max_util, cs->frequency, scale_cpu); 94 freq = map_util_freq(max_util, cs->frequency, scale_cpu);
95 95
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1aec01365ed4..83a4a3ca3e8a 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
220 ilog2((n) - 1) + 1) : \ 220 ilog2((n) - 1) + 1) : \
221 __order_base_2(n) \ 221 __order_base_2(n) \
222) 222)
223
224static inline __attribute__((const))
225int __bits_per(unsigned long n)
226{
227 if (n < 2)
228 return 1;
229 if (is_power_of_2(n))
230 return order_base_2(n) + 1;
231 return order_base_2(n);
232}
233
234/**
235 * bits_per - calculate the number of bits required for the argument
236 * @n: parameter
237 *
238 * This is constant-capable and can be used for compile time
239 * initializations, e.g bitfields.
240 *
241 * The first few values calculated by this routine:
242 * bf(0) = 1
243 * bf(1) = 1
244 * bf(2) = 2
245 * bf(3) = 2
246 * bf(4) = 3
247 * ... and so on.
248 */
249#define bits_per(n) \
250( \
251 __builtin_constant_p(n) ? ( \
252 ((n) == 0 || (n) == 1) \
253 ? 1 : ilog2(n) + 1 \
254 ) : \
255 __bits_per(n) \
256)
223#endif /* _LINUX_LOG2_H */ 257#endif /* _LINUX_LOG2_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76adce49b5ad..459d95e4a574 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@ struct audit_context;
35struct backing_dev_info; 35struct backing_dev_info;
36struct bio_list; 36struct bio_list;
37struct blk_plug; 37struct blk_plug;
38struct capture_control;
38struct cfs_rq; 39struct cfs_rq;
39struct fs_struct; 40struct fs_struct;
40struct futex_pi_state; 41struct futex_pi_state;
@@ -47,8 +48,9 @@ struct pid_namespace;
47struct pipe_inode_info; 48struct pipe_inode_info;
48struct rcu_node; 49struct rcu_node;
49struct reclaim_state; 50struct reclaim_state;
50struct capture_control;
51struct robust_list_head; 51struct robust_list_head;
52struct root_domain;
53struct rq;
52struct sched_attr; 54struct sched_attr;
53struct sched_param; 55struct sched_param;
54struct seq_file; 56struct seq_file;
@@ -281,6 +283,18 @@ struct vtime {
281 u64 gtime; 283 u64 gtime;
282}; 284};
283 285
286/*
287 * Utilization clamp constraints.
288 * @UCLAMP_MIN: Minimum utilization
289 * @UCLAMP_MAX: Maximum utilization
290 * @UCLAMP_CNT: Utilization clamp constraints count
291 */
292enum uclamp_id {
293 UCLAMP_MIN = 0,
294 UCLAMP_MAX,
295 UCLAMP_CNT
296};
297
284struct sched_info { 298struct sched_info {
285#ifdef CONFIG_SCHED_INFO 299#ifdef CONFIG_SCHED_INFO
286 /* Cumulative counters: */ 300 /* Cumulative counters: */
@@ -312,6 +326,10 @@ struct sched_info {
312# define SCHED_FIXEDPOINT_SHIFT 10 326# define SCHED_FIXEDPOINT_SHIFT 10
313# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) 327# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
314 328
329/* Increase resolution of cpu_capacity calculations */
330# define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
331# define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
332
315struct load_weight { 333struct load_weight {
316 unsigned long weight; 334 unsigned long weight;
317 u32 inv_weight; 335 u32 inv_weight;
@@ -560,6 +578,41 @@ struct sched_dl_entity {
560 struct hrtimer inactive_timer; 578 struct hrtimer inactive_timer;
561}; 579};
562 580
581#ifdef CONFIG_UCLAMP_TASK
582/* Number of utilization clamp buckets (shorter alias) */
583#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
584
585/*
586 * Utilization clamp for a scheduling entity
587 * @value: clamp value "assigned" to a se
588 * @bucket_id: bucket index corresponding to the "assigned" value
589 * @active: the se is currently refcounted in a rq's bucket
590 * @user_defined: the requested clamp value comes from user-space
591 *
592 * The bucket_id is the index of the clamp bucket matching the clamp value
593 * which is pre-computed and stored to avoid expensive integer divisions from
594 * the fast path.
595 *
596 * The active bit is set whenever a task has got an "effective" value assigned,
597 * which can be different from the clamp value "requested" from user-space.
598 * This allows to know a task is refcounted in the rq's bucket corresponding
599 * to the "effective" bucket_id.
600 *
601 * The user_defined bit is set whenever a task has got a task-specific clamp
602 * value requested from userspace, i.e. the system defaults apply to this task
603 * just as a restriction. This allows to relax default clamps when a less
604 * restrictive task-specific value has been requested, thus allowing to
605 * implement a "nice" semantic. For example, a task running with a 20%
606 * default boost can still drop its own boosting to 0%.
607 */
608struct uclamp_se {
609 unsigned int value : bits_per(SCHED_CAPACITY_SCALE);
610 unsigned int bucket_id : bits_per(UCLAMP_BUCKETS);
611 unsigned int active : 1;
612 unsigned int user_defined : 1;
613};
614#endif /* CONFIG_UCLAMP_TASK */
615
563union rcu_special { 616union rcu_special {
564 struct { 617 struct {
565 u8 blocked; 618 u8 blocked;
@@ -640,6 +693,13 @@ struct task_struct {
640#endif 693#endif
641 struct sched_dl_entity dl; 694 struct sched_dl_entity dl;
642 695
696#ifdef CONFIG_UCLAMP_TASK
697 /* Clamp values requested for a scheduling entity */
698 struct uclamp_se uclamp_req[UCLAMP_CNT];
699 /* Effective clamp values used for a scheduling entity */
700 struct uclamp_se uclamp[UCLAMP_CNT];
701#endif
702
643#ifdef CONFIG_PREEMPT_NOTIFIERS 703#ifdef CONFIG_PREEMPT_NOTIFIERS
644 /* List of struct preempt_notifier: */ 704 /* List of struct preempt_notifier: */
645 struct hlist_head preempt_notifiers; 705 struct hlist_head preempt_notifiers;
@@ -651,7 +711,8 @@ struct task_struct {
651 711
652 unsigned int policy; 712 unsigned int policy;
653 int nr_cpus_allowed; 713 int nr_cpus_allowed;
654 cpumask_t cpus_allowed; 714 const cpumask_t *cpus_ptr;
715 cpumask_t cpus_mask;
655 716
656#ifdef CONFIG_PREEMPT_RCU 717#ifdef CONFIG_PREEMPT_RCU
657 int rcu_read_lock_nesting; 718 int rcu_read_lock_nesting;
@@ -1399,7 +1460,7 @@ extern struct pid *cad_pid;
1399#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ 1460#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
1400#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */ 1461#define PF_MEMSTALL 0x01000000 /* Stalled due to lack of memory */
1401#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */ 1462#define PF_UMH 0x02000000 /* I'm an Usermodehelper process */
1402#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ 1463#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
1403#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ 1464#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
1404#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */ 1465#define PF_MEMALLOC_NOCMA 0x10000000 /* All allocation request will have _GFP_MOVABLE cleared */
1405#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ 1466#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
@@ -1915,4 +1976,16 @@ static inline void rseq_syscall(struct pt_regs *regs)
1915 1976
1916#endif 1977#endif
1917 1978
1979const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
1980char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
1981int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
1982
1983const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
1984const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
1985const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
1986
1987int sched_trace_rq_cpu(struct rq *rq);
1988
1989const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
1990
1918#endif 1991#endif
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index b36f4cf38111..1abe91ff6e4a 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -7,14 +7,6 @@
7 */ 7 */
8 8
9#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) 9#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
10extern void cpu_load_update_nohz_start(void);
11extern void cpu_load_update_nohz_stop(void);
12#else
13static inline void cpu_load_update_nohz_start(void) { }
14static inline void cpu_load_update_nohz_stop(void) { }
15#endif
16
17#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
18extern void nohz_balance_enter_idle(int cpu); 10extern void nohz_balance_enter_idle(int cpu);
19extern int get_nohz_timer_target(void); 11extern int get_nohz_timer_target(void);
20#else 12#else
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d728df7..d4f6215ee03f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
56extern unsigned int sysctl_sched_rt_period; 56extern unsigned int sysctl_sched_rt_period;
57extern int sysctl_sched_rt_runtime; 57extern int sysctl_sched_rt_runtime;
58 58
59#ifdef CONFIG_UCLAMP_TASK
60extern unsigned int sysctl_sched_uclamp_util_min;
61extern unsigned int sysctl_sched_uclamp_util_max;
62#endif
63
59#ifdef CONFIG_CFS_BANDWIDTH 64#ifdef CONFIG_CFS_BANDWIDTH
60extern unsigned int sysctl_sched_cfs_bandwidth_slice; 65extern unsigned int sysctl_sched_cfs_bandwidth_slice;
61#endif 66#endif
@@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
75 void __user *buffer, size_t *lenp, 80 void __user *buffer, size_t *lenp,
76 loff_t *ppos); 81 loff_t *ppos);
77 82
83#ifdef CONFIG_UCLAMP_TASK
84extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
85 void __user *buffer, size_t *lenp,
86 loff_t *ppos);
87#endif
88
78extern int sysctl_numa_balancing(struct ctl_table *table, int write, 89extern int sysctl_numa_balancing(struct ctl_table *table, int write,
79 void __user *buffer, size_t *lenp, 90 void __user *buffer, size_t *lenp,
80 loff_t *ppos); 91 loff_t *ppos);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index cfc0a89a7159..7863bb62d2ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -7,12 +7,6 @@
7#include <linux/sched/idle.h> 7#include <linux/sched/idle.h>
8 8
9/* 9/*
10 * Increase resolution of cpu_capacity calculations
11 */
12#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
13#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
14
15/*
16 * sched-domains (multiprocessor balancing) declarations: 10 * sched-domains (multiprocessor balancing) declarations:
17 */ 11 */
18#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
@@ -84,11 +78,6 @@ struct sched_domain {
84 unsigned int busy_factor; /* less balancing by factor if busy */ 78 unsigned int busy_factor; /* less balancing by factor if busy */
85 unsigned int imbalance_pct; /* No balance until over watermark */ 79 unsigned int imbalance_pct; /* No balance until over watermark */
86 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */ 80 unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
87 unsigned int busy_idx;
88 unsigned int idle_idx;
89 unsigned int newidle_idx;
90 unsigned int wake_idx;
91 unsigned int forkexec_idx;
92 81
93 int nohz_idle; /* NOHZ IDLE status */ 82 int nohz_idle; /* NOHZ IDLE status */
94 int flags; /* See SD_* */ 83 int flags; /* See SD_* */
@@ -201,14 +190,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
201# define SD_INIT_NAME(type) 190# define SD_INIT_NAME(type)
202#endif 191#endif
203 192
204#ifndef arch_scale_cpu_capacity
205static __always_inline
206unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
207{
208 return SCHED_CAPACITY_SCALE;
209}
210#endif
211
212#else /* CONFIG_SMP */ 193#else /* CONFIG_SMP */
213 194
214struct sched_domain_attr; 195struct sched_domain_attr;
@@ -224,16 +205,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
224 return true; 205 return true;
225} 206}
226 207
208#endif /* !CONFIG_SMP */
209
227#ifndef arch_scale_cpu_capacity 210#ifndef arch_scale_cpu_capacity
228static __always_inline 211static __always_inline
229unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) 212unsigned long arch_scale_cpu_capacity(int cpu)
230{ 213{
231 return SCHED_CAPACITY_SCALE; 214 return SCHED_CAPACITY_SCALE;
232} 215}
233#endif 216#endif
234 217
235#endif /* !CONFIG_SMP */
236
237static inline int task_node(const struct task_struct *p) 218static inline int task_node(const struct task_struct *p)
238{ 219{
239 return cpu_to_node(task_cpu(p)); 220 return cpu_to_node(task_cpu(p));
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c8c7c7efb487..420e80e56e55 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -594,6 +594,37 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
594 594
595 TP_printk("cpu=%d", __entry->cpu) 595 TP_printk("cpu=%d", __entry->cpu)
596); 596);
597
598/*
599 * Following tracepoints are not exported in tracefs and provide hooking
600 * mechanisms only for testing and debugging purposes.
601 *
602 * Postfixed with _tp to make them easily identifiable in the code.
603 */
604DECLARE_TRACE(pelt_cfs_tp,
605 TP_PROTO(struct cfs_rq *cfs_rq),
606 TP_ARGS(cfs_rq));
607
608DECLARE_TRACE(pelt_rt_tp,
609 TP_PROTO(struct rq *rq),
610 TP_ARGS(rq));
611
612DECLARE_TRACE(pelt_dl_tp,
613 TP_PROTO(struct rq *rq),
614 TP_ARGS(rq));
615
616DECLARE_TRACE(pelt_irq_tp,
617 TP_PROTO(struct rq *rq),
618 TP_ARGS(rq));
619
620DECLARE_TRACE(pelt_se_tp,
621 TP_PROTO(struct sched_entity *se),
622 TP_ARGS(se));
623
624DECLARE_TRACE(sched_overutilized_tp,
625 TP_PROTO(struct root_domain *rd, bool overutilized),
626 TP_ARGS(rd, overutilized));
627
597#endif /* _TRACE_SCHED_H */ 628#endif /* _TRACE_SCHED_H */
598 629
599/* This part must be outside protection */ 630/* This part must be outside protection */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index ed4ee170bee2..617bb59aa8ba 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -51,9 +51,21 @@
51#define SCHED_FLAG_RESET_ON_FORK 0x01 51#define SCHED_FLAG_RESET_ON_FORK 0x01
52#define SCHED_FLAG_RECLAIM 0x02 52#define SCHED_FLAG_RECLAIM 0x02
53#define SCHED_FLAG_DL_OVERRUN 0x04 53#define SCHED_FLAG_DL_OVERRUN 0x04
54#define SCHED_FLAG_KEEP_POLICY 0x08
55#define SCHED_FLAG_KEEP_PARAMS 0x10
56#define SCHED_FLAG_UTIL_CLAMP_MIN 0x20
57#define SCHED_FLAG_UTIL_CLAMP_MAX 0x40
58
59#define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \
60 SCHED_FLAG_KEEP_PARAMS)
61
62#define SCHED_FLAG_UTIL_CLAMP (SCHED_FLAG_UTIL_CLAMP_MIN | \
63 SCHED_FLAG_UTIL_CLAMP_MAX)
54 64
55#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \ 65#define SCHED_FLAG_ALL (SCHED_FLAG_RESET_ON_FORK | \
56 SCHED_FLAG_RECLAIM | \ 66 SCHED_FLAG_RECLAIM | \
57 SCHED_FLAG_DL_OVERRUN) 67 SCHED_FLAG_DL_OVERRUN | \
68 SCHED_FLAG_KEEP_ALL | \
69 SCHED_FLAG_UTIL_CLAMP)
58 70
59#endif /* _UAPI_LINUX_SCHED_H */ 71#endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 10fbb8031930..c852153ddb0d 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -9,6 +9,7 @@ struct sched_param {
9}; 9};
10 10
11#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ 11#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
12#define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */
12 13
13/* 14/*
14 * Extended scheduling parameters data structure. 15 * Extended scheduling parameters data structure.
@@ -21,8 +22,33 @@ struct sched_param {
21 * the tasks may be useful for a wide variety of application fields, e.g., 22 * the tasks may be useful for a wide variety of application fields, e.g.,
22 * multimedia, streaming, automation and control, and many others. 23 * multimedia, streaming, automation and control, and many others.
23 * 24 *
24 * This variant (sched_attr) is meant at describing a so-called 25 * This variant (sched_attr) allows to define additional attributes to
25 * sporadic time-constrained task. In such model a task is specified by: 26 * improve the scheduler knowledge about task requirements.
27 *
28 * Scheduling Class Attributes
29 * ===========================
30 *
31 * A subset of sched_attr attributes specifies the
32 * scheduling policy and relative POSIX attributes:
33 *
34 * @size size of the structure, for fwd/bwd compat.
35 *
36 * @sched_policy task's scheduling policy
37 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
38 * @sched_priority task's static priority (SCHED_FIFO/RR)
39 *
40 * Certain more advanced scheduling features can be controlled by a
41 * predefined set of flags via the attribute:
42 *
43 * @sched_flags for customizing the scheduler behaviour
44 *
45 * Sporadic Time-Constrained Task Attributes
46 * =========================================
47 *
48 * A subset of sched_attr attributes allows to describe a so-called
49 * sporadic time-constrained task.
50 *
51 * In such a model a task is specified by:
26 * - the activation period or minimum instance inter-arrival time; 52 * - the activation period or minimum instance inter-arrival time;
27 * - the maximum (or average, depending on the actual scheduling 53 * - the maximum (or average, depending on the actual scheduling
28 * discipline) computation time of all instances, a.k.a. runtime; 54 * discipline) computation time of all instances, a.k.a. runtime;
@@ -34,14 +60,8 @@ struct sched_param {
34 * than the runtime and must be completed by time instant t equal to 60 * than the runtime and must be completed by time instant t equal to
35 * the instance activation time + the deadline. 61 * the instance activation time + the deadline.
36 * 62 *
37 * This is reflected by the actual fields of the sched_attr structure: 63 * This is reflected by the following fields of the sched_attr structure:
38 * 64 *
39 * @size size of the structure, for fwd/bwd compat.
40 *
41 * @sched_policy task's scheduling policy
42 * @sched_flags for customizing the scheduler behaviour
43 * @sched_nice task's nice value (SCHED_NORMAL/BATCH)
44 * @sched_priority task's static priority (SCHED_FIFO/RR)
45 * @sched_deadline representative of the task's deadline 65 * @sched_deadline representative of the task's deadline
46 * @sched_runtime representative of the task's runtime 66 * @sched_runtime representative of the task's runtime
47 * @sched_period representative of the task's period 67 * @sched_period representative of the task's period
@@ -53,6 +73,29 @@ struct sched_param {
53 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the 73 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
54 * only user of this new interface. More information about the algorithm 74 * only user of this new interface. More information about the algorithm
55 * available in the scheduling class file or in Documentation/. 75 * available in the scheduling class file or in Documentation/.
76 *
77 * Task Utilization Attributes
78 * ===========================
79 *
80 * A subset of sched_attr attributes allows to specify the utilization
81 * expected for a task. These attributes allow to inform the scheduler about
82 * the utilization boundaries within which it should schedule the task. These
83 * boundaries are valuable hints to support scheduler decisions on both task
84 * placement and frequency selection.
85 *
86 * @sched_util_min represents the minimum utilization
87 * @sched_util_max represents the maximum utilization
88 *
89 * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It
90 * represents the percentage of CPU time used by a task when running at the
91 * maximum frequency on the highest capacity CPU of the system. For example, a
92 * 20% utilization task is a task running for 2ms every 10ms at maximum
93 * frequency.
94 *
95 * A task with a min utilization value bigger than 0 is more likely scheduled
96 * on a CPU with a capacity big enough to fit the specified value.
97 * A task with a max utilization value smaller than 1024 is more likely
98 * scheduled on a CPU with no more capacity than the specified value.
56 */ 99 */
57struct sched_attr { 100struct sched_attr {
58 __u32 size; 101 __u32 size;
@@ -70,6 +113,11 @@ struct sched_attr {
70 __u64 sched_runtime; 113 __u64 sched_runtime;
71 __u64 sched_deadline; 114 __u64 sched_deadline;
72 __u64 sched_period; 115 __u64 sched_period;
116
117 /* Utilization hints */
118 __u32 sched_util_min;
119 __u32 sched_util_max;
120
73}; 121};
74 122
75#endif /* _UAPI_LINUX_SCHED_TYPES_H */ 123#endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/init/Kconfig b/init/Kconfig
index 0e2344389501..c88289c18d59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
677config GENERIC_SCHED_CLOCK 677config GENERIC_SCHED_CLOCK
678 bool 678 bool
679 679
680menu "Scheduler features"
681
682config UCLAMP_TASK
683 bool "Enable utilization clamping for RT/FAIR tasks"
684 depends on CPU_FREQ_GOV_SCHEDUTIL
685 help
686 This feature enables the scheduler to track the clamped utilization
687 of each CPU based on RUNNABLE tasks scheduled on that CPU.
688
689 With this option, the user can specify the min and max CPU
690 utilization allowed for RUNNABLE tasks. The max utilization defines
691 the maximum frequency a task should use while the min utilization
692 defines the minimum frequency it should use.
693
694 Both min and max utilization clamp values are hints to the scheduler,
695 aiming at improving its frequency selection policy, but they do not
696 enforce or grant any specific bandwidth for tasks.
697
698 If in doubt, say N.
699
700config UCLAMP_BUCKETS_COUNT
701 int "Number of supported utilization clamp buckets"
702 range 5 20
703 default 5
704 depends on UCLAMP_TASK
705 help
706 Defines the number of clamp buckets to use. The range of each bucket
707 will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
708 number of clamp buckets the finer their granularity and the higher
709 the precision of clamping aggregation and tracking at run-time.
710
711 For example, with the minimum configuration value we will have 5
712 clamp buckets tracking 20% utilization each. A 25% boosted tasks will
713 be refcounted in the [20..39]% bucket and will set the bucket clamp
714 effective value to 25%.
715 If a second 30% boosted task should be co-scheduled on the same CPU,
716 that task will be refcounted in the same bucket of the first task and
717 it will boost the bucket clamp effective value to 30%.
718 The clamp effective value of a bucket is reset to its nominal value
719 (20% in the example above) when there are no more tasks refcounted in
720 that bucket.
721
722 An additional boost/capping margin can be added to some tasks. In the
723 example above the 25% task will be boosted to 30% until it exits the
724 CPU. If that should be considered not acceptable on certain systems,
725 it's always possible to reduce the margin by increasing the number of
726 clamp buckets to trade off used memory for run-time tracking
727 precision.
728
729 If in doubt, use the default value.
730
731endmenu
732
680# 733#
681# For architectures that want to enable the support for NUMA-affine scheduler 734# For architectures that want to enable the support for NUMA-affine scheduler
682# balancing logic: 735# balancing logic:
diff --git a/init/init_task.c b/init/init_task.c
index afa6ad795355..7ab773b9b3cd 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -72,7 +72,8 @@ struct task_struct init_task
72 .static_prio = MAX_PRIO - 20, 72 .static_prio = MAX_PRIO - 20,
73 .normal_prio = MAX_PRIO - 20, 73 .normal_prio = MAX_PRIO - 20,
74 .policy = SCHED_NORMAL, 74 .policy = SCHED_NORMAL,
75 .cpus_allowed = CPU_MASK_ALL, 75 .cpus_ptr = &init_task.cpus_mask,
76 .cpus_mask = CPU_MASK_ALL,
76 .nr_cpus_allowed= NR_CPUS, 77 .nr_cpus_allowed= NR_CPUS,
77 .mm = NULL, 78 .mm = NULL,
78 .active_mm = &init_mm, 79 .active_mm = &init_mm,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 515525ff1cfd..a1590e244f5f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
2829 if (task_css_is_root(task, cpuset_cgrp_id)) 2829 if (task_css_is_root(task, cpuset_cgrp_id))
2830 return; 2830 return;
2831 2831
2832 set_cpus_allowed_ptr(task, &current->cpus_allowed); 2832 set_cpus_allowed_ptr(task, current->cpus_ptr);
2833 task->mems_allowed = current->mems_allowed; 2833 task->mems_allowed = current->mems_allowed;
2834} 2834}
2835 2835
diff --git a/kernel/fork.c b/kernel/fork.c
index d18e343d4aab..847dd147b068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
898#ifdef CONFIG_STACKPROTECTOR 898#ifdef CONFIG_STACKPROTECTOR
899 tsk->stack_canary = get_random_canary(); 899 tsk->stack_canary = get_random_canary();
900#endif 900#endif
901 if (orig->cpus_ptr == &orig->cpus_mask)
902 tsk->cpus_ptr = &tsk->cpus_mask;
901 903
902 /* 904 /*
903 * One for us, one for whoever does the "release_task()" (usually 905 * One for us, one for whoever does the "release_task()" (usually
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
223 * All CPUs of a domain must have the same micro-architecture 223 * All CPUs of a domain must have the same micro-architecture
224 * since they all share the same table. 224 * since they all share the same table.
225 */ 225 */
226 cap = arch_scale_cpu_capacity(NULL, cpu); 226 cap = arch_scale_cpu_capacity(cpu);
227 if (prev_cap && prev_cap != cap) { 227 if (prev_cap && prev_cap != cap) {
228 pr_err("CPUs of %*pbl must have the same capacity\n", 228 pr_err("CPUs of %*pbl must have the same capacity\n",
229 cpumask_pr_args(span)); 229 cpumask_pr_args(span));
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
259} 259}
260#endif /* CONFIG_PROC_FS */ 260#endif /* CONFIG_PROC_FS */
261 261
262#ifdef CONFIG_SCHED_DEBUG
263int autogroup_path(struct task_group *tg, char *buf, int buflen) 262int autogroup_path(struct task_group *tg, char *buf, int buflen)
264{ 263{
265 if (!task_group_is_autogroup(tg)) 264 if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 266
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 267 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 268}
270#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..fa43ce3962e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -23,6 +23,17 @@
23#define CREATE_TRACE_POINTS 23#define CREATE_TRACE_POINTS
24#include <trace/events/sched.h> 24#include <trace/events/sched.h>
25 25
26/*
27 * Export tracepoints that act as a bare tracehook (ie: have no trace event
28 * associated with them) to allow external modules to probe them.
29 */
30EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
31EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
32EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
33EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
34EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
35EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
36
26DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 37DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
27 38
28#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL) 39#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -761,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
761 } 772 }
762} 773}
763 774
775#ifdef CONFIG_UCLAMP_TASK
776/* Max allowed minimum utilization */
777unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
778
779/* Max allowed maximum utilization */
780unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
781
782/* All clamps are required to be less or equal than these values */
783static struct uclamp_se uclamp_default[UCLAMP_CNT];
784
785/* Integer rounded range for each bucket */
786#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
787
788#define for_each_clamp_id(clamp_id) \
789 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
790
791static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
792{
793 return clamp_value / UCLAMP_BUCKET_DELTA;
794}
795
796static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
797{
798 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
799}
800
801static inline unsigned int uclamp_none(int clamp_id)
802{
803 if (clamp_id == UCLAMP_MIN)
804 return 0;
805 return SCHED_CAPACITY_SCALE;
806}
807
808static inline void uclamp_se_set(struct uclamp_se *uc_se,
809 unsigned int value, bool user_defined)
810{
811 uc_se->value = value;
812 uc_se->bucket_id = uclamp_bucket_id(value);
813 uc_se->user_defined = user_defined;
814}
815
816static inline unsigned int
817uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
818 unsigned int clamp_value)
819{
820 /*
821 * Avoid blocked utilization pushing up the frequency when we go
822 * idle (which drops the max-clamp) by retaining the last known
823 * max-clamp.
824 */
825 if (clamp_id == UCLAMP_MAX) {
826 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
827 return clamp_value;
828 }
829
830 return uclamp_none(UCLAMP_MIN);
831}
832
833static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
834 unsigned int clamp_value)
835{
836 /* Reset max-clamp retention only on idle exit */
837 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
838 return;
839
840 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
841}
842
843static inline
844unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
845 unsigned int clamp_value)
846{
847 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
848 int bucket_id = UCLAMP_BUCKETS - 1;
849
850 /*
851 * Since both min and max clamps are max aggregated, find the
852 * top most bucket with tasks in.
853 */
854 for ( ; bucket_id >= 0; bucket_id--) {
855 if (!bucket[bucket_id].tasks)
856 continue;
857 return bucket[bucket_id].value;
858 }
859
860 /* No tasks -- default clamp values */
861 return uclamp_idle_value(rq, clamp_id, clamp_value);
862}
863
864/*
865 * The effective clamp bucket index of a task depends on, by increasing
866 * priority:
867 * - the task specific clamp value, when explicitly requested from userspace
868 * - the system default clamp value, defined by the sysadmin
869 */
870static inline struct uclamp_se
871uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
872{
873 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
874 struct uclamp_se uc_max = uclamp_default[clamp_id];
875
876 /* System default restrictions always apply */
877 if (unlikely(uc_req.value > uc_max.value))
878 return uc_max;
879
880 return uc_req;
881}
882
883unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
884{
885 struct uclamp_se uc_eff;
886
887 /* Task currently refcounted: use back-annotated (effective) value */
888 if (p->uclamp[clamp_id].active)
889 return p->uclamp[clamp_id].value;
890
891 uc_eff = uclamp_eff_get(p, clamp_id);
892
893 return uc_eff.value;
894}
895
896/*
897 * When a task is enqueued on a rq, the clamp bucket currently defined by the
898 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
899 * updates the rq's clamp value if required.
900 *
901 * Tasks can have a task-specific value requested from user-space, track
902 * within each bucket the maximum value for tasks refcounted in it.
903 * This "local max aggregation" allows to track the exact "requested" value
904 * for each bucket when all its RUNNABLE tasks require the same clamp.
905 */
906static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
907 unsigned int clamp_id)
908{
909 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
910 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
911 struct uclamp_bucket *bucket;
912
913 lockdep_assert_held(&rq->lock);
914
915 /* Update task effective clamp */
916 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
917
918 bucket = &uc_rq->bucket[uc_se->bucket_id];
919 bucket->tasks++;
920 uc_se->active = true;
921
922 uclamp_idle_reset(rq, clamp_id, uc_se->value);
923
924 /*
925 * Local max aggregation: rq buckets always track the max
926 * "requested" clamp value of its RUNNABLE tasks.
927 */
928 if (bucket->tasks == 1 || uc_se->value > bucket->value)
929 bucket->value = uc_se->value;
930
931 if (uc_se->value > READ_ONCE(uc_rq->value))
932 WRITE_ONCE(uc_rq->value, uc_se->value);
933}
934
935/*
936 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
937 * is released. If this is the last task reference counting the rq's max
938 * active clamp value, then the rq's clamp value is updated.
939 *
940 * Both refcounted tasks and rq's cached clamp values are expected to be
941 * always valid. If it's detected they are not, as defensive programming,
942 * enforce the expected state and warn.
943 */
944static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
945 unsigned int clamp_id)
946{
947 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
948 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
949 struct uclamp_bucket *bucket;
950 unsigned int bkt_clamp;
951 unsigned int rq_clamp;
952
953 lockdep_assert_held(&rq->lock);
954
955 bucket = &uc_rq->bucket[uc_se->bucket_id];
956 SCHED_WARN_ON(!bucket->tasks);
957 if (likely(bucket->tasks))
958 bucket->tasks--;
959 uc_se->active = false;
960
961 /*
962 * Keep "local max aggregation" simple and accept to (possibly)
963 * overboost some RUNNABLE tasks in the same bucket.
964 * The rq clamp bucket value is reset to its base value whenever
965 * there are no more RUNNABLE tasks refcounting it.
966 */
967 if (likely(bucket->tasks))
968 return;
969
970 rq_clamp = READ_ONCE(uc_rq->value);
971 /*
972 * Defensive programming: this should never happen. If it happens,
973 * e.g. due to future modification, warn and fixup the expected value.
974 */
975 SCHED_WARN_ON(bucket->value > rq_clamp);
976 if (bucket->value >= rq_clamp) {
977 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
978 WRITE_ONCE(uc_rq->value, bkt_clamp);
979 }
980}
981
982static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
983{
984 unsigned int clamp_id;
985
986 if (unlikely(!p->sched_class->uclamp_enabled))
987 return;
988
989 for_each_clamp_id(clamp_id)
990 uclamp_rq_inc_id(rq, p, clamp_id);
991
992 /* Reset clamp idle holding when there is one RUNNABLE task */
993 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
994 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
995}
996
997static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
998{
999 unsigned int clamp_id;
1000
1001 if (unlikely(!p->sched_class->uclamp_enabled))
1002 return;
1003
1004 for_each_clamp_id(clamp_id)
1005 uclamp_rq_dec_id(rq, p, clamp_id);
1006}
1007
1008int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1009 void __user *buffer, size_t *lenp,
1010 loff_t *ppos)
1011{
1012 int old_min, old_max;
1013 static DEFINE_MUTEX(mutex);
1014 int result;
1015
1016 mutex_lock(&mutex);
1017 old_min = sysctl_sched_uclamp_util_min;
1018 old_max = sysctl_sched_uclamp_util_max;
1019
1020 result = proc_dointvec(table, write, buffer, lenp, ppos);
1021 if (result)
1022 goto undo;
1023 if (!write)
1024 goto done;
1025
1026 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1027 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
1028 result = -EINVAL;
1029 goto undo;
1030 }
1031
1032 if (old_min != sysctl_sched_uclamp_util_min) {
1033 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1034 sysctl_sched_uclamp_util_min, false);
1035 }
1036 if (old_max != sysctl_sched_uclamp_util_max) {
1037 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1038 sysctl_sched_uclamp_util_max, false);
1039 }
1040
1041 /*
1042 * Updating all the RUNNABLE task is expensive, keep it simple and do
1043 * just a lazy update at each next enqueue time.
1044 */
1045 goto done;
1046
1047undo:
1048 sysctl_sched_uclamp_util_min = old_min;
1049 sysctl_sched_uclamp_util_max = old_max;
1050done:
1051 mutex_unlock(&mutex);
1052
1053 return result;
1054}
1055
1056static int uclamp_validate(struct task_struct *p,
1057 const struct sched_attr *attr)
1058{
1059 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1060 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1061
1062 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
1063 lower_bound = attr->sched_util_min;
1064 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
1065 upper_bound = attr->sched_util_max;
1066
1067 if (lower_bound > upper_bound)
1068 return -EINVAL;
1069 if (upper_bound > SCHED_CAPACITY_SCALE)
1070 return -EINVAL;
1071
1072 return 0;
1073}
1074
1075static void __setscheduler_uclamp(struct task_struct *p,
1076 const struct sched_attr *attr)
1077{
1078 unsigned int clamp_id;
1079
1080 /*
1081 * On scheduling class change, reset to default clamps for tasks
1082 * without a task-specific value.
1083 */
1084 for_each_clamp_id(clamp_id) {
1085 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1086 unsigned int clamp_value = uclamp_none(clamp_id);
1087
1088 /* Keep using defined clamps across class changes */
1089 if (uc_se->user_defined)
1090 continue;
1091
1092 /* By default, RT tasks always get 100% boost */
1093 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1094 clamp_value = uclamp_none(UCLAMP_MAX);
1095
1096 uclamp_se_set(uc_se, clamp_value, false);
1097 }
1098
1099 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
1100 return;
1101
1102 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1103 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
1104 attr->sched_util_min, true);
1105 }
1106
1107 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1108 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
1109 attr->sched_util_max, true);
1110 }
1111}
1112
1113static void uclamp_fork(struct task_struct *p)
1114{
1115 unsigned int clamp_id;
1116
1117 for_each_clamp_id(clamp_id)
1118 p->uclamp[clamp_id].active = false;
1119
1120 if (likely(!p->sched_reset_on_fork))
1121 return;
1122
1123 for_each_clamp_id(clamp_id) {
1124 unsigned int clamp_value = uclamp_none(clamp_id);
1125
1126 /* By default, RT tasks always get 100% boost */
1127 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
1128 clamp_value = uclamp_none(UCLAMP_MAX);
1129
1130 uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
1131 }
1132}
1133
1134static void __init init_uclamp(void)
1135{
1136 struct uclamp_se uc_max = {};
1137 unsigned int clamp_id;
1138 int cpu;
1139
1140 for_each_possible_cpu(cpu) {
1141 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
1142 cpu_rq(cpu)->uclamp_flags = 0;
1143 }
1144
1145 for_each_clamp_id(clamp_id) {
1146 uclamp_se_set(&init_task.uclamp_req[clamp_id],
1147 uclamp_none(clamp_id), false);
1148 }
1149
1150 /* System defaults allow max clamp values for both indexes */
1151 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1152 for_each_clamp_id(clamp_id)
1153 uclamp_default[clamp_id] = uc_max;
1154}
1155
1156#else /* CONFIG_UCLAMP_TASK */
1157static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
1158static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
1159static inline int uclamp_validate(struct task_struct *p,
1160 const struct sched_attr *attr)
1161{
1162 return -EOPNOTSUPP;
1163}
1164static void __setscheduler_uclamp(struct task_struct *p,
1165 const struct sched_attr *attr) { }
1166static inline void uclamp_fork(struct task_struct *p) { }
1167static inline void init_uclamp(void) { }
1168#endif /* CONFIG_UCLAMP_TASK */
1169
764static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 1170static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
765{ 1171{
766 if (!(flags & ENQUEUE_NOCLOCK)) 1172 if (!(flags & ENQUEUE_NOCLOCK))
@@ -771,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
771 psi_enqueue(p, flags & ENQUEUE_WAKEUP); 1177 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
772 } 1178 }
773 1179
1180 uclamp_rq_inc(rq, p);
774 p->sched_class->enqueue_task(rq, p, flags); 1181 p->sched_class->enqueue_task(rq, p, flags);
775} 1182}
776 1183
@@ -784,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
784 psi_dequeue(p, flags & DEQUEUE_SLEEP); 1191 psi_dequeue(p, flags & DEQUEUE_SLEEP);
785 } 1192 }
786 1193
1194 uclamp_rq_dec(rq, p);
787 p->sched_class->dequeue_task(rq, p, flags); 1195 p->sched_class->dequeue_task(rq, p, flags);
788} 1196}
789 1197
@@ -930,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
930 */ 1338 */
931static inline bool is_cpu_allowed(struct task_struct *p, int cpu) 1339static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
932{ 1340{
933 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 1341 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
934 return false; 1342 return false;
935 1343
936 if (is_per_cpu_kthread(p)) 1344 if (is_per_cpu_kthread(p))
@@ -1025,7 +1433,7 @@ static int migration_cpu_stop(void *data)
1025 local_irq_disable(); 1433 local_irq_disable();
1026 /* 1434 /*
1027 * We need to explicitly wake pending tasks before running 1435 * We need to explicitly wake pending tasks before running
1028 * __migrate_task() such that we will not miss enforcing cpus_allowed 1436 * __migrate_task() such that we will not miss enforcing cpus_ptr
1029 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. 1437 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1030 */ 1438 */
1031 sched_ttwu_pending(); 1439 sched_ttwu_pending();
@@ -1056,7 +1464,7 @@ static int migration_cpu_stop(void *data)
1056 */ 1464 */
1057void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) 1465void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1058{ 1466{
1059 cpumask_copy(&p->cpus_allowed, new_mask); 1467 cpumask_copy(&p->cpus_mask, new_mask);
1060 p->nr_cpus_allowed = cpumask_weight(new_mask); 1468 p->nr_cpus_allowed = cpumask_weight(new_mask);
1061} 1469}
1062 1470
@@ -1126,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
1126 goto out; 1534 goto out;
1127 } 1535 }
1128 1536
1129 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1537 if (cpumask_equal(p->cpus_ptr, new_mask))
1130 goto out; 1538 goto out;
1131 1539
1132 if (!cpumask_intersects(new_mask, cpu_valid_mask)) { 1540 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1286,10 +1694,10 @@ static int migrate_swap_stop(void *data)
1286 if (task_cpu(arg->src_task) != arg->src_cpu) 1694 if (task_cpu(arg->src_task) != arg->src_cpu)
1287 goto unlock; 1695 goto unlock;
1288 1696
1289 if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed)) 1697 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
1290 goto unlock; 1698 goto unlock;
1291 1699
1292 if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed)) 1700 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
1293 goto unlock; 1701 goto unlock;
1294 1702
1295 __migrate_swap_task(arg->src_task, arg->dst_cpu); 1703 __migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
1331 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) 1739 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1332 goto out; 1740 goto out;
1333 1741
1334 if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed)) 1742 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
1335 goto out; 1743 goto out;
1336 1744
1337 if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed)) 1745 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
1338 goto out; 1746 goto out;
1339 1747
1340 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu); 1748 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1479,7 +1887,7 @@ void kick_process(struct task_struct *p)
1479EXPORT_SYMBOL_GPL(kick_process); 1887EXPORT_SYMBOL_GPL(kick_process);
1480 1888
1481/* 1889/*
1482 * ->cpus_allowed is protected by both rq->lock and p->pi_lock 1890 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
1483 * 1891 *
1484 * A few notes on cpu_active vs cpu_online: 1892 * A few notes on cpu_active vs cpu_online:
1485 * 1893 *
@@ -1519,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
1519 for_each_cpu(dest_cpu, nodemask) { 1927 for_each_cpu(dest_cpu, nodemask) {
1520 if (!cpu_active(dest_cpu)) 1928 if (!cpu_active(dest_cpu))
1521 continue; 1929 continue;
1522 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 1930 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
1523 return dest_cpu; 1931 return dest_cpu;
1524 } 1932 }
1525 } 1933 }
1526 1934
1527 for (;;) { 1935 for (;;) {
1528 /* Any allowed, online CPU? */ 1936 /* Any allowed, online CPU? */
1529 for_each_cpu(dest_cpu, &p->cpus_allowed) { 1937 for_each_cpu(dest_cpu, p->cpus_ptr) {
1530 if (!is_cpu_allowed(p, dest_cpu)) 1938 if (!is_cpu_allowed(p, dest_cpu))
1531 continue; 1939 continue;
1532 1940
@@ -1570,7 +1978,7 @@ out:
1570} 1978}
1571 1979
1572/* 1980/*
1573 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1981 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
1574 */ 1982 */
1575static inline 1983static inline
1576int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) 1984int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1580 if (p->nr_cpus_allowed > 1) 1988 if (p->nr_cpus_allowed > 1)
1581 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); 1989 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1582 else 1990 else
1583 cpu = cpumask_any(&p->cpus_allowed); 1991 cpu = cpumask_any(p->cpus_ptr);
1584 1992
1585 /* 1993 /*
1586 * In order not to call set_task_cpu() on a blocking task we need 1994 * In order not to call set_task_cpu() on a blocking task we need
1587 * to rely on ttwu() to place the task on a valid ->cpus_allowed 1995 * to rely on ttwu() to place the task on a valid ->cpus_ptr
1588 * CPU. 1996 * CPU.
1589 * 1997 *
1590 * Since this is common to all placement strategies, this lives here. 1998 * Since this is common to all placement strategies, this lives here.
@@ -1991,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1991 unsigned long flags; 2399 unsigned long flags;
1992 int cpu, success = 0; 2400 int cpu, success = 0;
1993 2401
2402 if (p == current) {
2403 /*
2404 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
2405 * == smp_processor_id()'. Together this means we can special
2406 * case the whole 'p->on_rq && ttwu_remote()' case below
2407 * without taking any locks.
2408 *
2409 * In particular:
2410 * - we rely on Program-Order guarantees for all the ordering,
2411 * - we're serialized against set_special_state() by virtue of
2412 * it disabling IRQs (this allows not taking ->pi_lock).
2413 */
2414 if (!(p->state & state))
2415 return false;
2416
2417 success = 1;
2418 cpu = task_cpu(p);
2419 trace_sched_waking(p);
2420 p->state = TASK_RUNNING;
2421 trace_sched_wakeup(p);
2422 goto out;
2423 }
2424
1994 /* 2425 /*
1995 * If we are going to wake up a thread waiting for CONDITION we 2426 * If we are going to wake up a thread waiting for CONDITION we
1996 * need to ensure that CONDITION=1 done by the caller can not be 2427 * need to ensure that CONDITION=1 done by the caller can not be
@@ -2000,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2000 raw_spin_lock_irqsave(&p->pi_lock, flags); 2431 raw_spin_lock_irqsave(&p->pi_lock, flags);
2001 smp_mb__after_spinlock(); 2432 smp_mb__after_spinlock();
2002 if (!(p->state & state)) 2433 if (!(p->state & state))
2003 goto out; 2434 goto unlock;
2004 2435
2005 trace_sched_waking(p); 2436 trace_sched_waking(p);
2006 2437
@@ -2030,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2030 */ 2461 */
2031 smp_rmb(); 2462 smp_rmb();
2032 if (p->on_rq && ttwu_remote(p, wake_flags)) 2463 if (p->on_rq && ttwu_remote(p, wake_flags))
2033 goto stat; 2464 goto unlock;
2034 2465
2035#ifdef CONFIG_SMP 2466#ifdef CONFIG_SMP
2036 /* 2467 /*
@@ -2090,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
2090#endif /* CONFIG_SMP */ 2521#endif /* CONFIG_SMP */
2091 2522
2092 ttwu_queue(p, cpu, wake_flags); 2523 ttwu_queue(p, cpu, wake_flags);
2093stat: 2524unlock:
2094 ttwu_stat(p, cpu, wake_flags);
2095out:
2096 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 2525 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2526out:
2527 if (success)
2528 ttwu_stat(p, cpu, wake_flags);
2097 2529
2098 return success; 2530 return success;
2099} 2531}
@@ -2300,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
2300 */ 2732 */
2301 p->prio = current->normal_prio; 2733 p->prio = current->normal_prio;
2302 2734
2735 uclamp_fork(p);
2736
2303 /* 2737 /*
2304 * Revert to default priority/policy on fork if requested. 2738 * Revert to default priority/policy on fork if requested.
2305 */ 2739 */
@@ -2395,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
2395#ifdef CONFIG_SMP 2829#ifdef CONFIG_SMP
2396 /* 2830 /*
2397 * Fork balancing, do it here and not earlier because: 2831 * Fork balancing, do it here and not earlier because:
2398 * - cpus_allowed can change in the fork path 2832 * - cpus_ptr can change in the fork path
2399 * - any previously selected CPU might disappear through hotplug 2833 * - any previously selected CPU might disappear through hotplug
2400 * 2834 *
2401 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, 2835 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3033,7 +3467,6 @@ void scheduler_tick(void)
3033 3467
3034 update_rq_clock(rq); 3468 update_rq_clock(rq);
3035 curr->sched_class->task_tick(rq, curr, 0); 3469 curr->sched_class->task_tick(rq, curr, 0);
3036 cpu_load_update_active(rq);
3037 calc_global_load_tick(rq); 3470 calc_global_load_tick(rq);
3038 psi_task_tick(rq); 3471 psi_task_tick(rq);
3039 3472
@@ -4071,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
4071static void __setscheduler(struct rq *rq, struct task_struct *p, 4504static void __setscheduler(struct rq *rq, struct task_struct *p,
4072 const struct sched_attr *attr, bool keep_boost) 4505 const struct sched_attr *attr, bool keep_boost)
4073{ 4506{
4507 /*
4508 * If params can't change scheduling class changes aren't allowed
4509 * either.
4510 */
4511 if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
4512 return;
4513
4074 __setscheduler_params(p, attr); 4514 __setscheduler_params(p, attr);
4075 4515
4076 /* 4516 /*
@@ -4208,6 +4648,13 @@ recheck:
4208 return retval; 4648 return retval;
4209 } 4649 }
4210 4650
4651 /* Update task specific "requested" clamps */
4652 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
4653 retval = uclamp_validate(p, attr);
4654 if (retval)
4655 return retval;
4656 }
4657
4211 /* 4658 /*
4212 * Make sure no PI-waiters arrive (or leave) while we are 4659 * Make sure no PI-waiters arrive (or leave) while we are
4213 * changing the priority of the task: 4660 * changing the priority of the task:
@@ -4237,6 +4684,8 @@ recheck:
4237 goto change; 4684 goto change;
4238 if (dl_policy(policy) && dl_param_changed(p, attr)) 4685 if (dl_policy(policy) && dl_param_changed(p, attr))
4239 goto change; 4686 goto change;
4687 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
4688 goto change;
4240 4689
4241 p->sched_reset_on_fork = reset_on_fork; 4690 p->sched_reset_on_fork = reset_on_fork;
4242 task_rq_unlock(rq, p, &rf); 4691 task_rq_unlock(rq, p, &rf);
@@ -4267,7 +4716,7 @@ change:
4267 * the entire root_domain to become SCHED_DEADLINE. We 4716 * the entire root_domain to become SCHED_DEADLINE. We
4268 * will also fail if there's no bandwidth available. 4717 * will also fail if there's no bandwidth available.
4269 */ 4718 */
4270 if (!cpumask_subset(span, &p->cpus_allowed) || 4719 if (!cpumask_subset(span, p->cpus_ptr) ||
4271 rq->rd->dl_bw.bw == 0) { 4720 rq->rd->dl_bw.bw == 0) {
4272 task_rq_unlock(rq, p, &rf); 4721 task_rq_unlock(rq, p, &rf);
4273 return -EPERM; 4722 return -EPERM;
@@ -4317,7 +4766,9 @@ change:
4317 put_prev_task(rq, p); 4766 put_prev_task(rq, p);
4318 4767
4319 prev_class = p->sched_class; 4768 prev_class = p->sched_class;
4769
4320 __setscheduler(rq, p, attr, pi); 4770 __setscheduler(rq, p, attr, pi);
4771 __setscheduler_uclamp(p, attr);
4321 4772
4322 if (queued) { 4773 if (queued) {
4323 /* 4774 /*
@@ -4493,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
4493 if (ret) 4944 if (ret)
4494 return -EFAULT; 4945 return -EFAULT;
4495 4946
4947 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
4948 size < SCHED_ATTR_SIZE_VER1)
4949 return -EINVAL;
4950
4496 /* 4951 /*
4497 * XXX: Do we want to be lenient like existing syscalls; or do we want 4952 * XXX: Do we want to be lenient like existing syscalls; or do we want
4498 * to be strict and return an error on out-of-bounds values? 4953 * to be strict and return an error on out-of-bounds values?
@@ -4556,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4556 5011
4557 if ((int)attr.sched_policy < 0) 5012 if ((int)attr.sched_policy < 0)
4558 return -EINVAL; 5013 return -EINVAL;
5014 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
5015 attr.sched_policy = SETPARAM_POLICY;
4559 5016
4560 rcu_read_lock(); 5017 rcu_read_lock();
4561 retval = -ESRCH; 5018 retval = -ESRCH;
4562 p = find_process_by_pid(pid); 5019 p = find_process_by_pid(pid);
4563 if (p != NULL) 5020 if (likely(p))
4564 retval = sched_setattr(p, &attr); 5021 get_task_struct(p);
4565 rcu_read_unlock(); 5022 rcu_read_unlock();
4566 5023
5024 if (likely(p)) {
5025 retval = sched_setattr(p, &attr);
5026 put_task_struct(p);
5027 }
5028
4567 return retval; 5029 return retval;
4568} 5030}
4569 5031
@@ -4714,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4714 else 5176 else
4715 attr.sched_nice = task_nice(p); 5177 attr.sched_nice = task_nice(p);
4716 5178
5179#ifdef CONFIG_UCLAMP_TASK
5180 attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
5181 attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
5182#endif
5183
4717 rcu_read_unlock(); 5184 rcu_read_unlock();
4718 5185
4719 retval = sched_read_attr(uattr, &attr, size); 5186 retval = sched_read_attr(uattr, &attr, size);
@@ -4866,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
4866 goto out_unlock; 5333 goto out_unlock;
4867 5334
4868 raw_spin_lock_irqsave(&p->pi_lock, flags); 5335 raw_spin_lock_irqsave(&p->pi_lock, flags);
4869 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); 5336 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
4870 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 5337 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4871 5338
4872out_unlock: 5339out_unlock:
@@ -5123,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
5123} 5590}
5124EXPORT_SYMBOL(io_schedule_timeout); 5591EXPORT_SYMBOL(io_schedule_timeout);
5125 5592
5126void io_schedule(void) 5593void __sched io_schedule(void)
5127{ 5594{
5128 int token; 5595 int token;
5129 5596
@@ -5443,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
5443 * allowed nodes is unnecessary. Thus, cpusets are not 5910 * allowed nodes is unnecessary. Thus, cpusets are not
5444 * applicable for such threads. This prevents checking for 5911 * applicable for such threads. This prevents checking for
5445 * success of set_cpus_allowed_ptr() on all attached tasks 5912 * success of set_cpus_allowed_ptr() on all attached tasks
5446 * before cpus_allowed may be changed. 5913 * before cpus_mask may be changed.
5447 */ 5914 */
5448 if (p->flags & PF_NO_SETAFFINITY) { 5915 if (p->flags & PF_NO_SETAFFINITY) {
5449 ret = -EINVAL; 5916 ret = -EINVAL;
@@ -5470,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
5470 if (curr_cpu == target_cpu) 5937 if (curr_cpu == target_cpu)
5471 return 0; 5938 return 0;
5472 5939
5473 if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed)) 5940 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
5474 return -EINVAL; 5941 return -EINVAL;
5475 5942
5476 /* TODO: This is not properly updating schedstats */ 5943 /* TODO: This is not properly updating schedstats */
@@ -5608,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5608 put_prev_task(rq, next); 6075 put_prev_task(rq, next);
5609 6076
5610 /* 6077 /*
5611 * Rules for changing task_struct::cpus_allowed are holding 6078 * Rules for changing task_struct::cpus_mask are holding
5612 * both pi_lock and rq->lock, such that holding either 6079 * both pi_lock and rq->lock, such that holding either
5613 * stabilizes the mask. 6080 * stabilizes the mask.
5614 * 6081 *
@@ -5902,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5902 6369
5903void __init sched_init(void) 6370void __init sched_init(void)
5904{ 6371{
5905 int i, j;
5906 unsigned long alloc_size = 0, ptr; 6372 unsigned long alloc_size = 0, ptr;
6373 int i;
5907 6374
5908 wait_bit_init(); 6375 wait_bit_init();
5909 6376
@@ -6005,10 +6472,6 @@ void __init sched_init(void)
6005#ifdef CONFIG_RT_GROUP_SCHED 6472#ifdef CONFIG_RT_GROUP_SCHED
6006 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); 6473 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
6007#endif 6474#endif
6008
6009 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6010 rq->cpu_load[j] = 0;
6011
6012#ifdef CONFIG_SMP 6475#ifdef CONFIG_SMP
6013 rq->sd = NULL; 6476 rq->sd = NULL;
6014 rq->rd = NULL; 6477 rq->rd = NULL;
@@ -6063,6 +6526,8 @@ void __init sched_init(void)
6063 6526
6064 psi_init(); 6527 psi_init();
6065 6528
6529 init_uclamp();
6530
6066 scheduler_running = 1; 6531 scheduler_running = 1;
6067} 6532}
6068 6533
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ec4e4a9aab5f..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
120 const struct sched_dl_entity *dl_se = &p->dl; 120 const struct sched_dl_entity *dl_se = &p->dl;
121 121
122 if (later_mask && 122 if (later_mask &&
123 cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) { 123 cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
124 return 1; 124 return 1;
125 } else { 125 } else {
126 int best_cpu = cpudl_maximum(cp); 126 int best_cpu = cpudl_maximum(cp);
127 127
128 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 128 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
129 129
130 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 130 if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
131 dl_time_before(dl_se->deadline, cp->elements[0].dl)) { 131 dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
132 if (later_mask) 132 if (later_mask)
133 cpumask_set_cpu(best_cpu, later_mask); 133 cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
196 * based on the task model parameters and gives the minimal utilization 196 * based on the task model parameters and gives the minimal utilization
197 * required to meet deadlines. 197 * required to meet deadlines.
198 */ 198 */
199unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 199unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
200 unsigned long max, enum schedutil_type type) 200 unsigned long max, enum schedutil_type type,
201 struct task_struct *p)
201{ 202{
202 unsigned long dl_util, util, irq; 203 unsigned long dl_util, util, irq;
203 struct rq *rq = cpu_rq(cpu); 204 struct rq *rq = cpu_rq(cpu);
204 205
205 if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) 206 if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
207 type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
206 return max; 208 return max;
209 }
207 210
208 /* 211 /*
209 * Early check to see if IRQ/steal time saturates the CPU, can be 212 * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
219 * CFS tasks and we use the same metric to track the effective 222 * CFS tasks and we use the same metric to track the effective
220 * utilization (PELT windows are synchronized) we can directly add them 223 * utilization (PELT windows are synchronized) we can directly add them
221 * to obtain the CPU's actual utilization. 224 * to obtain the CPU's actual utilization.
225 *
226 * CFS and RT utilization can be boosted or capped, depending on
227 * utilization clamp constraints requested by currently RUNNABLE
228 * tasks.
229 * When there are no CFS RUNNABLE tasks, clamps are released and
230 * frequency will be gracefully reduced with the utilization decay.
222 */ 231 */
223 util = util_cfs; 232 util = util_cfs + cpu_util_rt(rq);
224 util += cpu_util_rt(rq); 233 if (type == FREQUENCY_UTIL)
234 util = uclamp_util_with(rq, util, p);
225 235
226 dl_util = cpu_util_dl(rq); 236 dl_util = cpu_util_dl(rq);
227 237
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
276{ 286{
277 struct rq *rq = cpu_rq(sg_cpu->cpu); 287 struct rq *rq = cpu_rq(sg_cpu->cpu);
278 unsigned long util = cpu_util_cfs(rq); 288 unsigned long util = cpu_util_cfs(rq);
279 unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); 289 unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
280 290
281 sg_cpu->max = max; 291 sg_cpu->max = max;
282 sg_cpu->bw_dl = cpu_bw_dl(rq); 292 sg_cpu->bw_dl = cpu_bw_dl(rq);
283 293
284 return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL); 294 return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
285} 295}
286 296
287/** 297/**
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 9c6480e6d62d..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
94 if (skip) 94 if (skip)
95 continue; 95 continue;
96 96
97 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 97 if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
98 continue; 98 continue;
99 99
100 if (lowest_mask) { 100 if (lowest_mask) {
101 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 101 cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
102 102
103 /* 103 /*
104 * We have to ensure that we have at least one bit 104 * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..8b5bb2ac16e2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
538 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
539 * online CPU: 539 * online CPU:
540 */ 540 */
541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
542 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
543 /* 543 /*
544 * Failed to find any suitable CPU. 544 * Failed to find any suitable CPU.
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
1195 &curr->dl); 1195 &curr->dl);
1196 } else { 1196 } else {
1197 unsigned long scale_freq = arch_scale_freq_capacity(cpu); 1197 unsigned long scale_freq = arch_scale_freq_capacity(cpu);
1198 unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 1198 unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
1199 1199
1200 scaled_delta_exec = cap_scale(delta_exec, scale_freq); 1200 scaled_delta_exec = cap_scale(delta_exec, scale_freq);
1201 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); 1201 scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
1824static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1824static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1825{ 1825{
1826 if (!task_running(rq, p) && 1826 if (!task_running(rq, p) &&
1827 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1827 cpumask_test_cpu(cpu, p->cpus_ptr))
1828 return 1; 1828 return 1;
1829 return 0; 1829 return 0;
1830} 1830}
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1974 /* Retry if something changed. */ 1974 /* Retry if something changed. */
1975 if (double_lock_balance(rq, later_rq)) { 1975 if (double_lock_balance(rq, later_rq)) {
1976 if (unlikely(task_rq(task) != rq || 1976 if (unlikely(task_rq(task) != rq ||
1977 !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) || 1977 !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
1978 task_running(rq, task) || 1978 task_running(rq, task) ||
1979 !dl_task(task) || 1979 !dl_task(task) ||
1980 !task_on_rq_queued(task))) { 1980 !task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
233 *tablep = NULL; 233 *tablep = NULL;
234} 234}
235 235
236static int min_load_idx = 0;
237static int max_load_idx = CPU_LOAD_IDX_MAX-1;
238
239static void 236static void
240set_table_entry(struct ctl_table *entry, 237set_table_entry(struct ctl_table *entry,
241 const char *procname, void *data, int maxlen, 238 const char *procname, void *data, int maxlen,
242 umode_t mode, proc_handler *proc_handler, 239 umode_t mode, proc_handler *proc_handler)
243 bool load_idx)
244{ 240{
245 entry->procname = procname; 241 entry->procname = procname;
246 entry->data = data; 242 entry->data = data;
247 entry->maxlen = maxlen; 243 entry->maxlen = maxlen;
248 entry->mode = mode; 244 entry->mode = mode;
249 entry->proc_handler = proc_handler; 245 entry->proc_handler = proc_handler;
250
251 if (load_idx) {
252 entry->extra1 = &min_load_idx;
253 entry->extra2 = &max_load_idx;
254 }
255} 246}
256 247
257static struct ctl_table * 248static struct ctl_table *
258sd_alloc_ctl_domain_table(struct sched_domain *sd) 249sd_alloc_ctl_domain_table(struct sched_domain *sd)
259{ 250{
260 struct ctl_table *table = sd_alloc_ctl_entry(14); 251 struct ctl_table *table = sd_alloc_ctl_entry(9);
261 252
262 if (table == NULL) 253 if (table == NULL)
263 return NULL; 254 return NULL;
264 255
265 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 256 set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
266 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); 257 set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
267 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 258 set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
268 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 259 set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
269 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 260 set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
270 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 261 set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax);
271 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); 262 set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
272 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); 263 set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
273 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); 264 /* &table[8] is terminator */
274 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
275 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
276 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
277 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
278 /* &table[13] is terminator */
279 265
280 return table; 266 return table;
281} 267}
@@ -653,8 +639,6 @@ do { \
653 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) 639 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
654 640
655 P(nr_running); 641 P(nr_running);
656 SEQ_printf(m, " .%-30s: %lu\n", "load",
657 rq->load.weight);
658 P(nr_switches); 642 P(nr_switches);
659 P(nr_load_updates); 643 P(nr_load_updates);
660 P(nr_uninterruptible); 644 P(nr_uninterruptible);
@@ -662,11 +646,6 @@ do { \
662 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); 646 SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
663 PN(clock); 647 PN(clock);
664 PN(clock_task); 648 PN(clock_task);
665 P(cpu_load[0]);
666 P(cpu_load[1]);
667 P(cpu_load[2]);
668 P(cpu_load[3]);
669 P(cpu_load[4]);
670#undef P 649#undef P
671#undef PN 650#undef PN
672 651
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8591529e1753..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
275 return grp->my_q; 275 return grp->my_q;
276} 276}
277 277
278static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
279{
280 if (!path)
281 return;
282
283 if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
284 autogroup_path(cfs_rq->tg, path, len);
285 else if (cfs_rq && cfs_rq->tg->css.cgroup)
286 cgroup_path(cfs_rq->tg->css.cgroup, path, len);
287 else
288 strlcpy(path, "(null)", len);
289}
290
278static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 291static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
279{ 292{
280 struct rq *rq = rq_of(cfs_rq); 293 struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
449 return NULL; 462 return NULL;
450} 463}
451 464
465static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
466{
467 if (path)
468 strlcpy(path, "(null)", len);
469}
470
452static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 471static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
453{ 472{
454 return true; 473 return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
764 struct sched_entity *se = &p->se; 783 struct sched_entity *se = &p->se;
765 struct cfs_rq *cfs_rq = cfs_rq_of(se); 784 struct cfs_rq *cfs_rq = cfs_rq_of(se);
766 struct sched_avg *sa = &se->avg; 785 struct sched_avg *sa = &se->avg;
767 long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); 786 long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
768 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; 787 long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
769 788
770 if (cap > 0) { 789 if (cap > 0) {
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1466 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; 1485 group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
1467} 1486}
1468 1487
1469static unsigned long weighted_cpuload(struct rq *rq); 1488static unsigned long cpu_runnable_load(struct rq *rq);
1470static unsigned long source_load(int cpu, int type);
1471static unsigned long target_load(int cpu, int type);
1472 1489
1473/* Cached statistics for all CPUs within a node */ 1490/* Cached statistics for all CPUs within a node */
1474struct numa_stats { 1491struct numa_stats {
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1489 for_each_cpu(cpu, cpumask_of_node(nid)) { 1506 for_each_cpu(cpu, cpumask_of_node(nid)) {
1490 struct rq *rq = cpu_rq(cpu); 1507 struct rq *rq = cpu_rq(cpu);
1491 1508
1492 ns->load += weighted_cpuload(rq); 1509 ns->load += cpu_runnable_load(rq);
1493 ns->compute_capacity += capacity_of(cpu); 1510 ns->compute_capacity += capacity_of(cpu);
1494 } 1511 }
1495 1512
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
1621 * be incurred if the tasks were swapped. 1638 * be incurred if the tasks were swapped.
1622 */ 1639 */
1623 /* Skip this swap candidate if cannot move to the source cpu */ 1640 /* Skip this swap candidate if cannot move to the source cpu */
1624 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1641 if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
1625 goto unlock; 1642 goto unlock;
1626 1643
1627 /* 1644 /*
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
1718 1735
1719 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) { 1736 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1720 /* Skip this CPU if the source task cannot migrate */ 1737 /* Skip this CPU if the source task cannot migrate */
1721 if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed)) 1738 if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
1722 continue; 1739 continue;
1723 1740
1724 env->dst_cpu = cpu; 1741 env->dst_cpu = cpu;
@@ -2686,8 +2703,6 @@ static void
2686account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2703account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2687{ 2704{
2688 update_load_add(&cfs_rq->load, se->load.weight); 2705 update_load_add(&cfs_rq->load, se->load.weight);
2689 if (!parent_entity(se))
2690 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2691#ifdef CONFIG_SMP 2706#ifdef CONFIG_SMP
2692 if (entity_is_task(se)) { 2707 if (entity_is_task(se)) {
2693 struct rq *rq = rq_of(cfs_rq); 2708 struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2718,6 @@ static void
2703account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 2718account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2704{ 2719{
2705 update_load_sub(&cfs_rq->load, se->load.weight); 2720 update_load_sub(&cfs_rq->load, se->load.weight);
2706 if (!parent_entity(se))
2707 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2708#ifdef CONFIG_SMP 2721#ifdef CONFIG_SMP
2709 if (entity_is_task(se)) { 2722 if (entity_is_task(se)) {
2710 account_numa_dequeue(rq_of(cfs_rq), task_of(se)); 2723 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
3334 update_tg_cfs_util(cfs_rq, se, gcfs_rq); 3347 update_tg_cfs_util(cfs_rq, se, gcfs_rq);
3335 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq); 3348 update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
3336 3349
3350 trace_pelt_cfs_tp(cfs_rq);
3351 trace_pelt_se_tp(se);
3352
3337 return 1; 3353 return 1;
3338} 3354}
3339 3355
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3486 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum); 3502 add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
3487 3503
3488 cfs_rq_util_change(cfs_rq, flags); 3504 cfs_rq_util_change(cfs_rq, flags);
3505
3506 trace_pelt_cfs_tp(cfs_rq);
3489} 3507}
3490 3508
3491/** 3509/**
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
3505 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); 3523 add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
3506 3524
3507 cfs_rq_util_change(cfs_rq, 0); 3525 cfs_rq_util_change(cfs_rq, 0);
3526
3527 trace_pelt_cfs_tp(cfs_rq);
3508} 3528}
3509 3529
3510/* 3530/*
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
4100 * least twice that of our own weight (i.e. dont track it 4120 * least twice that of our own weight (i.e. dont track it
4101 * when there are only lesser-weight tasks around): 4121 * when there are only lesser-weight tasks around):
4102 */ 4122 */
4103 if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 4123 if (schedstat_enabled() &&
4124 rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
4104 schedstat_set(se->statistics.slice_max, 4125 schedstat_set(se->statistics.slice_max,
4105 max((u64)schedstat_val(se->statistics.slice_max), 4126 max((u64)schedstat_val(se->statistics.slice_max),
4106 se->sum_exec_runtime - se->prev_sum_exec_runtime)); 4127 se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4734 if (runtime_refresh_within(cfs_b, min_left)) 4755 if (runtime_refresh_within(cfs_b, min_left))
4735 return; 4756 return;
4736 4757
4758 /* don't push forwards an existing deferred unthrottle */
4759 if (cfs_b->slack_started)
4760 return;
4761 cfs_b->slack_started = true;
4762
4737 hrtimer_start(&cfs_b->slack_timer, 4763 hrtimer_start(&cfs_b->slack_timer,
4738 ns_to_ktime(cfs_bandwidth_slack_period), 4764 ns_to_ktime(cfs_bandwidth_slack_period),
4739 HRTIMER_MODE_REL); 4765 HRTIMER_MODE_REL);
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4787 4813
4788 /* confirm we're still not at a refresh boundary */ 4814 /* confirm we're still not at a refresh boundary */
4789 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4815 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4816 cfs_b->slack_started = false;
4790 if (cfs_b->distribute_running) { 4817 if (cfs_b->distribute_running) {
4791 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4818 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4792 return; 4819 return;
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4950 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4977 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4951 cfs_b->slack_timer.function = sched_cfs_slack_timer; 4978 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4952 cfs_b->distribute_running = 0; 4979 cfs_b->distribute_running = 0;
4980 cfs_b->slack_started = false;
4953} 4981}
4954 4982
4955static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) 4983static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
5153 5181
5154static inline void update_overutilized_status(struct rq *rq) 5182static inline void update_overutilized_status(struct rq *rq)
5155{ 5183{
5156 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) 5184 if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
5157 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED); 5185 WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
5186 trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
5187 }
5158} 5188}
5159#else 5189#else
5160static inline void update_overutilized_status(struct rq *rq) { } 5190static inline void update_overutilized_status(struct rq *rq) { }
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
5325DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); 5355DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5326 5356
5327#ifdef CONFIG_NO_HZ_COMMON 5357#ifdef CONFIG_NO_HZ_COMMON
5328/*
5329 * per rq 'load' arrray crap; XXX kill this.
5330 */
5331
5332/*
5333 * The exact cpuload calculated at every tick would be:
5334 *
5335 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5336 *
5337 * If a CPU misses updates for n ticks (as it was idle) and update gets
5338 * called on the n+1-th tick when CPU may be busy, then we have:
5339 *
5340 * load_n = (1 - 1/2^i)^n * load_0
5341 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
5342 *
5343 * decay_load_missed() below does efficient calculation of
5344 *
5345 * load' = (1 - 1/2^i)^n * load
5346 *
5347 * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
5348 * This allows us to precompute the above in said factors, thereby allowing the
5349 * reduction of an arbitrary n in O(log_2 n) steps. (See also
5350 * fixed_power_int())
5351 *
5352 * The calculation is approximated on a 128 point scale.
5353 */
5354#define DEGRADE_SHIFT 7
5355
5356static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
5357static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
5358 { 0, 0, 0, 0, 0, 0, 0, 0 },
5359 { 64, 32, 8, 0, 0, 0, 0, 0 },
5360 { 96, 72, 40, 12, 1, 0, 0, 0 },
5361 { 112, 98, 75, 43, 15, 1, 0, 0 },
5362 { 120, 112, 98, 76, 45, 16, 2, 0 }
5363};
5364
5365/*
5366 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
5367 * would be when CPU is idle and so we just decay the old load without
5368 * adding any new load.
5369 */
5370static unsigned long
5371decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
5372{
5373 int j = 0;
5374
5375 if (!missed_updates)
5376 return load;
5377
5378 if (missed_updates >= degrade_zero_ticks[idx])
5379 return 0;
5380
5381 if (idx == 1)
5382 return load >> missed_updates;
5383
5384 while (missed_updates) {
5385 if (missed_updates % 2)
5386 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5387
5388 missed_updates >>= 1;
5389 j++;
5390 }
5391 return load;
5392}
5393 5358
5394static struct { 5359static struct {
5395 cpumask_var_t idle_cpus_mask; 5360 cpumask_var_t idle_cpus_mask;
@@ -5401,234 +5366,11 @@ static struct {
5401 5366
5402#endif /* CONFIG_NO_HZ_COMMON */ 5367#endif /* CONFIG_NO_HZ_COMMON */
5403 5368
5404/** 5369static unsigned long cpu_runnable_load(struct rq *rq)
5405 * __cpu_load_update - update the rq->cpu_load[] statistics
5406 * @this_rq: The rq to update statistics for
5407 * @this_load: The current load
5408 * @pending_updates: The number of missed updates
5409 *
5410 * Update rq->cpu_load[] statistics. This function is usually called every
5411 * scheduler tick (TICK_NSEC).
5412 *
5413 * This function computes a decaying average:
5414 *
5415 * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
5416 *
5417 * Because of NOHZ it might not get called on every tick which gives need for
5418 * the @pending_updates argument.
5419 *
5420 * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
5421 * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
5422 * = A * (A * load[i]_n-2 + B) + B
5423 * = A * (A * (A * load[i]_n-3 + B) + B) + B
5424 * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
5425 * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
5426 * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
5427 * = (1 - 1/2^i)^n * (load[i]_0 - load) + load
5428 *
5429 * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
5430 * any change in load would have resulted in the tick being turned back on.
5431 *
5432 * For regular NOHZ, this reduces to:
5433 *
5434 * load[i]_n = (1 - 1/2^i)^n * load[i]_0
5435 *
5436 * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
5437 * term.
5438 */
5439static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
5440 unsigned long pending_updates)
5441{
5442 unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
5443 int i, scale;
5444
5445 this_rq->nr_load_updates++;
5446
5447 /* Update our load: */
5448 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5449 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5450 unsigned long old_load, new_load;
5451
5452 /* scale is effectively 1 << i now, and >> i divides by scale */
5453
5454 old_load = this_rq->cpu_load[i];
5455#ifdef CONFIG_NO_HZ_COMMON
5456 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5457 if (tickless_load) {
5458 old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
5459 /*
5460 * old_load can never be a negative value because a
5461 * decayed tickless_load cannot be greater than the
5462 * original tickless_load.
5463 */
5464 old_load += tickless_load;
5465 }
5466#endif
5467 new_load = this_load;
5468 /*
5469 * Round up the averaging division if load is increasing. This
5470 * prevents us from getting stuck on 9 if the load is 10, for
5471 * example.
5472 */
5473 if (new_load > old_load)
5474 new_load += scale - 1;
5475
5476 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5477 }
5478}
5479
5480/* Used instead of source_load when we know the type == 0 */
5481static unsigned long weighted_cpuload(struct rq *rq)
5482{ 5370{
5483 return cfs_rq_runnable_load_avg(&rq->cfs); 5371 return cfs_rq_runnable_load_avg(&rq->cfs);
5484} 5372}
5485 5373
5486#ifdef CONFIG_NO_HZ_COMMON
5487/*
5488 * There is no sane way to deal with nohz on smp when using jiffies because the
5489 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5490 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5491 *
5492 * Therefore we need to avoid the delta approach from the regular tick when
5493 * possible since that would seriously skew the load calculation. This is why we
5494 * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
5495 * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
5496 * loop exit, nohz_idle_balance, nohz full exit...)
5497 *
5498 * This means we might still be one tick off for nohz periods.
5499 */
5500
5501static void cpu_load_update_nohz(struct rq *this_rq,
5502 unsigned long curr_jiffies,
5503 unsigned long load)
5504{
5505 unsigned long pending_updates;
5506
5507 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5508 if (pending_updates) {
5509 this_rq->last_load_update_tick = curr_jiffies;
5510 /*
5511 * In the regular NOHZ case, we were idle, this means load 0.
5512 * In the NOHZ_FULL case, we were non-idle, we should consider
5513 * its weighted load.
5514 */
5515 cpu_load_update(this_rq, load, pending_updates);
5516 }
5517}
5518
5519/*
5520 * Called from nohz_idle_balance() to update the load ratings before doing the
5521 * idle balance.
5522 */
5523static void cpu_load_update_idle(struct rq *this_rq)
5524{
5525 /*
5526 * bail if there's load or we're actually up-to-date.
5527 */
5528 if (weighted_cpuload(this_rq))
5529 return;
5530
5531 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
5532}
5533
5534/*
5535 * Record CPU load on nohz entry so we know the tickless load to account
5536 * on nohz exit. cpu_load[0] happens then to be updated more frequently
5537 * than other cpu_load[idx] but it should be fine as cpu_load readers
5538 * shouldn't rely into synchronized cpu_load[*] updates.
5539 */
5540void cpu_load_update_nohz_start(void)
5541{
5542 struct rq *this_rq = this_rq();
5543
5544 /*
5545 * This is all lockless but should be fine. If weighted_cpuload changes
5546 * concurrently we'll exit nohz. And cpu_load write can race with
5547 * cpu_load_update_idle() but both updater would be writing the same.
5548 */
5549 this_rq->cpu_load[0] = weighted_cpuload(this_rq);
5550}
5551
5552/*
5553 * Account the tickless load in the end of a nohz frame.
5554 */
5555void cpu_load_update_nohz_stop(void)
5556{
5557 unsigned long curr_jiffies = READ_ONCE(jiffies);
5558 struct rq *this_rq = this_rq();
5559 unsigned long load;
5560 struct rq_flags rf;
5561
5562 if (curr_jiffies == this_rq->last_load_update_tick)
5563 return;
5564
5565 load = weighted_cpuload(this_rq);
5566 rq_lock(this_rq, &rf);
5567 update_rq_clock(this_rq);
5568 cpu_load_update_nohz(this_rq, curr_jiffies, load);
5569 rq_unlock(this_rq, &rf);
5570}
5571#else /* !CONFIG_NO_HZ_COMMON */
5572static inline void cpu_load_update_nohz(struct rq *this_rq,
5573 unsigned long curr_jiffies,
5574 unsigned long load) { }
5575#endif /* CONFIG_NO_HZ_COMMON */
5576
5577static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
5578{
5579#ifdef CONFIG_NO_HZ_COMMON
5580 /* See the mess around cpu_load_update_nohz(). */
5581 this_rq->last_load_update_tick = READ_ONCE(jiffies);
5582#endif
5583 cpu_load_update(this_rq, load, 1);
5584}
5585
5586/*
5587 * Called from scheduler_tick()
5588 */
5589void cpu_load_update_active(struct rq *this_rq)
5590{
5591 unsigned long load = weighted_cpuload(this_rq);
5592
5593 if (tick_nohz_tick_stopped())
5594 cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
5595 else
5596 cpu_load_update_periodic(this_rq, load);
5597}
5598
5599/*
5600 * Return a low guess at the load of a migration-source CPU weighted
5601 * according to the scheduling class and "nice" value.
5602 *
5603 * We want to under-estimate the load of migration sources, to
5604 * balance conservatively.
5605 */
5606static unsigned long source_load(int cpu, int type)
5607{
5608 struct rq *rq = cpu_rq(cpu);
5609 unsigned long total = weighted_cpuload(rq);
5610
5611 if (type == 0 || !sched_feat(LB_BIAS))
5612 return total;
5613
5614 return min(rq->cpu_load[type-1], total);
5615}
5616
5617/*
5618 * Return a high guess at the load of a migration-target CPU weighted
5619 * according to the scheduling class and "nice" value.
5620 */
5621static unsigned long target_load(int cpu, int type)
5622{
5623 struct rq *rq = cpu_rq(cpu);
5624 unsigned long total = weighted_cpuload(rq);
5625
5626 if (type == 0 || !sched_feat(LB_BIAS))
5627 return total;
5628
5629 return max(rq->cpu_load[type-1], total);
5630}
5631
5632static unsigned long capacity_of(int cpu) 5374static unsigned long capacity_of(int cpu)
5633{ 5375{
5634 return cpu_rq(cpu)->cpu_capacity; 5376 return cpu_rq(cpu)->cpu_capacity;
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
5638{ 5380{
5639 struct rq *rq = cpu_rq(cpu); 5381 struct rq *rq = cpu_rq(cpu);
5640 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); 5382 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5641 unsigned long load_avg = weighted_cpuload(rq); 5383 unsigned long load_avg = cpu_runnable_load(rq);
5642 5384
5643 if (nr_running) 5385 if (nr_running)
5644 return load_avg / nr_running; 5386 return load_avg / nr_running;
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5736 s64 this_eff_load, prev_eff_load; 5478 s64 this_eff_load, prev_eff_load;
5737 unsigned long task_load; 5479 unsigned long task_load;
5738 5480
5739 this_eff_load = target_load(this_cpu, sd->wake_idx); 5481 this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
5740 5482
5741 if (sync) { 5483 if (sync) {
5742 unsigned long current_load = task_h_load(current); 5484 unsigned long current_load = task_h_load(current);
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5754 this_eff_load *= 100; 5496 this_eff_load *= 100;
5755 this_eff_load *= capacity_of(prev_cpu); 5497 this_eff_load *= capacity_of(prev_cpu);
5756 5498
5757 prev_eff_load = source_load(prev_cpu, sd->wake_idx); 5499 prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
5758 prev_eff_load -= task_load; 5500 prev_eff_load -= task_load;
5759 if (sched_feat(WA_BIAS)) 5501 if (sched_feat(WA_BIAS))
5760 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5502 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5815 unsigned long this_runnable_load = ULONG_MAX; 5557 unsigned long this_runnable_load = ULONG_MAX;
5816 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; 5558 unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
5817 unsigned long most_spare = 0, this_spare = 0; 5559 unsigned long most_spare = 0, this_spare = 0;
5818 int load_idx = sd->forkexec_idx;
5819 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; 5560 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5820 unsigned long imbalance = scale_load_down(NICE_0_LOAD) * 5561 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5821 (sd->imbalance_pct-100) / 100; 5562 (sd->imbalance_pct-100) / 100;
5822 5563
5823 if (sd_flag & SD_BALANCE_WAKE)
5824 load_idx = sd->wake_idx;
5825
5826 do { 5564 do {
5827 unsigned long load, avg_load, runnable_load; 5565 unsigned long load, avg_load, runnable_load;
5828 unsigned long spare_cap, max_spare_cap; 5566 unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5831 5569
5832 /* Skip over this group if it has no CPUs allowed */ 5570 /* Skip over this group if it has no CPUs allowed */
5833 if (!cpumask_intersects(sched_group_span(group), 5571 if (!cpumask_intersects(sched_group_span(group),
5834 &p->cpus_allowed)) 5572 p->cpus_ptr))
5835 continue; 5573 continue;
5836 5574
5837 local_group = cpumask_test_cpu(this_cpu, 5575 local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5846 max_spare_cap = 0; 5584 max_spare_cap = 0;
5847 5585
5848 for_each_cpu(i, sched_group_span(group)) { 5586 for_each_cpu(i, sched_group_span(group)) {
5849 /* Bias balancing toward CPUs of our domain */ 5587 load = cpu_runnable_load(cpu_rq(i));
5850 if (local_group)
5851 load = source_load(i, load_idx);
5852 else
5853 load = target_load(i, load_idx);
5854
5855 runnable_load += load; 5588 runnable_load += load;
5856 5589
5857 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); 5590 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5963 return cpumask_first(sched_group_span(group)); 5696 return cpumask_first(sched_group_span(group));
5964 5697
5965 /* Traverse only the allowed CPUs */ 5698 /* Traverse only the allowed CPUs */
5966 for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { 5699 for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
5967 if (available_idle_cpu(i)) { 5700 if (available_idle_cpu(i)) {
5968 struct rq *rq = cpu_rq(i); 5701 struct rq *rq = cpu_rq(i);
5969 struct cpuidle_state *idle = idle_get_state(rq); 5702 struct cpuidle_state *idle = idle_get_state(rq);
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5987 shallowest_idle_cpu = i; 5720 shallowest_idle_cpu = i;
5988 } 5721 }
5989 } else if (shallowest_idle_cpu == -1) { 5722 } else if (shallowest_idle_cpu == -1) {
5990 load = weighted_cpuload(cpu_rq(i)); 5723 load = cpu_runnable_load(cpu_rq(i));
5991 if (load < min_load) { 5724 if (load < min_load) {
5992 min_load = load; 5725 min_load = load;
5993 least_loaded_cpu = i; 5726 least_loaded_cpu = i;
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6003{ 5736{
6004 int new_cpu = cpu; 5737 int new_cpu = cpu;
6005 5738
6006 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) 5739 if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
6007 return prev_cpu; 5740 return prev_cpu;
6008 5741
6009 /* 5742 /*
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
6120 if (!test_idle_cores(target, false)) 5853 if (!test_idle_cores(target, false))
6121 return -1; 5854 return -1;
6122 5855
6123 cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed); 5856 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
6124 5857
6125 for_each_cpu_wrap(core, cpus, target) { 5858 for_each_cpu_wrap(core, cpus, target) {
6126 bool idle = true; 5859 bool idle = true;
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target)
6154 return -1; 5887 return -1;
6155 5888
6156 for_each_cpu(cpu, cpu_smt_mask(target)) { 5889 for_each_cpu(cpu, cpu_smt_mask(target)) {
6157 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 5890 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6158 continue; 5891 continue;
6159 if (available_idle_cpu(cpu)) 5892 if (available_idle_cpu(cpu))
6160 return cpu; 5893 return cpu;
@@ -6218,7 +5951,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
6218 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { 5951 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6219 if (!--nr) 5952 if (!--nr)
6220 return -1; 5953 return -1;
6221 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 5954 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6222 continue; 5955 continue;
6223 if (available_idle_cpu(cpu)) 5956 if (available_idle_cpu(cpu))
6224 break; 5957 break;
@@ -6255,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6255 recent_used_cpu != target && 5988 recent_used_cpu != target &&
6256 cpus_share_cache(recent_used_cpu, target) && 5989 cpus_share_cache(recent_used_cpu, target) &&
6257 available_idle_cpu(recent_used_cpu) && 5990 available_idle_cpu(recent_used_cpu) &&
6258 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 5991 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6259 /* 5992 /*
6260 * Replace recent_used_cpu with prev as it is a potential 5993 * Replace recent_used_cpu with prev as it is a potential
6261 * candidate for the next wake: 5994 * candidate for the next wake:
@@ -6499,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6499static long 6232static long
6500compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) 6233compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6501{ 6234{
6502 long util, max_util, sum_util, energy = 0; 6235 unsigned int max_util, util_cfs, cpu_util, cpu_cap;
6236 unsigned long sum_util, energy = 0;
6237 struct task_struct *tsk;
6503 int cpu; 6238 int cpu;
6504 6239
6505 for (; pd; pd = pd->next) { 6240 for (; pd; pd = pd->next) {
6241 struct cpumask *pd_mask = perf_domain_span(pd);
6242
6243 /*
6244 * The energy model mandates all the CPUs of a performance
6245 * domain have the same capacity.
6246 */
6247 cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6506 max_util = sum_util = 0; 6248 max_util = sum_util = 0;
6249
6507 /* 6250 /*
6508 * The capacity state of CPUs of the current rd can be driven by 6251 * The capacity state of CPUs of the current rd can be driven by
6509 * CPUs of another rd if they belong to the same performance 6252 * CPUs of another rd if they belong to the same performance
@@ -6514,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6514 * it will not appear in its pd list and will not be accounted 6257 * it will not appear in its pd list and will not be accounted
6515 * by compute_energy(). 6258 * by compute_energy().
6516 */ 6259 */
6517 for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) { 6260 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6518 util = cpu_util_next(cpu, p, dst_cpu); 6261 util_cfs = cpu_util_next(cpu, p, dst_cpu);
6519 util = schedutil_energy_util(cpu, util); 6262
6520 max_util = max(util, max_util); 6263 /*
6521 sum_util += util; 6264 * Busy time computation: utilization clamping is not
6265 * required since the ratio (sum_util / cpu_capacity)
6266 * is already enough to scale the EM reported power
6267 * consumption at the (eventually clamped) cpu_capacity.
6268 */
6269 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6270 ENERGY_UTIL, NULL);
6271
6272 /*
6273 * Performance domain frequency: utilization clamping
6274 * must be considered since it affects the selection
6275 * of the performance domain frequency.
6276 * NOTE: in case RT tasks are running, by default the
6277 * FREQUENCY_UTIL's utilization can be max OPP.
6278 */
6279 tsk = cpu == dst_cpu ? p : NULL;
6280 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6281 FREQUENCY_UTIL, tsk);
6282 max_util = max(max_util, cpu_util);
6522 } 6283 }
6523 6284
6524 energy += em_pd_energy(pd->em_pd, max_util, sum_util); 6285 energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6601,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6601 int max_spare_cap_cpu = -1; 6362 int max_spare_cap_cpu = -1;
6602 6363
6603 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { 6364 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6604 if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) 6365 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6605 continue; 6366 continue;
6606 6367
6607 /* Skip CPUs that will be overutilized. */ 6368 /* Skip CPUs that will be overutilized. */
@@ -6690,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6690 } 6451 }
6691 6452
6692 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) && 6453 want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
6693 cpumask_test_cpu(cpu, &p->cpus_allowed); 6454 cpumask_test_cpu(cpu, p->cpus_ptr);
6694 } 6455 }
6695 6456
6696 rcu_read_lock(); 6457 rcu_read_lock();
@@ -7446,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7446 /* 7207 /*
7447 * We do not migrate tasks that are: 7208 * We do not migrate tasks that are:
7448 * 1) throttled_lb_pair, or 7209 * 1) throttled_lb_pair, or
7449 * 2) cannot be migrated to this CPU due to cpus_allowed, or 7210 * 2) cannot be migrated to this CPU due to cpus_ptr, or
7450 * 3) running (obviously), or 7211 * 3) running (obviously), or
7451 * 4) are cache-hot on their current CPU. 7212 * 4) are cache-hot on their current CPU.
7452 */ 7213 */
7453 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) 7214 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7454 return 0; 7215 return 0;
7455 7216
7456 if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) { 7217 if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
7457 int cpu; 7218 int cpu;
7458 7219
7459 schedstat_inc(p->se.statistics.nr_failed_migrations_affine); 7220 schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7473,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7473 7234
7474 /* Prevent to re-select dst_cpu via env's CPUs: */ 7235 /* Prevent to re-select dst_cpu via env's CPUs: */
7475 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7236 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7476 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7237 if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
7477 env->flags |= LBF_DST_PINNED; 7238 env->flags |= LBF_DST_PINNED;
7478 env->new_dst_cpu = cpu; 7239 env->new_dst_cpu = cpu;
7479 break; 7240 break;
@@ -7559,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
7559static const unsigned int sched_nr_migrate_break = 32; 7320static const unsigned int sched_nr_migrate_break = 32;
7560 7321
7561/* 7322/*
7562 * detach_tasks() -- tries to detach up to imbalance weighted load from 7323 * detach_tasks() -- tries to detach up to imbalance runnable load from
7563 * busiest_rq, as part of a balancing operation within domain "sd". 7324 * busiest_rq, as part of a balancing operation within domain "sd".
7564 * 7325 *
7565 * Returns number of detached tasks if successful and 0 otherwise. 7326 * Returns number of detached tasks if successful and 0 otherwise.
@@ -7627,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
7627 7388
7628 /* 7389 /*
7629 * We only want to steal up to the prescribed amount of 7390 * We only want to steal up to the prescribed amount of
7630 * weighted load. 7391 * runnable load.
7631 */ 7392 */
7632 if (env->imbalance <= 0) 7393 if (env->imbalance <= 0)
7633 break; 7394 break;
@@ -7696,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
7696 rq_unlock(env->dst_rq, &rf); 7457 rq_unlock(env->dst_rq, &rf);
7697} 7458}
7698 7459
7460#ifdef CONFIG_NO_HZ_COMMON
7699static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) 7461static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
7700{ 7462{
7701 if (cfs_rq->avg.load_avg) 7463 if (cfs_rq->avg.load_avg)
@@ -7723,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq)
7723 return false; 7485 return false;
7724} 7486}
7725 7487
7488static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
7489{
7490 rq->last_blocked_load_update_tick = jiffies;
7491
7492 if (!has_blocked)
7493 rq->has_blocked_load = 0;
7494}
7495#else
7496static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
7497static inline bool others_have_blocked(struct rq *rq) { return false; }
7498static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
7499#endif
7500
7726#ifdef CONFIG_FAIR_GROUP_SCHED 7501#ifdef CONFIG_FAIR_GROUP_SCHED
7727 7502
7728static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) 7503static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7788,11 +7563,7 @@ static void update_blocked_averages(int cpu)
7788 if (others_have_blocked(rq)) 7563 if (others_have_blocked(rq))
7789 done = false; 7564 done = false;
7790 7565
7791#ifdef CONFIG_NO_HZ_COMMON 7566 update_blocked_load_status(rq, !done);
7792 rq->last_blocked_load_update_tick = jiffies;
7793 if (done)
7794 rq->has_blocked_load = 0;
7795#endif
7796 rq_unlock_irqrestore(rq, &rf); 7567 rq_unlock_irqrestore(rq, &rf);
7797} 7568}
7798 7569
@@ -7858,11 +7629,7 @@ static inline void update_blocked_averages(int cpu)
7858 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class); 7629 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
7859 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class); 7630 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
7860 update_irq_load_avg(rq, 0); 7631 update_irq_load_avg(rq, 0);
7861#ifdef CONFIG_NO_HZ_COMMON 7632 update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
7862 rq->last_blocked_load_update_tick = jiffies;
7863 if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
7864 rq->has_blocked_load = 0;
7865#endif
7866 rq_unlock_irqrestore(rq, &rf); 7633 rq_unlock_irqrestore(rq, &rf);
7867} 7634}
7868 7635
@@ -7880,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
7880struct sg_lb_stats { 7647struct sg_lb_stats {
7881 unsigned long avg_load; /*Avg load across the CPUs of the group */ 7648 unsigned long avg_load; /*Avg load across the CPUs of the group */
7882 unsigned long group_load; /* Total load over the CPUs of the group */ 7649 unsigned long group_load; /* Total load over the CPUs of the group */
7883 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
7884 unsigned long load_per_task; 7650 unsigned long load_per_task;
7885 unsigned long group_capacity; 7651 unsigned long group_capacity;
7886 unsigned long group_util; /* Total utilization of the group */ 7652 unsigned long group_util; /* Total utilization of the group */
@@ -7934,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7934 }; 7700 };
7935} 7701}
7936 7702
7937/**
7938 * get_sd_load_idx - Obtain the load index for a given sched domain.
7939 * @sd: The sched_domain whose load_idx is to be obtained.
7940 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
7941 *
7942 * Return: The load index.
7943 */
7944static inline int get_sd_load_idx(struct sched_domain *sd,
7945 enum cpu_idle_type idle)
7946{
7947 int load_idx;
7948
7949 switch (idle) {
7950 case CPU_NOT_IDLE:
7951 load_idx = sd->busy_idx;
7952 break;
7953
7954 case CPU_NEWLY_IDLE:
7955 load_idx = sd->newidle_idx;
7956 break;
7957 default:
7958 load_idx = sd->idle_idx;
7959 break;
7960 }
7961
7962 return load_idx;
7963}
7964
7965static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) 7703static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
7966{ 7704{
7967 struct rq *rq = cpu_rq(cpu); 7705 struct rq *rq = cpu_rq(cpu);
7968 unsigned long max = arch_scale_cpu_capacity(sd, cpu); 7706 unsigned long max = arch_scale_cpu_capacity(cpu);
7969 unsigned long used, free; 7707 unsigned long used, free;
7970 unsigned long irq; 7708 unsigned long irq;
7971 7709
@@ -7990,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
7990 unsigned long capacity = scale_rt_capacity(sd, cpu); 7728 unsigned long capacity = scale_rt_capacity(sd, cpu);
7991 struct sched_group *sdg = sd->groups; 7729 struct sched_group *sdg = sd->groups;
7992 7730
7993 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu); 7731 cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
7994 7732
7995 if (!capacity) 7733 if (!capacity)
7996 capacity = 1; 7734 capacity = 1;
@@ -8100,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
8100 7838
8101/* 7839/*
8102 * Group imbalance indicates (and tries to solve) the problem where balancing 7840 * Group imbalance indicates (and tries to solve) the problem where balancing
8103 * groups is inadequate due to ->cpus_allowed constraints. 7841 * groups is inadequate due to ->cpus_ptr constraints.
8104 * 7842 *
8105 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a 7843 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
8106 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. 7844 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8250,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8250 struct sg_lb_stats *sgs, 7988 struct sg_lb_stats *sgs,
8251 int *sg_status) 7989 int *sg_status)
8252{ 7990{
8253 int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
8254 int load_idx = get_sd_load_idx(env->sd, env->idle);
8255 unsigned long load;
8256 int i, nr_running; 7991 int i, nr_running;
8257 7992
8258 memset(sgs, 0, sizeof(*sgs)); 7993 memset(sgs, 0, sizeof(*sgs));
@@ -8263,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8263 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false)) 7998 if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
8264 env->flags |= LBF_NOHZ_AGAIN; 7999 env->flags |= LBF_NOHZ_AGAIN;
8265 8000
8266 /* Bias balancing toward CPUs of our domain: */ 8001 sgs->group_load += cpu_runnable_load(rq);
8267 if (local_group)
8268 load = target_load(i, load_idx);
8269 else
8270 load = source_load(i, load_idx);
8271
8272 sgs->group_load += load;
8273 sgs->group_util += cpu_util(i); 8002 sgs->group_util += cpu_util(i);
8274 sgs->sum_nr_running += rq->cfs.h_nr_running; 8003 sgs->sum_nr_running += rq->cfs.h_nr_running;
8275 8004
@@ -8284,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8284 sgs->nr_numa_running += rq->nr_numa_running; 8013 sgs->nr_numa_running += rq->nr_numa_running;
8285 sgs->nr_preferred_running += rq->nr_preferred_running; 8014 sgs->nr_preferred_running += rq->nr_preferred_running;
8286#endif 8015#endif
8287 sgs->sum_weighted_load += weighted_cpuload(rq);
8288 /* 8016 /*
8289 * No need to call idle_cpu() if nr_running is not 0 8017 * No need to call idle_cpu() if nr_running is not 0
8290 */ 8018 */
@@ -8303,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
8303 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity; 8031 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8304 8032
8305 if (sgs->sum_nr_running) 8033 if (sgs->sum_nr_running)
8306 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 8034 sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
8307 8035
8308 sgs->group_weight = group->group_weight; 8036 sgs->group_weight = group->group_weight;
8309 8037
@@ -8517,8 +8245,12 @@ next_group:
8517 8245
8518 /* Update over-utilization (tipping point, U >= 0) indicator */ 8246 /* Update over-utilization (tipping point, U >= 0) indicator */
8519 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED); 8247 WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
8248 trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
8520 } else if (sg_status & SG_OVERUTILIZED) { 8249 } else if (sg_status & SG_OVERUTILIZED) {
8521 WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED); 8250 struct root_domain *rd = env->dst_rq->rd;
8251
8252 WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
8253 trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
8522 } 8254 }
8523} 8255}
8524 8256
@@ -8724,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8724 * find_busiest_group - Returns the busiest group within the sched_domain 8456 * find_busiest_group - Returns the busiest group within the sched_domain
8725 * if there is an imbalance. 8457 * if there is an imbalance.
8726 * 8458 *
8727 * Also calculates the amount of weighted load which should be moved 8459 * Also calculates the amount of runnable load which should be moved
8728 * to restore balance. 8460 * to restore balance.
8729 * 8461 *
8730 * @env: The load balancing environment. 8462 * @env: The load balancing environment.
@@ -8769,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8769 /* 8501 /*
8770 * If the busiest group is imbalanced the below checks don't 8502 * If the busiest group is imbalanced the below checks don't
8771 * work because they assume all things are equal, which typically 8503 * work because they assume all things are equal, which typically
8772 * isn't true due to cpus_allowed constraints and the like. 8504 * isn't true due to cpus_ptr constraints and the like.
8773 */ 8505 */
8774 if (busiest->group_type == group_imbalanced) 8506 if (busiest->group_type == group_imbalanced)
8775 goto force_balance; 8507 goto force_balance;
@@ -8843,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8843 int i; 8575 int i;
8844 8576
8845 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 8577 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
8846 unsigned long capacity, wl; 8578 unsigned long capacity, load;
8847 enum fbq_type rt; 8579 enum fbq_type rt;
8848 8580
8849 rq = cpu_rq(i); 8581 rq = cpu_rq(i);
@@ -8897,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8897 rq->nr_running == 1) 8629 rq->nr_running == 1)
8898 continue; 8630 continue;
8899 8631
8900 wl = weighted_cpuload(rq); 8632 load = cpu_runnable_load(rq);
8901 8633
8902 /* 8634 /*
8903 * When comparing with imbalance, use weighted_cpuload() 8635 * When comparing with imbalance, use cpu_runnable_load()
8904 * which is not scaled with the CPU capacity. 8636 * which is not scaled with the CPU capacity.
8905 */ 8637 */
8906 8638
8907 if (rq->nr_running == 1 && wl > env->imbalance && 8639 if (rq->nr_running == 1 && load > env->imbalance &&
8908 !check_cpu_capacity(rq, env->sd)) 8640 !check_cpu_capacity(rq, env->sd))
8909 continue; 8641 continue;
8910 8642
8911 /* 8643 /*
8912 * For the load comparisons with the other CPU's, consider 8644 * For the load comparisons with the other CPU's, consider
8913 * the weighted_cpuload() scaled with the CPU capacity, so 8645 * the cpu_runnable_load() scaled with the CPU capacity, so
8914 * that the load can be moved away from the CPU that is 8646 * that the load can be moved away from the CPU that is
8915 * potentially running at a lower capacity. 8647 * potentially running at a lower capacity.
8916 * 8648 *
8917 * Thus we're looking for max(wl_i / capacity_i), crosswise 8649 * Thus we're looking for max(load_i / capacity_i), crosswise
8918 * multiplication to rid ourselves of the division works out 8650 * multiplication to rid ourselves of the division works out
8919 * to: wl_i * capacity_j > wl_j * capacity_i; where j is 8651 * to: load_i * capacity_j > load_j * capacity_i; where j is
8920 * our previous maximum. 8652 * our previous maximum.
8921 */ 8653 */
8922 if (wl * busiest_capacity > busiest_load * capacity) { 8654 if (load * busiest_capacity > busiest_load * capacity) {
8923 busiest_load = wl; 8655 busiest_load = load;
8924 busiest_capacity = capacity; 8656 busiest_capacity = capacity;
8925 busiest = rq; 8657 busiest = rq;
8926 } 8658 }
@@ -9211,7 +8943,7 @@ more_balance:
9211 * if the curr task on busiest CPU can't be 8943 * if the curr task on busiest CPU can't be
9212 * moved to this_cpu: 8944 * moved to this_cpu:
9213 */ 8945 */
9214 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 8946 if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
9215 raw_spin_unlock_irqrestore(&busiest->lock, 8947 raw_spin_unlock_irqrestore(&busiest->lock,
9216 flags); 8948 flags);
9217 env.flags |= LBF_ALL_PINNED; 8949 env.flags |= LBF_ALL_PINNED;
@@ -9880,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
9880 9612
9881 rq_lock_irqsave(rq, &rf); 9613 rq_lock_irqsave(rq, &rf);
9882 update_rq_clock(rq); 9614 update_rq_clock(rq);
9883 cpu_load_update_idle(rq);
9884 rq_unlock_irqrestore(rq, &rf); 9615 rq_unlock_irqrestore(rq, &rf);
9885 9616
9886 if (flags & NOHZ_BALANCE_KICK) 9617 if (flags & NOHZ_BALANCE_KICK)
@@ -10691,6 +10422,10 @@ const struct sched_class fair_sched_class = {
10691#ifdef CONFIG_FAIR_GROUP_SCHED 10422#ifdef CONFIG_FAIR_GROUP_SCHED
10692 .task_change_group = task_change_group_fair, 10423 .task_change_group = task_change_group_fair,
10693#endif 10424#endif
10425
10426#ifdef CONFIG_UCLAMP_TASK
10427 .uclamp_enabled = 1,
10428#endif
10694}; 10429};
10695 10430
10696#ifdef CONFIG_SCHED_DEBUG 10431#ifdef CONFIG_SCHED_DEBUG
@@ -10738,3 +10473,83 @@ __init void init_sched_fair_class(void)
10738#endif /* SMP */ 10473#endif /* SMP */
10739 10474
10740} 10475}
10476
10477/*
10478 * Helper functions to facilitate extracting info from tracepoints.
10479 */
10480
10481const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
10482{
10483#ifdef CONFIG_SMP
10484 return cfs_rq ? &cfs_rq->avg : NULL;
10485#else
10486 return NULL;
10487#endif
10488}
10489EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
10490
10491char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
10492{
10493 if (!cfs_rq) {
10494 if (str)
10495 strlcpy(str, "(null)", len);
10496 else
10497 return NULL;
10498 }
10499
10500 cfs_rq_tg_path(cfs_rq, str, len);
10501 return str;
10502}
10503EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
10504
10505int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
10506{
10507 return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
10508}
10509EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
10510
10511const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
10512{
10513#ifdef CONFIG_SMP
10514 return rq ? &rq->avg_rt : NULL;
10515#else
10516 return NULL;
10517#endif
10518}
10519EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
10520
10521const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
10522{
10523#ifdef CONFIG_SMP
10524 return rq ? &rq->avg_dl : NULL;
10525#else
10526 return NULL;
10527#endif
10528}
10529EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
10530
10531const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
10532{
10533#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
10534 return rq ? &rq->avg_irq : NULL;
10535#else
10536 return NULL;
10537#endif
10538}
10539EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
10540
10541int sched_trace_rq_cpu(struct rq *rq)
10542{
10543 return rq ? cpu_of(rq) : -1;
10544}
10545EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
10546
10547const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
10548{
10549#ifdef CONFIG_SMP
10550 return rd ? rd->span : NULL;
10551#else
10552 return NULL;
10553#endif
10554}
10555EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
39 39
40SCHED_FEAT(HRTICK, false) 40SCHED_FEAT(HRTICK, false)
41SCHED_FEAT(DOUBLE_TICK, false) 41SCHED_FEAT(DOUBLE_TICK, false)
42SCHED_FEAT(LB_BIAS, false)
43 42
44/* 43/*
45 * Decrement CPU capacity based on time not spent running tasks 44 * Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
28#include "sched.h" 28#include "sched.h"
29#include "pelt.h" 29#include "pelt.h"
30 30
31#include <trace/events/sched.h>
32
31/* 33/*
32 * Approximate: 34 * Approximate:
33 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 35 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
265{ 267{
266 if (___update_load_sum(now, &se->avg, 0, 0, 0)) { 268 if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
267 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 269 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
270 trace_pelt_se_tp(se);
268 return 1; 271 return 1;
269 } 272 }
270 273
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
278 281
279 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se)); 282 ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
280 cfs_se_util_change(&se->avg); 283 cfs_se_util_change(&se->avg);
284 trace_pelt_se_tp(se);
281 return 1; 285 return 1;
282 } 286 }
283 287
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
292 cfs_rq->curr != NULL)) { 296 cfs_rq->curr != NULL)) {
293 297
294 ___update_load_avg(&cfs_rq->avg, 1, 1); 298 ___update_load_avg(&cfs_rq->avg, 1, 1);
299 trace_pelt_cfs_tp(cfs_rq);
295 return 1; 300 return 1;
296 } 301 }
297 302
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
317 running)) { 322 running)) {
318 323
319 ___update_load_avg(&rq->avg_rt, 1, 1); 324 ___update_load_avg(&rq->avg_rt, 1, 1);
325 trace_pelt_rt_tp(rq);
320 return 1; 326 return 1;
321 } 327 }
322 328
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
340 running)) { 346 running)) {
341 347
342 ___update_load_avg(&rq->avg_dl, 1, 1); 348 ___update_load_avg(&rq->avg_dl, 1, 1);
349 trace_pelt_dl_tp(rq);
343 return 1; 350 return 1;
344 } 351 }
345 352
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
366 * reflect the real amount of computation 373 * reflect the real amount of computation
367 */ 374 */
368 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq))); 375 running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
369 running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 376 running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
370 377
371 /* 378 /*
372 * We know the time that has been used by interrupt since last update 379 * We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
388 1, 395 1,
389 1); 396 1);
390 397
391 if (ret) 398 if (ret) {
392 ___update_load_avg(&rq->avg_irq, 1, 1); 399 ___update_load_avg(&rq->avg_irq, 1, 1);
400 trace_pelt_irq_tp(rq);
401 }
393 402
394 return ret; 403 return ret;
395} 404}
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
79 * Scale the elapsed time to reflect the real amount of 79 * Scale the elapsed time to reflect the real amount of
80 * computation 80 * computation
81 */ 81 */
82 delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq))); 82 delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq))); 83 delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
84 84
85 rq->clock_pelt += delta; 85 rq->clock_pelt += delta;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1614static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 1614static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1615{ 1615{
1616 if (!task_running(rq, p) && 1616 if (!task_running(rq, p) &&
1617 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1617 cpumask_test_cpu(cpu, p->cpus_ptr))
1618 return 1; 1618 return 1;
1619 1619
1620 return 0; 1620 return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1751 * Also make sure that it wasn't scheduled on its rq. 1751 * Also make sure that it wasn't scheduled on its rq.
1752 */ 1752 */
1753 if (unlikely(task_rq(task) != rq || 1753 if (unlikely(task_rq(task) != rq ||
1754 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) || 1754 !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
1755 task_running(rq, task) || 1755 task_running(rq, task) ||
1756 !rt_task(task) || 1756 !rt_task(task) ||
1757 !task_on_rq_queued(task))) { 1757 !task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
2400 .switched_to = switched_to_rt, 2400 .switched_to = switched_to_rt,
2401 2401
2402 .update_curr = update_curr_rt, 2402 .update_curr = update_curr_rt,
2403
2404#ifdef CONFIG_UCLAMP_TASK
2405 .uclamp_enabled = 1,
2406#endif
2403}; 2407};
2404 2408
2405#ifdef CONFIG_RT_GROUP_SCHED 2409#ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ 2/* Generated by Documentation/scheduler/sched-pelt; do not modify. */
3 3
4static const u32 runnable_avg_yN_inv[] = { 4static const u32 runnable_avg_yN_inv[] __maybe_unused = {
5 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 5 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
6 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, 6 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
7 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, 7 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
96extern void calc_global_load_tick(struct rq *this_rq); 96extern void calc_global_load_tick(struct rq *this_rq);
97extern long calc_load_fold_active(struct rq *this_rq, long adjust); 97extern long calc_load_fold_active(struct rq *this_rq, long adjust);
98 98
99#ifdef CONFIG_SMP
100extern void cpu_load_update_active(struct rq *this_rq);
101#else
102static inline void cpu_load_update_active(struct rq *this_rq) { }
103#endif
104
105/* 99/*
106 * Helpers for converting nanosecond timing to jiffy resolution 100 * Helpers for converting nanosecond timing to jiffy resolution
107 */ 101 */
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
344 u64 runtime_expires; 338 u64 runtime_expires;
345 int expires_seq; 339 int expires_seq;
346 340
347 short idle; 341 u8 idle;
348 short period_active; 342 u8 period_active;
343 u8 distribute_running;
344 u8 slack_started;
349 struct hrtimer period_timer; 345 struct hrtimer period_timer;
350 struct hrtimer slack_timer; 346 struct hrtimer slack_timer;
351 struct list_head throttled_cfs_rq; 347 struct list_head throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
354 int nr_periods; 350 int nr_periods;
355 int nr_throttled; 351 int nr_throttled;
356 u64 throttled_time; 352 u64 throttled_time;
357
358 bool distribute_running;
359#endif 353#endif
360}; 354};
361 355
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
797#endif 791#endif
798#endif /* CONFIG_SMP */ 792#endif /* CONFIG_SMP */
799 793
794#ifdef CONFIG_UCLAMP_TASK
795/*
796 * struct uclamp_bucket - Utilization clamp bucket
797 * @value: utilization clamp value for tasks on this clamp bucket
798 * @tasks: number of RUNNABLE tasks on this clamp bucket
799 *
800 * Keep track of how many tasks are RUNNABLE for a given utilization
801 * clamp value.
802 */
803struct uclamp_bucket {
804 unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
805 unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
806};
807
808/*
809 * struct uclamp_rq - rq's utilization clamp
810 * @value: currently active clamp values for a rq
811 * @bucket: utilization clamp buckets affecting a rq
812 *
813 * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
814 * A clamp value is affecting a rq when there is at least one task RUNNABLE
815 * (or actually running) with that value.
816 *
817 * There are up to UCLAMP_CNT possible different clamp values, currently there
818 * are only two: minimum utilization and maximum utilization.
819 *
820 * All utilization clamping values are MAX aggregated, since:
821 * - for util_min: we want to run the CPU at least at the max of the minimum
822 * utilization required by its currently RUNNABLE tasks.
823 * - for util_max: we want to allow the CPU to run up to the max of the
824 * maximum utilization allowed by its currently RUNNABLE tasks.
825 *
826 * Since on each system we expect only a limited number of different
827 * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
828 * the metrics required to compute all the per-rq utilization clamp values.
829 */
830struct uclamp_rq {
831 unsigned int value;
832 struct uclamp_bucket bucket[UCLAMP_BUCKETS];
833};
834#endif /* CONFIG_UCLAMP_TASK */
835
800/* 836/*
801 * This is the main, per-CPU runqueue data structure. 837 * This is the main, per-CPU runqueue data structure.
802 * 838 *
@@ -818,8 +854,6 @@ struct rq {
818 unsigned int nr_preferred_running; 854 unsigned int nr_preferred_running;
819 unsigned int numa_migrate_on; 855 unsigned int numa_migrate_on;
820#endif 856#endif
821 #define CPU_LOAD_IDX_MAX 5
822 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
823#ifdef CONFIG_NO_HZ_COMMON 857#ifdef CONFIG_NO_HZ_COMMON
824#ifdef CONFIG_SMP 858#ifdef CONFIG_SMP
825 unsigned long last_load_update_tick; 859 unsigned long last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
830 atomic_t nohz_flags; 864 atomic_t nohz_flags;
831#endif /* CONFIG_NO_HZ_COMMON */ 865#endif /* CONFIG_NO_HZ_COMMON */
832 866
833 /* capture load from *all* tasks on this CPU: */
834 struct load_weight load;
835 unsigned long nr_load_updates; 867 unsigned long nr_load_updates;
836 u64 nr_switches; 868 u64 nr_switches;
837 869
870#ifdef CONFIG_UCLAMP_TASK
871 /* Utilization clamp values based on CPU's RUNNABLE tasks */
872 struct uclamp_rq uclamp[UCLAMP_CNT] ____cacheline_aligned;
873 unsigned int uclamp_flags;
874#define UCLAMP_FLAG_IDLE 0x01
875#endif
876
838 struct cfs_rq cfs; 877 struct cfs_rq cfs;
839 struct rt_rq rt; 878 struct rt_rq rt;
840 struct dl_rq dl; 879 struct dl_rq dl;
@@ -1649,6 +1688,10 @@ extern const u32 sched_prio_to_wmult[40];
1649struct sched_class { 1688struct sched_class {
1650 const struct sched_class *next; 1689 const struct sched_class *next;
1651 1690
1691#ifdef CONFIG_UCLAMP_TASK
1692 int uclamp_enabled;
1693#endif
1694
1652 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1695 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1653 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1696 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1654 void (*yield_task) (struct rq *rq); 1697 void (*yield_task) (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
2222static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} 2265static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2223#endif /* CONFIG_CPU_FREQ */ 2266#endif /* CONFIG_CPU_FREQ */
2224 2267
2268#ifdef CONFIG_UCLAMP_TASK
2269unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
2270
2271static __always_inline
2272unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
2273 struct task_struct *p)
2274{
2275 unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
2276 unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
2277
2278 if (p) {
2279 min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
2280 max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
2281 }
2282
2283 /*
2284 * Since CPU's {min,max}_util clamps are MAX aggregated considering
2285 * RUNNABLE tasks with _different_ clamps, we can end up with an
2286 * inversion. Fix it now when the clamps are applied.
2287 */
2288 if (unlikely(min_util >= max_util))
2289 return min_util;
2290
2291 return clamp(util, min_util, max_util);
2292}
2293
2294static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
2295{
2296 return uclamp_util_with(rq, util, NULL);
2297}
2298#else /* CONFIG_UCLAMP_TASK */
2299static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
2300 struct task_struct *p)
2301{
2302 return util;
2303}
2304static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
2305{
2306 return util;
2307}
2308#endif /* CONFIG_UCLAMP_TASK */
2309
2225#ifdef arch_scale_freq_capacity 2310#ifdef arch_scale_freq_capacity
2226# ifndef arch_scale_freq_invariant 2311# ifndef arch_scale_freq_invariant
2227# define arch_scale_freq_invariant() true 2312# define arch_scale_freq_invariant() true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
2237} 2322}
2238#endif 2323#endif
2239 2324
2240#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2241/** 2325/**
2242 * enum schedutil_type - CPU utilization type 2326 * enum schedutil_type - CPU utilization type
2243 * @FREQUENCY_UTIL: Utilization used to select frequency 2327 * @FREQUENCY_UTIL: Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
2253 ENERGY_UTIL, 2337 ENERGY_UTIL,
2254}; 2338};
2255 2339
2256unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs, 2340#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2257 unsigned long max, enum schedutil_type type);
2258
2259static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
2260{
2261 unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
2262 2341
2263 return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL); 2342unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2264} 2343 unsigned long max, enum schedutil_type type,
2344 struct task_struct *p);
2265 2345
2266static inline unsigned long cpu_bw_dl(struct rq *rq) 2346static inline unsigned long cpu_bw_dl(struct rq *rq)
2267{ 2347{
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
2290 return READ_ONCE(rq->avg_rt.util_avg); 2370 return READ_ONCE(rq->avg_rt.util_avg);
2291} 2371}
2292#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 2372#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2293static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs) 2373static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
2374 unsigned long max, enum schedutil_type type,
2375 struct task_struct *p)
2294{ 2376{
2295 return cfs; 2377 return 0;
2296} 2378}
2297#endif 2379#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2298 2380
2299#ifdef CONFIG_HAVE_SCHED_AVG_IRQ 2381#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
2300static inline unsigned long cpu_util_irq(struct rq *rq) 2382static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
1344 .imbalance_pct = 125, 1344 .imbalance_pct = 125,
1345 1345
1346 .cache_nice_tries = 0, 1346 .cache_nice_tries = 0,
1347 .busy_idx = 0,
1348 .idle_idx = 0,
1349 .newidle_idx = 0,
1350 .wake_idx = 0,
1351 .forkexec_idx = 0,
1352 1347
1353 .flags = 1*SD_LOAD_BALANCE 1348 .flags = 1*SD_LOAD_BALANCE
1354 | 1*SD_BALANCE_NEWIDLE 1349 | 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
1400 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { 1395 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1401 sd->imbalance_pct = 117; 1396 sd->imbalance_pct = 117;
1402 sd->cache_nice_tries = 1; 1397 sd->cache_nice_tries = 1;
1403 sd->busy_idx = 2;
1404 1398
1405#ifdef CONFIG_NUMA 1399#ifdef CONFIG_NUMA
1406 } else if (sd->flags & SD_NUMA) { 1400 } else if (sd->flags & SD_NUMA) {
1407 sd->cache_nice_tries = 2; 1401 sd->cache_nice_tries = 2;
1408 sd->busy_idx = 3;
1409 sd->idle_idx = 2;
1410 1402
1411 sd->flags &= ~SD_PREFER_SIBLING; 1403 sd->flags &= ~SD_PREFER_SIBLING;
1412 sd->flags |= SD_SERIALIZE; 1404 sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
1419#endif 1411#endif
1420 } else { 1412 } else {
1421 sd->cache_nice_tries = 1; 1413 sd->cache_nice_tries = 1;
1422 sd->busy_idx = 2;
1423 sd->idle_idx = 1;
1424 } 1414 }
1425 1415
1426 /* 1416 /*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
1884 unsigned long cap; 1874 unsigned long cap;
1885 1875
1886 /* Is there any asymmetry? */ 1876 /* Is there any asymmetry? */
1887 cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map)); 1877 cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
1888 1878
1889 for_each_cpu(i, cpu_map) { 1879 for_each_cpu(i, cpu_map) {
1890 if (arch_scale_cpu_capacity(NULL, i) != cap) { 1880 if (arch_scale_cpu_capacity(i) != cap) {
1891 asym = true; 1881 asym = true;
1892 break; 1882 break;
1893 } 1883 }
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
1902 * to everyone. 1892 * to everyone.
1903 */ 1893 */
1904 for_each_cpu(i, cpu_map) { 1894 for_each_cpu(i, cpu_map) {
1905 unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i); 1895 unsigned long max_capacity = arch_scale_cpu_capacity(i);
1906 int tl_id = 0; 1896 int tl_id = 0;
1907 1897
1908 for_each_sd_topology(tl) { 1898 for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
1912 for_each_cpu_and(j, tl->mask(i), cpu_map) { 1902 for_each_cpu_and(j, tl->mask(i), cpu_map) {
1913 unsigned long capacity; 1903 unsigned long capacity;
1914 1904
1915 capacity = arch_scale_cpu_capacity(NULL, j); 1905 capacity = arch_scale_cpu_capacity(j);
1916 1906
1917 if (capacity <= max_capacity) 1907 if (capacity <= max_capacity)
1918 continue; 1908 continue;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index fa0f9adfb752..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
118 bookmark.func = NULL; 118 bookmark.func = NULL;
119 INIT_LIST_HEAD(&bookmark.entry); 119 INIT_LIST_HEAD(&bookmark.entry);
120 120
121 spin_lock_irqsave(&wq_head->lock, flags); 121 do {
122 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
123 spin_unlock_irqrestore(&wq_head->lock, flags);
124
125 while (bookmark.flags & WQ_FLAG_BOOKMARK) {
126 spin_lock_irqsave(&wq_head->lock, flags); 122 spin_lock_irqsave(&wq_head->lock, flags);
127 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, 123 nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
128 wake_flags, key, &bookmark); 124 wake_flags, key, &bookmark);
129 spin_unlock_irqrestore(&wq_head->lock, flags); 125 spin_unlock_irqrestore(&wq_head->lock, flags);
130 } 126 } while (bookmark.flags & WQ_FLAG_BOOKMARK);
131} 127}
132 128
133/** 129/**
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1beca96fb625..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = {
452 .mode = 0644, 452 .mode = 0644,
453 .proc_handler = sched_rr_handler, 453 .proc_handler = sched_rr_handler,
454 }, 454 },
455#ifdef CONFIG_UCLAMP_TASK
456 {
457 .procname = "sched_util_clamp_min",
458 .data = &sysctl_sched_uclamp_util_min,
459 .maxlen = sizeof(unsigned int),
460 .mode = 0644,
461 .proc_handler = sysctl_sched_uclamp_handler,
462 },
463 {
464 .procname = "sched_util_clamp_max",
465 .data = &sysctl_sched_uclamp_util_max,
466 .maxlen = sizeof(unsigned int),
467 .mode = 0644,
468 .proc_handler = sysctl_sched_uclamp_handler,
469 },
470#endif
455#ifdef CONFIG_SCHED_AUTOGROUP 471#ifdef CONFIG_SCHED_AUTOGROUP
456 { 472 {
457 .procname = "sched_autogroup_enabled", 473 .procname = "sched_autogroup_enabled",
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
782 */ 782 */
783 if (!ts->tick_stopped) { 783 if (!ts->tick_stopped) {
784 calc_load_nohz_start(); 784 calc_load_nohz_start();
785 cpu_load_update_nohz_start();
786 quiet_vmstat(); 785 quiet_vmstat();
787 786
788 ts->last_tick = hrtimer_get_expires(&ts->sched_timer); 787 ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
829{ 828{
830 /* Update jiffies first */ 829 /* Update jiffies first */
831 tick_do_update_jiffies64(now); 830 tick_do_update_jiffies64(now);
832 cpu_load_update_nohz_stop();
833 831
834 /* 832 /*
835 * Clear the timer idle flag, so we avoid IPIs on remote queueing and 833 * Clear the timer idle flag, so we avoid IPIs on remote queueing and
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
277 * of this thread, than stop migrating for the duration 277 * of this thread, than stop migrating for the duration
278 * of the current test. 278 * of the current test.
279 */ 279 */
280 if (!cpumask_equal(current_mask, &current->cpus_allowed)) 280 if (!cpumask_equal(current_mask, current->cpus_ptr))
281 goto disable; 281 goto disable;
282 282
283 get_online_cpus(); 283 get_online_cpus();
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 157d9e31f6c2..60ba93fc42ce 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
23 * Kernel threads bound to a single CPU can safely use 23 * Kernel threads bound to a single CPU can safely use
24 * smp_processor_id(): 24 * smp_processor_id():
25 */ 25 */
26 if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu))) 26 if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
27 goto out; 27 goto out;
28 28
29 /* 29 /*
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 1da597aa6141..1a72b7d95cdc 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -34,7 +34,7 @@ static void simple_thread_func(int cnt)
34 34
35 /* Silly tracepoints */ 35 /* Silly tracepoints */
36 trace_foo_bar("hello", cnt, array, random_strings[len], 36 trace_foo_bar("hello", cnt, array, random_strings[len],
37 &current->cpus_allowed); 37 current->cpus_ptr);
38 38
39 trace_foo_with_template_simple("HELLO", cnt); 39 trace_foo_with_template_simple("HELLO", cnt);
40 40