Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: - Remove the unused per rq load array and all its infrastructure, by Dietmar Eggemann. - Add utilization clamping support by Patrick Bellasi. This is a refinement of the energy aware scheduling framework with support for boosting of interactive and capping of background workloads: to make sure critical GUI threads get maximum frequency ASAP, and to make sure background processing doesn't unnecessarily move to cpufreq governor to higher frequencies and less energy efficient CPU modes. - Add the bare minimum of tracepoints required for LISA EAS regression testing, by Qais Yousef - which allows automated testing of various power management features, including energy aware scheduling. - Restructure the former tsk_nr_cpus_allowed() facility that the -rt kernel used to modify the scheduler's CPU affinity logic such as migrate_disable() - introduce the task->cpus_ptr value instead of taking the address of &task->cpus_allowed directly - by Sebastian Andrzej Siewior. - Misc optimizations, fixes, cleanups and small enhancements - see the Git log for details. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) sched/uclamp: Add uclamp support to energy_compute() sched/uclamp: Add uclamp_util_with() sched/cpufreq, sched/uclamp: Add clamps for FAIR and RT tasks sched/uclamp: Set default clamps for RT tasks sched/uclamp: Reset uclamp values on RESET_ON_FORK sched/uclamp: Extend sched_setattr() to support utilization clamping sched/core: Allow sched_setattr() to use the current policy sched/uclamp: Add system default clamps sched/uclamp: Enforce last task's UCLAMP_MAX sched/uclamp: Add bucket local max tracking sched/uclamp: Add CPU's clamp buckets refcounting sched/fair: Rename weighted_cpuload() to cpu_runnable_load() sched/debug: Export the newly added tracepoints sched/debug: Add sched_overutilized tracepoint sched/debug: Add new tracepoint to track PELT at se level sched/debug: Add new tracepoints to track PELT at rq level sched/debug: Add a new sched_trace_*() helper functions sched/autogroup: Make autogroup_path() always available sched/wait: Deduplicate code with do-while sched/topology: Remove unused 'sd' parameter from arch_scale_cpu_capacity() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-08 19:39:53 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2019-07-08 19:39:53 -0400
commit: dad1c12ed831a7a89cc01e5582cd0b81a4be7f19 (patch)
tree: 7a84799d3108bd9d3f1d4b530afd3ff9300db982
parent: 090bc5a2a91499c1fd64b78d125daa6ca5531d38 (diff)
parent: af24bde8df2029f067dc46aff0393c8f18ff6e2f (diff)
49 files changed, 1216 insertions, 618 deletions
diff --git a/Documentation/scheduler/sched-pelt.c b/Documentation/scheduler/sched-pelt.c
index e4219139386a..7238b355919c 100644
--- a/Documentation/scheduler/sched-pelt.c
+++ b/Documentation/scheduler/sched-pelt.c
@@ -20,7 +20,8 @@ void calc_runnable_avg_yN_inv(void)
        int i;
        unsigned int x;
-        printf("static const u32 runnable_avg_yN_inv[] = {");
+        /* To silence -Wunused-but-set-variable warnings. */
+        printf("static const u32 runnable_avg_yN_inv[] __maybe_unused = {");
        for (i = 0; i < HALFLIFE; i++) {
                x = ((1UL<<32)-1)*pow(y, i);
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c
index 60e375ce1ab2..d17cb1e6d679 100644
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -169,7 +169,7 @@ static void update_cpu_capacity(unsigned int cpu)
        topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
        pr_info("CPU%u: update cpu_capacity %lu\n",
-                cpu, topology_get_cpu_scale(NULL, cpu));
+                cpu, topology_get_cpu_scale(cpu));
 }
 #else
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 6a52d761854b..79190d877fa7 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1831,7 +1831,7 @@ format_mca_init_stack(void *mca_data, unsigned long offset,
        ti->cpu = cpu;
        p->stack = ti;
        p->state = TASK_UNINTERRUPTIBLE;
-        cpumask_set_cpu(cpu, &p->cpus_allowed);
+        cpumask_set_cpu(cpu, &p->cpus_mask);
        INIT_LIST_HEAD(&p->tasks);
        p->parent = p->real_parent = p->group_leader = p;
        INIT_LIST_HEAD(&p->children);
diff --git a/arch/mips/include/asm/switch_to.h b/arch/mips/include/asm/switch_to.h
index 0f813bb753c6..09cbe9042828 100644
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -42,7 +42,7 @@ extern struct task_struct *ll_task;
 * inline to try to keep the overhead down. If we have been forced to run on
 * a "CPU" with an FPU because of a previous high level of FP computation,
 * but did not actually use the FPU during the most recent time-slice (CU1
- * isn't set), we undo the restriction on cpus_allowed.
+ * isn't set), we undo the restriction on cpus_mask.
 *
 * We're not calling set_cpus_allowed() here, because we have no need to
 * force prompt migration - we're already switching the current CPU to a
@@ -57,7 +57,7 @@ do {									\
            test_ti_thread_flag(__prev_ti, TIF_FPUBOUND) &&             \
            (!(KSTK_STATUS(prev) & ST0_CU1))) {                         \
                clear_ti_thread_flag(__prev_ti, TIF_FPUBOUND);          \
-                prev->cpus_allowed = prev->thread.user_cpus_allowed;    \
+                prev->cpus_mask = prev->thread.user_cpus_allowed;       \
        }                                                               \
        next->thread.emulated_fp = 0;                                   \
 } while(0)
diff --git a/arch/mips/kernel/mips-mt-fpaff.c b/arch/mips/kernel/mips-mt-fpaff.c
index a7c0f97e4b0d..1a08428eedcf 100644
--- a/arch/mips/kernel/mips-mt-fpaff.c
+++ b/arch/mips/kernel/mips-mt-fpaff.c
@@ -177,7 +177,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
        if (retval)
                goto out_unlock;
-        cpumask_or(&allowed, &p->thread.user_cpus_allowed, &p->cpus_allowed);
+        cpumask_or(&allowed, &p->thread.user_cpus_allowed, p->cpus_ptr);
        cpumask_and(&mask, &allowed, cpu_active_mask);
 out_unlock:
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c52766a5b85f..ac7159263da0 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -891,12 +891,12 @@ static void mt_ase_fp_affinity(void)
                 * restricted the allowed set to exclude any CPUs with FPUs,
                 * we'll skip the procedure.
                 */
-                if (cpumask_intersects(&current->cpus_allowed, &mt_fpu_cpumask)) {
+                if (cpumask_intersects(&current->cpus_mask, &mt_fpu_cpumask)) {
                        cpumask_t tmask;
                        current->thread.user_cpus_allowed
-                                = current->cpus_allowed;
+                                = current->cpus_mask;
-                        cpumask_and(&tmask, &current->cpus_allowed,
+                        cpumask_and(&tmask, &current->cpus_mask,
                                    &mt_fpu_cpumask);
                        set_cpus_allowed_ptr(current, &tmask);
                        set_thread_flag(TIF_FPUBOUND);
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index e56b553de27b..f18d5067cd0f 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -128,7 +128,7 @@ void __spu_update_sched_info(struct spu_context *ctx)
         * runqueue. The context will be rescheduled on the proper node
         * if it is timesliced or preempted.
         */
-        cpumask_copy(&ctx->cpus_allowed, &current->cpus_allowed);
+        cpumask_copy(&ctx->cpus_allowed, current->cpus_ptr);
        /* Save the current cpu id for spu interrupt routing. */
        ctx->last_ran = raw_smp_processor_id();
diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
index 604c0e3bcc83..f68baccc69f0 100644
--- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
+++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c
@@ -1503,7 +1503,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
         * may be scheduled elsewhere and invalidate entries in the
         * pseudo-locked region.
         */
-        if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
+        if (!cpumask_subset(current->cpus_ptr, &plr->d->cpu_mask)) {
                mutex_unlock(&rdtgroup_mutex);
                return -EINVAL;
        }
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c
index 1739d7e1952a..9b09e31ae82f 100644
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -43,7 +43,7 @@ static ssize_t cpu_capacity_show(struct device *dev,
 {
        struct cpu *cpu = container_of(dev, struct cpu, dev);
-        return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
+        return sprintf(buf, "%lu\n", topology_get_cpu_scale(cpu->dev.id));
 }
 static void update_topology_flags_workfn(struct work_struct *work);
@@ -116,7 +116,7 @@ void topology_normalize_cpu_scale(void)
                        / capacity_scale;
                topology_set_cpu_scale(cpu, capacity);
                pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
-                        cpu, topology_get_cpu_scale(NULL, cpu));
+                        cpu, topology_get_cpu_scale(cpu));
        }
 }
@@ -185,7 +185,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
        cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus);
        for_each_cpu(cpu, policy->related_cpus) {
-                raw_capacity[cpu] = topology_get_cpu_scale(NULL, cpu) *
+                raw_capacity[cpu] = topology_get_cpu_scale(cpu) *
                                    policy->cpuinfo.max_freq / 1000UL;
                capacity_scale = max(raw_capacity[cpu], capacity_scale);
        }
diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c
index 4fe662c3bbc1..c142b23bb401 100644
--- a/drivers/infiniband/hw/hfi1/affinity.c
+++ b/drivers/infiniband/hw/hfi1/affinity.c
@@ -1038,7 +1038,7 @@ int hfi1_get_proc_affinity(int node)
        struct hfi1_affinity_node *entry;
        cpumask_var_t diff, hw_thread_mask, available_mask, intrs_mask;
        const struct cpumask *node_mask,
-                *proc_mask = &current->cpus_allowed;
+                *proc_mask = current->cpus_ptr;
        struct hfi1_affinity_node_list *affinity = &node_affinity;
        struct cpu_mask_set *set = &affinity->proc;
@@ -1046,7 +1046,7 @@ int hfi1_get_proc_affinity(int node)
         * check whether process/context affinity has already
         * been set
         */
-        if (cpumask_weight(proc_mask) == 1) {
+        if (current->nr_cpus_allowed == 1) {
                hfi1_cdbg(PROC, "PID %u %s affinity set to CPU %*pbl",
                          current->pid, current->comm,
                          cpumask_pr_args(proc_mask));
@@ -1057,7 +1057,7 @@ int hfi1_get_proc_affinity(int node)
                cpu = cpumask_first(proc_mask);
                cpumask_set_cpu(cpu, &set->used);
                goto done;
-        } else if (cpumask_weight(proc_mask) < cpumask_weight(&set->mask)) {
+        } else if (current->nr_cpus_allowed < cpumask_weight(&set->mask)) {
                hfi1_cdbg(PROC, "PID %u %s affinity set to CPU set(s) %*pbl",
                          current->pid, current->comm,
                          cpumask_pr_args(proc_mask));
diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c
index 28b66bd70b74..2395fd4233a7 100644
--- a/drivers/infiniband/hw/hfi1/sdma.c
+++ b/drivers/infiniband/hw/hfi1/sdma.c
@@ -869,14 +869,13 @@ struct sdma_engine *sdma_select_user_engine(struct hfi1_devdata *dd,
 {
        struct sdma_rht_node *rht_node;
        struct sdma_engine *sde = NULL;
-        const struct cpumask *current_mask = &current->cpus_allowed;
        unsigned long cpu_id;
        /*
         * To ensure that always the same sdma engine(s) will be
         * selected make sure the process is pinned to this CPU only.
         */
-        if (cpumask_weight(current_mask) != 1)
+        if (current->nr_cpus_allowed != 1)
                goto out;
        cpu_id = smp_processor_id();
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 78fa634de98a..27b6e664e59d 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -1142,7 +1142,7 @@ static __poll_t qib_poll(struct file *fp, struct poll_table_struct *pt)
 static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
 {
        struct qib_filedata *fd = fp->private_data;
-        const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+        const unsigned int weight = current->nr_cpus_allowed;
        const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
        int local_cpu;
@@ -1623,9 +1623,8 @@ static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
                ret = find_free_ctxt(i_minor - 1, fp, uinfo);
        else {
                int unit;
-                const unsigned int cpu = cpumask_first(&current->cpus_allowed);
+                const unsigned int cpu = cpumask_first(current->cpus_ptr);
-                const unsigned int weight =
+                const unsigned int weight = current->nr_cpus_allowed;
-                        cpumask_weight(&current->cpus_allowed);
                if (weight == 1 && !test_bit(cpu, qib_cpulist))
                        if (!find_hca(cpu, &unit) && unit >= 0)
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 55180501b915..46dcb6f0eccf 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -381,9 +381,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
        seq_printf(m, "Cpus_allowed:\t%*pb\n",
-                   cpumask_pr_args(&task->cpus_allowed));
+                   cpumask_pr_args(task->cpus_ptr));
        seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-                   cpumask_pr_args(&task->cpus_allowed));
+                   cpumask_pr_args(task->cpus_ptr));
 }
 static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h
index d9bdc1a7f4e7..1cfe05ea1d89 100644
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -18,7 +18,7 @@ DECLARE_PER_CPU(unsigned long, cpu_scale);
 struct sched_domain;
 static inline
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+unsigned long topology_get_cpu_scale(int cpu)
 {
        return per_cpu(cpu_scale, cpu);
 }
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index aa027f7bcb3e..73f8c3cb9588 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -89,7 +89,7 @@ static inline unsigned long em_pd_energy(struct em_perf_domain *pd,
         * like schedutil.
         */
        cpu = cpumask_first(to_cpumask(pd->cpus));
-        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+        scale_cpu = arch_scale_cpu_capacity(cpu);
        cs = &pd->table[pd->nr_cap_states - 1];
        freq = map_util_freq(max_util, cs->frequency, scale_cpu);
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 1aec01365ed4..83a4a3ca3e8a 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -220,4 +220,38 @@ int __order_base_2(unsigned long n)
                ilog2((n) - 1) + 1) :           \
        __order_base_2(n)                       \
 )
+static inline __attribute__((const))
+int __bits_per(unsigned long n)
+{
+        if (n < 2)
+                return 1;
+        if (is_power_of_2(n))
+                return order_base_2(n) + 1;
+        return order_base_2(n);
+}
+/**
+ * bits_per - calculate the number of bits required for the argument
+ * @n: parameter
+ *
+ * This is constant-capable and can be used for compile time
+ * initializations, e.g bitfields.
+ *
+ * The first few values calculated by this routine:
+ * bf(0) = 1
+ * bf(1) = 1
+ * bf(2) = 2
+ * bf(3) = 2
+ * bf(4) = 3
+ * ... and so on.
+ */
+#define bits_per(n)                             \
+(                                               \
+        __builtin_constant_p(n) ? (             \
+                ((n) == 0 || (n) == 1)          \
+                        ? 1 : ilog2(n) + 1      \
+        ) :                                     \
+        __bits_per(n)                           \
+)
 #endif /* _LINUX_LOG2_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 76adce49b5ad..459d95e4a574 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -35,6 +35,7 @@ struct audit_context;
 struct backing_dev_info;
 struct bio_list;
 struct blk_plug;
+struct capture_control;
 struct cfs_rq;
 struct fs_struct;
 struct futex_pi_state;
@@ -47,8 +48,9 @@ struct pid_namespace;
 struct pipe_inode_info;
 struct rcu_node;
 struct reclaim_state;
-struct capture_control;
 struct robust_list_head;
+struct root_domain;
+struct rq;
 struct sched_attr;
 struct sched_param;
 struct seq_file;
@@ -281,6 +283,18 @@ struct vtime {
        u64                     gtime;
 };
+/*
+ * Utilization clamp constraints.
+ * @UCLAMP_MIN: Minimum utilization
+ * @UCLAMP_MAX: Maximum utilization
+ * @UCLAMP_CNT: Utilization clamp constraints count
+ */
+enum uclamp_id {
+        UCLAMP_MIN = 0,
+        UCLAMP_MAX,
+        UCLAMP_CNT
+};
 struct sched_info {
 #ifdef CONFIG_SCHED_INFO
        /* Cumulative counters: */
@@ -312,6 +326,10 @@ struct sched_info {
 # define SCHED_FIXEDPOINT_SHIFT         10
 # define SCHED_FIXEDPOINT_SCALE         (1L << SCHED_FIXEDPOINT_SHIFT)
+/* Increase resolution of cpu_capacity calculations */
+# define SCHED_CAPACITY_SHIFT           SCHED_FIXEDPOINT_SHIFT
+# define SCHED_CAPACITY_SCALE           (1L << SCHED_CAPACITY_SHIFT)
 struct load_weight {
        unsigned long                   weight;
        u32                             inv_weight;
@@ -560,6 +578,41 @@ struct sched_dl_entity {
        struct hrtimer inactive_timer;
 };
+#ifdef CONFIG_UCLAMP_TASK
+/* Number of utilization clamp buckets (shorter alias) */
+#define UCLAMP_BUCKETS CONFIG_UCLAMP_BUCKETS_COUNT
+/*
+ * Utilization clamp for a scheduling entity
+ * @value:              clamp value "assigned" to a se
+ * @bucket_id:          bucket index corresponding to the "assigned" value
+ * @active:             the se is currently refcounted in a rq's bucket
+ * @user_defined:       the requested clamp value comes from user-space
+ *
+ * The bucket_id is the index of the clamp bucket matching the clamp value
+ * which is pre-computed and stored to avoid expensive integer divisions from
+ * the fast path.
+ *
+ * The active bit is set whenever a task has got an "effective" value assigned,
+ * which can be different from the clamp value "requested" from user-space.
+ * This allows to know a task is refcounted in the rq's bucket corresponding
+ * to the "effective" bucket_id.
+ *
+ * The user_defined bit is set whenever a task has got a task-specific clamp
+ * value requested from userspace, i.e. the system defaults apply to this task
+ * just as a restriction. This allows to relax default clamps when a less
+ * restrictive task-specific value has been requested, thus allowing to
+ * implement a "nice" semantic. For example, a task running with a 20%
+ * default boost can still drop its own boosting to 0%.
+ */
+struct uclamp_se {
+        unsigned int value              : bits_per(SCHED_CAPACITY_SCALE);
+        unsigned int bucket_id          : bits_per(UCLAMP_BUCKETS);
+        unsigned int active             : 1;
+        unsigned int user_defined       : 1;
+};
+#endif /* CONFIG_UCLAMP_TASK */
 union rcu_special {
        struct {
                u8                      blocked;
@@ -640,6 +693,13 @@ struct task_struct {
 #endif
        struct sched_dl_entity          dl;
+#ifdef CONFIG_UCLAMP_TASK
+        /* Clamp values requested for a scheduling entity */
+        struct uclamp_se                uclamp_req[UCLAMP_CNT];
+        /* Effective clamp values used for a scheduling entity */
+        struct uclamp_se                uclamp[UCLAMP_CNT];
+#endif
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        /* List of struct preempt_notifier: */
        struct hlist_head               preempt_notifiers;
@@ -651,7 +711,8 @@ struct task_struct {
        unsigned int                    policy;
        int                             nr_cpus_allowed;
-        cpumask_t                       cpus_allowed;
+        const cpumask_t                 *cpus_ptr;
+        cpumask_t                       cpus_mask;
 #ifdef CONFIG_PREEMPT_RCU
        int                             rcu_read_lock_nesting;
@@ -1399,7 +1460,7 @@ extern struct pid *cad_pid;
 #define PF_SWAPWRITE            0x00800000      /* Allowed to write to swap */
 #define PF_MEMSTALL             0x01000000      /* Stalled due to lack of memory */
 #define PF_UMH                  0x02000000      /* I'm an Usermodehelper process */
-#define PF_NO_SETAFFINITY       0x04000000      /* Userland is not allowed to meddle with cpus_allowed */
+#define PF_NO_SETAFFINITY       0x04000000      /* Userland is not allowed to meddle with cpus_mask */
 #define PF_MCE_EARLY            0x08000000      /* Early kill for mce process policy */
 #define PF_MEMALLOC_NOCMA       0x10000000      /* All allocation request will have _GFP_MOVABLE cleared */
 #define PF_FREEZER_SKIP         0x40000000      /* Freezer should not count it as freezable */
@@ -1915,4 +1976,16 @@ static inline void rseq_syscall(struct pt_regs *regs)
 #endif
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq);
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len);
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq);
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq);
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq);
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq);
+int sched_trace_rq_cpu(struct rq *rq);
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 #endif
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index b36f4cf38111..1abe91ff6e4a 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -7,14 +7,6 @@
 */
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void cpu_load_update_nohz_start(void);
-extern void cpu_load_update_nohz_stop(void);
-#else
-static inline void cpu_load_update_nohz_start(void) { }
-static inline void cpu_load_update_nohz_stop(void) { }
-#endif
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern int get_nohz_timer_target(void);
 #else
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 99ce6d728df7..d4f6215ee03f 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -56,6 +56,11 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
 extern unsigned int sysctl_sched_rt_period;
 extern int sysctl_sched_rt_runtime;
+#ifdef CONFIG_UCLAMP_TASK
+extern unsigned int sysctl_sched_uclamp_util_min;
+extern unsigned int sysctl_sched_uclamp_util_max;
+#endif
 #ifdef CONFIG_CFS_BANDWIDTH
 extern unsigned int sysctl_sched_cfs_bandwidth_slice;
 #endif
@@ -75,6 +80,12 @@ extern int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos);
+#ifdef CONFIG_UCLAMP_TASK
+extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+                                       void __user *buffer, size_t *lenp,
+                                       loff_t *ppos);
+#endif
 extern int sysctl_numa_balancing(struct ctl_table *table, int write,
                                 void __user *buffer, size_t *lenp,
                                 loff_t *ppos);
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index cfc0a89a7159..7863bb62d2ab 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -7,12 +7,6 @@
 #include <linux/sched/idle.h>
 /*
- * Increase resolution of cpu_capacity calculations
- */
-#define SCHED_CAPACITY_SHIFT    SCHED_FIXEDPOINT_SHIFT
-#define SCHED_CAPACITY_SCALE    (1L << SCHED_CAPACITY_SHIFT)
-/*
 * sched-domains (multiprocessor balancing) declarations:
 */
 #ifdef CONFIG_SMP
@@ -84,11 +78,6 @@ struct sched_domain {
        unsigned int busy_factor;       /* less balancing by factor if busy */
        unsigned int imbalance_pct;     /* No balance until over watermark */
        unsigned int cache_nice_tries;  /* Leave cache hot tasks for # tries */
-        unsigned int busy_idx;
-        unsigned int idle_idx;
-        unsigned int newidle_idx;
-        unsigned int wake_idx;
-        unsigned int forkexec_idx;
        int nohz_idle;                  /* NOHZ IDLE status */
        int flags;                      /* See SD_* */
@@ -201,14 +190,6 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
 # define SD_INIT_NAME(type)
 #endif
-#ifndef arch_scale_cpu_capacity
-static __always_inline
-unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
-{
-        return SCHED_CAPACITY_SCALE;
-}
-#endif
 #else /* CONFIG_SMP */
 struct sched_domain_attr;
@@ -224,16 +205,16 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
        return true;
 }
+#endif  /* !CONFIG_SMP */
 #ifndef arch_scale_cpu_capacity
 static __always_inline
-unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
+unsigned long arch_scale_cpu_capacity(int cpu)
 {
        return SCHED_CAPACITY_SCALE;
 }
 #endif
-#endif  /* !CONFIG_SMP */
 static inline int task_node(const struct task_struct *p)
 {
        return cpu_to_node(task_cpu(p));
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c8c7c7efb487..420e80e56e55 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -594,6 +594,37 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
        TP_printk("cpu=%d", __entry->cpu)
 );
+/*
+ * Following tracepoints are not exported in tracefs and provide hooking
+ * mechanisms only for testing and debugging purposes.
+ *
+ * Postfixed with _tp to make them easily identifiable in the code.
+ */
+DECLARE_TRACE(pelt_cfs_tp,
+        TP_PROTO(struct cfs_rq *cfs_rq),
+        TP_ARGS(cfs_rq));
+DECLARE_TRACE(pelt_rt_tp,
+        TP_PROTO(struct rq *rq),
+        TP_ARGS(rq));
+DECLARE_TRACE(pelt_dl_tp,
+        TP_PROTO(struct rq *rq),
+        TP_ARGS(rq));
+DECLARE_TRACE(pelt_irq_tp,
+        TP_PROTO(struct rq *rq),
+        TP_ARGS(rq));
+DECLARE_TRACE(pelt_se_tp,
+        TP_PROTO(struct sched_entity *se),
+        TP_ARGS(se));
+DECLARE_TRACE(sched_overutilized_tp,
+        TP_PROTO(struct root_domain *rd, bool overutilized),
+        TP_ARGS(rd, overutilized));
 #endif /* _TRACE_SCHED_H */
 /* This part must be outside protection */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index ed4ee170bee2..617bb59aa8ba 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -51,9 +51,21 @@
 #define SCHED_FLAG_RESET_ON_FORK        0x01
 #define SCHED_FLAG_RECLAIM              0x02
 #define SCHED_FLAG_DL_OVERRUN           0x04
+#define SCHED_FLAG_KEEP_POLICY          0x08
+#define SCHED_FLAG_KEEP_PARAMS          0x10
+#define SCHED_FLAG_UTIL_CLAMP_MIN       0x20
+#define SCHED_FLAG_UTIL_CLAMP_MAX       0x40
+#define SCHED_FLAG_KEEP_ALL     (SCHED_FLAG_KEEP_POLICY | \
+                                 SCHED_FLAG_KEEP_PARAMS)
+#define SCHED_FLAG_UTIL_CLAMP   (SCHED_FLAG_UTIL_CLAMP_MIN | \
+                                 SCHED_FLAG_UTIL_CLAMP_MAX)
 #define SCHED_FLAG_ALL  (SCHED_FLAG_RESET_ON_FORK       | \
                         SCHED_FLAG_RECLAIM             | \
-                         SCHED_FLAG_DL_OVERRUN)
+                         SCHED_FLAG_DL_OVERRUN          | \
+                         SCHED_FLAG_KEEP_ALL            | \
+                         SCHED_FLAG_UTIL_CLAMP)
 #endif /* _UAPI_LINUX_SCHED_H */
diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h
index 10fbb8031930..c852153ddb0d 100644
--- a/include/uapi/linux/sched/types.h
+++ b/include/uapi/linux/sched/types.h
@@ -9,6 +9,7 @@ struct sched_param {
 };
 #define SCHED_ATTR_SIZE_VER0    48      /* sizeof first published struct */
+#define SCHED_ATTR_SIZE_VER1    56      /* add: util_{min,max} */
 /*
 * Extended scheduling parameters data structure.
@@ -21,8 +22,33 @@ struct sched_param {
 * the tasks may be useful for a wide variety of application fields, e.g.,
 * multimedia, streaming, automation and control, and many others.
 *
- * This variant (sched_attr) is meant at describing a so-called
+ * This variant (sched_attr) allows to define additional attributes to
- * sporadic time-constrained task. In such model a task is specified by:
+ * improve the scheduler knowledge about task requirements.
+ *
+ * Scheduling Class Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes specifies the
+ * scheduling policy and relative POSIX attributes:
+ *
+ *  @size               size of the structure, for fwd/bwd compat.
+ *
+ *  @sched_policy       task's scheduling policy
+ *  @sched_nice         task's nice value      (SCHED_NORMAL/BATCH)
+ *  @sched_priority     task's static priority (SCHED_FIFO/RR)
+ *
+ * Certain more advanced scheduling features can be controlled by a
+ * predefined set of flags via the attribute:
+ *
+ *  @sched_flags        for customizing the scheduler behaviour
+ *
+ * Sporadic Time-Constrained Task Attributes
+ * =========================================
+ *
+ * A subset of sched_attr attributes allows to describe a so-called
+ * sporadic time-constrained task.
+ *
+ * In such a model a task is specified by:
 *  - the activation period or minimum instance inter-arrival time;
 *  - the maximum (or average, depending on the actual scheduling
 *    discipline) computation time of all instances, a.k.a. runtime;
@@ -34,14 +60,8 @@ struct sched_param {
 * than the runtime and must be completed by time instant t equal to
 * the instance activation time + the deadline.
 *
- * This is reflected by the actual fields of the sched_attr structure:
+ * This is reflected by the following fields of the sched_attr structure:
 *
- *  @size               size of the structure, for fwd/bwd compat.
- *
- *  @sched_policy       task's scheduling policy
- *  @sched_flags        for customizing the scheduler behaviour
- *  @sched_nice         task's nice value      (SCHED_NORMAL/BATCH)
- *  @sched_priority     task's static priority (SCHED_FIFO/RR)
 *  @sched_deadline     representative of the task's deadline
 *  @sched_runtime      representative of the task's runtime
 *  @sched_period       representative of the task's period
@@ -53,6 +73,29 @@ struct sched_param {
 * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
 * only user of this new interface. More information about the algorithm
 * available in the scheduling class file or in Documentation/.
+ *
+ * Task Utilization Attributes
+ * ===========================
+ *
+ * A subset of sched_attr attributes allows to specify the utilization
+ * expected for a task. These attributes allow to inform the scheduler about
+ * the utilization boundaries within which it should schedule the task. These
+ * boundaries are valuable hints to support scheduler decisions on both task
+ * placement and frequency selection.
+ *
+ *  @sched_util_min     represents the minimum utilization
+ *  @sched_util_max     represents the maximum utilization
+ *
+ * Utilization is a value in the range [0..SCHED_CAPACITY_SCALE]. It
+ * represents the percentage of CPU time used by a task when running at the
+ * maximum frequency on the highest capacity CPU of the system. For example, a
+ * 20% utilization task is a task running for 2ms every 10ms at maximum
+ * frequency.
+ *
+ * A task with a min utilization value bigger than 0 is more likely scheduled
+ * on a CPU with a capacity big enough to fit the specified value.
+ * A task with a max utilization value smaller than 1024 is more likely
+ * scheduled on a CPU with no more capacity than the specified value.
 */
 struct sched_attr {
        __u32 size;
@@ -70,6 +113,11 @@ struct sched_attr {
        __u64 sched_runtime;
        __u64 sched_deadline;
        __u64 sched_period;
+        /* Utilization hints */
+        __u32 sched_util_min;
+        __u32 sched_util_max;
 };
 #endif /* _UAPI_LINUX_SCHED_TYPES_H */
diff --git a/init/Kconfig b/init/Kconfig
index 0e2344389501..c88289c18d59 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -677,6 +677,59 @@ config HAVE_UNSTABLE_SCHED_CLOCK
 config GENERIC_SCHED_CLOCK
        bool
+menu "Scheduler features"
+config UCLAMP_TASK
+        bool "Enable utilization clamping for RT/FAIR tasks"
+        depends on CPU_FREQ_GOV_SCHEDUTIL
+        help
+          This feature enables the scheduler to track the clamped utilization
+          of each CPU based on RUNNABLE tasks scheduled on that CPU.
+          With this option, the user can specify the min and max CPU
+          utilization allowed for RUNNABLE tasks. The max utilization defines
+          the maximum frequency a task should use while the min utilization
+          defines the minimum frequency it should use.
+          Both min and max utilization clamp values are hints to the scheduler,
+          aiming at improving its frequency selection policy, but they do not
+          enforce or grant any specific bandwidth for tasks.
+          If in doubt, say N.
+config UCLAMP_BUCKETS_COUNT
+        int "Number of supported utilization clamp buckets"
+        range 5 20
+        default 5
+        depends on UCLAMP_TASK
+        help
+          Defines the number of clamp buckets to use. The range of each bucket
+          will be SCHED_CAPACITY_SCALE/UCLAMP_BUCKETS_COUNT. The higher the
+          number of clamp buckets the finer their granularity and the higher
+          the precision of clamping aggregation and tracking at run-time.
+          For example, with the minimum configuration value we will have 5
+          clamp buckets tracking 20% utilization each. A 25% boosted tasks will
+          be refcounted in the [20..39]% bucket and will set the bucket clamp
+          effective value to 25%.
+          If a second 30% boosted task should be co-scheduled on the same CPU,
+          that task will be refcounted in the same bucket of the first task and
+          it will boost the bucket clamp effective value to 30%.
+          The clamp effective value of a bucket is reset to its nominal value
+          (20% in the example above) when there are no more tasks refcounted in
+          that bucket.
+          An additional boost/capping margin can be added to some tasks. In the
+          example above the 25% task will be boosted to 30% until it exits the
+          CPU. If that should be considered not acceptable on certain systems,
+          it's always possible to reduce the margin by increasing the number of
+          clamp buckets to trade off used memory for run-time tracking
+          precision.
+          If in doubt, use the default value.
+endmenu
 #
 # For architectures that want to enable the support for NUMA-affine scheduler
 # balancing logic:
diff --git a/init/init_task.c b/init/init_task.c
index afa6ad795355..7ab773b9b3cd 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -72,7 +72,8 @@ struct task_struct init_task
        .static_prio    = MAX_PRIO - 20,
        .normal_prio    = MAX_PRIO - 20,
        .policy         = SCHED_NORMAL,
-        .cpus_allowed   = CPU_MASK_ALL,
+        .cpus_ptr       = &init_task.cpus_mask,
+        .cpus_mask      = CPU_MASK_ALL,
        .nr_cpus_allowed= NR_CPUS,
        .mm             = NULL,
        .active_mm      = &init_mm,
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 515525ff1cfd..a1590e244f5f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2829,7 +2829,7 @@ static void cpuset_fork(struct task_struct *task)
        if (task_css_is_root(task, cpuset_cgrp_id))
                return;
-        set_cpus_allowed_ptr(task, &current->cpus_allowed);
+        set_cpus_allowed_ptr(task, current->cpus_ptr);
        task->mems_allowed = current->mems_allowed;
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index d18e343d4aab..847dd147b068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -898,6 +898,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 #ifdef CONFIG_STACKPROTECTOR
        tsk->stack_canary = get_random_canary();
 #endif
+        if (orig->cpus_ptr == &orig->cpus_mask)
+                tsk->cpus_ptr = &tsk->cpus_mask;
        /*
         * One for us, one for whoever does the "release_task()" (usually
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 7d66ee68aaaf..0a9326f5f421 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -223,7 +223,7 @@ int em_register_perf_domain(cpumask_t *span, unsigned int nr_states,
                 * All CPUs of a domain must have the same micro-architecture
                 * since they all share the same table.
                 */
-                cap = arch_scale_cpu_capacity(NULL, cpu);
+                cap = arch_scale_cpu_capacity(cpu);
                if (prev_cap && prev_cap != cap) {
                        pr_err("CPUs of %*pbl must have the same capacity\n",
                                                        cpumask_pr_args(span));
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2d4ff5353ded..2067080bb235 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -259,7 +259,6 @@ out:
 }
 #endif /* CONFIG_PROC_FS */
-#ifdef CONFIG_SCHED_DEBUG
 int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
        if (!task_group_is_autogroup(tg))
@@ -267,4 +266,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 874c427742a9..fa43ce3962e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -23,6 +23,17 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+/*
+ * Export tracepoints that act as a bare tracehook (ie: have no trace event
+ * associated with them) to allow external modules to probe them.
+ */
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_JUMP_LABEL)
@@ -761,6 +772,401 @@ static void set_load_weight(struct task_struct *p, bool update_load)
        }
 }
+#ifdef CONFIG_UCLAMP_TASK
+/* Max allowed minimum utilization */
+unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
+/* Max allowed maximum utilization */
+unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
+/* All clamps are required to be less or equal than these values */
+static struct uclamp_se uclamp_default[UCLAMP_CNT];
+/* Integer rounded range for each bucket */
+#define UCLAMP_BUCKET_DELTA DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
+#define for_each_clamp_id(clamp_id) \
+        for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
+static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
+{
+        return clamp_value / UCLAMP_BUCKET_DELTA;
+}
+static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
+{
+        return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
+}
+static inline unsigned int uclamp_none(int clamp_id)
+{
+        if (clamp_id == UCLAMP_MIN)
+                return 0;
+        return SCHED_CAPACITY_SCALE;
+}
+static inline void uclamp_se_set(struct uclamp_se *uc_se,
+                                 unsigned int value, bool user_defined)
+{
+        uc_se->value = value;
+        uc_se->bucket_id = uclamp_bucket_id(value);
+        uc_se->user_defined = user_defined;
+}
+static inline unsigned int
+uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
+                  unsigned int clamp_value)
+{
+        /*
+         * Avoid blocked utilization pushing up the frequency when we go
+         * idle (which drops the max-clamp) by retaining the last known
+         * max-clamp.
+         */
+        if (clamp_id == UCLAMP_MAX) {
+                rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
+                return clamp_value;
+        }
+        return uclamp_none(UCLAMP_MIN);
+}
+static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
+                                     unsigned int clamp_value)
+{
+        /* Reset max-clamp retention only on idle exit */
+        if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+                return;
+        WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
+}
+static inline
+unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
+                                 unsigned int clamp_value)
+{
+        struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
+        int bucket_id = UCLAMP_BUCKETS - 1;
+        /*
+         * Since both min and max clamps are max aggregated, find the
+         * top most bucket with tasks in.
+         */
+        for ( ; bucket_id >= 0; bucket_id--) {
+                if (!bucket[bucket_id].tasks)
+                        continue;
+                return bucket[bucket_id].value;
+        }
+        /* No tasks -- default clamp values */
+        return uclamp_idle_value(rq, clamp_id, clamp_value);
+}
+/*
+ * The effective clamp bucket index of a task depends on, by increasing
+ * priority:
+ * - the task specific clamp value, when explicitly requested from userspace
+ * - the system default clamp value, defined by the sysadmin
+ */
+static inline struct uclamp_se
+uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
+{
+        struct uclamp_se uc_req = p->uclamp_req[clamp_id];
+        struct uclamp_se uc_max = uclamp_default[clamp_id];
+        /* System default restrictions always apply */
+        if (unlikely(uc_req.value > uc_max.value))
+                return uc_max;
+        return uc_req;
+}
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
+{
+        struct uclamp_se uc_eff;
+        /* Task currently refcounted: use back-annotated (effective) value */
+        if (p->uclamp[clamp_id].active)
+                return p->uclamp[clamp_id].value;
+        uc_eff = uclamp_eff_get(p, clamp_id);
+        return uc_eff.value;
+}
+/*
+ * When a task is enqueued on a rq, the clamp bucket currently defined by the
+ * task's uclamp::bucket_id is refcounted on that rq. This also immediately
+ * updates the rq's clamp value if required.
+ *
+ * Tasks can have a task-specific value requested from user-space, track
+ * within each bucket the maximum value for tasks refcounted in it.
+ * This "local max aggregation" allows to track the exact "requested" value
+ * for each bucket when all its RUNNABLE tasks require the same clamp.
+ */
+static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        lockdep_assert_held(&rq->lock);
+        /* Update task effective clamp */
+        p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        bucket->tasks++;
+        uc_se->active = true;
+        uclamp_idle_reset(rq, clamp_id, uc_se->value);
+        /*
+         * Local max aggregation: rq buckets always track the max
+         * "requested" clamp value of its RUNNABLE tasks.
+         */
+        if (bucket->tasks == 1 || uc_se->value > bucket->value)
+                bucket->value = uc_se->value;
+        if (uc_se->value > READ_ONCE(uc_rq->value))
+                WRITE_ONCE(uc_rq->value, uc_se->value);
+}
+/*
+ * When a task is dequeued from a rq, the clamp bucket refcounted by the task
+ * is released. If this is the last task reference counting the rq's max
+ * active clamp value, then the rq's clamp value is updated.
+ *
+ * Both refcounted tasks and rq's cached clamp values are expected to be
+ * always valid. If it's detected they are not, as defensive programming,
+ * enforce the expected state and warn.
+ */
+static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
+                                    unsigned int clamp_id)
+{
+        struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
+        struct uclamp_se *uc_se = &p->uclamp[clamp_id];
+        struct uclamp_bucket *bucket;
+        unsigned int bkt_clamp;
+        unsigned int rq_clamp;
+        lockdep_assert_held(&rq->lock);
+        bucket = &uc_rq->bucket[uc_se->bucket_id];
+        SCHED_WARN_ON(!bucket->tasks);
+        if (likely(bucket->tasks))
+                bucket->tasks--;
+        uc_se->active = false;
+        /*
+         * Keep "local max aggregation" simple and accept to (possibly)
+         * overboost some RUNNABLE tasks in the same bucket.
+         * The rq clamp bucket value is reset to its base value whenever
+         * there are no more RUNNABLE tasks refcounting it.
+         */
+        if (likely(bucket->tasks))
+                return;
+        rq_clamp = READ_ONCE(uc_rq->value);
+        /*
+         * Defensive programming: this should never happen. If it happens,
+         * e.g. due to future modification, warn and fixup the expected value.
+         */
+        SCHED_WARN_ON(bucket->value > rq_clamp);
+        if (bucket->value >= rq_clamp) {
+                bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
+                WRITE_ONCE(uc_rq->value, bkt_clamp);
+        }
+}
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_inc_id(rq, p, clamp_id);
+        /* Reset clamp idle holding when there is one RUNNABLE task */
+        if (rq->uclamp_flags & UCLAMP_FLAG_IDLE)
+                rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
+{
+        unsigned int clamp_id;
+        if (unlikely(!p->sched_class->uclamp_enabled))
+                return;
+        for_each_clamp_id(clamp_id)
+                uclamp_rq_dec_id(rq, p, clamp_id);
+}
+int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int old_min, old_max;
+        static DEFINE_MUTEX(mutex);
+        int result;
+        mutex_lock(&mutex);
+        old_min = sysctl_sched_uclamp_util_min;
+        old_max = sysctl_sched_uclamp_util_max;
+        result = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (result)
+                goto undo;
+        if (!write)
+                goto done;
+        if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
+            sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE) {
+                result = -EINVAL;
+                goto undo;
+        }
+        if (old_min != sysctl_sched_uclamp_util_min) {
+                uclamp_se_set(&uclamp_default[UCLAMP_MIN],
+                              sysctl_sched_uclamp_util_min, false);
+        }
+        if (old_max != sysctl_sched_uclamp_util_max) {
+                uclamp_se_set(&uclamp_default[UCLAMP_MAX],
+                              sysctl_sched_uclamp_util_max, false);
+        }
+        /*
+         * Updating all the RUNNABLE task is expensive, keep it simple and do
+         * just a lazy update at each next enqueue time.
+         */
+        goto done;
+undo:
+        sysctl_sched_uclamp_util_min = old_min;
+        sysctl_sched_uclamp_util_max = old_max;
+done:
+        mutex_unlock(&mutex);
+        return result;
+}
+static int uclamp_validate(struct task_struct *p,
+                           const struct sched_attr *attr)
+{
+        unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
+        unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
+                lower_bound = attr->sched_util_min;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
+                upper_bound = attr->sched_util_max;
+        if (lower_bound > upper_bound)
+                return -EINVAL;
+        if (upper_bound > SCHED_CAPACITY_SCALE)
+                return -EINVAL;
+        return 0;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+                                  const struct sched_attr *attr)
+{
+        unsigned int clamp_id;
+        /*
+         * On scheduling class change, reset to default clamps for tasks
+         * without a task-specific value.
+         */
+        for_each_clamp_id(clamp_id) {
+                struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+                unsigned int clamp_value = uclamp_none(clamp_id);
+                /* Keep using defined clamps across class changes */
+                if (uc_se->user_defined)
+                        continue;
+                /* By default, RT tasks always get 100% boost */
+                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+                        clamp_value = uclamp_none(UCLAMP_MAX);
+                uclamp_se_set(uc_se, clamp_value, false);
+        }
+        if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
+                return;
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+                uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
+                              attr->sched_util_min, true);
+        }
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+                uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
+                              attr->sched_util_max, true);
+        }
+}
+static void uclamp_fork(struct task_struct *p)
+{
+        unsigned int clamp_id;
+        for_each_clamp_id(clamp_id)
+                p->uclamp[clamp_id].active = false;
+        if (likely(!p->sched_reset_on_fork))
+                return;
+        for_each_clamp_id(clamp_id) {
+                unsigned int clamp_value = uclamp_none(clamp_id);
+                /* By default, RT tasks always get 100% boost */
+                if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
+                        clamp_value = uclamp_none(UCLAMP_MAX);
+                uclamp_se_set(&p->uclamp_req[clamp_id], clamp_value, false);
+        }
+}
+static void __init init_uclamp(void)
+{
+        struct uclamp_se uc_max = {};
+        unsigned int clamp_id;
+        int cpu;
+        for_each_possible_cpu(cpu) {
+                memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
+                cpu_rq(cpu)->uclamp_flags = 0;
+        }
+        for_each_clamp_id(clamp_id) {
+                uclamp_se_set(&init_task.uclamp_req[clamp_id],
+                              uclamp_none(clamp_id), false);
+        }
+        /* System defaults allow max clamp values for both indexes */
+        uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
+        for_each_clamp_id(clamp_id)
+                uclamp_default[clamp_id] = uc_max;
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) { }
+static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) { }
+static inline int uclamp_validate(struct task_struct *p,
+                                  const struct sched_attr *attr)
+{
+        return -EOPNOTSUPP;
+}
+static void __setscheduler_uclamp(struct task_struct *p,
+                                  const struct sched_attr *attr) { }
+static inline void uclamp_fork(struct task_struct *p) { }
+static inline void init_uclamp(void) { }
+#endif /* CONFIG_UCLAMP_TASK */
 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (!(flags & ENQUEUE_NOCLOCK))
@@ -771,6 +1177,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_enqueue(p, flags & ENQUEUE_WAKEUP);
        }
+        uclamp_rq_inc(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
@@ -784,6 +1191,7 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
                psi_dequeue(p, flags & DEQUEUE_SLEEP);
        }
+        uclamp_rq_dec(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -930,7 +1338,7 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
 */
 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
 {
-        if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                return false;
        if (is_per_cpu_kthread(p))
@@ -1025,7 +1433,7 @@ static int migration_cpu_stop(void *data)
        local_irq_disable();
        /*
         * We need to explicitly wake pending tasks before running
-         * __migrate_task() such that we will not miss enforcing cpus_allowed
+         * __migrate_task() such that we will not miss enforcing cpus_ptr
         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
         */
        sched_ttwu_pending();
@@ -1056,7 +1464,7 @@ static int migration_cpu_stop(void *data)
 */
 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
 {
-        cpumask_copy(&p->cpus_allowed, new_mask);
+        cpumask_copy(&p->cpus_mask, new_mask);
        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
@@ -1126,7 +1534,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
                goto out;
        }
-        if (cpumask_equal(&p->cpus_allowed, new_mask))
+        if (cpumask_equal(p->cpus_ptr, new_mask))
                goto out;
        if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -1286,10 +1694,10 @@ static int migrate_swap_stop(void *data)
        if (task_cpu(arg->src_task) != arg->src_cpu)
                goto unlock;
-        if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr))
                goto unlock;
-        if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr))
                goto unlock;
        __migrate_swap_task(arg->src_task, arg->dst_cpu);
@@ -1331,10 +1739,10 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
                goto out;
-        if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr))
                goto out;
-        if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
+        if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr))
                goto out;
        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
@@ -1479,7 +1887,7 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 /*
- * ->cpus_allowed is protected by both rq->lock and p->pi_lock
+ * ->cpus_ptr is protected by both rq->lock and p->pi_lock
 *
 * A few notes on cpu_active vs cpu_online:
 *
@@ -1519,14 +1927,14 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
                for_each_cpu(dest_cpu, nodemask) {
                        if (!cpu_active(dest_cpu))
                                continue;
-                        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                        if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
                                return dest_cpu;
                }
        }
        for (;;) {
                /* Any allowed, online CPU? */
-                for_each_cpu(dest_cpu, &p->cpus_allowed) {
+                for_each_cpu(dest_cpu, p->cpus_ptr) {
                        if (!is_cpu_allowed(p, dest_cpu))
                                continue;
@@ -1570,7 +1978,7 @@ out:
 }
 /*
- * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
 */
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
@@ -1580,11 +1988,11 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
        if (p->nr_cpus_allowed > 1)
                cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        else
-                cpu = cpumask_any(&p->cpus_allowed);
+                cpu = cpumask_any(p->cpus_ptr);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
-         * to rely on ttwu() to place the task on a valid ->cpus_allowed
+         * to rely on ttwu() to place the task on a valid ->cpus_ptr
         * CPU.
         *
         * Since this is common to all placement strategies, this lives here.
@@ -1991,6 +2399,29 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        unsigned long flags;
        int cpu, success = 0;
+        if (p == current) {
+                /*
+                 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
+                 * == smp_processor_id()'. Together this means we can special
+                 * case the whole 'p->on_rq && ttwu_remote()' case below
+                 * without taking any locks.
+                 *
+                 * In particular:
+                 *  - we rely on Program-Order guarantees for all the ordering,
+                 *  - we're serialized against set_special_state() by virtue of
+                 *    it disabling IRQs (this allows not taking ->pi_lock).
+                 */
+                if (!(p->state & state))
+                        return false;
+                success = 1;
+                cpu = task_cpu(p);
+                trace_sched_waking(p);
+                p->state = TASK_RUNNING;
+                trace_sched_wakeup(p);
+                goto out;
+        }
        /*
         * If we are going to wake up a thread waiting for CONDITION we
         * need to ensure that CONDITION=1 done by the caller can not be
@@ -2000,7 +2431,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        raw_spin_lock_irqsave(&p->pi_lock, flags);
        smp_mb__after_spinlock();
        if (!(p->state & state))
-                goto out;
+                goto unlock;
        trace_sched_waking(p);
@@ -2030,7 +2461,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
         */
        smp_rmb();
        if (p->on_rq && ttwu_remote(p, wake_flags))
-                goto stat;
+                goto unlock;
 #ifdef CONFIG_SMP
        /*
@@ -2090,10 +2521,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu, wake_flags);
-stat:
+unlock:
-        ttwu_stat(p, cpu, wake_flags);
-out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+out:
+        if (success)
+                ttwu_stat(p, cpu, wake_flags);
        return success;
 }
@@ -2300,6 +2732,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         */
        p->prio = current->normal_prio;
+        uclamp_fork(p);
        /*
         * Revert to default priority/policy on fork if requested.
         */
@@ -2395,7 +2829,7 @@ void wake_up_new_task(struct task_struct *p)
 #ifdef CONFIG_SMP
        /*
         * Fork balancing, do it here and not earlier because:
-         *  - cpus_allowed can change in the fork path
+         *  - cpus_ptr can change in the fork path
         *  - any previously selected CPU might disappear through hotplug
         *
         * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
@@ -3033,7 +3467,6 @@ void scheduler_tick(void)
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
-        cpu_load_update_active(rq);
        calc_global_load_tick(rq);
        psi_task_tick(rq);
@@ -4071,6 +4504,13 @@ static void __setscheduler_params(struct task_struct *p,
 static void __setscheduler(struct rq *rq, struct task_struct *p,
                           const struct sched_attr *attr, bool keep_boost)
 {
+        /*
+         * If params can't change scheduling class changes aren't allowed
+         * either.
+         */
+        if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)
+                return;
        __setscheduler_params(p, attr);
        /*
@@ -4208,6 +4648,13 @@ recheck:
                        return retval;
        }
+        /* Update task specific "requested" clamps */
+        if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
+                retval = uclamp_validate(p, attr);
+                if (retval)
+                        return retval;
+        }
        /*
         * Make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
@@ -4237,6 +4684,8 @@ recheck:
                        goto change;
                if (dl_policy(policy) && dl_param_changed(p, attr))
                        goto change;
+                if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
+                        goto change;
                p->sched_reset_on_fork = reset_on_fork;
                task_rq_unlock(rq, p, &rf);
@@ -4267,7 +4716,7 @@ change:
                         * the entire root_domain to become SCHED_DEADLINE. We
                         * will also fail if there's no bandwidth available.
                         */
-                        if (!cpumask_subset(span, &p->cpus_allowed) ||
+                        if (!cpumask_subset(span, p->cpus_ptr) ||
                            rq->rd->dl_bw.bw == 0) {
                                task_rq_unlock(rq, p, &rf);
                                return -EPERM;
@@ -4317,7 +4766,9 @@ change:
                put_prev_task(rq, p);
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr, pi);
+        __setscheduler_uclamp(p, attr);
        if (queued) {
                /*
@@ -4493,6 +4944,10 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a
        if (ret)
                return -EFAULT;
+        if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
+            size < SCHED_ATTR_SIZE_VER1)
+                return -EINVAL;
        /*
         * XXX: Do we want to be lenient like existing syscalls; or do we want
         * to be strict and return an error on out-of-bounds values?
@@ -4556,14 +5011,21 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if ((int)attr.sched_policy < 0)
                return -EINVAL;
+        if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
+                attr.sched_policy = SETPARAM_POLICY;
        rcu_read_lock();
        retval = -ESRCH;
        p = find_process_by_pid(pid);
-        if (p != NULL)
+        if (likely(p))
-                retval = sched_setattr(p, &attr);
+                get_task_struct(p);
        rcu_read_unlock();
+        if (likely(p)) {
+                retval = sched_setattr(p, &attr);
+                put_task_struct(p);
+        }
        return retval;
 }
@@ -4714,6 +5176,11 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else
                attr.sched_nice = task_nice(p);
+#ifdef CONFIG_UCLAMP_TASK
+        attr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
+        attr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
+#endif
        rcu_read_unlock();
        retval = sched_read_attr(uattr, &attr, size);
@@ -4866,7 +5333,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+        cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
@@ -5123,7 +5590,7 @@ long __sched io_schedule_timeout(long timeout)
 }
 EXPORT_SYMBOL(io_schedule_timeout);
-void io_schedule(void)
+void __sched io_schedule(void)
 {
        int token;
@@ -5443,7 +5910,7 @@ int task_can_attach(struct task_struct *p,
         * allowed nodes is unnecessary.  Thus, cpusets are not
         * applicable for such threads.  This prevents checking for
         * success of set_cpus_allowed_ptr() on all attached tasks
-         * before cpus_allowed may be changed.
+         * before cpus_mask may be changed.
         */
        if (p->flags & PF_NO_SETAFFINITY) {
                ret = -EINVAL;
@@ -5470,7 +5937,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
        if (curr_cpu == target_cpu)
                return 0;
-        if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(target_cpu, p->cpus_ptr))
                return -EINVAL;
        /* TODO: This is not properly updating schedstats */
@@ -5608,7 +6075,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
                put_prev_task(rq, next);
                /*
-                 * Rules for changing task_struct::cpus_allowed are holding
+                 * Rules for changing task_struct::cpus_mask are holding
                 * both pi_lock and rq->lock, such that holding either
                 * stabilizes the mask.
                 *
@@ -5902,8 +6369,8 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
 void __init sched_init(void)
 {
-        int i, j;
        unsigned long alloc_size = 0, ptr;
+        int i;
        wait_bit_init();
@@ -6005,10 +6472,6 @@ void __init sched_init(void)
 #ifdef CONFIG_RT_GROUP_SCHED
                init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
 #endif
-                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
-                        rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
@@ -6063,6 +6526,8 @@ void __init sched_init(void)
        psi_init();
+        init_uclamp();
        scheduler_running = 1;
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index ec4e4a9aab5f..5cc4012572ec 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -120,14 +120,14 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        const struct sched_dl_entity *dl_se = &p->dl;
        if (later_mask &&
-            cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
+            cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
                return 1;
        } else {
                int best_cpu = cpudl_maximum(cp);
                WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-                if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+                if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
                    dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
                        if (later_mask)
                                cpumask_set_cpu(best_cpu, later_mask);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 962cf343f798..636ca6f88c8e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -196,14 +196,17 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
 * based on the task model parameters and gives the minimal utilization
 * required to meet deadlines.
 */
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-                                  unsigned long max, enum schedutil_type type)
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p)
 {
        unsigned long dl_util, util, irq;
        struct rq *rq = cpu_rq(cpu);
-        if (type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt))
+        if (!IS_BUILTIN(CONFIG_UCLAMP_TASK) &&
+            type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) {
                return max;
+        }
        /*
         * Early check to see if IRQ/steal time saturates the CPU, can be
@@ -219,9 +222,16 @@ unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
         * CFS tasks and we use the same metric to track the effective
         * utilization (PELT windows are synchronized) we can directly add them
         * to obtain the CPU's actual utilization.
+         *
+         * CFS and RT utilization can be boosted or capped, depending on
+         * utilization clamp constraints requested by currently RUNNABLE
+         * tasks.
+         * When there are no CFS RUNNABLE tasks, clamps are released and
+         * frequency will be gracefully reduced with the utilization decay.
         */
-        util = util_cfs;
+        util = util_cfs + cpu_util_rt(rq);
-        util += cpu_util_rt(rq);
+        if (type == FREQUENCY_UTIL)
+                util = uclamp_util_with(rq, util, p);
        dl_util = cpu_util_dl(rq);
@@ -276,12 +286,12 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
        struct rq *rq = cpu_rq(sg_cpu->cpu);
        unsigned long util = cpu_util_cfs(rq);
-        unsigned long max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+        unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);
        sg_cpu->max = max;
        sg_cpu->bw_dl = cpu_bw_dl(rq);
-        return schedutil_freq_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL);
+        return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL);
 }
 /**
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 9c6480e6d62d..b7abca987d94 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -94,11 +94,11 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
                if (skip)
                        continue;
-                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
+                if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
                        continue;
                if (lowest_mask) {
-                        cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+                        cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
                        /*
                         * We have to ensure that we have at least one bit
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 43901fa3f269..8b5bb2ac16e2 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -538,7 +538,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 * If we cannot preempt any rq, fall back to pick any
                 * online CPU:
                 */
-                cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
+                cpu = cpumask_any_and(cpu_active_mask, p->cpus_ptr);
                if (cpu >= nr_cpu_ids) {
                        /*
                         * Failed to find any suitable CPU.
@@ -1195,7 +1195,7 @@ static void update_curr_dl(struct rq *rq)
                                                 &curr->dl);
        } else {
                unsigned long scale_freq = arch_scale_freq_capacity(cpu);
-                unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+                unsigned long scale_cpu = arch_scale_cpu_capacity(cpu);
                scaled_delta_exec = cap_scale(delta_exec, scale_freq);
                scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu);
@@ -1824,7 +1824,7 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            cpumask_test_cpu(cpu, &p->cpus_allowed))
+            cpumask_test_cpu(cpu, p->cpus_ptr))
                return 1;
        return 0;
 }
@@ -1974,7 +1974,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
                /* Retry if something changed. */
                if (double_lock_balance(rq, later_rq)) {
                        if (unlikely(task_rq(task) != rq ||
-                                     !cpumask_test_cpu(later_rq->cpu, &task->cpus_allowed) ||
+                                     !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
                                     task_running(rq, task) ||
                                     !dl_task(task) ||
                                     !task_on_rq_queued(task))) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 14c6a8716ba1..f7e4579e746c 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -233,49 +233,35 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
        *tablep = NULL;
 }
-static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX-1;
 static void
 set_table_entry(struct ctl_table *entry,
                const char *procname, void *data, int maxlen,
-                umode_t mode, proc_handler *proc_handler,
+                umode_t mode, proc_handler *proc_handler)
-                bool load_idx)
 {
        entry->procname = procname;
        entry->data = data;
        entry->maxlen = maxlen;
        entry->mode = mode;
        entry->proc_handler = proc_handler;
-        if (load_idx) {
-                entry->extra1 = &min_load_idx;
-                entry->extra2 = &max_load_idx;
-        }
 }
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(14);
+        struct ctl_table *table = sd_alloc_ctl_entry(9);
        if (table == NULL)
                return NULL;
-        set_table_entry(&table[0] , "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[0], "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[1] , "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[1], "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[2] , "busy_idx",            &sd->busy_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[2], "busy_factor",         &sd->busy_factor,         sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[3] , "idle_idx",            &sd->idle_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[3], "imbalance_pct",       &sd->imbalance_pct,       sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[4] , "newidle_idx",         &sd->newidle_idx,         sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[4], "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[5] , "wake_idx",            &sd->wake_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[5], "flags",               &sd->flags,               sizeof(int),  0644, proc_dointvec_minmax);
-        set_table_entry(&table[6] , "forkexec_idx",        &sd->forkexec_idx,        sizeof(int) , 0644, proc_dointvec_minmax,   true );
+        set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
-        set_table_entry(&table[7] , "busy_factor",         &sd->busy_factor,         sizeof(int) , 0644, proc_dointvec_minmax,   false);
+        set_table_entry(&table[7], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring);
-        set_table_entry(&table[8] , "imbalance_pct",       &sd->imbalance_pct,       sizeof(int) , 0644, proc_dointvec_minmax,   false);
+        /* &table[8] is terminator */
-        set_table_entry(&table[9] , "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[10], "flags",               &sd->flags,               sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[12], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring,          false);
-        /* &table[13] is terminator */
        return table;
 }
@@ -653,8 +639,6 @@ do {									\
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
        P(nr_running);
-        SEQ_printf(m, "  .%-30s: %lu\n", "load",
-                   rq->load.weight);
        P(nr_switches);
        P(nr_load_updates);
        P(nr_uninterruptible);
@@ -662,11 +646,6 @@ do {									\
        SEQ_printf(m, "  .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr)));
        PN(clock);
        PN(clock_task);
-        P(cpu_load[0]);
-        P(cpu_load[1]);
-        P(cpu_load[2]);
-        P(cpu_load[3]);
-        P(cpu_load[4]);
 #undef P
 #undef PN
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8591529e1753..036be95a87e9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -275,6 +275,19 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return grp->my_q;
 }
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+        if (!path)
+                return;
+        if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+                autogroup_path(cfs_rq->tg, path, len);
+        else if (cfs_rq && cfs_rq->tg->css.cgroup)
+                cgroup_path(cfs_rq->tg->css.cgroup, path, len);
+        else
+                strlcpy(path, "(null)", len);
+}
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
@@ -449,6 +462,12 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
        return NULL;
 }
+static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+        if (path)
+                strlcpy(path, "(null)", len);
+}
 static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
        return true;
@@ -764,7 +783,7 @@ void post_init_entity_util_avg(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
-        long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+        long cpu_scale = arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)));
        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
        if (cap > 0) {
@@ -1466,9 +1485,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
               group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
 }
-static unsigned long weighted_cpuload(struct rq *rq);
+static unsigned long cpu_runnable_load(struct rq *rq);
-static unsigned long source_load(int cpu, int type);
-static unsigned long target_load(int cpu, int type);
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -1489,7 +1506,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        for_each_cpu(cpu, cpumask_of_node(nid)) {
                struct rq *rq = cpu_rq(cpu);
-                ns->load += weighted_cpuload(rq);
+                ns->load += cpu_runnable_load(rq);
                ns->compute_capacity += capacity_of(cpu);
        }
@@ -1621,7 +1638,7 @@ static void task_numa_compare(struct task_numa_env *env,
         * be incurred if the tasks were swapped.
         */
        /* Skip this swap candidate if cannot move to the source cpu */
-        if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+        if (!cpumask_test_cpu(env->src_cpu, cur->cpus_ptr))
                goto unlock;
        /*
@@ -1718,7 +1735,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                /* Skip this CPU if the source task cannot migrate */
-                if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, env->p->cpus_ptr))
                        continue;
                env->dst_cpu = cpu;
@@ -2686,8 +2703,6 @@ static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
        if (entity_is_task(se)) {
                struct rq *rq = rq_of(cfs_rq);
@@ -2703,8 +2718,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
        if (entity_is_task(se)) {
                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
@@ -3334,6 +3347,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
        update_tg_cfs_util(cfs_rq, se, gcfs_rq);
        update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
+        trace_pelt_cfs_tp(cfs_rq);
+        trace_pelt_se_tp(se);
        return 1;
 }
@@ -3486,6 +3502,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
        cfs_rq_util_change(cfs_rq, flags);
+        trace_pelt_cfs_tp(cfs_rq);
 }
 /**
@@ -3505,6 +3523,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
        add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
        cfs_rq_util_change(cfs_rq, 0);
+        trace_pelt_cfs_tp(cfs_rq);
 }
 /*
@@ -4100,7 +4120,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
-        if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
+        if (schedstat_enabled() &&
+            rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) {
                schedstat_set(se->statistics.slice_max,
                        max((u64)schedstat_val(se->statistics.slice_max),
                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
@@ -4734,6 +4755,11 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
        if (runtime_refresh_within(cfs_b, min_left))
                return;
+        /* don't push forwards an existing deferred unthrottle */
+        if (cfs_b->slack_started)
+                return;
+        cfs_b->slack_started = true;
        hrtimer_start(&cfs_b->slack_timer,
                        ns_to_ktime(cfs_bandwidth_slack_period),
                        HRTIMER_MODE_REL);
@@ -4787,6 +4813,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock_irqsave(&cfs_b->lock, flags);
+        cfs_b->slack_started = false;
        if (cfs_b->distribute_running) {
                raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
                return;
@@ -4950,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
        cfs_b->distribute_running = 0;
+        cfs_b->slack_started = false;
 }
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -5153,8 +5181,10 @@ static inline bool cpu_overutilized(int cpu)
 static inline void update_overutilized_status(struct rq *rq)
 {
-        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+        if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu)) {
                WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rq->rd, SG_OVERUTILIZED);
+        }
 }
 #else
 static inline void update_overutilized_status(struct rq *rq) { }
@@ -5325,71 +5355,6 @@ DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 #ifdef CONFIG_NO_HZ_COMMON
-/*
- * per rq 'load' arrray crap; XXX kill this.
- */
-/*
- * The exact cpuload calculated at every tick would be:
- *
- *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
- *
- * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when CPU may be busy, then we have:
- *
- *   load_n   = (1 - 1/2^i)^n * load_0
- *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
- *
- * decay_load_missed() below does efficient calculation of
- *
- *   load' = (1 - 1/2^i)^n * load
- *
- * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors.
- * This allows us to precompute the above in said factors, thereby allowing the
- * reduction of an arbitrary n in O(log_2 n) steps. (See also
- * fixed_power_int())
- *
- * The calculation is approximated on a 128 point scale.
- */
-#define DEGRADE_SHIFT           7
-static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
-static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
-        {   0,   0,  0,  0,  0,  0, 0, 0 },
-        {  64,  32,  8,  0,  0,  0, 0, 0 },
-        {  96,  72, 40, 12,  1,  0, 0, 0 },
-        { 112,  98, 75, 43, 15,  1, 0, 0 },
-        { 120, 112, 98, 76, 45, 16, 2, 0 }
-};
-/*
- * Update cpu_load for any missed ticks, due to tickless idle. The backlog
- * would be when CPU is idle and so we just decay the old load without
- * adding any new load.
- */
-static unsigned long
-decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
-{
-        int j = 0;
-        if (!missed_updates)
-                return load;
-        if (missed_updates >= degrade_zero_ticks[idx])
-                return 0;
-        if (idx == 1)
-                return load >> missed_updates;
-        while (missed_updates) {
-                if (missed_updates % 2)
-                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
-                missed_updates >>= 1;
-                j++;
-        }
-        return load;
-}
 static struct {
        cpumask_var_t idle_cpus_mask;
@@ -5401,234 +5366,11 @@ static struct {
 #endif /* CONFIG_NO_HZ_COMMON */
-/**
+static unsigned long cpu_runnable_load(struct rq *rq)
- * __cpu_load_update - update the rq->cpu_load[] statistics
- * @this_rq: The rq to update statistics for
- * @this_load: The current load
- * @pending_updates: The number of missed updates
- *
- * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
- *
- * This function computes a decaying average:
- *
- *   load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load
- *
- * Because of NOHZ it might not get called on every tick which gives need for
- * the @pending_updates argument.
- *
- *   load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1
- *             = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load
- *             = A * (A * load[i]_n-2 + B) + B
- *             = A * (A * (A * load[i]_n-3 + B) + B) + B
- *             = A^3 * load[i]_n-3 + (A^2 + A + 1) * B
- *             = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B
- *             = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B
- *             = (1 - 1/2^i)^n * (load[i]_0 - load) + load
- *
- * In the above we've assumed load_n := load, which is true for NOHZ_FULL as
- * any change in load would have resulted in the tick being turned back on.
- *
- * For regular NOHZ, this reduces to:
- *
- *   load[i]_n = (1 - 1/2^i)^n * load[i]_0
- *
- * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra
- * term.
- */
-static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
-                            unsigned long pending_updates)
-{
-        unsigned long __maybe_unused tickless_load = this_rq->cpu_load[0];
-        int i, scale;
-        this_rq->nr_load_updates++;
-        /* Update our load: */
-        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
-        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
-                unsigned long old_load, new_load;
-                /* scale is effectively 1 << i now, and >> i divides by scale */
-                old_load = this_rq->cpu_load[i];
-#ifdef CONFIG_NO_HZ_COMMON
-                old_load = decay_load_missed(old_load, pending_updates - 1, i);
-                if (tickless_load) {
-                        old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
-                        /*
-                         * old_load can never be a negative value because a
-                         * decayed tickless_load cannot be greater than the
-                         * original tickless_load.
-                         */
-                        old_load += tickless_load;
-                }
-#endif
-                new_load = this_load;
-                /*
-                 * Round up the averaging division if load is increasing. This
-                 * prevents us from getting stuck on 9 if the load is 10, for
-                 * example.
-                 */
-                if (new_load > old_load)
-                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
-        }
-}
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(struct rq *rq)
 {
        return cfs_rq_runnable_load_avg(&rq->cfs);
 }
-#ifdef CONFIG_NO_HZ_COMMON
-/*
- * There is no sane way to deal with nohz on smp when using jiffies because the
- * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
- * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
- *
- * Therefore we need to avoid the delta approach from the regular tick when
- * possible since that would seriously skew the load calculation. This is why we
- * use cpu_load_update_periodic() for CPUs out of nohz. However we'll rely on
- * jiffies deltas for updates happening while in nohz mode (idle ticks, idle
- * loop exit, nohz_idle_balance, nohz full exit...)
- *
- * This means we might still be one tick off for nohz periods.
- */
-static void cpu_load_update_nohz(struct rq *this_rq,
-                                 unsigned long curr_jiffies,
-                                 unsigned long load)
-{
-        unsigned long pending_updates;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        if (pending_updates) {
-                this_rq->last_load_update_tick = curr_jiffies;
-                /*
-                 * In the regular NOHZ case, we were idle, this means load 0.
-                 * In the NOHZ_FULL case, we were non-idle, we should consider
-                 * its weighted load.
-                 */
-                cpu_load_update(this_rq, load, pending_updates);
-        }
-}
-/*
- * Called from nohz_idle_balance() to update the load ratings before doing the
- * idle balance.
- */
-static void cpu_load_update_idle(struct rq *this_rq)
-{
-        /*
-         * bail if there's load or we're actually up-to-date.
-         */
-        if (weighted_cpuload(this_rq))
-                return;
-        cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
-}
-/*
- * Record CPU load on nohz entry so we know the tickless load to account
- * on nohz exit. cpu_load[0] happens then to be updated more frequently
- * than other cpu_load[idx] but it should be fine as cpu_load readers
- * shouldn't rely into synchronized cpu_load[*] updates.
- */
-void cpu_load_update_nohz_start(void)
-{
-        struct rq *this_rq = this_rq();
-        /*
-         * This is all lockless but should be fine. If weighted_cpuload changes
-         * concurrently we'll exit nohz. And cpu_load write can race with
-         * cpu_load_update_idle() but both updater would be writing the same.
-         */
-        this_rq->cpu_load[0] = weighted_cpuload(this_rq);
-}
-/*
- * Account the tickless load in the end of a nohz frame.
- */
-void cpu_load_update_nohz_stop(void)
-{
-        unsigned long curr_jiffies = READ_ONCE(jiffies);
-        struct rq *this_rq = this_rq();
-        unsigned long load;
-        struct rq_flags rf;
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        load = weighted_cpuload(this_rq);
-        rq_lock(this_rq, &rf);
-        update_rq_clock(this_rq);
-        cpu_load_update_nohz(this_rq, curr_jiffies, load);
-        rq_unlock(this_rq, &rf);
-}
-#else /* !CONFIG_NO_HZ_COMMON */
-static inline void cpu_load_update_nohz(struct rq *this_rq,
-                                        unsigned long curr_jiffies,
-                                        unsigned long load) { }
-#endif /* CONFIG_NO_HZ_COMMON */
-static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
-{
-#ifdef CONFIG_NO_HZ_COMMON
-        /* See the mess around cpu_load_update_nohz(). */
-        this_rq->last_load_update_tick = READ_ONCE(jiffies);
-#endif
-        cpu_load_update(this_rq, load, 1);
-}
-/*
- * Called from scheduler_tick()
- */
-void cpu_load_update_active(struct rq *this_rq)
-{
-        unsigned long load = weighted_cpuload(this_rq);
-        if (tick_nohz_tick_stopped())
-                cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
-        else
-                cpu_load_update_periodic(this_rq, load);
-}
-/*
- * Return a low guess at the load of a migration-source CPU weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(rq);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target CPU weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(rq);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
 static unsigned long capacity_of(int cpu)
 {
        return cpu_rq(cpu)->cpu_capacity;
@@ -5638,7 +5380,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
-        unsigned long load_avg = weighted_cpuload(rq);
+        unsigned long load_avg = cpu_runnable_load(rq);
        if (nr_running)
                return load_avg / nr_running;
@@ -5736,7 +5478,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
        s64 this_eff_load, prev_eff_load;
        unsigned long task_load;
-        this_eff_load = target_load(this_cpu, sd->wake_idx);
+        this_eff_load = cpu_runnable_load(cpu_rq(this_cpu));
        if (sync) {
                unsigned long current_load = task_h_load(current);
@@ -5754,7 +5496,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                this_eff_load *= 100;
        this_eff_load *= capacity_of(prev_cpu);
-        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+        prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu));
        prev_eff_load -= task_load;
        if (sched_feat(WA_BIAS))
                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -5815,14 +5557,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
        unsigned long this_runnable_load = ULONG_MAX;
        unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
        unsigned long most_spare = 0, this_spare = 0;
-        int load_idx = sd->forkexec_idx;
        int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
        unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
                                (sd->imbalance_pct-100) / 100;
-        if (sd_flag & SD_BALANCE_WAKE)
-                load_idx = sd->wake_idx;
        do {
                unsigned long load, avg_load, runnable_load;
                unsigned long spare_cap, max_spare_cap;
@@ -5831,7 +5569,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                /* Skip over this group if it has no CPUs allowed */
                if (!cpumask_intersects(sched_group_span(group),
-                                        &p->cpus_allowed))
+                                        p->cpus_ptr))
                        continue;
                local_group = cpumask_test_cpu(this_cpu,
@@ -5846,12 +5584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                max_spare_cap = 0;
                for_each_cpu(i, sched_group_span(group)) {
-                        /* Bias balancing toward CPUs of our domain */
+                        load = cpu_runnable_load(cpu_rq(i));
-                        if (local_group)
-                                load = source_load(i, load_idx);
-                        else
-                                load = target_load(i, load_idx);
                        runnable_load += load;
                        avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
@@ -5963,7 +5696,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                return cpumask_first(sched_group_span(group));
        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
+        for_each_cpu_and(i, sched_group_span(group), p->cpus_ptr) {
                if (available_idle_cpu(i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
@@ -5987,7 +5720,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
                                shallowest_idle_cpu = i;
                        }
                } else if (shallowest_idle_cpu == -1) {
-                        load = weighted_cpuload(cpu_rq(i));
+                        load = cpu_runnable_load(cpu_rq(i));
                        if (load < min_load) {
                                min_load = load;
                                least_loaded_cpu = i;
@@ -6003,7 +5736,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 {
        int new_cpu = cpu;
-        if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+        if (!cpumask_intersects(sched_domain_span(sd), p->cpus_ptr))
                return prev_cpu;
        /*
@@ -6120,7 +5853,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
        if (!test_idle_cores(target, false))
                return -1;
-        cpumask_and(cpus, sched_domain_span(sd), &p->cpus_allowed);
+        cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
        for_each_cpu_wrap(core, cpus, target) {
                bool idle = true;
@@ -6154,7 +5887,7 @@ static int select_idle_smt(struct task_struct *p, int target)
                return -1;
        for_each_cpu(cpu, cpu_smt_mask(target)) {
-                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        return cpu;
@@ -6218,7 +5951,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
        for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
                if (!--nr)
                        return -1;
-                if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                        continue;
                if (available_idle_cpu(cpu))
                        break;
@@ -6255,7 +5988,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
            recent_used_cpu != target &&
            cpus_share_cache(recent_used_cpu, target) &&
            available_idle_cpu(recent_used_cpu) &&
-            cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
+            cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
                 * candidate for the next wake:
@@ -6499,11 +6232,21 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
 static long
 compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
 {
-        long util, max_util, sum_util, energy = 0;
+        unsigned int max_util, util_cfs, cpu_util, cpu_cap;
+        unsigned long sum_util, energy = 0;
+        struct task_struct *tsk;
        int cpu;
        for (; pd; pd = pd->next) {
+                struct cpumask *pd_mask = perf_domain_span(pd);
+                /*
+                 * The energy model mandates all the CPUs of a performance
+                 * domain have the same capacity.
+                 */
+                cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
                max_util = sum_util = 0;
                /*
                 * The capacity state of CPUs of the current rd can be driven by
                 * CPUs of another rd if they belong to the same performance
@@ -6514,11 +6257,29 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
                 * it will not appear in its pd list and will not be accounted
                 * by compute_energy().
                 */
-                for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+                for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
-                        util = cpu_util_next(cpu, p, dst_cpu);
+                        util_cfs = cpu_util_next(cpu, p, dst_cpu);
-                        util = schedutil_energy_util(cpu, util);
-                        max_util = max(util, max_util);
+                        /*
-                        sum_util += util;
+                         * Busy time computation: utilization clamping is not
+                         * required since the ratio (sum_util / cpu_capacity)
+                         * is already enough to scale the EM reported power
+                         * consumption at the (eventually clamped) cpu_capacity.
+                         */
+                        sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                                       ENERGY_UTIL, NULL);
+                        /*
+                         * Performance domain frequency: utilization clamping
+                         * must be considered since it affects the selection
+                         * of the performance domain frequency.
+                         * NOTE: in case RT tasks are running, by default the
+                         * FREQUENCY_UTIL's utilization can be max OPP.
+                         */
+                        tsk = cpu == dst_cpu ? p : NULL;
+                        cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
+                                                      FREQUENCY_UTIL, tsk);
+                        max_util = max(max_util, cpu_util);
                }
                energy += em_pd_energy(pd->em_pd, max_util, sum_util);
@@ -6601,7 +6362,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
                int max_spare_cap_cpu = -1;
                for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
-                        if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+                        if (!cpumask_test_cpu(cpu, p->cpus_ptr))
                                continue;
                        /* Skip CPUs that will be overutilized. */
@@ -6690,7 +6451,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                }
                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
-                              cpumask_test_cpu(cpu, &p->cpus_allowed);
+                              cpumask_test_cpu(cpu, p->cpus_ptr);
        }
        rcu_read_lock();
@@ -7446,14 +7207,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
-         * 2) cannot be migrated to this CPU due to cpus_allowed, or
+         * 2) cannot be migrated to this CPU due to cpus_ptr, or
         * 3) running (obviously), or
         * 4) are cache-hot on their current CPU.
         */
        if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
                return 0;
-        if (!cpumask_test_cpu(env->dst_cpu, &p->cpus_allowed)) {
+        if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
                int cpu;
                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
@@ -7473,7 +7234,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                /* Prevent to re-select dst_cpu via env's CPUs: */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
-                        if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+                        if (cpumask_test_cpu(cpu, p->cpus_ptr)) {
                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
@@ -7559,7 +7320,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * detach_tasks() -- tries to detach up to imbalance runnable load from
 * busiest_rq, as part of a balancing operation within domain "sd".
 *
 * Returns number of detached tasks if successful and 0 otherwise.
@@ -7627,7 +7388,7 @@ static int detach_tasks(struct lb_env *env)
                /*
                 * We only want to steal up to the prescribed amount of
-                 * weighted load.
+                 * runnable load.
                 */
                if (env->imbalance <= 0)
                        break;
@@ -7696,6 +7457,7 @@ static void attach_tasks(struct lb_env *env)
        rq_unlock(env->dst_rq, &rf);
 }
+#ifdef CONFIG_NO_HZ_COMMON
 static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
 {
        if (cfs_rq->avg.load_avg)
@@ -7723,6 +7485,19 @@ static inline bool others_have_blocked(struct rq *rq)
        return false;
 }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
+{
+        rq->last_blocked_load_update_tick = jiffies;
+        if (!has_blocked)
+                rq->has_blocked_load = 0;
+}
+#else
+static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
+static inline bool others_have_blocked(struct rq *rq) { return false; }
+static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7788,11 +7563,7 @@ static void update_blocked_averages(int cpu)
        if (others_have_blocked(rq))
                done = false;
-#ifdef CONFIG_NO_HZ_COMMON
+        update_blocked_load_status(rq, !done);
-        rq->last_blocked_load_update_tick = jiffies;
-        if (done)
-                rq->has_blocked_load = 0;
-#endif
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -7858,11 +7629,7 @@ static inline void update_blocked_averages(int cpu)
        update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
        update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
        update_irq_load_avg(rq, 0);
-#ifdef CONFIG_NO_HZ_COMMON
+        update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
-        rq->last_blocked_load_update_tick = jiffies;
-        if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
-                rq->has_blocked_load = 0;
-#endif
        rq_unlock_irqrestore(rq, &rf);
 }
@@ -7880,7 +7647,6 @@ static unsigned long task_h_load(struct task_struct *p)
 struct sg_lb_stats {
        unsigned long avg_load; /*Avg load across the CPUs of the group */
        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long load_per_task;
        unsigned long group_capacity;
        unsigned long group_util; /* Total utilization of the group */
@@ -7934,38 +7700,10 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
        };
 }
-/**
- * get_sd_load_idx - Obtain the load index for a given sched domain.
- * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The idle status of the CPU for whose sd load_idx is obtained.
- *
- * Return: The load index.
- */
-static inline int get_sd_load_idx(struct sched_domain *sd,
-                                        enum cpu_idle_type idle)
-{
-        int load_idx;
-        switch (idle) {
-        case CPU_NOT_IDLE:
-                load_idx = sd->busy_idx;
-                break;
-        case CPU_NEWLY_IDLE:
-                load_idx = sd->newidle_idx;
-                break;
-        default:
-                load_idx = sd->idle_idx;
-                break;
-        }
-        return load_idx;
-}
 static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long max = arch_scale_cpu_capacity(sd, cpu);
+        unsigned long max = arch_scale_cpu_capacity(cpu);
        unsigned long used, free;
        unsigned long irq;
@@ -7990,7 +7728,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        unsigned long capacity = scale_rt_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
-        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
+        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
        if (!capacity)
                capacity = 1;
@@ -8100,7 +7838,7 @@ static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 /*
 * Group imbalance indicates (and tries to solve) the problem where balancing
- * groups is inadequate due to ->cpus_allowed constraints.
+ * groups is inadequate due to ->cpus_ptr constraints.
 *
 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
@@ -8250,9 +7988,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                                      struct sg_lb_stats *sgs,
                                      int *sg_status)
 {
-        int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
-        int load_idx = get_sd_load_idx(env->sd, env->idle);
-        unsigned long load;
        int i, nr_running;
        memset(sgs, 0, sizeof(*sgs));
@@ -8263,13 +7998,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
                        env->flags |= LBF_NOHZ_AGAIN;
-                /* Bias balancing toward CPUs of our domain: */
+                sgs->group_load += cpu_runnable_load(rq);
-                if (local_group)
-                        load = target_load(i, load_idx);
-                else
-                        load = source_load(i, load_idx);
-                sgs->group_load += load;
                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
@@ -8284,7 +8013,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
-                sgs->sum_weighted_load += weighted_cpuload(rq);
                /*
                 * No need to call idle_cpu() if nr_running is not 0
                 */
@@ -8303,7 +8031,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
        if (sgs->sum_nr_running)
-                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+                sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
@@ -8517,8 +8245,12 @@ next_group:
                /* Update over-utilization (tipping point, U >= 0) indicator */
                WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rd, sg_status & SG_OVERUTILIZED);
        } else if (sg_status & SG_OVERUTILIZED) {
-                WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
+                struct root_domain *rd = env->dst_rq->rd;
+                WRITE_ONCE(rd->overutilized, SG_OVERUTILIZED);
+                trace_sched_overutilized_tp(rd, SG_OVERUTILIZED);
        }
 }
@@ -8724,7 +8456,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance.
 *
- * Also calculates the amount of weighted load which should be moved
+ * Also calculates the amount of runnable load which should be moved
 * to restore balance.
 *
 * @env: The load balancing environment.
@@ -8769,7 +8501,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        /*
         * If the busiest group is imbalanced the below checks don't
         * work because they assume all things are equal, which typically
-         * isn't true due to cpus_allowed constraints and the like.
+         * isn't true due to cpus_ptr constraints and the like.
         */
        if (busiest->group_type == group_imbalanced)
                goto force_balance;
@@ -8843,7 +8575,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
-                unsigned long capacity, wl;
+                unsigned long capacity, load;
                enum fbq_type rt;
                rq = cpu_rq(i);
@@ -8897,30 +8629,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                    rq->nr_running == 1)
                        continue;
-                wl = weighted_cpuload(rq);
+                load = cpu_runnable_load(rq);
                /*
-                 * When comparing with imbalance, use weighted_cpuload()
+                 * When comparing with imbalance, use cpu_runnable_load()
                 * which is not scaled with the CPU capacity.
                 */
-                if (rq->nr_running == 1 && wl > env->imbalance &&
+                if (rq->nr_running == 1 && load > env->imbalance &&
                    !check_cpu_capacity(rq, env->sd))
                        continue;
                /*
                 * For the load comparisons with the other CPU's, consider
-                 * the weighted_cpuload() scaled with the CPU capacity, so
+                 * the cpu_runnable_load() scaled with the CPU capacity, so
                 * that the load can be moved away from the CPU that is
                 * potentially running at a lower capacity.
                 *
-                 * Thus we're looking for max(wl_i / capacity_i), crosswise
+                 * Thus we're looking for max(load_i / capacity_i), crosswise
                 * multiplication to rid ourselves of the division works out
-                 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
+                 * to: load_i * capacity_j > load_j * capacity_i;  where j is
                 * our previous maximum.
                 */
-                if (wl * busiest_capacity > busiest_load * capacity) {
+                if (load * busiest_capacity > busiest_load * capacity) {
-                        busiest_load = wl;
+                        busiest_load = load;
                        busiest_capacity = capacity;
                        busiest = rq;
                }
@@ -9211,7 +8943,7 @@ more_balance:
                         * if the curr task on busiest CPU can't be
                         * moved to this_cpu:
                         */
-                        if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                        if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
                                env.flags |= LBF_ALL_PINNED;
@@ -9880,7 +9612,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
                        rq_lock_irqsave(rq, &rf);
                        update_rq_clock(rq);
-                        cpu_load_update_idle(rq);
                        rq_unlock_irqrestore(rq, &rf);
                        if (flags & NOHZ_BALANCE_KICK)
@@ -10691,6 +10422,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
        .task_change_group      = task_change_group_fair,
 #endif
+#ifdef CONFIG_UCLAMP_TASK
+        .uclamp_enabled         = 1,
+#endif
 };
 #ifdef CONFIG_SCHED_DEBUG
@@ -10738,3 +10473,83 @@ __init void init_sched_fair_class(void)
 #endif /* SMP */
 }
+/*
+ * Helper functions to facilitate extracting info from tracepoints.
+ */
+const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_SMP
+        return cfs_rq ? &cfs_rq->avg : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
+char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
+{
+        if (!cfs_rq) {
+                if (str)
+                        strlcpy(str, "(null)", len);
+                else
+                        return NULL;
+        }
+        cfs_rq_tg_path(cfs_rq, str, len);
+        return str;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
+int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
+const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq ? &rq->avg_rt : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
+const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq ? &rq->avg_dl : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
+const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
+{
+#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
+        return rq ? &rq->avg_irq : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
+int sched_trace_rq_cpu(struct rq *rq)
+{
+        return rq ? cpu_of(rq) : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
+{
+#ifdef CONFIG_SMP
+        return rd ? rd->span : NULL;
+#else
+        return NULL;
+#endif
+}
+EXPORT_SYMBOL_GPL(sched_trace_rd_span);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 858589b83377..2410db5e9a35 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -39,7 +39,6 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
 SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, false)
 /*
 * Decrement CPU capacity based on time not spent running tasks
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
index befce29bd882..a96db50d40e0 100644
--- a/kernel/sched/pelt.c
+++ b/kernel/sched/pelt.c
@@ -28,6 +28,8 @@
 #include "sched.h"
 #include "pelt.h"
+#include <trace/events/sched.h>
 /*
 * Approximate:
 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
@@ -265,6 +267,7 @@ int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
 {
        if (___update_load_sum(now, &se->avg, 0, 0, 0)) {
                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+                trace_pelt_se_tp(se);
                return 1;
        }
@@ -278,6 +281,7 @@ int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se
                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
                cfs_se_util_change(&se->avg);
+                trace_pelt_se_tp(se);
                return 1;
        }
@@ -292,6 +296,7 @@ int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
                                cfs_rq->curr != NULL)) {
                ___update_load_avg(&cfs_rq->avg, 1, 1);
+                trace_pelt_cfs_tp(cfs_rq);
                return 1;
        }
@@ -317,6 +322,7 @@ int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
                                running)) {
                ___update_load_avg(&rq->avg_rt, 1, 1);
+                trace_pelt_rt_tp(rq);
                return 1;
        }
@@ -340,6 +346,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
                                running)) {
                ___update_load_avg(&rq->avg_dl, 1, 1);
+                trace_pelt_dl_tp(rq);
                return 1;
        }
@@ -366,7 +373,7 @@ int update_irq_load_avg(struct rq *rq, u64 running)
         * reflect the real amount of computation
         */
        running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
-        running = cap_scale(running, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+        running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
        /*
         * We know the time that has been used by interrupt since last update
@@ -388,8 +395,10 @@ int update_irq_load_avg(struct rq *rq, u64 running)
                                1,
                                1);
-        if (ret)
+        if (ret) {
                ___update_load_avg(&rq->avg_irq, 1, 1);
+                trace_pelt_irq_tp(rq);
+        }
        return ret;
 }
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index 7489d5f56960..afff644da065 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -79,7 +79,7 @@ static inline void update_rq_clock_pelt(struct rq *rq, s64 delta)
         * Scale the elapsed time to reflect the real amount of
         * computation
         */
-        delta = cap_scale(delta, arch_scale_cpu_capacity(NULL, cpu_of(rq)));
+        delta = cap_scale(delta, arch_scale_cpu_capacity(cpu_of(rq)));
        delta = cap_scale(delta, arch_scale_freq_capacity(cpu_of(rq)));
        rq->clock_pelt += delta;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1e6b909dca36..a532558a5176 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1614,7 +1614,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            cpumask_test_cpu(cpu, &p->cpus_allowed))
+            cpumask_test_cpu(cpu, p->cpus_ptr))
                return 1;
        return 0;
@@ -1751,7 +1751,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         * Also make sure that it wasn't scheduled on its rq.
                         */
                        if (unlikely(task_rq(task) != rq ||
-                                     !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
+                                     !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
                                     task_running(rq, task) ||
                                     !rt_task(task) ||
                                     !task_on_rq_queued(task))) {
@@ -2400,6 +2400,10 @@ const struct sched_class rt_sched_class = {
        .switched_to            = switched_to_rt,
        .update_curr            = update_curr_rt,
+#ifdef CONFIG_UCLAMP_TASK
+        .uclamp_enabled         = 1,
+#endif
 };
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h
index a26473674fb7..c529706bed11 100644
--- a/kernel/sched/sched-pelt.h
+++ b/kernel/sched/sched-pelt.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* Generated by Documentation/scheduler/sched-pelt; do not modify. */
-static const u32 runnable_avg_yN_inv[] = {
+static const u32 runnable_avg_yN_inv[] __maybe_unused = {
        0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
        0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
        0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b52ed1ada0be..802b1f3405f2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -96,12 +96,6 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
-#ifdef CONFIG_SMP
-extern void cpu_load_update_active(struct rq *this_rq);
-#else
-static inline void cpu_load_update_active(struct rq *this_rq) { }
-#endif
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
@@ -344,8 +338,10 @@ struct cfs_bandwidth {
        u64                     runtime_expires;
        int                     expires_seq;
-        short                   idle;
+        u8                      idle;
-        short                   period_active;
+        u8                      period_active;
+        u8                      distribute_running;
+        u8                      slack_started;
        struct hrtimer          period_timer;
        struct hrtimer          slack_timer;
        struct list_head        throttled_cfs_rq;
@@ -354,8 +350,6 @@ struct cfs_bandwidth {
        int                     nr_periods;
        int                     nr_throttled;
        u64                     throttled_time;
-        bool                    distribute_running;
 #endif
 };
@@ -797,6 +791,48 @@ extern void rto_push_irq_work_func(struct irq_work *work);
 #endif
 #endif /* CONFIG_SMP */
+#ifdef CONFIG_UCLAMP_TASK
+/*
+ * struct uclamp_bucket - Utilization clamp bucket
+ * @value: utilization clamp value for tasks on this clamp bucket
+ * @tasks: number of RUNNABLE tasks on this clamp bucket
+ *
+ * Keep track of how many tasks are RUNNABLE for a given utilization
+ * clamp value.
+ */
+struct uclamp_bucket {
+        unsigned long value : bits_per(SCHED_CAPACITY_SCALE);
+        unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
+};
+/*
+ * struct uclamp_rq - rq's utilization clamp
+ * @value: currently active clamp values for a rq
+ * @bucket: utilization clamp buckets affecting a rq
+ *
+ * Keep track of RUNNABLE tasks on a rq to aggregate their clamp values.
+ * A clamp value is affecting a rq when there is at least one task RUNNABLE
+ * (or actually running) with that value.
+ *
+ * There are up to UCLAMP_CNT possible different clamp values, currently there
+ * are only two: minimum utilization and maximum utilization.
+ *
+ * All utilization clamping values are MAX aggregated, since:
+ * - for util_min: we want to run the CPU at least at the max of the minimum
+ *   utilization required by its currently RUNNABLE tasks.
+ * - for util_max: we want to allow the CPU to run up to the max of the
+ *   maximum utilization allowed by its currently RUNNABLE tasks.
+ *
+ * Since on each system we expect only a limited number of different
+ * utilization clamp values (UCLAMP_BUCKETS), use a simple array to track
+ * the metrics required to compute all the per-rq utilization clamp values.
+ */
+struct uclamp_rq {
+        unsigned int value;
+        struct uclamp_bucket bucket[UCLAMP_BUCKETS];
+};
+#endif /* CONFIG_UCLAMP_TASK */
 /*
 * This is the main, per-CPU runqueue data structure.
 *
@@ -818,8 +854,6 @@ struct rq {
        unsigned int            nr_preferred_running;
        unsigned int            numa_migrate_on;
 #endif
-        #define CPU_LOAD_IDX_MAX 5
-        unsigned long           cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
        unsigned long           last_load_update_tick;
@@ -830,11 +864,16 @@ struct rq {
        atomic_t nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
-        /* capture load from *all* tasks on this CPU: */
-        struct load_weight      load;
        unsigned long           nr_load_updates;
        u64                     nr_switches;
+#ifdef CONFIG_UCLAMP_TASK
+        /* Utilization clamp values based on CPU's RUNNABLE tasks */
+        struct uclamp_rq        uclamp[UCLAMP_CNT] ____cacheline_aligned;
+        unsigned int            uclamp_flags;
+#define UCLAMP_FLAG_IDLE 0x01
+#endif
        struct cfs_rq           cfs;
        struct rt_rq            rt;
        struct dl_rq            dl;
@@ -1649,6 +1688,10 @@ extern const u32		sched_prio_to_wmult[40];
 struct sched_class {
        const struct sched_class *next;
+#ifdef CONFIG_UCLAMP_TASK
+        int uclamp_enabled;
+#endif
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*yield_task)   (struct rq *rq);
@@ -2222,6 +2265,48 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
+#ifdef CONFIG_UCLAMP_TASK
+unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id);
+static __always_inline
+unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+                              struct task_struct *p)
+{
+        unsigned int min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value);
+        unsigned int max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value);
+        if (p) {
+                min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN));
+                max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX));
+        }
+        /*
+         * Since CPU's {min,max}_util clamps are MAX aggregated considering
+         * RUNNABLE tasks with _different_ clamps, we can end up with an
+         * inversion. Fix it now when the clamps are applied.
+         */
+        if (unlikely(min_util >= max_util))
+                return min_util;
+        return clamp(util, min_util, max_util);
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+        return uclamp_util_with(rq, util, NULL);
+}
+#else /* CONFIG_UCLAMP_TASK */
+static inline unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
+                                            struct task_struct *p)
+{
+        return util;
+}
+static inline unsigned int uclamp_util(struct rq *rq, unsigned int util)
+{
+        return util;
+}
+#endif /* CONFIG_UCLAMP_TASK */
 #ifdef arch_scale_freq_capacity
 # ifndef arch_scale_freq_invariant
 #  define arch_scale_freq_invariant()   true
@@ -2237,7 +2322,6 @@ static inline unsigned long capacity_orig_of(int cpu)
 }
 #endif
-#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
 /**
 * enum schedutil_type - CPU utilization type
 * @FREQUENCY_UTIL:     Utilization used to select frequency
@@ -2253,15 +2337,11 @@ enum schedutil_type {
        ENERGY_UTIL,
 };
-unsigned long schedutil_freq_util(int cpu, unsigned long util_cfs,
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-                                  unsigned long max, enum schedutil_type type);
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
-{
-        unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
-        return schedutil_freq_util(cpu, cfs, max, ENERGY_UTIL);
+unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
-}
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p);
 static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
@@ -2290,11 +2370,13 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
        return READ_ONCE(rq->avg_rt.util_avg);
 }
 #else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
-static inline unsigned long schedutil_energy_util(int cpu, unsigned long cfs)
+static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
+                                 unsigned long max, enum schedutil_type type,
+                                 struct task_struct *p)
 {
-        return cfs;
+        return 0;
 }
-#endif
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
 static inline unsigned long cpu_util_irq(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f53f89df837d..f751ce0b783e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1344,11 +1344,6 @@ sd_init(struct sched_domain_topology_level *tl,
                .imbalance_pct          = 125,
                .cache_nice_tries       = 0,
-                .busy_idx               = 0,
-                .idle_idx               = 0,
-                .newidle_idx            = 0,
-                .wake_idx               = 0,
-                .forkexec_idx           = 0,
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
@@ -1400,13 +1395,10 @@ sd_init(struct sched_domain_topology_level *tl,
        } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
                sd->imbalance_pct = 117;
                sd->cache_nice_tries = 1;
-                sd->busy_idx = 2;
 #ifdef CONFIG_NUMA
        } else if (sd->flags & SD_NUMA) {
                sd->cache_nice_tries = 2;
-                sd->busy_idx = 3;
-                sd->idle_idx = 2;
                sd->flags &= ~SD_PREFER_SIBLING;
                sd->flags |= SD_SERIALIZE;
@@ -1419,8 +1411,6 @@ sd_init(struct sched_domain_topology_level *tl,
 #endif
        } else {
                sd->cache_nice_tries = 1;
-                sd->busy_idx = 2;
-                sd->idle_idx = 1;
        }
        /*
@@ -1884,10 +1874,10 @@ static struct sched_domain_topology_level
        unsigned long cap;
        /* Is there any asymmetry? */
-        cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
+        cap = arch_scale_cpu_capacity(cpumask_first(cpu_map));
        for_each_cpu(i, cpu_map) {
-                if (arch_scale_cpu_capacity(NULL, i) != cap) {
+                if (arch_scale_cpu_capacity(i) != cap) {
                        asym = true;
                        break;
                }
@@ -1902,7 +1892,7 @@ static struct sched_domain_topology_level
         * to everyone.
         */
        for_each_cpu(i, cpu_map) {
-                unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
+                unsigned long max_capacity = arch_scale_cpu_capacity(i);
                int tl_id = 0;
                for_each_sd_topology(tl) {
@@ -1912,7 +1902,7 @@ static struct sched_domain_topology_level
                        for_each_cpu_and(j, tl->mask(i), cpu_map) {
                                unsigned long capacity;
-                                capacity = arch_scale_cpu_capacity(NULL, j);
+                                capacity = arch_scale_cpu_capacity(j);
                                if (capacity <= max_capacity)
                                        continue;
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index fa0f9adfb752..c1e566a114ca 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -118,16 +118,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
        bookmark.func = NULL;
        INIT_LIST_HEAD(&bookmark.entry);
-        spin_lock_irqsave(&wq_head->lock, flags);
+        do {
-        nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive, wake_flags, key, &bookmark);
-        spin_unlock_irqrestore(&wq_head->lock, flags);
-        while (bookmark.flags & WQ_FLAG_BOOKMARK) {
                spin_lock_irqsave(&wq_head->lock, flags);
                nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
                                                wake_flags, key, &bookmark);
                spin_unlock_irqrestore(&wq_head->lock, flags);
-        }
+        } while (bookmark.flags & WQ_FLAG_BOOKMARK);
 }
 /**
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 1beca96fb625..1c1ad1e14f21 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -452,6 +452,22 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = sched_rr_handler,
        },
+#ifdef CONFIG_UCLAMP_TASK
+        {
+                .procname       = "sched_util_clamp_min",
+                .data           = &sysctl_sched_uclamp_util_min,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = sysctl_sched_uclamp_handler,
+        },
+        {
+                .procname       = "sched_util_clamp_max",
+                .data           = &sysctl_sched_uclamp_util_max,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = sysctl_sched_uclamp_handler,
+        },
+#endif
 #ifdef CONFIG_SCHED_AUTOGROUP
        {
                .procname       = "sched_autogroup_enabled",
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f4ee1a3428ae..be9707f68024 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -782,7 +782,6 @@ static void tick_nohz_stop_tick(struct tick_sched *ts, int cpu)
         */
        if (!ts->tick_stopped) {
                calc_load_nohz_start();
-                cpu_load_update_nohz_start();
                quiet_vmstat();
                ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
@@ -829,7 +828,6 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
        /* Update jiffies first */
        tick_do_update_jiffies64(now);
-        cpu_load_update_nohz_stop();
        /*
         * Clear the timer idle flag, so we avoid IPIs on remote queueing and
diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c
index 1e6db9cbe4dc..fa95139445b2 100644
--- a/kernel/trace/trace_hwlat.c
+++ b/kernel/trace/trace_hwlat.c
@@ -277,7 +277,7 @@ static void move_to_next_cpu(void)
         * of this thread, than stop migrating for the duration
         * of the current test.
         */
-        if (!cpumask_equal(current_mask, &current->cpus_allowed))
+        if (!cpumask_equal(current_mask, current->cpus_ptr))
                goto disable;
        get_online_cpus();
diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c
index 157d9e31f6c2..60ba93fc42ce 100644
--- a/lib/smp_processor_id.c
+++ b/lib/smp_processor_id.c
@@ -23,7 +23,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
         * Kernel threads bound to a single CPU can safely use
         * smp_processor_id():
         */
-        if (cpumask_equal(&current->cpus_allowed, cpumask_of(this_cpu)))
+        if (cpumask_equal(current->cpus_ptr, cpumask_of(this_cpu)))
                goto out;
        /*
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index 1da597aa6141..1a72b7d95cdc 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -34,7 +34,7 @@ static void simple_thread_func(int cnt)
        /* Silly tracepoints */
        trace_foo_bar("hello", cnt, array, random_strings[len],
-                      &current->cpus_allowed);
+                      current->cpus_ptr);
        trace_foo_with_template_simple("HELLO", cnt);
author	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-08 19:39:53 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2019-07-08 19:39:53 -0400
commit	dad1c12ed831a7a89cc01e5582cd0b81a4be7f19 (patch)
tree	7a84799d3108bd9d3f1d4b530afd3ff9300db982
parent	090bc5a2a91499c1fd64b78d125daa6ca5531d38 (diff)
parent	af24bde8df2029f067dc46aff0393c8f18ff6e2f (diff)