summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-09-16 20:25:49 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-09-16 20:25:49 -0400
commit7e67a859997aad47727aff9c5a32e160da079ce3 (patch)
tree96f53425c2834de5b3276d7598782ab6412e4d5e
parent772c1d06bd402f7ee72c61a18c2db74cd74b6758 (diff)
parent563c4f85f9f0d63b712081d5b4522152cdcb8b6b (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - MAINTAINERS: Add Mark Rutland as perf submaintainer, Juri Lelli and Vincent Guittot as scheduler submaintainers. Add Dietmar Eggemann, Steven Rostedt, Ben Segall and Mel Gorman as scheduler reviewers. As perf and the scheduler is getting bigger and more complex, document the status quo of current responsibilities and interests, and spread the review pain^H^H^H^H fun via an increase in the Cc: linecount generated by scripts/get_maintainer.pl. :-) - Add another series of patches that brings the -rt (PREEMPT_RT) tree closer to mainline: split the monolithic CONFIG_PREEMPT dependencies into a new CONFIG_PREEMPTION category that will allow the eventual introduction of CONFIG_PREEMPT_RT. Still a few more hundred patches to go though. - Extend the CPU cgroup controller with uclamp.min and uclamp.max to allow the finer shaping of CPU bandwidth usage. - Micro-optimize energy-aware wake-ups from O(CPUS^2) to O(CPUS). - Improve the behavior of high CPU count, high thread count applications running under cpu.cfs_quota_us constraints. - Improve balancing with SCHED_IDLE (SCHED_BATCH) tasks present. - Improve CPU isolation housekeeping CPU allocation NUMA locality. - Fix deadline scheduler bandwidth calculations and logic when cpusets rebuilds the topology, or when it gets deadline-throttled while it's being offlined. - Convert the cpuset_mutex to percpu_rwsem, to allow it to be used from setscheduler() system calls without creating global serialization. Add new synchronization between cpuset topology-changing events and the deadline acceptance tests in setscheduler(), which were broken before. - Rework the active_mm state machine to be less confusing and more optimal. - Rework (simplify) the pick_next_task() slowpath. - Improve load-balancing on AMD EPYC systems. - ... and misc cleanups, smaller fixes and improvements - please see the Git log for more details. * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (53 commits) sched/psi: Correct overly pessimistic size calculation sched/fair: Speed-up energy-aware wake-ups sched/uclamp: Always use 'enum uclamp_id' for clamp_id values sched/uclamp: Update CPU's refcount on TG's clamp changes sched/uclamp: Use TG's clamps to restrict TASK's clamps sched/uclamp: Propagate system defaults to the root group sched/uclamp: Propagate parent clamps sched/uclamp: Extend CPU's cgroup controller sched/topology: Improve load balancing on AMD EPYC systems arch, ia64: Make NUMA select SMP sched, perf: MAINTAINERS update, add submaintainers and reviewers sched/fair: Use rq_lock/unlock in online_fair_sched_group cpufreq: schedutil: fix equation in comment sched: Rework pick_next_task() slow-path sched: Allow put_prev_task() to drop rq->lock sched/fair: Expose newidle_balance() sched: Add task_struct pointer to sched_class::set_curr_task sched: Rework CPU hotplug task selection sched/{rt,deadline}: Fix set_next_task vs pick_next_task sched: Fix kerneldoc comment for ia64_set_curr_task ...
-rw-r--r--Documentation/admin-guide/cgroup-v2.rst34
-rw-r--r--Documentation/scheduler/sched-bwc.rst74
-rw-r--r--MAINTAINERS7
-rw-r--r--arch/Kconfig2
-rw-r--r--arch/ia64/Kconfig1
-rw-r--r--arch/x86/entry/entry_32.S6
-rw-r--r--arch/x86/entry/entry_64.S4
-rw-r--r--arch/x86/entry/thunk_32.S2
-rw-r--r--arch/x86/entry/thunk_64.S4
-rw-r--r--arch/x86/include/asm/preempt.h2
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/dumpstack.c7
-rw-r--r--arch/x86/kernel/kprobes/core.c2
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--include/asm-generic/preempt.h4
-rw-r--r--include/linux/cgroup.h1
-rw-r--r--include/linux/cpuset.h13
-rw-r--r--include/linux/preempt.h6
-rw-r--r--include/linux/rcupdate.h2
-rw-r--r--include/linux/rcutree.h2
-rw-r--r--include/linux/sched.h11
-rw-r--r--include/linux/sched/deadline.h8
-rw-r--r--include/linux/sched/task.h6
-rw-r--r--include/linux/sched/topology.h10
-rw-r--r--include/linux/spinlock.h2
-rw-r--r--include/linux/spinlock_api_smp.h2
-rw-r--r--include/linux/topology.h14
-rw-r--r--include/linux/torture.h2
-rw-r--r--init/Kconfig22
-rw-r--r--init/init_task.c2
-rw-r--r--init/main.c2
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/cgroup/cpuset.c163
-rw-r--r--kernel/events/core.c9
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/locking/rtmutex.c6
-rw-r--r--kernel/rcu/Kconfig8
-rw-r--r--kernel/rcu/tree.c12
-rw-r--r--kernel/rcu/tree_stall.h6
-rw-r--r--kernel/sched/core.c561
-rw-r--r--kernel/sched/cpufreq_schedutil.c6
-rw-r--r--kernel/sched/deadline.c134
-rw-r--r--kernel/sched/fair.c409
-rw-r--r--kernel/sched/idle.c31
-rw-r--r--kernel/sched/isolation.c12
-rw-r--r--kernel/sched/psi.c2
-rw-r--r--kernel/sched/rt.c74
-rw-r--r--kernel/sched/sched.h63
-rw-r--r--kernel/sched/stats.h7
-rw-r--r--kernel/sched/stop_task.c22
-rw-r--r--kernel/sched/topology.c53
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace_events.c4
-rw-r--r--kernel/trace/trace_sched_wakeup.c3
-rw-r--r--mm/khugepaged.c2
-rw-r--r--mm/page_alloc.c2
60 files changed, 1274 insertions, 595 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 3b29005aa981..5f1c266131b0 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -951,6 +951,13 @@ controller implements weight and absolute bandwidth limit models for
951normal scheduling policy and absolute bandwidth allocation model for 951normal scheduling policy and absolute bandwidth allocation model for
952realtime scheduling policy. 952realtime scheduling policy.
953 953
954In all the above models, cycles distribution is defined only on a temporal
955base and it does not account for the frequency at which tasks are executed.
956The (optional) utilization clamping support allows to hint the schedutil
957cpufreq governor about the minimum desired frequency which should always be
958provided by a CPU, as well as the maximum desired frequency, which should not
959be exceeded by a CPU.
960
954WARNING: cgroup2 doesn't yet support control of realtime processes and 961WARNING: cgroup2 doesn't yet support control of realtime processes and
955the cpu controller can only be enabled when all RT processes are in 962the cpu controller can only be enabled when all RT processes are in
956the root cgroup. Be aware that system management software may already 963the root cgroup. Be aware that system management software may already
@@ -1016,6 +1023,33 @@ All time durations are in microseconds.
1016 Shows pressure stall information for CPU. See 1023 Shows pressure stall information for CPU. See
1017 Documentation/accounting/psi.rst for details. 1024 Documentation/accounting/psi.rst for details.
1018 1025
1026 cpu.uclamp.min
1027 A read-write single value file which exists on non-root cgroups.
1028 The default is "0", i.e. no utilization boosting.
1029
1030 The requested minimum utilization (protection) as a percentage
1031 rational number, e.g. 12.34 for 12.34%.
1032
1033 This interface allows reading and setting minimum utilization clamp
1034 values similar to the sched_setattr(2). This minimum utilization
1035 value is used to clamp the task specific minimum utilization clamp.
1036
1037 The requested minimum utilization (protection) is always capped by
1038 the current value for the maximum utilization (limit), i.e.
1039 `cpu.uclamp.max`.
1040
1041 cpu.uclamp.max
1042 A read-write single value file which exists on non-root cgroups.
1043 The default is "max". i.e. no utilization capping
1044
1045 The requested maximum utilization (limit) as a percentage rational
1046 number, e.g. 98.76 for 98.76%.
1047
1048 This interface allows reading and setting maximum utilization clamp
1049 values similar to the sched_setattr(2). This maximum utilization
1050 value is used to clamp the task specific maximum utilization clamp.
1051
1052
1019 1053
1020Memory 1054Memory
1021------ 1055------
diff --git a/Documentation/scheduler/sched-bwc.rst b/Documentation/scheduler/sched-bwc.rst
index 3a9064219656..9801d6b284b1 100644
--- a/Documentation/scheduler/sched-bwc.rst
+++ b/Documentation/scheduler/sched-bwc.rst
@@ -9,15 +9,16 @@ CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
9specification of the maximum CPU bandwidth available to a group or hierarchy. 9specification of the maximum CPU bandwidth available to a group or hierarchy.
10 10
11The bandwidth allowed for a group is specified using a quota and period. Within 11The bandwidth allowed for a group is specified using a quota and period. Within
12each given "period" (microseconds), a group is allowed to consume only up to 12each given "period" (microseconds), a task group is allocated up to "quota"
13"quota" microseconds of CPU time. When the CPU bandwidth consumption of a 13microseconds of CPU time. That quota is assigned to per-cpu run queues in
14group exceeds this limit (for that period), the tasks belonging to its 14slices as threads in the cgroup become runnable. Once all quota has been
15hierarchy will be throttled and are not allowed to run again until the next 15assigned any additional requests for quota will result in those threads being
16period. 16throttled. Throttled threads will not be able to run again until the next
17 17period when the quota is replenished.
18A group's unused runtime is globally tracked, being refreshed with quota units 18
19above at each period boundary. As threads consume this bandwidth it is 19A group's unassigned quota is globally tracked, being refreshed back to
20transferred to cpu-local "silos" on a demand basis. The amount transferred 20cfs_quota units at each period boundary. As threads consume this bandwidth it
21is transferred to cpu-local "silos" on a demand basis. The amount transferred
21within each of these updates is tunable and described as the "slice". 22within each of these updates is tunable and described as the "slice".
22 23
23Management 24Management
@@ -35,12 +36,12 @@ The default values are::
35 36
36A value of -1 for cpu.cfs_quota_us indicates that the group does not have any 37A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
37bandwidth restriction in place, such a group is described as an unconstrained 38bandwidth restriction in place, such a group is described as an unconstrained
38bandwidth group. This represents the traditional work-conserving behavior for 39bandwidth group. This represents the traditional work-conserving behavior for
39CFS. 40CFS.
40 41
41Writing any (valid) positive value(s) will enact the specified bandwidth limit. 42Writing any (valid) positive value(s) will enact the specified bandwidth limit.
42The minimum quota allowed for the quota or period is 1ms. There is also an 43The minimum quota allowed for the quota or period is 1ms. There is also an
43upper bound on the period length of 1s. Additional restrictions exist when 44upper bound on the period length of 1s. Additional restrictions exist when
44bandwidth limits are used in a hierarchical fashion, these are explained in 45bandwidth limits are used in a hierarchical fashion, these are explained in
45more detail below. 46more detail below.
46 47
@@ -53,8 +54,8 @@ unthrottled if it is in a constrained state.
53System wide settings 54System wide settings
54-------------------- 55--------------------
55For efficiency run-time is transferred between the global pool and CPU local 56For efficiency run-time is transferred between the global pool and CPU local
56"silos" in a batch fashion. This greatly reduces global accounting pressure 57"silos" in a batch fashion. This greatly reduces global accounting pressure
57on large systems. The amount transferred each time such an update is required 58on large systems. The amount transferred each time such an update is required
58is described as the "slice". 59is described as the "slice".
59 60
60This is tunable via procfs:: 61This is tunable via procfs::
@@ -97,6 +98,51 @@ There are two ways in which a group may become throttled:
97In case b) above, even though the child may have runtime remaining it will not 98In case b) above, even though the child may have runtime remaining it will not
98be allowed to until the parent's runtime is refreshed. 99be allowed to until the parent's runtime is refreshed.
99 100
101CFS Bandwidth Quota Caveats
102---------------------------
103Once a slice is assigned to a cpu it does not expire. However all but 1ms of
104the slice may be returned to the global pool if all threads on that cpu become
105unrunnable. This is configured at compile time by the min_cfs_rq_runtime
106variable. This is a performance tweak that helps prevent added contention on
107the global lock.
108
109The fact that cpu-local slices do not expire results in some interesting corner
110cases that should be understood.
111
112For cgroup cpu constrained applications that are cpu limited this is a
113relatively moot point because they will naturally consume the entirety of their
114quota as well as the entirety of each cpu-local slice in each period. As a
115result it is expected that nr_periods roughly equal nr_throttled, and that
116cpuacct.usage will increase roughly equal to cfs_quota_us in each period.
117
118For highly-threaded, non-cpu bound applications this non-expiration nuance
119allows applications to briefly burst past their quota limits by the amount of
120unused slice on each cpu that the task group is running on (typically at most
1211ms per cpu or as defined by min_cfs_rq_runtime). This slight burst only
122applies if quota had been assigned to a cpu and then not fully used or returned
123in previous periods. This burst amount will not be transferred between cores.
124As a result, this mechanism still strictly limits the task group to quota
125average usage, albeit over a longer time window than a single period. This
126also limits the burst ability to no more than 1ms per cpu. This provides
127better more predictable user experience for highly threaded applications with
128small quota limits on high core count machines. It also eliminates the
129propensity to throttle these applications while simultanously using less than
130quota amounts of cpu. Another way to say this, is that by allowing the unused
131portion of a slice to remain valid across periods we have decreased the
132possibility of wastefully expiring quota on cpu-local silos that don't need a
133full slice's amount of cpu time.
134
135The interaction between cpu-bound and non-cpu-bound-interactive applications
136should also be considered, especially when single core usage hits 100%. If you
137gave each of these applications half of a cpu-core and they both got scheduled
138on the same CPU it is theoretically possible that the non-cpu bound application
139will use up to 1ms additional quota in some periods, thereby preventing the
140cpu-bound application from fully using its quota by that same amount. In these
141instances it will be up to the CFS algorithm (see sched-design-CFS.rst) to
142decide which application is chosen to run, as they will both be runnable and
143have remaining quota. This runtime discrepancy will be made up in the following
144periods when the interactive application idles.
145
100Examples 146Examples
101-------- 147--------
1021. Limit a group to 1 CPU worth of runtime:: 1481. Limit a group to 1 CPU worth of runtime::
diff --git a/MAINTAINERS b/MAINTAINERS
index cbe625343277..49f75d1b7b51 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -12578,6 +12578,7 @@ PERFORMANCE EVENTS SUBSYSTEM
12578M: Peter Zijlstra <peterz@infradead.org> 12578M: Peter Zijlstra <peterz@infradead.org>
12579M: Ingo Molnar <mingo@redhat.com> 12579M: Ingo Molnar <mingo@redhat.com>
12580M: Arnaldo Carvalho de Melo <acme@kernel.org> 12580M: Arnaldo Carvalho de Melo <acme@kernel.org>
12581R: Mark Rutland <mark.rutland@arm.com>
12581R: Alexander Shishkin <alexander.shishkin@linux.intel.com> 12582R: Alexander Shishkin <alexander.shishkin@linux.intel.com>
12582R: Jiri Olsa <jolsa@redhat.com> 12583R: Jiri Olsa <jolsa@redhat.com>
12583R: Namhyung Kim <namhyung@kernel.org> 12584R: Namhyung Kim <namhyung@kernel.org>
@@ -14175,6 +14176,12 @@ F: drivers/watchdog/sc1200wdt.c
14175SCHEDULER 14176SCHEDULER
14176M: Ingo Molnar <mingo@redhat.com> 14177M: Ingo Molnar <mingo@redhat.com>
14177M: Peter Zijlstra <peterz@infradead.org> 14178M: Peter Zijlstra <peterz@infradead.org>
14179M: Juri Lelli <juri.lelli@redhat.com> (SCHED_DEADLINE)
14180M: Vincent Guittot <vincent.guittot@linaro.org> (SCHED_NORMAL)
14181R: Dietmar Eggemann <dietmar.eggemann@arm.com> (SCHED_NORMAL)
14182R: Steven Rostedt <rostedt@goodmis.org> (SCHED_FIFO/SCHED_RR)
14183R: Ben Segall <bsegall@google.com> (CONFIG_CFS_BANDWIDTH)
14184R: Mel Gorman <mgorman@suse.de> (CONFIG_NUMA_BALANCING)
14178L: linux-kernel@vger.kernel.org 14185L: linux-kernel@vger.kernel.org
14179T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core 14186T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core
14180S: Maintained 14187S: Maintained
diff --git a/arch/Kconfig b/arch/Kconfig
index 71d9ae0c0ea1..6baedab10dca 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -106,7 +106,7 @@ config STATIC_KEYS_SELFTEST
106config OPTPROBES 106config OPTPROBES
107 def_bool y 107 def_bool y
108 depends on KPROBES && HAVE_OPTPROBES 108 depends on KPROBES && HAVE_OPTPROBES
109 select TASKS_RCU if PREEMPT 109 select TASKS_RCU if PREEMPTION
110 110
111config KPROBES_ON_FTRACE 111config KPROBES_ON_FTRACE
112 def_bool y 112 def_bool y
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 13d49c232556..9711cf730929 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -311,6 +311,7 @@ config ARCH_DISCONTIGMEM_DEFAULT
311config NUMA 311config NUMA
312 bool "NUMA support" 312 bool "NUMA support"
313 depends on !FLATMEM 313 depends on !FLATMEM
314 select SMP
314 help 315 help
315 Say Y to compile the kernel to support NUMA (Non-Uniform Memory 316 Say Y to compile the kernel to support NUMA (Non-Uniform Memory
316 Access). This option is for configuring high-end multiprocessor 317 Access). This option is for configuring high-end multiprocessor
diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4f86928246e7..f83ca5aa8b77 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -63,7 +63,7 @@
63 * enough to patch inline, increasing performance. 63 * enough to patch inline, increasing performance.
64 */ 64 */
65 65
66#ifdef CONFIG_PREEMPT 66#ifdef CONFIG_PREEMPTION
67# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF 67# define preempt_stop(clobbers) DISABLE_INTERRUPTS(clobbers); TRACE_IRQS_OFF
68#else 68#else
69# define preempt_stop(clobbers) 69# define preempt_stop(clobbers)
@@ -1084,7 +1084,7 @@ restore_all:
1084 INTERRUPT_RETURN 1084 INTERRUPT_RETURN
1085 1085
1086restore_all_kernel: 1086restore_all_kernel:
1087#ifdef CONFIG_PREEMPT 1087#ifdef CONFIG_PREEMPTION
1088 DISABLE_INTERRUPTS(CLBR_ANY) 1088 DISABLE_INTERRUPTS(CLBR_ANY)
1089 cmpl $0, PER_CPU_VAR(__preempt_count) 1089 cmpl $0, PER_CPU_VAR(__preempt_count)
1090 jnz .Lno_preempt 1090 jnz .Lno_preempt
@@ -1364,7 +1364,7 @@ ENTRY(xen_hypervisor_callback)
1364ENTRY(xen_do_upcall) 1364ENTRY(xen_do_upcall)
13651: mov %esp, %eax 13651: mov %esp, %eax
1366 call xen_evtchn_do_upcall 1366 call xen_evtchn_do_upcall
1367#ifndef CONFIG_PREEMPT 1367#ifndef CONFIG_PREEMPTION
1368 call xen_maybe_preempt_hcall 1368 call xen_maybe_preempt_hcall
1369#endif 1369#endif
1370 jmp ret_from_intr 1370 jmp ret_from_intr
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index be9ca198c581..af077ded1969 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -664,7 +664,7 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
664 664
665/* Returning to kernel space */ 665/* Returning to kernel space */
666retint_kernel: 666retint_kernel:
667#ifdef CONFIG_PREEMPT 667#ifdef CONFIG_PREEMPTION
668 /* Interrupts are off */ 668 /* Interrupts are off */
669 /* Check if we need preemption */ 669 /* Check if we need preemption */
670 btl $9, EFLAGS(%rsp) /* were interrupts off? */ 670 btl $9, EFLAGS(%rsp) /* were interrupts off? */
@@ -1115,7 +1115,7 @@ ENTRY(xen_do_hypervisor_callback) /* do_hypervisor_callback(struct *pt_regs) */
1115 call xen_evtchn_do_upcall 1115 call xen_evtchn_do_upcall
1116 LEAVE_IRQ_STACK 1116 LEAVE_IRQ_STACK
1117 1117
1118#ifndef CONFIG_PREEMPT 1118#ifndef CONFIG_PREEMPTION
1119 call xen_maybe_preempt_hcall 1119 call xen_maybe_preempt_hcall
1120#endif 1120#endif
1121 jmp error_exit 1121 jmp error_exit
diff --git a/arch/x86/entry/thunk_32.S b/arch/x86/entry/thunk_32.S
index cb3464525b37..2713490611a3 100644
--- a/arch/x86/entry/thunk_32.S
+++ b/arch/x86/entry/thunk_32.S
@@ -34,7 +34,7 @@
34 THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1 34 THUNK trace_hardirqs_off_thunk,trace_hardirqs_off_caller,1
35#endif 35#endif
36 36
37#ifdef CONFIG_PREEMPT 37#ifdef CONFIG_PREEMPTION
38 THUNK ___preempt_schedule, preempt_schedule 38 THUNK ___preempt_schedule, preempt_schedule
39 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace 39 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
40 EXPORT_SYMBOL(___preempt_schedule) 40 EXPORT_SYMBOL(___preempt_schedule)
diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S
index cc20465b2867..ea5c4167086c 100644
--- a/arch/x86/entry/thunk_64.S
+++ b/arch/x86/entry/thunk_64.S
@@ -46,7 +46,7 @@
46 THUNK lockdep_sys_exit_thunk,lockdep_sys_exit 46 THUNK lockdep_sys_exit_thunk,lockdep_sys_exit
47#endif 47#endif
48 48
49#ifdef CONFIG_PREEMPT 49#ifdef CONFIG_PREEMPTION
50 THUNK ___preempt_schedule, preempt_schedule 50 THUNK ___preempt_schedule, preempt_schedule
51 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace 51 THUNK ___preempt_schedule_notrace, preempt_schedule_notrace
52 EXPORT_SYMBOL(___preempt_schedule) 52 EXPORT_SYMBOL(___preempt_schedule)
@@ -55,7 +55,7 @@
55 55
56#if defined(CONFIG_TRACE_IRQFLAGS) \ 56#if defined(CONFIG_TRACE_IRQFLAGS) \
57 || defined(CONFIG_DEBUG_LOCK_ALLOC) \ 57 || defined(CONFIG_DEBUG_LOCK_ALLOC) \
58 || defined(CONFIG_PREEMPT) 58 || defined(CONFIG_PREEMPTION)
59.L_restore: 59.L_restore:
60 popq %r11 60 popq %r11
61 popq %r10 61 popq %r10
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 99a7fa9ab0a3..3d4cb83a8828 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -102,7 +102,7 @@ static __always_inline bool should_resched(int preempt_offset)
102 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); 102 return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
103} 103}
104 104
105#ifdef CONFIG_PREEMPT 105#ifdef CONFIG_PREEMPTION
106 extern asmlinkage void ___preempt_schedule(void); 106 extern asmlinkage void ___preempt_schedule(void);
107# define __preempt_schedule() \ 107# define __preempt_schedule() \
108 asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) 108 asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 68c363c341bf..7d6e0efcc2db 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/sched/clock.h> 9#include <linux/sched/clock.h>
10#include <linux/random.h> 10#include <linux/random.h>
11#include <linux/topology.h>
11#include <asm/processor.h> 12#include <asm/processor.h>
12#include <asm/apic.h> 13#include <asm/apic.h>
13#include <asm/cacheinfo.h> 14#include <asm/cacheinfo.h>
@@ -889,6 +890,10 @@ static void init_amd_zn(struct cpuinfo_x86 *c)
889{ 890{
890 set_cpu_cap(c, X86_FEATURE_ZEN); 891 set_cpu_cap(c, X86_FEATURE_ZEN);
891 892
893#ifdef CONFIG_NUMA
894 node_reclaim_distance = 32;
895#endif
896
892 /* 897 /*
893 * Fix erratum 1076: CPB feature bit not being set in CPUID. 898 * Fix erratum 1076: CPB feature bit not being set in CPUID.
894 * Always set it, except when running under a hypervisor. 899 * Always set it, except when running under a hypervisor.
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2b5886401e5f..e07424e19274 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -367,13 +367,18 @@ NOKPROBE_SYMBOL(oops_end);
367 367
368int __die(const char *str, struct pt_regs *regs, long err) 368int __die(const char *str, struct pt_regs *regs, long err)
369{ 369{
370 const char *pr = "";
371
370 /* Save the regs of the first oops for the executive summary later. */ 372 /* Save the regs of the first oops for the executive summary later. */
371 if (!die_counter) 373 if (!die_counter)
372 exec_summary_regs = *regs; 374 exec_summary_regs = *regs;
373 375
376 if (IS_ENABLED(CONFIG_PREEMPTION))
377 pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT";
378
374 printk(KERN_DEFAULT 379 printk(KERN_DEFAULT
375 "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, 380 "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter,
376 IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", 381 pr,
377 IS_ENABLED(CONFIG_SMP) ? " SMP" : "", 382 IS_ENABLED(CONFIG_SMP) ? " SMP" : "",
378 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", 383 debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "",
379 IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", 384 IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "",
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 0e0b08008b5a..43fc13c831af 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -580,7 +580,7 @@ static void setup_singlestep(struct kprobe *p, struct pt_regs *regs,
580 if (setup_detour_execution(p, regs, reenter)) 580 if (setup_detour_execution(p, regs, reenter))
581 return; 581 return;
582 582
583#if !defined(CONFIG_PREEMPT) 583#if !defined(CONFIG_PREEMPTION)
584 if (p->ainsn.boostable && !p->post_handler) { 584 if (p->ainsn.boostable && !p->post_handler) {
585 /* Boost up -- we can execute copied instructions directly */ 585 /* Boost up -- we can execute copied instructions directly */
586 if (!reenter) 586 if (!reenter)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 4ab377c9fffe..4cc967178bf9 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -311,7 +311,7 @@ static void kvm_guest_cpu_init(void)
311 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { 311 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
312 u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason)); 312 u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
313 313
314#ifdef CONFIG_PREEMPT 314#ifdef CONFIG_PREEMPTION
315 pa |= KVM_ASYNC_PF_SEND_ALWAYS; 315 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
316#endif 316#endif
317 pa |= KVM_ASYNC_PF_ENABLED; 317 pa |= KVM_ASYNC_PF_ENABLED;
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index c3046c920063..d683f5e6d791 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -78,11 +78,11 @@ static __always_inline bool should_resched(int preempt_offset)
78 tif_need_resched()); 78 tif_need_resched());
79} 79}
80 80
81#ifdef CONFIG_PREEMPT 81#ifdef CONFIG_PREEMPTION
82extern asmlinkage void preempt_schedule(void); 82extern asmlinkage void preempt_schedule(void);
83#define __preempt_schedule() preempt_schedule() 83#define __preempt_schedule() preempt_schedule()
84extern asmlinkage void preempt_schedule_notrace(void); 84extern asmlinkage void preempt_schedule_notrace(void);
85#define __preempt_schedule_notrace() preempt_schedule_notrace() 85#define __preempt_schedule_notrace() preempt_schedule_notrace()
86#endif /* CONFIG_PREEMPT */ 86#endif /* CONFIG_PREEMPTION */
87 87
88#endif /* __ASM_PREEMPT_H */ 88#endif /* __ASM_PREEMPT_H */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f6b048902d6c..3ba3e6da13a6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,7 @@ struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset,
150struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, 150struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
151 struct cgroup_subsys_state **dst_cssp); 151 struct cgroup_subsys_state **dst_cssp);
152 152
153void cgroup_enable_task_cg_lists(void);
153void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, 154void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
154 struct css_task_iter *it); 155 struct css_task_iter *it);
155struct task_struct *css_task_iter_next(struct css_task_iter *it); 156struct task_struct *css_task_iter_next(struct css_task_iter *it);
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 934633a05d20..04c20de66afc 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
40 40
41static inline void cpuset_inc(void) 41static inline void cpuset_inc(void)
42{ 42{
43 static_branch_inc(&cpusets_pre_enable_key); 43 static_branch_inc_cpuslocked(&cpusets_pre_enable_key);
44 static_branch_inc(&cpusets_enabled_key); 44 static_branch_inc_cpuslocked(&cpusets_enabled_key);
45} 45}
46 46
47static inline void cpuset_dec(void) 47static inline void cpuset_dec(void)
48{ 48{
49 static_branch_dec(&cpusets_enabled_key); 49 static_branch_dec_cpuslocked(&cpusets_enabled_key);
50 static_branch_dec(&cpusets_pre_enable_key); 50 static_branch_dec_cpuslocked(&cpusets_pre_enable_key);
51} 51}
52 52
53extern int cpuset_init(void); 53extern int cpuset_init(void);
@@ -55,6 +55,8 @@ extern void cpuset_init_smp(void);
55extern void cpuset_force_rebuild(void); 55extern void cpuset_force_rebuild(void);
56extern void cpuset_update_active_cpus(void); 56extern void cpuset_update_active_cpus(void);
57extern void cpuset_wait_for_hotplug(void); 57extern void cpuset_wait_for_hotplug(void);
58extern void cpuset_read_lock(void);
59extern void cpuset_read_unlock(void);
58extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 60extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
59extern void cpuset_cpus_allowed_fallback(struct task_struct *p); 61extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
60extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 62extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -176,6 +178,9 @@ static inline void cpuset_update_active_cpus(void)
176 178
177static inline void cpuset_wait_for_hotplug(void) { } 179static inline void cpuset_wait_for_hotplug(void) { }
178 180
181static inline void cpuset_read_lock(void) { }
182static inline void cpuset_read_unlock(void) { }
183
179static inline void cpuset_cpus_allowed(struct task_struct *p, 184static inline void cpuset_cpus_allowed(struct task_struct *p,
180 struct cpumask *mask) 185 struct cpumask *mask)
181{ 186{
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index dd92b1a93919..bbb68dba37cc 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -182,7 +182,7 @@ do { \
182 182
183#define preemptible() (preempt_count() == 0 && !irqs_disabled()) 183#define preemptible() (preempt_count() == 0 && !irqs_disabled())
184 184
185#ifdef CONFIG_PREEMPT 185#ifdef CONFIG_PREEMPTION
186#define preempt_enable() \ 186#define preempt_enable() \
187do { \ 187do { \
188 barrier(); \ 188 barrier(); \
@@ -203,7 +203,7 @@ do { \
203 __preempt_schedule(); \ 203 __preempt_schedule(); \
204} while (0) 204} while (0)
205 205
206#else /* !CONFIG_PREEMPT */ 206#else /* !CONFIG_PREEMPTION */
207#define preempt_enable() \ 207#define preempt_enable() \
208do { \ 208do { \
209 barrier(); \ 209 barrier(); \
@@ -217,7 +217,7 @@ do { \
217} while (0) 217} while (0)
218 218
219#define preempt_check_resched() do { } while (0) 219#define preempt_check_resched() do { } while (0)
220#endif /* CONFIG_PREEMPT */ 220#endif /* CONFIG_PREEMPTION */
221 221
222#define preempt_disable_notrace() \ 222#define preempt_disable_notrace() \
223do { \ 223do { \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 80d6056f5855..75a2eded7aa2 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -585,7 +585,7 @@ do { \
585 * 585 *
586 * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU), 586 * In non-preemptible RCU implementations (TREE_RCU and TINY_RCU),
587 * it is illegal to block while in an RCU read-side critical section. 587 * it is illegal to block while in an RCU read-side critical section.
588 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPT 588 * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION
589 * kernel builds, RCU read-side critical sections may be preempted, 589 * kernel builds, RCU read-side critical sections may be preempted,
590 * but explicit blocking is illegal. Finally, in preemptible RCU 590 * but explicit blocking is illegal. Finally, in preemptible RCU
591 * implementations in real-time (with -rt patchset) kernel builds, RCU 591 * implementations in real-time (with -rt patchset) kernel builds, RCU
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 735601ac27d3..18b1ed9864b0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -53,7 +53,7 @@ void rcu_scheduler_starting(void);
53extern int rcu_scheduler_active __read_mostly; 53extern int rcu_scheduler_active __read_mostly;
54void rcu_end_inkernel_boot(void); 54void rcu_end_inkernel_boot(void);
55bool rcu_is_watching(void); 55bool rcu_is_watching(void);
56#ifndef CONFIG_PREEMPT 56#ifndef CONFIG_PREEMPTION
57void rcu_all_qs(void); 57void rcu_all_qs(void);
58#endif 58#endif
59 59
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9f51932bd543..f0edee94834a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -295,6 +295,11 @@ enum uclamp_id {
295 UCLAMP_CNT 295 UCLAMP_CNT
296}; 296};
297 297
298#ifdef CONFIG_SMP
299extern struct root_domain def_root_domain;
300extern struct mutex sched_domains_mutex;
301#endif
302
298struct sched_info { 303struct sched_info {
299#ifdef CONFIG_SCHED_INFO 304#ifdef CONFIG_SCHED_INFO
300 /* Cumulative counters: */ 305 /* Cumulative counters: */
@@ -1767,7 +1772,7 @@ static inline int test_tsk_need_resched(struct task_struct *tsk)
1767 * value indicates whether a reschedule was done in fact. 1772 * value indicates whether a reschedule was done in fact.
1768 * cond_resched_lock() will drop the spinlock before scheduling, 1773 * cond_resched_lock() will drop the spinlock before scheduling,
1769 */ 1774 */
1770#ifndef CONFIG_PREEMPT 1775#ifndef CONFIG_PREEMPTION
1771extern int _cond_resched(void); 1776extern int _cond_resched(void);
1772#else 1777#else
1773static inline int _cond_resched(void) { return 0; } 1778static inline int _cond_resched(void) { return 0; }
@@ -1796,12 +1801,12 @@ static inline void cond_resched_rcu(void)
1796 1801
1797/* 1802/*
1798 * Does a critical section need to be broken due to another 1803 * Does a critical section need to be broken due to another
1799 * task waiting?: (technically does not depend on CONFIG_PREEMPT, 1804 * task waiting?: (technically does not depend on CONFIG_PREEMPTION,
1800 * but a general need for low latency) 1805 * but a general need for low latency)
1801 */ 1806 */
1802static inline int spin_needbreak(spinlock_t *lock) 1807static inline int spin_needbreak(spinlock_t *lock)
1803{ 1808{
1804#ifdef CONFIG_PREEMPT 1809#ifdef CONFIG_PREEMPTION
1805 return spin_is_contended(lock); 1810 return spin_is_contended(lock);
1806#else 1811#else
1807 return 0; 1812 return 0;
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 0cb034331cbb..1aff00b65f3c 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -24,3 +24,11 @@ static inline bool dl_time_before(u64 a, u64 b)
24{ 24{
25 return (s64)(a - b) < 0; 25 return (s64)(a - b) < 0;
26} 26}
27
28#ifdef CONFIG_SMP
29
30struct root_domain;
31extern void dl_add_task_root_domain(struct task_struct *p);
32extern void dl_clear_root_domain(struct root_domain *rd);
33
34#endif /* CONFIG_SMP */
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h
index 0497091e40c1..3d90ed8f75f0 100644
--- a/include/linux/sched/task.h
+++ b/include/linux/sched/task.h
@@ -105,7 +105,11 @@ extern void sched_exec(void);
105#define sched_exec() {} 105#define sched_exec() {}
106#endif 106#endif
107 107
108#define get_task_struct(tsk) do { refcount_inc(&(tsk)->usage); } while(0) 108static inline struct task_struct *get_task_struct(struct task_struct *t)
109{
110 refcount_inc(&t->usage);
111 return t;
112}
109 113
110extern void __put_task_struct(struct task_struct *t); 114extern void __put_task_struct(struct task_struct *t);
111 115
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index 7863bb62d2ab..f341163fedc9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -150,6 +150,10 @@ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
150 return to_cpumask(sd->span); 150 return to_cpumask(sd->span);
151} 151}
152 152
153extern void partition_sched_domains_locked(int ndoms_new,
154 cpumask_var_t doms_new[],
155 struct sched_domain_attr *dattr_new);
156
153extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 157extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
154 struct sched_domain_attr *dattr_new); 158 struct sched_domain_attr *dattr_new);
155 159
@@ -195,6 +199,12 @@ extern void set_sched_topology(struct sched_domain_topology_level *tl);
195struct sched_domain_attr; 199struct sched_domain_attr;
196 200
197static inline void 201static inline void
202partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
203 struct sched_domain_attr *dattr_new)
204{
205}
206
207static inline void
198partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 208partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
199 struct sched_domain_attr *dattr_new) 209 struct sched_domain_attr *dattr_new)
200{ 210{
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index ed7c4d6b8235..031ce8617df8 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -214,7 +214,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
214 214
215/* 215/*
216 * Define the various spin_lock methods. Note we define these 216 * Define the various spin_lock methods. Note we define these
217 * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The 217 * regardless of whether CONFIG_SMP or CONFIG_PREEMPTION are set. The
218 * various methods are defined as nops in the case they are not 218 * various methods are defined as nops in the case they are not
219 * required. 219 * required.
220 */ 220 */
diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h
index 42dfab89e740..b762eaba4cdf 100644
--- a/include/linux/spinlock_api_smp.h
+++ b/include/linux/spinlock_api_smp.h
@@ -96,7 +96,7 @@ static inline int __raw_spin_trylock(raw_spinlock_t *lock)
96 96
97/* 97/*
98 * If lockdep is enabled then we use the non-preemption spin-ops 98 * If lockdep is enabled then we use the non-preemption spin-ops
99 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 99 * even on CONFIG_PREEMPTION, because lockdep assumes that interrupts are
100 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 100 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
101 */ 101 */
102#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) 102#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
diff --git a/include/linux/topology.h b/include/linux/topology.h
index 2a19d196af28..eb2fe6edd73c 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -60,6 +60,20 @@ int arch_update_cpu_topology(void);
60 */ 60 */
61#define RECLAIM_DISTANCE 30 61#define RECLAIM_DISTANCE 30
62#endif 62#endif
63
64/*
65 * The following tunable allows platforms to override the default node
66 * reclaim distance (RECLAIM_DISTANCE) if remote memory accesses are
67 * sufficiently fast that the default value actually hurts
68 * performance.
69 *
70 * AMD EPYC machines use this because even though the 2-hop distance
71 * is 32 (3.2x slower than a local memory access) performance actually
72 * *improves* if allowed to reclaim memory and load balance tasks
73 * between NUMA nodes 2-hops apart.
74 */
75extern int __read_mostly node_reclaim_distance;
76
63#ifndef PENALTY_FOR_NODE_WITH_CPUS 77#ifndef PENALTY_FOR_NODE_WITH_CPUS
64#define PENALTY_FOR_NODE_WITH_CPUS (1) 78#define PENALTY_FOR_NODE_WITH_CPUS (1)
65#endif 79#endif
diff --git a/include/linux/torture.h b/include/linux/torture.h
index a620118385bb..6241f59e2d6f 100644
--- a/include/linux/torture.h
+++ b/include/linux/torture.h
@@ -86,7 +86,7 @@ void _torture_stop_kthread(char *m, struct task_struct **tp);
86#define torture_stop_kthread(n, tp) \ 86#define torture_stop_kthread(n, tp) \
87 _torture_stop_kthread("Stopping " #n " task", &(tp)) 87 _torture_stop_kthread("Stopping " #n " task", &(tp))
88 88
89#ifdef CONFIG_PREEMPT 89#ifdef CONFIG_PREEMPTION
90#define torture_preempt_schedule() preempt_schedule() 90#define torture_preempt_schedule() preempt_schedule()
91#else 91#else
92#define torture_preempt_schedule() 92#define torture_preempt_schedule()
diff --git a/init/Kconfig b/init/Kconfig
index d96127ebc44e..ec1021fd3371 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -931,6 +931,28 @@ config RT_GROUP_SCHED
931 931
932endif #CGROUP_SCHED 932endif #CGROUP_SCHED
933 933
934config UCLAMP_TASK_GROUP
935 bool "Utilization clamping per group of tasks"
936 depends on CGROUP_SCHED
937 depends on UCLAMP_TASK
938 default n
939 help
940 This feature enables the scheduler to track the clamped utilization
941 of each CPU based on RUNNABLE tasks currently scheduled on that CPU.
942
943 When this option is enabled, the user can specify a min and max
944 CPU bandwidth which is allowed for each single task in a group.
945 The max bandwidth allows to clamp the maximum frequency a task
946 can use, while the min bandwidth allows to define a minimum
947 frequency a task will always use.
948
949 When task group based utilization clamping is enabled, an eventually
950 specified task-specific clamp value is constrained by the cgroup
951 specified clamp value. Both minimum and maximum task clamping cannot
952 be bigger than the corresponding clamping defined at task group level.
953
954 If in doubt, say N.
955
934config CGROUP_PIDS 956config CGROUP_PIDS
935 bool "PIDs controller" 957 bool "PIDs controller"
936 help 958 help
diff --git a/init/init_task.c b/init/init_task.c
index 7ab773b9b3cd..bfe06c53b14e 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -174,7 +174,7 @@ struct task_struct init_task
174#ifdef CONFIG_FUNCTION_GRAPH_TRACER 174#ifdef CONFIG_FUNCTION_GRAPH_TRACER
175 .ret_stack = NULL, 175 .ret_stack = NULL,
176#endif 176#endif
177#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) 177#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION)
178 .trace_recursion = 0, 178 .trace_recursion = 0,
179#endif 179#endif
180#ifdef CONFIG_LIVEPATCH 180#ifdef CONFIG_LIVEPATCH
diff --git a/init/main.c b/init/main.c
index 96f8d5af52d6..653693da8da6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -433,7 +433,7 @@ noinline void __ref rest_init(void)
433 433
434 /* 434 /*
435 * Enable might_sleep() and smp_processor_id() checks. 435 * Enable might_sleep() and smp_processor_id() checks.
436 * They cannot be enabled earlier because with CONFIG_PREEMPT=y 436 * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
437 * kernel_thread() would trigger might_sleep() splats. With 437 * kernel_thread() would trigger might_sleep() splats. With
438 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled 438 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
439 * already, but it's stuck on the kthreadd_done completion. 439 * already, but it's stuck on the kthreadd_done completion.
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8be1da1ebd9a..a7ce73a2c401 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1891,7 +1891,7 @@ static int cgroup_reconfigure(struct fs_context *fc)
1891 */ 1891 */
1892static bool use_task_css_set_links __read_mostly; 1892static bool use_task_css_set_links __read_mostly;
1893 1893
1894static void cgroup_enable_task_cg_lists(void) 1894void cgroup_enable_task_cg_lists(void)
1895{ 1895{
1896 struct task_struct *p, *g; 1896 struct task_struct *p, *g;
1897 1897
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5aa37531ce76..c52bc91f882b 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -45,6 +45,7 @@
45#include <linux/proc_fs.h> 45#include <linux/proc_fs.h>
46#include <linux/rcupdate.h> 46#include <linux/rcupdate.h>
47#include <linux/sched.h> 47#include <linux/sched.h>
48#include <linux/sched/deadline.h>
48#include <linux/sched/mm.h> 49#include <linux/sched/mm.h>
49#include <linux/sched/task.h> 50#include <linux/sched/task.h>
50#include <linux/seq_file.h> 51#include <linux/seq_file.h>
@@ -332,7 +333,18 @@ static struct cpuset top_cpuset = {
332 * guidelines for accessing subsystem state in kernel/cgroup.c 333 * guidelines for accessing subsystem state in kernel/cgroup.c
333 */ 334 */
334 335
335static DEFINE_MUTEX(cpuset_mutex); 336DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
337
338void cpuset_read_lock(void)
339{
340 percpu_down_read(&cpuset_rwsem);
341}
342
343void cpuset_read_unlock(void)
344{
345 percpu_up_read(&cpuset_rwsem);
346}
347
336static DEFINE_SPINLOCK(callback_lock); 348static DEFINE_SPINLOCK(callback_lock);
337 349
338static struct workqueue_struct *cpuset_migrate_mm_wq; 350static struct workqueue_struct *cpuset_migrate_mm_wq;
@@ -894,6 +906,67 @@ done:
894 return ndoms; 906 return ndoms;
895} 907}
896 908
909static void update_tasks_root_domain(struct cpuset *cs)
910{
911 struct css_task_iter it;
912 struct task_struct *task;
913
914 css_task_iter_start(&cs->css, 0, &it);
915
916 while ((task = css_task_iter_next(&it)))
917 dl_add_task_root_domain(task);
918
919 css_task_iter_end(&it);
920}
921
922static void rebuild_root_domains(void)
923{
924 struct cpuset *cs = NULL;
925 struct cgroup_subsys_state *pos_css;
926
927 percpu_rwsem_assert_held(&cpuset_rwsem);
928 lockdep_assert_cpus_held();
929 lockdep_assert_held(&sched_domains_mutex);
930
931 cgroup_enable_task_cg_lists();
932
933 rcu_read_lock();
934
935 /*
936 * Clear default root domain DL accounting, it will be computed again
937 * if a task belongs to it.
938 */
939 dl_clear_root_domain(&def_root_domain);
940
941 cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
942
943 if (cpumask_empty(cs->effective_cpus)) {
944 pos_css = css_rightmost_descendant(pos_css);
945 continue;
946 }
947
948 css_get(&cs->css);
949
950 rcu_read_unlock();
951
952 update_tasks_root_domain(cs);
953
954 rcu_read_lock();
955 css_put(&cs->css);
956 }
957 rcu_read_unlock();
958}
959
960static void
961partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
962 struct sched_domain_attr *dattr_new)
963{
964 mutex_lock(&sched_domains_mutex);
965 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
966 rebuild_root_domains();
967 mutex_unlock(&sched_domains_mutex);
968}
969
897/* 970/*
898 * Rebuild scheduler domains. 971 * Rebuild scheduler domains.
899 * 972 *
@@ -911,8 +984,8 @@ static void rebuild_sched_domains_locked(void)
911 cpumask_var_t *doms; 984 cpumask_var_t *doms;
912 int ndoms; 985 int ndoms;
913 986
914 lockdep_assert_held(&cpuset_mutex); 987 lockdep_assert_cpus_held();
915 get_online_cpus(); 988 percpu_rwsem_assert_held(&cpuset_rwsem);
916 989
917 /* 990 /*
918 * We have raced with CPU hotplug. Don't do anything to avoid 991 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -921,19 +994,17 @@ static void rebuild_sched_domains_locked(void)
921 */ 994 */
922 if (!top_cpuset.nr_subparts_cpus && 995 if (!top_cpuset.nr_subparts_cpus &&
923 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) 996 !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
924 goto out; 997 return;
925 998
926 if (top_cpuset.nr_subparts_cpus && 999 if (top_cpuset.nr_subparts_cpus &&
927 !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) 1000 !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
928 goto out; 1001 return;
929 1002
930 /* Generate domain masks and attrs */ 1003 /* Generate domain masks and attrs */
931 ndoms = generate_sched_domains(&doms, &attr); 1004 ndoms = generate_sched_domains(&doms, &attr);
932 1005
933 /* Have scheduler rebuild the domains */ 1006 /* Have scheduler rebuild the domains */
934 partition_sched_domains(ndoms, doms, attr); 1007 partition_and_rebuild_sched_domains(ndoms, doms, attr);
935out:
936 put_online_cpus();
937} 1008}
938#else /* !CONFIG_SMP */ 1009#else /* !CONFIG_SMP */
939static void rebuild_sched_domains_locked(void) 1010static void rebuild_sched_domains_locked(void)
@@ -943,9 +1014,11 @@ static void rebuild_sched_domains_locked(void)
943 1014
944void rebuild_sched_domains(void) 1015void rebuild_sched_domains(void)
945{ 1016{
946 mutex_lock(&cpuset_mutex); 1017 get_online_cpus();
1018 percpu_down_write(&cpuset_rwsem);
947 rebuild_sched_domains_locked(); 1019 rebuild_sched_domains_locked();
948 mutex_unlock(&cpuset_mutex); 1020 percpu_up_write(&cpuset_rwsem);
1021 put_online_cpus();
949} 1022}
950 1023
951/** 1024/**
@@ -1051,7 +1124,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
1051 int deleting; /* Moving cpus from subparts_cpus to effective_cpus */ 1124 int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
1052 bool part_error = false; /* Partition error? */ 1125 bool part_error = false; /* Partition error? */
1053 1126
1054 lockdep_assert_held(&cpuset_mutex); 1127 percpu_rwsem_assert_held(&cpuset_rwsem);
1055 1128
1056 /* 1129 /*
1057 * The parent must be a partition root. 1130 * The parent must be a partition root.
@@ -2039,7 +2112,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
2039 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); 2112 cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
2040 cs = css_cs(css); 2113 cs = css_cs(css);
2041 2114
2042 mutex_lock(&cpuset_mutex); 2115 percpu_down_write(&cpuset_rwsem);
2043 2116
2044 /* allow moving tasks into an empty cpuset if on default hierarchy */ 2117 /* allow moving tasks into an empty cpuset if on default hierarchy */
2045 ret = -ENOSPC; 2118 ret = -ENOSPC;
@@ -2063,7 +2136,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
2063 cs->attach_in_progress++; 2136 cs->attach_in_progress++;
2064 ret = 0; 2137 ret = 0;
2065out_unlock: 2138out_unlock:
2066 mutex_unlock(&cpuset_mutex); 2139 percpu_up_write(&cpuset_rwsem);
2067 return ret; 2140 return ret;
2068} 2141}
2069 2142
@@ -2073,9 +2146,9 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
2073 2146
2074 cgroup_taskset_first(tset, &css); 2147 cgroup_taskset_first(tset, &css);
2075 2148
2076 mutex_lock(&cpuset_mutex); 2149 percpu_down_write(&cpuset_rwsem);
2077 css_cs(css)->attach_in_progress--; 2150 css_cs(css)->attach_in_progress--;
2078 mutex_unlock(&cpuset_mutex); 2151 percpu_up_write(&cpuset_rwsem);
2079} 2152}
2080 2153
2081/* 2154/*
@@ -2098,7 +2171,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
2098 cgroup_taskset_first(tset, &css); 2171 cgroup_taskset_first(tset, &css);
2099 cs = css_cs(css); 2172 cs = css_cs(css);
2100 2173
2101 mutex_lock(&cpuset_mutex); 2174 percpu_down_write(&cpuset_rwsem);
2102 2175
2103 /* prepare for attach */ 2176 /* prepare for attach */
2104 if (cs == &top_cpuset) 2177 if (cs == &top_cpuset)
@@ -2152,7 +2225,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
2152 if (!cs->attach_in_progress) 2225 if (!cs->attach_in_progress)
2153 wake_up(&cpuset_attach_wq); 2226 wake_up(&cpuset_attach_wq);
2154 2227
2155 mutex_unlock(&cpuset_mutex); 2228 percpu_up_write(&cpuset_rwsem);
2156} 2229}
2157 2230
2158/* The various types of files and directories in a cpuset file system */ 2231/* The various types of files and directories in a cpuset file system */
@@ -2183,7 +2256,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2183 cpuset_filetype_t type = cft->private; 2256 cpuset_filetype_t type = cft->private;
2184 int retval = 0; 2257 int retval = 0;
2185 2258
2186 mutex_lock(&cpuset_mutex); 2259 get_online_cpus();
2260 percpu_down_write(&cpuset_rwsem);
2187 if (!is_cpuset_online(cs)) { 2261 if (!is_cpuset_online(cs)) {
2188 retval = -ENODEV; 2262 retval = -ENODEV;
2189 goto out_unlock; 2263 goto out_unlock;
@@ -2219,7 +2293,8 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
2219 break; 2293 break;
2220 } 2294 }
2221out_unlock: 2295out_unlock:
2222 mutex_unlock(&cpuset_mutex); 2296 percpu_up_write(&cpuset_rwsem);
2297 put_online_cpus();
2223 return retval; 2298 return retval;
2224} 2299}
2225 2300
@@ -2230,7 +2305,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2230 cpuset_filetype_t type = cft->private; 2305 cpuset_filetype_t type = cft->private;
2231 int retval = -ENODEV; 2306 int retval = -ENODEV;
2232 2307
2233 mutex_lock(&cpuset_mutex); 2308 get_online_cpus();
2309 percpu_down_write(&cpuset_rwsem);
2234 if (!is_cpuset_online(cs)) 2310 if (!is_cpuset_online(cs))
2235 goto out_unlock; 2311 goto out_unlock;
2236 2312
@@ -2243,7 +2319,8 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
2243 break; 2319 break;
2244 } 2320 }
2245out_unlock: 2321out_unlock:
2246 mutex_unlock(&cpuset_mutex); 2322 percpu_up_write(&cpuset_rwsem);
2323 put_online_cpus();
2247 return retval; 2324 return retval;
2248} 2325}
2249 2326
@@ -2282,7 +2359,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2282 kernfs_break_active_protection(of->kn); 2359 kernfs_break_active_protection(of->kn);
2283 flush_work(&cpuset_hotplug_work); 2360 flush_work(&cpuset_hotplug_work);
2284 2361
2285 mutex_lock(&cpuset_mutex); 2362 get_online_cpus();
2363 percpu_down_write(&cpuset_rwsem);
2286 if (!is_cpuset_online(cs)) 2364 if (!is_cpuset_online(cs))
2287 goto out_unlock; 2365 goto out_unlock;
2288 2366
@@ -2306,7 +2384,8 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
2306 2384
2307 free_cpuset(trialcs); 2385 free_cpuset(trialcs);
2308out_unlock: 2386out_unlock:
2309 mutex_unlock(&cpuset_mutex); 2387 percpu_up_write(&cpuset_rwsem);
2388 put_online_cpus();
2310 kernfs_unbreak_active_protection(of->kn); 2389 kernfs_unbreak_active_protection(of->kn);
2311 css_put(&cs->css); 2390 css_put(&cs->css);
2312 flush_workqueue(cpuset_migrate_mm_wq); 2391 flush_workqueue(cpuset_migrate_mm_wq);
@@ -2437,13 +2516,15 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
2437 return -EINVAL; 2516 return -EINVAL;
2438 2517
2439 css_get(&cs->css); 2518 css_get(&cs->css);
2440 mutex_lock(&cpuset_mutex); 2519 get_online_cpus();
2520 percpu_down_write(&cpuset_rwsem);
2441 if (!is_cpuset_online(cs)) 2521 if (!is_cpuset_online(cs))
2442 goto out_unlock; 2522 goto out_unlock;
2443 2523
2444 retval = update_prstate(cs, val); 2524 retval = update_prstate(cs, val);
2445out_unlock: 2525out_unlock:
2446 mutex_unlock(&cpuset_mutex); 2526 percpu_up_write(&cpuset_rwsem);
2527 put_online_cpus();
2447 css_put(&cs->css); 2528 css_put(&cs->css);
2448 return retval ?: nbytes; 2529 return retval ?: nbytes;
2449} 2530}
@@ -2649,7 +2730,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
2649 if (!parent) 2730 if (!parent)
2650 return 0; 2731 return 0;
2651 2732
2652 mutex_lock(&cpuset_mutex); 2733 get_online_cpus();
2734 percpu_down_write(&cpuset_rwsem);
2653 2735
2654 set_bit(CS_ONLINE, &cs->flags); 2736 set_bit(CS_ONLINE, &cs->flags);
2655 if (is_spread_page(parent)) 2737 if (is_spread_page(parent))
@@ -2700,7 +2782,8 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
2700 cpumask_copy(cs->effective_cpus, parent->cpus_allowed); 2782 cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
2701 spin_unlock_irq(&callback_lock); 2783 spin_unlock_irq(&callback_lock);
2702out_unlock: 2784out_unlock:
2703 mutex_unlock(&cpuset_mutex); 2785 percpu_up_write(&cpuset_rwsem);
2786 put_online_cpus();
2704 return 0; 2787 return 0;
2705} 2788}
2706 2789
@@ -2719,7 +2802,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
2719{ 2802{
2720 struct cpuset *cs = css_cs(css); 2803 struct cpuset *cs = css_cs(css);
2721 2804
2722 mutex_lock(&cpuset_mutex); 2805 get_online_cpus();
2806 percpu_down_write(&cpuset_rwsem);
2723 2807
2724 if (is_partition_root(cs)) 2808 if (is_partition_root(cs))
2725 update_prstate(cs, 0); 2809 update_prstate(cs, 0);
@@ -2738,7 +2822,8 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
2738 cpuset_dec(); 2822 cpuset_dec();
2739 clear_bit(CS_ONLINE, &cs->flags); 2823 clear_bit(CS_ONLINE, &cs->flags);
2740 2824
2741 mutex_unlock(&cpuset_mutex); 2825 percpu_up_write(&cpuset_rwsem);
2826 put_online_cpus();
2742} 2827}
2743 2828
2744static void cpuset_css_free(struct cgroup_subsys_state *css) 2829static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2750,7 +2835,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
2750 2835
2751static void cpuset_bind(struct cgroup_subsys_state *root_css) 2836static void cpuset_bind(struct cgroup_subsys_state *root_css)
2752{ 2837{
2753 mutex_lock(&cpuset_mutex); 2838 percpu_down_write(&cpuset_rwsem);
2754 spin_lock_irq(&callback_lock); 2839 spin_lock_irq(&callback_lock);
2755 2840
2756 if (is_in_v2_mode()) { 2841 if (is_in_v2_mode()) {
@@ -2763,7 +2848,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
2763 } 2848 }
2764 2849
2765 spin_unlock_irq(&callback_lock); 2850 spin_unlock_irq(&callback_lock);
2766 mutex_unlock(&cpuset_mutex); 2851 percpu_up_write(&cpuset_rwsem);
2767} 2852}
2768 2853
2769/* 2854/*
@@ -2805,6 +2890,8 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
2805 2890
2806int __init cpuset_init(void) 2891int __init cpuset_init(void)
2807{ 2892{
2893 BUG_ON(percpu_init_rwsem(&cpuset_rwsem));
2894
2808 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); 2895 BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
2809 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); 2896 BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
2810 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL)); 2897 BUG_ON(!zalloc_cpumask_var(&top_cpuset.subparts_cpus, GFP_KERNEL));
@@ -2876,7 +2963,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2876 is_empty = cpumask_empty(cs->cpus_allowed) || 2963 is_empty = cpumask_empty(cs->cpus_allowed) ||
2877 nodes_empty(cs->mems_allowed); 2964 nodes_empty(cs->mems_allowed);
2878 2965
2879 mutex_unlock(&cpuset_mutex); 2966 percpu_up_write(&cpuset_rwsem);
2880 2967
2881 /* 2968 /*
2882 * Move tasks to the nearest ancestor with execution resources, 2969 * Move tasks to the nearest ancestor with execution resources,
@@ -2886,7 +2973,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
2886 if (is_empty) 2973 if (is_empty)
2887 remove_tasks_in_empty_cpuset(cs); 2974 remove_tasks_in_empty_cpuset(cs);
2888 2975
2889 mutex_lock(&cpuset_mutex); 2976 percpu_down_write(&cpuset_rwsem);
2890} 2977}
2891 2978
2892static void 2979static void
@@ -2936,14 +3023,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
2936retry: 3023retry:
2937 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0); 3024 wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
2938 3025
2939 mutex_lock(&cpuset_mutex); 3026 percpu_down_write(&cpuset_rwsem);
2940 3027
2941 /* 3028 /*
2942 * We have raced with task attaching. We wait until attaching 3029 * We have raced with task attaching. We wait until attaching
2943 * is finished, so we won't attach a task to an empty cpuset. 3030 * is finished, so we won't attach a task to an empty cpuset.
2944 */ 3031 */
2945 if (cs->attach_in_progress) { 3032 if (cs->attach_in_progress) {
2946 mutex_unlock(&cpuset_mutex); 3033 percpu_up_write(&cpuset_rwsem);
2947 goto retry; 3034 goto retry;
2948 } 3035 }
2949 3036
@@ -3011,7 +3098,7 @@ update_tasks:
3011 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems, 3098 hotplug_update_tasks_legacy(cs, &new_cpus, &new_mems,
3012 cpus_updated, mems_updated); 3099 cpus_updated, mems_updated);
3013 3100
3014 mutex_unlock(&cpuset_mutex); 3101 percpu_up_write(&cpuset_rwsem);
3015} 3102}
3016 3103
3017/** 3104/**
@@ -3041,7 +3128,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
3041 if (on_dfl && !alloc_cpumasks(NULL, &tmp)) 3128 if (on_dfl && !alloc_cpumasks(NULL, &tmp))
3042 ptmp = &tmp; 3129 ptmp = &tmp;
3043 3130
3044 mutex_lock(&cpuset_mutex); 3131 percpu_down_write(&cpuset_rwsem);
3045 3132
3046 /* fetch the available cpus/mems and find out which changed how */ 3133 /* fetch the available cpus/mems and find out which changed how */
3047 cpumask_copy(&new_cpus, cpu_active_mask); 3134 cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3091,7 +3178,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
3091 update_tasks_nodemask(&top_cpuset); 3178 update_tasks_nodemask(&top_cpuset);
3092 } 3179 }
3093 3180
3094 mutex_unlock(&cpuset_mutex); 3181 percpu_up_write(&cpuset_rwsem);
3095 3182
3096 /* if cpus or mems changed, we need to propagate to descendants */ 3183 /* if cpus or mems changed, we need to propagate to descendants */
3097 if (cpus_updated || mems_updated) { 3184 if (cpus_updated || mems_updated) {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 2aad959e6def..1c414b8866b4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4174,10 +4174,8 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task)
4174 return NULL; 4174 return NULL;
4175 4175
4176 __perf_event_init_context(ctx); 4176 __perf_event_init_context(ctx);
4177 if (task) { 4177 if (task)
4178 ctx->task = task; 4178 ctx->task = get_task_struct(task);
4179 get_task_struct(task);
4180 }
4181 ctx->pmu = pmu; 4179 ctx->pmu = pmu;
4182 4180
4183 return ctx; 4181 return ctx;
@@ -10440,8 +10438,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
10440 * and we cannot use the ctx information because we need the 10438 * and we cannot use the ctx information because we need the
10441 * pmu before we get a ctx. 10439 * pmu before we get a ctx.
10442 */ 10440 */
10443 get_task_struct(task); 10441 event->hw.target = get_task_struct(task);
10444 event->hw.target = task;
10445 } 10442 }
10446 10443
10447 event->clock = &local_clock; 10444 event->clock = &local_clock;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e8f7f179bf77..9d50fbe5531a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1255,8 +1255,7 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
1255 * the thread dies to avoid that the interrupt code 1255 * the thread dies to avoid that the interrupt code
1256 * references an already freed task_struct. 1256 * references an already freed task_struct.
1257 */ 1257 */
1258 get_task_struct(t); 1258 new->thread = get_task_struct(t);
1259 new->thread = t;
1260 /* 1259 /*
1261 * Tell the thread to set its affinity. This is 1260 * Tell the thread to set its affinity. This is
1262 * important for shared interrupt handlers as we do 1261 * important for shared interrupt handlers as we do
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ebe8315a756a..1b66ccbb744a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1907,7 +1907,7 @@ int register_kretprobe(struct kretprobe *rp)
1907 1907
1908 /* Pre-allocate memory for max kretprobe instances */ 1908 /* Pre-allocate memory for max kretprobe instances */
1909 if (rp->maxactive <= 0) { 1909 if (rp->maxactive <= 0) {
1910#ifdef CONFIG_PREEMPT 1910#ifdef CONFIG_PREEMPTION
1911 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); 1911 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1912#else 1912#else
1913 rp->maxactive = num_possible_cpus(); 1913 rp->maxactive = num_possible_cpus();
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index fa83d36e30c6..2874bf556162 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -628,8 +628,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
628 } 628 }
629 629
630 /* [10] Grab the next task, i.e. owner of @lock */ 630 /* [10] Grab the next task, i.e. owner of @lock */
631 task = rt_mutex_owner(lock); 631 task = get_task_struct(rt_mutex_owner(lock));
632 get_task_struct(task);
633 raw_spin_lock(&task->pi_lock); 632 raw_spin_lock(&task->pi_lock);
634 633
635 /* 634 /*
@@ -709,8 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
709 } 708 }
710 709
711 /* [10] Grab the next task, i.e. the owner of @lock */ 710 /* [10] Grab the next task, i.e. the owner of @lock */
712 task = rt_mutex_owner(lock); 711 task = get_task_struct(rt_mutex_owner(lock));
713 get_task_struct(task);
714 raw_spin_lock(&task->pi_lock); 712 raw_spin_lock(&task->pi_lock);
715 713
716 /* [11] requeue the pi waiters if necessary */ 714 /* [11] requeue the pi waiters if necessary */
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 480edf328b51..7644eda17d62 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -7,7 +7,7 @@ menu "RCU Subsystem"
7 7
8config TREE_RCU 8config TREE_RCU
9 bool 9 bool
10 default y if !PREEMPT && SMP 10 default y if !PREEMPTION && SMP
11 help 11 help
12 This option selects the RCU implementation that is 12 This option selects the RCU implementation that is
13 designed for very large SMP system with hundreds or 13 designed for very large SMP system with hundreds or
@@ -16,7 +16,7 @@ config TREE_RCU
16 16
17config PREEMPT_RCU 17config PREEMPT_RCU
18 bool 18 bool
19 default y if PREEMPT 19 default y if PREEMPTION
20 help 20 help
21 This option selects the RCU implementation that is 21 This option selects the RCU implementation that is
22 designed for very large SMP systems with hundreds or 22 designed for very large SMP systems with hundreds or
@@ -28,7 +28,7 @@ config PREEMPT_RCU
28 28
29config TINY_RCU 29config TINY_RCU
30 bool 30 bool
31 default y if !PREEMPT && !SMP 31 default y if !PREEMPTION && !SMP
32 help 32 help
33 This option selects the RCU implementation that is 33 This option selects the RCU implementation that is
34 designed for UP systems from which real-time response 34 designed for UP systems from which real-time response
@@ -70,7 +70,7 @@ config TREE_SRCU
70 This option selects the full-fledged version of SRCU. 70 This option selects the full-fledged version of SRCU.
71 71
72config TASKS_RCU 72config TASKS_RCU
73 def_bool PREEMPT 73 def_bool PREEMPTION
74 select SRCU 74 select SRCU
75 help 75 help
76 This option enables a task-based RCU implementation that uses 76 This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 71395e91b876..81105141b6a8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1912,7 +1912,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
1912 struct rcu_node *rnp_p; 1912 struct rcu_node *rnp_p;
1913 1913
1914 raw_lockdep_assert_held_rcu_node(rnp); 1914 raw_lockdep_assert_held_rcu_node(rnp);
1915 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)) || 1915 if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPTION)) ||
1916 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) || 1916 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)) ||
1917 rnp->qsmask != 0) { 1917 rnp->qsmask != 0) {
1918 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1918 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
@@ -2266,7 +2266,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
2266 mask = 0; 2266 mask = 0;
2267 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2267 raw_spin_lock_irqsave_rcu_node(rnp, flags);
2268 if (rnp->qsmask == 0) { 2268 if (rnp->qsmask == 0) {
2269 if (!IS_ENABLED(CONFIG_PREEMPT) || 2269 if (!IS_ENABLED(CONFIG_PREEMPTION) ||
2270 rcu_preempt_blocked_readers_cgp(rnp)) { 2270 rcu_preempt_blocked_readers_cgp(rnp)) {
2271 /* 2271 /*
2272 * No point in scanning bits because they 2272 * No point in scanning bits because they
@@ -2681,7 +2681,7 @@ static int rcu_blocking_is_gp(void)
2681{ 2681{
2682 int ret; 2682 int ret;
2683 2683
2684 if (IS_ENABLED(CONFIG_PREEMPT)) 2684 if (IS_ENABLED(CONFIG_PREEMPTION))
2685 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; 2685 return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE;
2686 might_sleep(); /* Check for RCU read-side critical section. */ 2686 might_sleep(); /* Check for RCU read-side critical section. */
2687 preempt_disable(); 2687 preempt_disable();
@@ -3297,13 +3297,13 @@ static int __init rcu_spawn_gp_kthread(void)
3297 t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name); 3297 t = kthread_create(rcu_gp_kthread, NULL, "%s", rcu_state.name);
3298 if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__)) 3298 if (WARN_ONCE(IS_ERR(t), "%s: Could not start grace-period kthread, OOM is now expected behavior\n", __func__))
3299 return 0; 3299 return 0;
3300 rnp = rcu_get_root();
3301 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3302 rcu_state.gp_kthread = t;
3303 if (kthread_prio) { 3300 if (kthread_prio) {
3304 sp.sched_priority = kthread_prio; 3301 sp.sched_priority = kthread_prio;
3305 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 3302 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
3306 } 3303 }
3304 rnp = rcu_get_root();
3305 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3306 rcu_state.gp_kthread = t;
3307 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3307 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3308 wake_up_process(t); 3308 wake_up_process(t);
3309 rcu_spawn_nocb_kthreads(); 3309 rcu_spawn_nocb_kthreads();
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 841ab43f3e60..c0b8c458d8a6 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -163,7 +163,7 @@ static void rcu_iw_handler(struct irq_work *iwp)
163// 163//
164// Printing RCU CPU stall warnings 164// Printing RCU CPU stall warnings
165 165
166#ifdef CONFIG_PREEMPT 166#ifdef CONFIG_PREEMPTION
167 167
168/* 168/*
169 * Dump detailed information for all tasks blocking the current RCU 169 * Dump detailed information for all tasks blocking the current RCU
@@ -215,7 +215,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
215 return ndetected; 215 return ndetected;
216} 216}
217 217
218#else /* #ifdef CONFIG_PREEMPT */ 218#else /* #ifdef CONFIG_PREEMPTION */
219 219
220/* 220/*
221 * Because preemptible RCU does not exist, we never have to check for 221 * Because preemptible RCU does not exist, we never have to check for
@@ -233,7 +233,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp)
233{ 233{
234 return 0; 234 return 0;
235} 235}
236#endif /* #else #ifdef CONFIG_PREEMPT */ 236#endif /* #else #ifdef CONFIG_PREEMPTION */
237 237
238/* 238/*
239 * Dump stacks of all tasks running on stalled CPUs. First try using 239 * Dump stacks of all tasks running on stalled CPUs. First try using
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7fa8e74ad2ab..06961b997ed6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -773,6 +773,18 @@ static void set_load_weight(struct task_struct *p, bool update_load)
773} 773}
774 774
775#ifdef CONFIG_UCLAMP_TASK 775#ifdef CONFIG_UCLAMP_TASK
776/*
777 * Serializes updates of utilization clamp values
778 *
779 * The (slow-path) user-space triggers utilization clamp value updates which
780 * can require updates on (fast-path) scheduler's data structures used to
781 * support enqueue/dequeue operations.
782 * While the per-CPU rq lock protects fast-path update operations, user-space
783 * requests are serialized using a mutex to reduce the risk of conflicting
784 * updates or API abuses.
785 */
786static DEFINE_MUTEX(uclamp_mutex);
787
776/* Max allowed minimum utilization */ 788/* Max allowed minimum utilization */
777unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE; 789unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
778 790
@@ -798,7 +810,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
798 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); 810 return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
799} 811}
800 812
801static inline unsigned int uclamp_none(int clamp_id) 813static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
802{ 814{
803 if (clamp_id == UCLAMP_MIN) 815 if (clamp_id == UCLAMP_MIN)
804 return 0; 816 return 0;
@@ -814,7 +826,7 @@ static inline void uclamp_se_set(struct uclamp_se *uc_se,
814} 826}
815 827
816static inline unsigned int 828static inline unsigned int
817uclamp_idle_value(struct rq *rq, unsigned int clamp_id, 829uclamp_idle_value(struct rq *rq, enum uclamp_id clamp_id,
818 unsigned int clamp_value) 830 unsigned int clamp_value)
819{ 831{
820 /* 832 /*
@@ -830,7 +842,7 @@ uclamp_idle_value(struct rq *rq, unsigned int clamp_id,
830 return uclamp_none(UCLAMP_MIN); 842 return uclamp_none(UCLAMP_MIN);
831} 843}
832 844
833static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id, 845static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
834 unsigned int clamp_value) 846 unsigned int clamp_value)
835{ 847{
836 /* Reset max-clamp retention only on idle exit */ 848 /* Reset max-clamp retention only on idle exit */
@@ -841,8 +853,8 @@ static inline void uclamp_idle_reset(struct rq *rq, unsigned int clamp_id,
841} 853}
842 854
843static inline 855static inline
844unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id, 856enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
845 unsigned int clamp_value) 857 unsigned int clamp_value)
846{ 858{
847 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; 859 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
848 int bucket_id = UCLAMP_BUCKETS - 1; 860 int bucket_id = UCLAMP_BUCKETS - 1;
@@ -861,16 +873,42 @@ unsigned int uclamp_rq_max_value(struct rq *rq, unsigned int clamp_id,
861 return uclamp_idle_value(rq, clamp_id, clamp_value); 873 return uclamp_idle_value(rq, clamp_id, clamp_value);
862} 874}
863 875
876static inline struct uclamp_se
877uclamp_tg_restrict(struct task_struct *p, enum uclamp_id clamp_id)
878{
879 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
880#ifdef CONFIG_UCLAMP_TASK_GROUP
881 struct uclamp_se uc_max;
882
883 /*
884 * Tasks in autogroups or root task group will be
885 * restricted by system defaults.
886 */
887 if (task_group_is_autogroup(task_group(p)))
888 return uc_req;
889 if (task_group(p) == &root_task_group)
890 return uc_req;
891
892 uc_max = task_group(p)->uclamp[clamp_id];
893 if (uc_req.value > uc_max.value || !uc_req.user_defined)
894 return uc_max;
895#endif
896
897 return uc_req;
898}
899
864/* 900/*
865 * The effective clamp bucket index of a task depends on, by increasing 901 * The effective clamp bucket index of a task depends on, by increasing
866 * priority: 902 * priority:
867 * - the task specific clamp value, when explicitly requested from userspace 903 * - the task specific clamp value, when explicitly requested from userspace
904 * - the task group effective clamp value, for tasks not either in the root
905 * group or in an autogroup
868 * - the system default clamp value, defined by the sysadmin 906 * - the system default clamp value, defined by the sysadmin
869 */ 907 */
870static inline struct uclamp_se 908static inline struct uclamp_se
871uclamp_eff_get(struct task_struct *p, unsigned int clamp_id) 909uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
872{ 910{
873 struct uclamp_se uc_req = p->uclamp_req[clamp_id]; 911 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
874 struct uclamp_se uc_max = uclamp_default[clamp_id]; 912 struct uclamp_se uc_max = uclamp_default[clamp_id];
875 913
876 /* System default restrictions always apply */ 914 /* System default restrictions always apply */
@@ -880,7 +918,7 @@ uclamp_eff_get(struct task_struct *p, unsigned int clamp_id)
880 return uc_req; 918 return uc_req;
881} 919}
882 920
883unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id) 921enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
884{ 922{
885 struct uclamp_se uc_eff; 923 struct uclamp_se uc_eff;
886 924
@@ -904,7 +942,7 @@ unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id)
904 * for each bucket when all its RUNNABLE tasks require the same clamp. 942 * for each bucket when all its RUNNABLE tasks require the same clamp.
905 */ 943 */
906static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, 944static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
907 unsigned int clamp_id) 945 enum uclamp_id clamp_id)
908{ 946{
909 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 947 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
910 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 948 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -942,7 +980,7 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
942 * enforce the expected state and warn. 980 * enforce the expected state and warn.
943 */ 981 */
944static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, 982static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
945 unsigned int clamp_id) 983 enum uclamp_id clamp_id)
946{ 984{
947 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id]; 985 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
948 struct uclamp_se *uc_se = &p->uclamp[clamp_id]; 986 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
@@ -981,7 +1019,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
981 1019
982static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) 1020static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
983{ 1021{
984 unsigned int clamp_id; 1022 enum uclamp_id clamp_id;
985 1023
986 if (unlikely(!p->sched_class->uclamp_enabled)) 1024 if (unlikely(!p->sched_class->uclamp_enabled))
987 return; 1025 return;
@@ -996,7 +1034,7 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
996 1034
997static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) 1035static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
998{ 1036{
999 unsigned int clamp_id; 1037 enum uclamp_id clamp_id;
1000 1038
1001 if (unlikely(!p->sched_class->uclamp_enabled)) 1039 if (unlikely(!p->sched_class->uclamp_enabled))
1002 return; 1040 return;
@@ -1005,15 +1043,82 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1005 uclamp_rq_dec_id(rq, p, clamp_id); 1043 uclamp_rq_dec_id(rq, p, clamp_id);
1006} 1044}
1007 1045
1046static inline void
1047uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
1048{
1049 struct rq_flags rf;
1050 struct rq *rq;
1051
1052 /*
1053 * Lock the task and the rq where the task is (or was) queued.
1054 *
1055 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1056 * price to pay to safely serialize util_{min,max} updates with
1057 * enqueues, dequeues and migration operations.
1058 * This is the same locking schema used by __set_cpus_allowed_ptr().
1059 */
1060 rq = task_rq_lock(p, &rf);
1061
1062 /*
1063 * Setting the clamp bucket is serialized by task_rq_lock().
1064 * If the task is not yet RUNNABLE and its task_struct is not
1065 * affecting a valid clamp bucket, the next time it's enqueued,
1066 * it will already see the updated clamp bucket value.
1067 */
1068 if (!p->uclamp[clamp_id].active) {
1069 uclamp_rq_dec_id(rq, p, clamp_id);
1070 uclamp_rq_inc_id(rq, p, clamp_id);
1071 }
1072
1073 task_rq_unlock(rq, p, &rf);
1074}
1075
1076static inline void
1077uclamp_update_active_tasks(struct cgroup_subsys_state *css,
1078 unsigned int clamps)
1079{
1080 enum uclamp_id clamp_id;
1081 struct css_task_iter it;
1082 struct task_struct *p;
1083
1084 css_task_iter_start(css, 0, &it);
1085 while ((p = css_task_iter_next(&it))) {
1086 for_each_clamp_id(clamp_id) {
1087 if ((0x1 << clamp_id) & clamps)
1088 uclamp_update_active(p, clamp_id);
1089 }
1090 }
1091 css_task_iter_end(&it);
1092}
1093
1094#ifdef CONFIG_UCLAMP_TASK_GROUP
1095static void cpu_util_update_eff(struct cgroup_subsys_state *css);
1096static void uclamp_update_root_tg(void)
1097{
1098 struct task_group *tg = &root_task_group;
1099
1100 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN],
1101 sysctl_sched_uclamp_util_min, false);
1102 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX],
1103 sysctl_sched_uclamp_util_max, false);
1104
1105 rcu_read_lock();
1106 cpu_util_update_eff(&root_task_group.css);
1107 rcu_read_unlock();
1108}
1109#else
1110static void uclamp_update_root_tg(void) { }
1111#endif
1112
1008int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, 1113int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1009 void __user *buffer, size_t *lenp, 1114 void __user *buffer, size_t *lenp,
1010 loff_t *ppos) 1115 loff_t *ppos)
1011{ 1116{
1117 bool update_root_tg = false;
1012 int old_min, old_max; 1118 int old_min, old_max;
1013 static DEFINE_MUTEX(mutex);
1014 int result; 1119 int result;
1015 1120
1016 mutex_lock(&mutex); 1121 mutex_lock(&uclamp_mutex);
1017 old_min = sysctl_sched_uclamp_util_min; 1122 old_min = sysctl_sched_uclamp_util_min;
1018 old_max = sysctl_sched_uclamp_util_max; 1123 old_max = sysctl_sched_uclamp_util_max;
1019 1124
@@ -1032,23 +1137,30 @@ int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1032 if (old_min != sysctl_sched_uclamp_util_min) { 1137 if (old_min != sysctl_sched_uclamp_util_min) {
1033 uclamp_se_set(&uclamp_default[UCLAMP_MIN], 1138 uclamp_se_set(&uclamp_default[UCLAMP_MIN],
1034 sysctl_sched_uclamp_util_min, false); 1139 sysctl_sched_uclamp_util_min, false);
1140 update_root_tg = true;
1035 } 1141 }
1036 if (old_max != sysctl_sched_uclamp_util_max) { 1142 if (old_max != sysctl_sched_uclamp_util_max) {
1037 uclamp_se_set(&uclamp_default[UCLAMP_MAX], 1143 uclamp_se_set(&uclamp_default[UCLAMP_MAX],
1038 sysctl_sched_uclamp_util_max, false); 1144 sysctl_sched_uclamp_util_max, false);
1145 update_root_tg = true;
1039 } 1146 }
1040 1147
1148 if (update_root_tg)
1149 uclamp_update_root_tg();
1150
1041 /* 1151 /*
1042 * Updating all the RUNNABLE task is expensive, keep it simple and do 1152 * We update all RUNNABLE tasks only when task groups are in use.
1043 * just a lazy update at each next enqueue time. 1153 * Otherwise, keep it simple and do just a lazy update at each next
1154 * task enqueue time.
1044 */ 1155 */
1156
1045 goto done; 1157 goto done;
1046 1158
1047undo: 1159undo:
1048 sysctl_sched_uclamp_util_min = old_min; 1160 sysctl_sched_uclamp_util_min = old_min;
1049 sysctl_sched_uclamp_util_max = old_max; 1161 sysctl_sched_uclamp_util_max = old_max;
1050done: 1162done:
1051 mutex_unlock(&mutex); 1163 mutex_unlock(&uclamp_mutex);
1052 1164
1053 return result; 1165 return result;
1054} 1166}
@@ -1075,7 +1187,7 @@ static int uclamp_validate(struct task_struct *p,
1075static void __setscheduler_uclamp(struct task_struct *p, 1187static void __setscheduler_uclamp(struct task_struct *p,
1076 const struct sched_attr *attr) 1188 const struct sched_attr *attr)
1077{ 1189{
1078 unsigned int clamp_id; 1190 enum uclamp_id clamp_id;
1079 1191
1080 /* 1192 /*
1081 * On scheduling class change, reset to default clamps for tasks 1193 * On scheduling class change, reset to default clamps for tasks
@@ -1112,7 +1224,7 @@ static void __setscheduler_uclamp(struct task_struct *p,
1112 1224
1113static void uclamp_fork(struct task_struct *p) 1225static void uclamp_fork(struct task_struct *p)
1114{ 1226{
1115 unsigned int clamp_id; 1227 enum uclamp_id clamp_id;
1116 1228
1117 for_each_clamp_id(clamp_id) 1229 for_each_clamp_id(clamp_id)
1118 p->uclamp[clamp_id].active = false; 1230 p->uclamp[clamp_id].active = false;
@@ -1134,9 +1246,11 @@ static void uclamp_fork(struct task_struct *p)
1134static void __init init_uclamp(void) 1246static void __init init_uclamp(void)
1135{ 1247{
1136 struct uclamp_se uc_max = {}; 1248 struct uclamp_se uc_max = {};
1137 unsigned int clamp_id; 1249 enum uclamp_id clamp_id;
1138 int cpu; 1250 int cpu;
1139 1251
1252 mutex_init(&uclamp_mutex);
1253
1140 for_each_possible_cpu(cpu) { 1254 for_each_possible_cpu(cpu) {
1141 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq)); 1255 memset(&cpu_rq(cpu)->uclamp, 0, sizeof(struct uclamp_rq));
1142 cpu_rq(cpu)->uclamp_flags = 0; 1256 cpu_rq(cpu)->uclamp_flags = 0;
@@ -1149,8 +1263,13 @@ static void __init init_uclamp(void)
1149 1263
1150 /* System defaults allow max clamp values for both indexes */ 1264 /* System defaults allow max clamp values for both indexes */
1151 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false); 1265 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1152 for_each_clamp_id(clamp_id) 1266 for_each_clamp_id(clamp_id) {
1153 uclamp_default[clamp_id] = uc_max; 1267 uclamp_default[clamp_id] = uc_max;
1268#ifdef CONFIG_UCLAMP_TASK_GROUP
1269 root_task_group.uclamp_req[clamp_id] = uc_max;
1270 root_task_group.uclamp[clamp_id] = uc_max;
1271#endif
1272 }
1154} 1273}
1155 1274
1156#else /* CONFIG_UCLAMP_TASK */ 1275#else /* CONFIG_UCLAMP_TASK */
@@ -1494,7 +1613,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1494 if (queued) 1613 if (queued)
1495 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 1614 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1496 if (running) 1615 if (running)
1497 set_curr_task(rq, p); 1616 set_next_task(rq, p);
1498} 1617}
1499 1618
1500/* 1619/*
@@ -3214,12 +3333,8 @@ static __always_inline struct rq *
3214context_switch(struct rq *rq, struct task_struct *prev, 3333context_switch(struct rq *rq, struct task_struct *prev,
3215 struct task_struct *next, struct rq_flags *rf) 3334 struct task_struct *next, struct rq_flags *rf)
3216{ 3335{
3217 struct mm_struct *mm, *oldmm;
3218
3219 prepare_task_switch(rq, prev, next); 3336 prepare_task_switch(rq, prev, next);
3220 3337
3221 mm = next->mm;
3222 oldmm = prev->active_mm;
3223 /* 3338 /*
3224 * For paravirt, this is coupled with an exit in switch_to to 3339 * For paravirt, this is coupled with an exit in switch_to to
3225 * combine the page table reload and the switch backend into 3340 * combine the page table reload and the switch backend into
@@ -3228,22 +3343,37 @@ context_switch(struct rq *rq, struct task_struct *prev,
3228 arch_start_context_switch(prev); 3343 arch_start_context_switch(prev);
3229 3344
3230 /* 3345 /*
3231 * If mm is non-NULL, we pass through switch_mm(). If mm is 3346 * kernel -> kernel lazy + transfer active
3232 * NULL, we will pass through mmdrop() in finish_task_switch(). 3347 * user -> kernel lazy + mmgrab() active
3233 * Both of these contain the full memory barrier required by 3348 *
3234 * membarrier after storing to rq->curr, before returning to 3349 * kernel -> user switch + mmdrop() active
3235 * user-space. 3350 * user -> user switch
3236 */ 3351 */
3237 if (!mm) { 3352 if (!next->mm) { // to kernel
3238 next->active_mm = oldmm; 3353 enter_lazy_tlb(prev->active_mm, next);
3239 mmgrab(oldmm); 3354
3240 enter_lazy_tlb(oldmm, next); 3355 next->active_mm = prev->active_mm;
3241 } else 3356 if (prev->mm) // from user
3242 switch_mm_irqs_off(oldmm, mm, next); 3357 mmgrab(prev->active_mm);
3358 else
3359 prev->active_mm = NULL;
3360 } else { // to user
3361 /*
3362 * sys_membarrier() requires an smp_mb() between setting
3363 * rq->curr and returning to userspace.
3364 *
3365 * The below provides this either through switch_mm(), or in
3366 * case 'prev->active_mm == next->mm' through
3367 * finish_task_switch()'s mmdrop().
3368 */
3369
3370 switch_mm_irqs_off(prev->active_mm, next->mm, next);
3243 3371
3244 if (!prev->mm) { 3372 if (!prev->mm) { // from kernel
3245 prev->active_mm = NULL; 3373 /* will mmdrop() in finish_task_switch(). */
3246 rq->prev_mm = oldmm; 3374 rq->prev_mm = prev->active_mm;
3375 prev->active_mm = NULL;
3376 }
3247 } 3377 }
3248 3378
3249 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); 3379 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
@@ -3622,7 +3752,7 @@ static inline void sched_tick_start(int cpu) { }
3622static inline void sched_tick_stop(int cpu) { } 3752static inline void sched_tick_stop(int cpu) { }
3623#endif 3753#endif
3624 3754
3625#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3755#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \
3626 defined(CONFIG_TRACE_PREEMPT_TOGGLE)) 3756 defined(CONFIG_TRACE_PREEMPT_TOGGLE))
3627/* 3757/*
3628 * If the value passed in is equal to the current preempt count 3758 * If the value passed in is equal to the current preempt count
@@ -3780,7 +3910,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3780 3910
3781 p = fair_sched_class.pick_next_task(rq, prev, rf); 3911 p = fair_sched_class.pick_next_task(rq, prev, rf);
3782 if (unlikely(p == RETRY_TASK)) 3912 if (unlikely(p == RETRY_TASK))
3783 goto again; 3913 goto restart;
3784 3914
3785 /* Assumes fair_sched_class->next == idle_sched_class */ 3915 /* Assumes fair_sched_class->next == idle_sched_class */
3786 if (unlikely(!p)) 3916 if (unlikely(!p))
@@ -3789,14 +3919,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3789 return p; 3919 return p;
3790 } 3920 }
3791 3921
3792again: 3922restart:
3923 /*
3924 * Ensure that we put DL/RT tasks before the pick loop, such that they
3925 * can PULL higher prio tasks when we lower the RQ 'priority'.
3926 */
3927 prev->sched_class->put_prev_task(rq, prev, rf);
3928 if (!rq->nr_running)
3929 newidle_balance(rq, rf);
3930
3793 for_each_class(class) { 3931 for_each_class(class) {
3794 p = class->pick_next_task(rq, prev, rf); 3932 p = class->pick_next_task(rq, NULL, NULL);
3795 if (p) { 3933 if (p)
3796 if (unlikely(p == RETRY_TASK))
3797 goto again;
3798 return p; 3934 return p;
3799 }
3800 } 3935 }
3801 3936
3802 /* The idle class should always have a runnable task: */ 3937 /* The idle class should always have a runnable task: */
@@ -3823,7 +3958,7 @@ again:
3823 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets 3958 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
3824 * called on the nearest possible occasion: 3959 * called on the nearest possible occasion:
3825 * 3960 *
3826 * - If the kernel is preemptible (CONFIG_PREEMPT=y): 3961 * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
3827 * 3962 *
3828 * - in syscall or exception context, at the next outmost 3963 * - in syscall or exception context, at the next outmost
3829 * preempt_enable(). (this might be as soon as the wake_up()'s 3964 * preempt_enable(). (this might be as soon as the wake_up()'s
@@ -3832,7 +3967,7 @@ again:
3832 * - in IRQ context, return from interrupt-handler to 3967 * - in IRQ context, return from interrupt-handler to
3833 * preemptible context 3968 * preemptible context
3834 * 3969 *
3835 * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) 3970 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
3836 * then at the next: 3971 * then at the next:
3837 * 3972 *
3838 * - cond_resched() call 3973 * - cond_resched() call
@@ -4077,7 +4212,7 @@ static void __sched notrace preempt_schedule_common(void)
4077 } while (need_resched()); 4212 } while (need_resched());
4078} 4213}
4079 4214
4080#ifdef CONFIG_PREEMPT 4215#ifdef CONFIG_PREEMPTION
4081/* 4216/*
4082 * this is the entry point to schedule() from in-kernel preemption 4217 * this is the entry point to schedule() from in-kernel preemption
4083 * off of preempt_enable. Kernel preemptions off return from interrupt 4218 * off of preempt_enable. Kernel preemptions off return from interrupt
@@ -4149,7 +4284,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
4149} 4284}
4150EXPORT_SYMBOL_GPL(preempt_schedule_notrace); 4285EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
4151 4286
4152#endif /* CONFIG_PREEMPT */ 4287#endif /* CONFIG_PREEMPTION */
4153 4288
4154/* 4289/*
4155 * this is the entry point to schedule() from kernel preemption 4290 * this is the entry point to schedule() from kernel preemption
@@ -4317,7 +4452,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
4317 if (queued) 4452 if (queued)
4318 enqueue_task(rq, p, queue_flag); 4453 enqueue_task(rq, p, queue_flag);
4319 if (running) 4454 if (running)
4320 set_curr_task(rq, p); 4455 set_next_task(rq, p);
4321 4456
4322 check_class_changed(rq, p, prev_class, oldprio); 4457 check_class_changed(rq, p, prev_class, oldprio);
4323out_unlock: 4458out_unlock:
@@ -4384,7 +4519,7 @@ void set_user_nice(struct task_struct *p, long nice)
4384 resched_curr(rq); 4519 resched_curr(rq);
4385 } 4520 }
4386 if (running) 4521 if (running)
4387 set_curr_task(rq, p); 4522 set_next_task(rq, p);
4388out_unlock: 4523out_unlock:
4389 task_rq_unlock(rq, p, &rf); 4524 task_rq_unlock(rq, p, &rf);
4390} 4525}
@@ -4701,6 +4836,9 @@ recheck:
4701 return retval; 4836 return retval;
4702 } 4837 }
4703 4838
4839 if (pi)
4840 cpuset_read_lock();
4841
4704 /* 4842 /*
4705 * Make sure no PI-waiters arrive (or leave) while we are 4843 * Make sure no PI-waiters arrive (or leave) while we are
4706 * changing the priority of the task: 4844 * changing the priority of the task:
@@ -4715,8 +4853,8 @@ recheck:
4715 * Changing the policy of the stop threads its a very bad idea: 4853 * Changing the policy of the stop threads its a very bad idea:
4716 */ 4854 */
4717 if (p == rq->stop) { 4855 if (p == rq->stop) {
4718 task_rq_unlock(rq, p, &rf); 4856 retval = -EINVAL;
4719 return -EINVAL; 4857 goto unlock;
4720 } 4858 }
4721 4859
4722 /* 4860 /*
@@ -4734,8 +4872,8 @@ recheck:
4734 goto change; 4872 goto change;
4735 4873
4736 p->sched_reset_on_fork = reset_on_fork; 4874 p->sched_reset_on_fork = reset_on_fork;
4737 task_rq_unlock(rq, p, &rf); 4875 retval = 0;
4738 return 0; 4876 goto unlock;
4739 } 4877 }
4740change: 4878change:
4741 4879
@@ -4748,8 +4886,8 @@ change:
4748 if (rt_bandwidth_enabled() && rt_policy(policy) && 4886 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4749 task_group(p)->rt_bandwidth.rt_runtime == 0 && 4887 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4750 !task_group_is_autogroup(task_group(p))) { 4888 !task_group_is_autogroup(task_group(p))) {
4751 task_rq_unlock(rq, p, &rf); 4889 retval = -EPERM;
4752 return -EPERM; 4890 goto unlock;
4753 } 4891 }
4754#endif 4892#endif
4755#ifdef CONFIG_SMP 4893#ifdef CONFIG_SMP
@@ -4764,8 +4902,8 @@ change:
4764 */ 4902 */
4765 if (!cpumask_subset(span, p->cpus_ptr) || 4903 if (!cpumask_subset(span, p->cpus_ptr) ||
4766 rq->rd->dl_bw.bw == 0) { 4904 rq->rd->dl_bw.bw == 0) {
4767 task_rq_unlock(rq, p, &rf); 4905 retval = -EPERM;
4768 return -EPERM; 4906 goto unlock;
4769 } 4907 }
4770 } 4908 }
4771#endif 4909#endif
@@ -4775,6 +4913,8 @@ change:
4775 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4913 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4776 policy = oldpolicy = -1; 4914 policy = oldpolicy = -1;
4777 task_rq_unlock(rq, p, &rf); 4915 task_rq_unlock(rq, p, &rf);
4916 if (pi)
4917 cpuset_read_unlock();
4778 goto recheck; 4918 goto recheck;
4779 } 4919 }
4780 4920
@@ -4784,8 +4924,8 @@ change:
4784 * is available. 4924 * is available.
4785 */ 4925 */
4786 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) { 4926 if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4787 task_rq_unlock(rq, p, &rf); 4927 retval = -EBUSY;
4788 return -EBUSY; 4928 goto unlock;
4789 } 4929 }
4790 4930
4791 p->sched_reset_on_fork = reset_on_fork; 4931 p->sched_reset_on_fork = reset_on_fork;
@@ -4827,7 +4967,7 @@ change:
4827 enqueue_task(rq, p, queue_flags); 4967 enqueue_task(rq, p, queue_flags);
4828 } 4968 }
4829 if (running) 4969 if (running)
4830 set_curr_task(rq, p); 4970 set_next_task(rq, p);
4831 4971
4832 check_class_changed(rq, p, prev_class, oldprio); 4972 check_class_changed(rq, p, prev_class, oldprio);
4833 4973
@@ -4835,14 +4975,22 @@ change:
4835 preempt_disable(); 4975 preempt_disable();
4836 task_rq_unlock(rq, p, &rf); 4976 task_rq_unlock(rq, p, &rf);
4837 4977
4838 if (pi) 4978 if (pi) {
4979 cpuset_read_unlock();
4839 rt_mutex_adjust_pi(p); 4980 rt_mutex_adjust_pi(p);
4981 }
4840 4982
4841 /* Run balance callbacks after we've adjusted the PI chain: */ 4983 /* Run balance callbacks after we've adjusted the PI chain: */
4842 balance_callback(rq); 4984 balance_callback(rq);
4843 preempt_enable(); 4985 preempt_enable();
4844 4986
4845 return 0; 4987 return 0;
4988
4989unlock:
4990 task_rq_unlock(rq, p, &rf);
4991 if (pi)
4992 cpuset_read_unlock();
4993 return retval;
4846} 4994}
4847 4995
4848static int _sched_setscheduler(struct task_struct *p, int policy, 4996static int _sched_setscheduler(struct task_struct *p, int policy,
@@ -4926,10 +5074,15 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4926 rcu_read_lock(); 5074 rcu_read_lock();
4927 retval = -ESRCH; 5075 retval = -ESRCH;
4928 p = find_process_by_pid(pid); 5076 p = find_process_by_pid(pid);
4929 if (p != NULL) 5077 if (likely(p))
4930 retval = sched_setscheduler(p, policy, &lparam); 5078 get_task_struct(p);
4931 rcu_read_unlock(); 5079 rcu_read_unlock();
4932 5080
5081 if (likely(p)) {
5082 retval = sched_setscheduler(p, policy, &lparam);
5083 put_task_struct(p);
5084 }
5085
4933 return retval; 5086 return retval;
4934} 5087}
4935 5088
@@ -5460,7 +5613,7 @@ SYSCALL_DEFINE0(sched_yield)
5460 return 0; 5613 return 0;
5461} 5614}
5462 5615
5463#ifndef CONFIG_PREEMPT 5616#ifndef CONFIG_PREEMPTION
5464int __sched _cond_resched(void) 5617int __sched _cond_resched(void)
5465{ 5618{
5466 if (should_resched(0)) { 5619 if (should_resched(0)) {
@@ -5477,7 +5630,7 @@ EXPORT_SYMBOL(_cond_resched);
5477 * __cond_resched_lock() - if a reschedule is pending, drop the given lock, 5630 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
5478 * call schedule, and on return reacquire the lock. 5631 * call schedule, and on return reacquire the lock.
5479 * 5632 *
5480 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 5633 * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level
5481 * operations here to prevent schedule() from being called twice (once via 5634 * operations here to prevent schedule() from being called twice (once via
5482 * spin_unlock(), once by hand). 5635 * spin_unlock(), once by hand).
5483 */ 5636 */
@@ -6016,7 +6169,7 @@ void sched_setnuma(struct task_struct *p, int nid)
6016 if (queued) 6169 if (queued)
6017 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); 6170 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
6018 if (running) 6171 if (running)
6019 set_curr_task(rq, p); 6172 set_next_task(rq, p);
6020 task_rq_unlock(rq, p, &rf); 6173 task_rq_unlock(rq, p, &rf);
6021} 6174}
6022#endif /* CONFIG_NUMA_BALANCING */ 6175#endif /* CONFIG_NUMA_BALANCING */
@@ -6056,21 +6209,22 @@ static void calc_load_migrate(struct rq *rq)
6056 atomic_long_add(delta, &calc_load_tasks); 6209 atomic_long_add(delta, &calc_load_tasks);
6057} 6210}
6058 6211
6059static void put_prev_task_fake(struct rq *rq, struct task_struct *prev) 6212static struct task_struct *__pick_migrate_task(struct rq *rq)
6060{ 6213{
6061} 6214 const struct sched_class *class;
6215 struct task_struct *next;
6062 6216
6063static const struct sched_class fake_sched_class = { 6217 for_each_class(class) {
6064 .put_prev_task = put_prev_task_fake, 6218 next = class->pick_next_task(rq, NULL, NULL);
6065}; 6219 if (next) {
6220 next->sched_class->put_prev_task(rq, next, NULL);
6221 return next;
6222 }
6223 }
6066 6224
6067static struct task_struct fake_task = { 6225 /* The idle class should always have a runnable task */
6068 /* 6226 BUG();
6069 * Avoid pull_{rt,dl}_task() 6227}
6070 */
6071 .prio = MAX_PRIO + 1,
6072 .sched_class = &fake_sched_class,
6073};
6074 6228
6075/* 6229/*
6076 * Migrate all tasks from the rq, sleeping tasks will be migrated by 6230 * Migrate all tasks from the rq, sleeping tasks will be migrated by
@@ -6113,12 +6267,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
6113 if (rq->nr_running == 1) 6267 if (rq->nr_running == 1)
6114 break; 6268 break;
6115 6269
6116 /* 6270 next = __pick_migrate_task(rq);
6117 * pick_next_task() assumes pinned rq->lock:
6118 */
6119 next = pick_next_task(rq, &fake_task, rf);
6120 BUG_ON(!next);
6121 put_prev_task(rq, next);
6122 6271
6123 /* 6272 /*
6124 * Rules for changing task_struct::cpus_mask are holding 6273 * Rules for changing task_struct::cpus_mask are holding
@@ -6415,19 +6564,19 @@ DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
6415 6564
6416void __init sched_init(void) 6565void __init sched_init(void)
6417{ 6566{
6418 unsigned long alloc_size = 0, ptr; 6567 unsigned long ptr = 0;
6419 int i; 6568 int i;
6420 6569
6421 wait_bit_init(); 6570 wait_bit_init();
6422 6571
6423#ifdef CONFIG_FAIR_GROUP_SCHED 6572#ifdef CONFIG_FAIR_GROUP_SCHED
6424 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6573 ptr += 2 * nr_cpu_ids * sizeof(void **);
6425#endif 6574#endif
6426#ifdef CONFIG_RT_GROUP_SCHED 6575#ifdef CONFIG_RT_GROUP_SCHED
6427 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 6576 ptr += 2 * nr_cpu_ids * sizeof(void **);
6428#endif 6577#endif
6429 if (alloc_size) { 6578 if (ptr) {
6430 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 6579 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
6431 6580
6432#ifdef CONFIG_FAIR_GROUP_SCHED 6581#ifdef CONFIG_FAIR_GROUP_SCHED
6433 root_task_group.se = (struct sched_entity **)ptr; 6582 root_task_group.se = (struct sched_entity **)ptr;
@@ -6746,7 +6895,7 @@ struct task_struct *curr_task(int cpu)
6746 6895
6747#ifdef CONFIG_IA64 6896#ifdef CONFIG_IA64
6748/** 6897/**
6749 * set_curr_task - set the current task for a given CPU. 6898 * ia64_set_curr_task - set the current task for a given CPU.
6750 * @cpu: the processor in question. 6899 * @cpu: the processor in question.
6751 * @p: the task pointer to set. 6900 * @p: the task pointer to set.
6752 * 6901 *
@@ -6771,6 +6920,20 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
6771/* task_group_lock serializes the addition/removal of task groups */ 6920/* task_group_lock serializes the addition/removal of task groups */
6772static DEFINE_SPINLOCK(task_group_lock); 6921static DEFINE_SPINLOCK(task_group_lock);
6773 6922
6923static inline void alloc_uclamp_sched_group(struct task_group *tg,
6924 struct task_group *parent)
6925{
6926#ifdef CONFIG_UCLAMP_TASK_GROUP
6927 enum uclamp_id clamp_id;
6928
6929 for_each_clamp_id(clamp_id) {
6930 uclamp_se_set(&tg->uclamp_req[clamp_id],
6931 uclamp_none(clamp_id), false);
6932 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
6933 }
6934#endif
6935}
6936
6774static void sched_free_group(struct task_group *tg) 6937static void sched_free_group(struct task_group *tg)
6775{ 6938{
6776 free_fair_sched_group(tg); 6939 free_fair_sched_group(tg);
@@ -6794,6 +6957,8 @@ struct task_group *sched_create_group(struct task_group *parent)
6794 if (!alloc_rt_sched_group(tg, parent)) 6957 if (!alloc_rt_sched_group(tg, parent))
6795 goto err; 6958 goto err;
6796 6959
6960 alloc_uclamp_sched_group(tg, parent);
6961
6797 return tg; 6962 return tg;
6798 6963
6799err: 6964err:
@@ -6897,7 +7062,7 @@ void sched_move_task(struct task_struct *tsk)
6897 if (queued) 7062 if (queued)
6898 enqueue_task(rq, tsk, queue_flags); 7063 enqueue_task(rq, tsk, queue_flags);
6899 if (running) 7064 if (running)
6900 set_curr_task(rq, tsk); 7065 set_next_task(rq, tsk);
6901 7066
6902 task_rq_unlock(rq, tsk, &rf); 7067 task_rq_unlock(rq, tsk, &rf);
6903} 7068}
@@ -6980,10 +7145,6 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6980#ifdef CONFIG_RT_GROUP_SCHED 7145#ifdef CONFIG_RT_GROUP_SCHED
6981 if (!sched_rt_can_attach(css_tg(css), task)) 7146 if (!sched_rt_can_attach(css_tg(css), task))
6982 return -EINVAL; 7147 return -EINVAL;
6983#else
6984 /* We don't support RT-tasks being in separate groups */
6985 if (task->sched_class != &fair_sched_class)
6986 return -EINVAL;
6987#endif 7148#endif
6988 /* 7149 /*
6989 * Serialize against wake_up_new_task() such that if its 7150 * Serialize against wake_up_new_task() such that if its
@@ -7014,6 +7175,178 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
7014 sched_move_task(task); 7175 sched_move_task(task);
7015} 7176}
7016 7177
7178#ifdef CONFIG_UCLAMP_TASK_GROUP
7179static void cpu_util_update_eff(struct cgroup_subsys_state *css)
7180{
7181 struct cgroup_subsys_state *top_css = css;
7182 struct uclamp_se *uc_parent = NULL;
7183 struct uclamp_se *uc_se = NULL;
7184 unsigned int eff[UCLAMP_CNT];
7185 enum uclamp_id clamp_id;
7186 unsigned int clamps;
7187
7188 css_for_each_descendant_pre(css, top_css) {
7189 uc_parent = css_tg(css)->parent
7190 ? css_tg(css)->parent->uclamp : NULL;
7191
7192 for_each_clamp_id(clamp_id) {
7193 /* Assume effective clamps matches requested clamps */
7194 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
7195 /* Cap effective clamps with parent's effective clamps */
7196 if (uc_parent &&
7197 eff[clamp_id] > uc_parent[clamp_id].value) {
7198 eff[clamp_id] = uc_parent[clamp_id].value;
7199 }
7200 }
7201 /* Ensure protection is always capped by limit */
7202 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
7203
7204 /* Propagate most restrictive effective clamps */
7205 clamps = 0x0;
7206 uc_se = css_tg(css)->uclamp;
7207 for_each_clamp_id(clamp_id) {
7208 if (eff[clamp_id] == uc_se[clamp_id].value)
7209 continue;
7210 uc_se[clamp_id].value = eff[clamp_id];
7211 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
7212 clamps |= (0x1 << clamp_id);
7213 }
7214 if (!clamps) {
7215 css = css_rightmost_descendant(css);
7216 continue;
7217 }
7218
7219 /* Immediately update descendants RUNNABLE tasks */
7220 uclamp_update_active_tasks(css, clamps);
7221 }
7222}
7223
7224/*
7225 * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
7226 * C expression. Since there is no way to convert a macro argument (N) into a
7227 * character constant, use two levels of macros.
7228 */
7229#define _POW10(exp) ((unsigned int)1e##exp)
7230#define POW10(exp) _POW10(exp)
7231
7232struct uclamp_request {
7233#define UCLAMP_PERCENT_SHIFT 2
7234#define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
7235 s64 percent;
7236 u64 util;
7237 int ret;
7238};
7239
7240static inline struct uclamp_request
7241capacity_from_percent(char *buf)
7242{
7243 struct uclamp_request req = {
7244 .percent = UCLAMP_PERCENT_SCALE,
7245 .util = SCHED_CAPACITY_SCALE,
7246 .ret = 0,
7247 };
7248
7249 buf = strim(buf);
7250 if (strcmp(buf, "max")) {
7251 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT,
7252 &req.percent);
7253 if (req.ret)
7254 return req;
7255 if (req.percent > UCLAMP_PERCENT_SCALE) {
7256 req.ret = -ERANGE;
7257 return req;
7258 }
7259
7260 req.util = req.percent << SCHED_CAPACITY_SHIFT;
7261 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
7262 }
7263
7264 return req;
7265}
7266
7267static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
7268 size_t nbytes, loff_t off,
7269 enum uclamp_id clamp_id)
7270{
7271 struct uclamp_request req;
7272 struct task_group *tg;
7273
7274 req = capacity_from_percent(buf);
7275 if (req.ret)
7276 return req.ret;
7277
7278 mutex_lock(&uclamp_mutex);
7279 rcu_read_lock();
7280
7281 tg = css_tg(of_css(of));
7282 if (tg->uclamp_req[clamp_id].value != req.util)
7283 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
7284
7285 /*
7286 * Because of not recoverable conversion rounding we keep track of the
7287 * exact requested value
7288 */
7289 tg->uclamp_pct[clamp_id] = req.percent;
7290
7291 /* Update effective clamps to track the most restrictive value */
7292 cpu_util_update_eff(of_css(of));
7293
7294 rcu_read_unlock();
7295 mutex_unlock(&uclamp_mutex);
7296
7297 return nbytes;
7298}
7299
7300static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of,
7301 char *buf, size_t nbytes,
7302 loff_t off)
7303{
7304 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
7305}
7306
7307static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of,
7308 char *buf, size_t nbytes,
7309 loff_t off)
7310{
7311 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
7312}
7313
7314static inline void cpu_uclamp_print(struct seq_file *sf,
7315 enum uclamp_id clamp_id)
7316{
7317 struct task_group *tg;
7318 u64 util_clamp;
7319 u64 percent;
7320 u32 rem;
7321
7322 rcu_read_lock();
7323 tg = css_tg(seq_css(sf));
7324 util_clamp = tg->uclamp_req[clamp_id].value;
7325 rcu_read_unlock();
7326
7327 if (util_clamp == SCHED_CAPACITY_SCALE) {
7328 seq_puts(sf, "max\n");
7329 return;
7330 }
7331
7332 percent = tg->uclamp_pct[clamp_id];
7333 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
7334 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
7335}
7336
7337static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
7338{
7339 cpu_uclamp_print(sf, UCLAMP_MIN);
7340 return 0;
7341}
7342
7343static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
7344{
7345 cpu_uclamp_print(sf, UCLAMP_MAX);
7346 return 0;
7347}
7348#endif /* CONFIG_UCLAMP_TASK_GROUP */
7349
7017#ifdef CONFIG_FAIR_GROUP_SCHED 7350#ifdef CONFIG_FAIR_GROUP_SCHED
7018static int cpu_shares_write_u64(struct cgroup_subsys_state *css, 7351static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
7019 struct cftype *cftype, u64 shareval) 7352 struct cftype *cftype, u64 shareval)
@@ -7359,6 +7692,20 @@ static struct cftype cpu_legacy_files[] = {
7359 .write_u64 = cpu_rt_period_write_uint, 7692 .write_u64 = cpu_rt_period_write_uint,
7360 }, 7693 },
7361#endif 7694#endif
7695#ifdef CONFIG_UCLAMP_TASK_GROUP
7696 {
7697 .name = "uclamp.min",
7698 .flags = CFTYPE_NOT_ON_ROOT,
7699 .seq_show = cpu_uclamp_min_show,
7700 .write = cpu_uclamp_min_write,
7701 },
7702 {
7703 .name = "uclamp.max",
7704 .flags = CFTYPE_NOT_ON_ROOT,
7705 .seq_show = cpu_uclamp_max_show,
7706 .write = cpu_uclamp_max_write,
7707 },
7708#endif
7362 { } /* Terminate */ 7709 { } /* Terminate */
7363}; 7710};
7364 7711
@@ -7526,6 +7873,20 @@ static struct cftype cpu_files[] = {
7526 .write = cpu_max_write, 7873 .write = cpu_max_write,
7527 }, 7874 },
7528#endif 7875#endif
7876#ifdef CONFIG_UCLAMP_TASK_GROUP
7877 {
7878 .name = "uclamp.min",
7879 .flags = CFTYPE_NOT_ON_ROOT,
7880 .seq_show = cpu_uclamp_min_show,
7881 .write = cpu_uclamp_min_write,
7882 },
7883 {
7884 .name = "uclamp.max",
7885 .flags = CFTYPE_NOT_ON_ROOT,
7886 .seq_show = cpu_uclamp_max_show,
7887 .write = cpu_uclamp_max_write,
7888 },
7889#endif
7529 { } /* terminate */ 7890 { } /* terminate */
7530}; 7891};
7531 7892
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 867b4bb6d4be..fdce9cfaca05 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -263,9 +263,9 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs,
263 * irq metric. Because IRQ/steal time is hidden from the task clock we 263 * irq metric. Because IRQ/steal time is hidden from the task clock we
264 * need to scale the task numbers: 264 * need to scale the task numbers:
265 * 265 *
266 * 1 - irq 266 * max - irq
267 * U' = irq + ------- * U 267 * U' = irq + --------- * U
268 * max 268 * max
269 */ 269 */
270 util = scale_irq_capacity(util, irq, max); 270 util = scale_irq_capacity(util, irq, max);
271 util += irq; 271 util += irq;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 46122edd8552..39dc9f74f289 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -529,6 +529,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
529static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p) 529static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
530{ 530{
531 struct rq *later_rq = NULL; 531 struct rq *later_rq = NULL;
532 struct dl_bw *dl_b;
532 533
533 later_rq = find_lock_later_rq(p, rq); 534 later_rq = find_lock_later_rq(p, rq);
534 if (!later_rq) { 535 if (!later_rq) {
@@ -557,6 +558,38 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
557 double_lock_balance(rq, later_rq); 558 double_lock_balance(rq, later_rq);
558 } 559 }
559 560
561 if (p->dl.dl_non_contending || p->dl.dl_throttled) {
562 /*
563 * Inactive timer is armed (or callback is running, but
564 * waiting for us to release rq locks). In any case, when it
565 * will fire (or continue), it will see running_bw of this
566 * task migrated to later_rq (and correctly handle it).
567 */
568 sub_running_bw(&p->dl, &rq->dl);
569 sub_rq_bw(&p->dl, &rq->dl);
570
571 add_rq_bw(&p->dl, &later_rq->dl);
572 add_running_bw(&p->dl, &later_rq->dl);
573 } else {
574 sub_rq_bw(&p->dl, &rq->dl);
575 add_rq_bw(&p->dl, &later_rq->dl);
576 }
577
578 /*
579 * And we finally need to fixup root_domain(s) bandwidth accounting,
580 * since p is still hanging out in the old (now moved to default) root
581 * domain.
582 */
583 dl_b = &rq->rd->dl_bw;
584 raw_spin_lock(&dl_b->lock);
585 __dl_sub(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
586 raw_spin_unlock(&dl_b->lock);
587
588 dl_b = &later_rq->rd->dl_bw;
589 raw_spin_lock(&dl_b->lock);
590 __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(later_rq->rd->span));
591 raw_spin_unlock(&dl_b->lock);
592
560 set_task_cpu(p, later_rq->cpu); 593 set_task_cpu(p, later_rq->cpu);
561 double_unlock_balance(later_rq, rq); 594 double_unlock_balance(later_rq, rq);
562 595
@@ -1694,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
1694} 1727}
1695#endif 1728#endif
1696 1729
1697static inline void set_next_task(struct rq *rq, struct task_struct *p) 1730static void set_next_task_dl(struct rq *rq, struct task_struct *p)
1698{ 1731{
1699 p->se.exec_start = rq_clock_task(rq); 1732 p->se.exec_start = rq_clock_task(rq);
1700 1733
1701 /* You can't push away the running task */ 1734 /* You can't push away the running task */
1702 dequeue_pushable_dl_task(rq, p); 1735 dequeue_pushable_dl_task(rq, p);
1736
1737 if (hrtick_enabled(rq))
1738 start_hrtick_dl(rq, p);
1739
1740 if (rq->curr->sched_class != &dl_sched_class)
1741 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1742
1743 deadline_queue_push_tasks(rq);
1703} 1744}
1704 1745
1705static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, 1746static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1720,64 +1761,42 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1720 struct task_struct *p; 1761 struct task_struct *p;
1721 struct dl_rq *dl_rq; 1762 struct dl_rq *dl_rq;
1722 1763
1723 dl_rq = &rq->dl; 1764 WARN_ON_ONCE(prev || rf);
1724 1765
1725 if (need_pull_dl_task(rq, prev)) { 1766 dl_rq = &rq->dl;
1726 /*
1727 * This is OK, because current is on_cpu, which avoids it being
1728 * picked for load-balance and preemption/IRQs are still
1729 * disabled avoiding further scheduler activity on it and we're
1730 * being very careful to re-start the picking loop.
1731 */
1732 rq_unpin_lock(rq, rf);
1733 pull_dl_task(rq);
1734 rq_repin_lock(rq, rf);
1735 /*
1736 * pull_dl_task() can drop (and re-acquire) rq->lock; this
1737 * means a stop task can slip in, in which case we need to
1738 * re-start task selection.
1739 */
1740 if (rq->stop && task_on_rq_queued(rq->stop))
1741 return RETRY_TASK;
1742 }
1743
1744 /*
1745 * When prev is DL, we may throttle it in put_prev_task().
1746 * So, we update time before we check for dl_nr_running.
1747 */
1748 if (prev->sched_class == &dl_sched_class)
1749 update_curr_dl(rq);
1750 1767
1751 if (unlikely(!dl_rq->dl_nr_running)) 1768 if (unlikely(!dl_rq->dl_nr_running))
1752 return NULL; 1769 return NULL;
1753 1770
1754 put_prev_task(rq, prev);
1755
1756 dl_se = pick_next_dl_entity(rq, dl_rq); 1771 dl_se = pick_next_dl_entity(rq, dl_rq);
1757 BUG_ON(!dl_se); 1772 BUG_ON(!dl_se);
1758 1773
1759 p = dl_task_of(dl_se); 1774 p = dl_task_of(dl_se);
1760 1775
1761 set_next_task(rq, p); 1776 set_next_task_dl(rq, p);
1762
1763 if (hrtick_enabled(rq))
1764 start_hrtick_dl(rq, p);
1765
1766 deadline_queue_push_tasks(rq);
1767
1768 if (rq->curr->sched_class != &dl_sched_class)
1769 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1770 1777
1771 return p; 1778 return p;
1772} 1779}
1773 1780
1774static void put_prev_task_dl(struct rq *rq, struct task_struct *p) 1781static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1775{ 1782{
1776 update_curr_dl(rq); 1783 update_curr_dl(rq);
1777 1784
1778 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); 1785 update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1779 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1) 1786 if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
1780 enqueue_pushable_dl_task(rq, p); 1787 enqueue_pushable_dl_task(rq, p);
1788
1789 if (rf && !on_dl_rq(&p->dl) && need_pull_dl_task(rq, p)) {
1790 /*
1791 * This is OK, because current is on_cpu, which avoids it being
1792 * picked for load-balance and preemption/IRQs are still
1793 * disabled avoiding further scheduler activity on it and we've
1794 * not yet started the picking loop.
1795 */
1796 rq_unpin_lock(rq, rf);
1797 pull_dl_task(rq);
1798 rq_repin_lock(rq, rf);
1799 }
1781} 1800}
1782 1801
1783/* 1802/*
@@ -1811,11 +1830,6 @@ static void task_fork_dl(struct task_struct *p)
1811 */ 1830 */
1812} 1831}
1813 1832
1814static void set_curr_task_dl(struct rq *rq)
1815{
1816 set_next_task(rq, rq->curr);
1817}
1818
1819#ifdef CONFIG_SMP 1833#ifdef CONFIG_SMP
1820 1834
1821/* Only try algorithms three times */ 1835/* Only try algorithms three times */
@@ -2275,6 +2289,36 @@ void __init init_sched_dl_class(void)
2275 GFP_KERNEL, cpu_to_node(i)); 2289 GFP_KERNEL, cpu_to_node(i));
2276} 2290}
2277 2291
2292void dl_add_task_root_domain(struct task_struct *p)
2293{
2294 struct rq_flags rf;
2295 struct rq *rq;
2296 struct dl_bw *dl_b;
2297
2298 rq = task_rq_lock(p, &rf);
2299 if (!dl_task(p))
2300 goto unlock;
2301
2302 dl_b = &rq->rd->dl_bw;
2303 raw_spin_lock(&dl_b->lock);
2304
2305 __dl_add(dl_b, p->dl.dl_bw, cpumask_weight(rq->rd->span));
2306
2307 raw_spin_unlock(&dl_b->lock);
2308
2309unlock:
2310 task_rq_unlock(rq, p, &rf);
2311}
2312
2313void dl_clear_root_domain(struct root_domain *rd)
2314{
2315 unsigned long flags;
2316
2317 raw_spin_lock_irqsave(&rd->dl_bw.lock, flags);
2318 rd->dl_bw.total_bw = 0;
2319 raw_spin_unlock_irqrestore(&rd->dl_bw.lock, flags);
2320}
2321
2278#endif /* CONFIG_SMP */ 2322#endif /* CONFIG_SMP */
2279 2323
2280static void switched_from_dl(struct rq *rq, struct task_struct *p) 2324static void switched_from_dl(struct rq *rq, struct task_struct *p)
@@ -2395,6 +2439,7 @@ const struct sched_class dl_sched_class = {
2395 2439
2396 .pick_next_task = pick_next_task_dl, 2440 .pick_next_task = pick_next_task_dl,
2397 .put_prev_task = put_prev_task_dl, 2441 .put_prev_task = put_prev_task_dl,
2442 .set_next_task = set_next_task_dl,
2398 2443
2399#ifdef CONFIG_SMP 2444#ifdef CONFIG_SMP
2400 .select_task_rq = select_task_rq_dl, 2445 .select_task_rq = select_task_rq_dl,
@@ -2405,7 +2450,6 @@ const struct sched_class dl_sched_class = {
2405 .task_woken = task_woken_dl, 2450 .task_woken = task_woken_dl,
2406#endif 2451#endif
2407 2452
2408 .set_curr_task = set_curr_task_dl,
2409 .task_tick = task_tick_dl, 2453 .task_tick = task_tick_dl,
2410 .task_fork = task_fork_dl, 2454 .task_fork = task_fork_dl,
2411 2455
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 500f5db0de0b..d4bbf68c3161 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -96,12 +96,12 @@ int __weak arch_asym_cpu_priority(int cpu)
96} 96}
97 97
98/* 98/*
99 * The margin used when comparing utilization with CPU capacity: 99 * The margin used when comparing utilization with CPU capacity.
100 * util * margin < capacity * 1024
101 * 100 *
102 * (default: ~20%) 101 * (default: ~20%)
103 */ 102 */
104static unsigned int capacity_margin = 1280; 103#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
104
105#endif 105#endif
106 106
107#ifdef CONFIG_CFS_BANDWIDTH 107#ifdef CONFIG_CFS_BANDWIDTH
@@ -1188,47 +1188,6 @@ static unsigned int task_scan_max(struct task_struct *p)
1188 return max(smin, smax); 1188 return max(smin, smax);
1189} 1189}
1190 1190
1191void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
1192{
1193 int mm_users = 0;
1194 struct mm_struct *mm = p->mm;
1195
1196 if (mm) {
1197 mm_users = atomic_read(&mm->mm_users);
1198 if (mm_users == 1) {
1199 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1200 mm->numa_scan_seq = 0;
1201 }
1202 }
1203 p->node_stamp = 0;
1204 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
1205 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1206 p->numa_work.next = &p->numa_work;
1207 p->numa_faults = NULL;
1208 RCU_INIT_POINTER(p->numa_group, NULL);
1209 p->last_task_numa_placement = 0;
1210 p->last_sum_exec_runtime = 0;
1211
1212 /* New address space, reset the preferred nid */
1213 if (!(clone_flags & CLONE_VM)) {
1214 p->numa_preferred_nid = NUMA_NO_NODE;
1215 return;
1216 }
1217
1218 /*
1219 * New thread, keep existing numa_preferred_nid which should be copied
1220 * already by arch_dup_task_struct but stagger when scans start.
1221 */
1222 if (mm) {
1223 unsigned int delay;
1224
1225 delay = min_t(unsigned int, task_scan_max(current),
1226 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
1227 delay += 2 * TICK_NSEC;
1228 p->node_stamp = delay;
1229 }
1230}
1231
1232static void account_numa_enqueue(struct rq *rq, struct task_struct *p) 1191static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1233{ 1192{
1234 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE); 1193 rq->nr_numa_running += (p->numa_preferred_nid != NUMA_NO_NODE);
@@ -2523,7 +2482,7 @@ static void reset_ptenuma_scan(struct task_struct *p)
2523 * The expensive part of numa migration is done from task_work context. 2482 * The expensive part of numa migration is done from task_work context.
2524 * Triggered from task_tick_numa(). 2483 * Triggered from task_tick_numa().
2525 */ 2484 */
2526void task_numa_work(struct callback_head *work) 2485static void task_numa_work(struct callback_head *work)
2527{ 2486{
2528 unsigned long migrate, next_scan, now = jiffies; 2487 unsigned long migrate, next_scan, now = jiffies;
2529 struct task_struct *p = current; 2488 struct task_struct *p = current;
@@ -2536,7 +2495,7 @@ void task_numa_work(struct callback_head *work)
2536 2495
2537 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); 2496 SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
2538 2497
2539 work->next = work; /* protect against double add */ 2498 work->next = work;
2540 /* 2499 /*
2541 * Who cares about NUMA placement when they're dying. 2500 * Who cares about NUMA placement when they're dying.
2542 * 2501 *
@@ -2665,6 +2624,50 @@ out:
2665 } 2624 }
2666} 2625}
2667 2626
2627void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
2628{
2629 int mm_users = 0;
2630 struct mm_struct *mm = p->mm;
2631
2632 if (mm) {
2633 mm_users = atomic_read(&mm->mm_users);
2634 if (mm_users == 1) {
2635 mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2636 mm->numa_scan_seq = 0;
2637 }
2638 }
2639 p->node_stamp = 0;
2640 p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
2641 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2642 /* Protect against double add, see task_tick_numa and task_numa_work */
2643 p->numa_work.next = &p->numa_work;
2644 p->numa_faults = NULL;
2645 RCU_INIT_POINTER(p->numa_group, NULL);
2646 p->last_task_numa_placement = 0;
2647 p->last_sum_exec_runtime = 0;
2648
2649 init_task_work(&p->numa_work, task_numa_work);
2650
2651 /* New address space, reset the preferred nid */
2652 if (!(clone_flags & CLONE_VM)) {
2653 p->numa_preferred_nid = NUMA_NO_NODE;
2654 return;
2655 }
2656
2657 /*
2658 * New thread, keep existing numa_preferred_nid which should be copied
2659 * already by arch_dup_task_struct but stagger when scans start.
2660 */
2661 if (mm) {
2662 unsigned int delay;
2663
2664 delay = min_t(unsigned int, task_scan_max(current),
2665 current->numa_scan_period * mm_users * NSEC_PER_MSEC);
2666 delay += 2 * TICK_NSEC;
2667 p->node_stamp = delay;
2668 }
2669}
2670
2668/* 2671/*
2669 * Drive the periodic memory faults.. 2672 * Drive the periodic memory faults..
2670 */ 2673 */
@@ -2693,10 +2696,8 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2693 curr->numa_scan_period = task_scan_start(curr); 2696 curr->numa_scan_period = task_scan_start(curr);
2694 curr->node_stamp += period; 2697 curr->node_stamp += period;
2695 2698
2696 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 2699 if (!time_before(jiffies, curr->mm->numa_next_scan))
2697 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2698 task_work_add(curr, work, true); 2700 task_work_add(curr, work, true);
2699 }
2700 } 2701 }
2701} 2702}
2702 2703
@@ -3689,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3689 return cfs_rq->avg.load_avg; 3690 return cfs_rq->avg.load_avg;
3690} 3691}
3691 3692
3692static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
3693
3694static inline unsigned long task_util(struct task_struct *p) 3693static inline unsigned long task_util(struct task_struct *p)
3695{ 3694{
3696 return READ_ONCE(p->se.avg.util_avg); 3695 return READ_ONCE(p->se.avg.util_avg);
@@ -3807,7 +3806,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
3807 3806
3808static inline int task_fits_capacity(struct task_struct *p, long capacity) 3807static inline int task_fits_capacity(struct task_struct *p, long capacity)
3809{ 3808{
3810 return capacity * 1024 > task_util_est(p) * capacity_margin; 3809 return fits_capacity(task_util_est(p), capacity);
3811} 3810}
3812 3811
3813static inline void update_misfit_status(struct task_struct *p, struct rq *rq) 3812static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4370,8 +4369,6 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
4370 4369
4371 now = sched_clock_cpu(smp_processor_id()); 4370 now = sched_clock_cpu(smp_processor_id());
4372 cfs_b->runtime = cfs_b->quota; 4371 cfs_b->runtime = cfs_b->quota;
4373 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
4374 cfs_b->expires_seq++;
4375} 4372}
4376 4373
4377static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) 4374static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4393,8 +4390,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4393{ 4390{
4394 struct task_group *tg = cfs_rq->tg; 4391 struct task_group *tg = cfs_rq->tg;
4395 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg); 4392 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
4396 u64 amount = 0, min_amount, expires; 4393 u64 amount = 0, min_amount;
4397 int expires_seq;
4398 4394
4399 /* note: this is a positive sum as runtime_remaining <= 0 */ 4395 /* note: this is a positive sum as runtime_remaining <= 0 */
4400 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining; 4396 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4411,61 +4407,17 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4411 cfs_b->idle = 0; 4407 cfs_b->idle = 0;
4412 } 4408 }
4413 } 4409 }
4414 expires_seq = cfs_b->expires_seq;
4415 expires = cfs_b->runtime_expires;
4416 raw_spin_unlock(&cfs_b->lock); 4410 raw_spin_unlock(&cfs_b->lock);
4417 4411
4418 cfs_rq->runtime_remaining += amount; 4412 cfs_rq->runtime_remaining += amount;
4419 /*
4420 * we may have advanced our local expiration to account for allowed
4421 * spread between our sched_clock and the one on which runtime was
4422 * issued.
4423 */
4424 if (cfs_rq->expires_seq != expires_seq) {
4425 cfs_rq->expires_seq = expires_seq;
4426 cfs_rq->runtime_expires = expires;
4427 }
4428 4413
4429 return cfs_rq->runtime_remaining > 0; 4414 return cfs_rq->runtime_remaining > 0;
4430} 4415}
4431 4416
4432/*
4433 * Note: This depends on the synchronization provided by sched_clock and the
4434 * fact that rq->clock snapshots this value.
4435 */
4436static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4437{
4438 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4439
4440 /* if the deadline is ahead of our clock, nothing to do */
4441 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4442 return;
4443
4444 if (cfs_rq->runtime_remaining < 0)
4445 return;
4446
4447 /*
4448 * If the local deadline has passed we have to consider the
4449 * possibility that our sched_clock is 'fast' and the global deadline
4450 * has not truly expired.
4451 *
4452 * Fortunately we can check determine whether this the case by checking
4453 * whether the global deadline(cfs_b->expires_seq) has advanced.
4454 */
4455 if (cfs_rq->expires_seq == cfs_b->expires_seq) {
4456 /* extend local deadline, drift is bounded above by 2 ticks */
4457 cfs_rq->runtime_expires += TICK_NSEC;
4458 } else {
4459 /* global deadline is ahead, expiration has passed */
4460 cfs_rq->runtime_remaining = 0;
4461 }
4462}
4463
4464static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) 4417static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4465{ 4418{
4466 /* dock delta_exec before expiring quota (as it could span periods) */ 4419 /* dock delta_exec before expiring quota (as it could span periods) */
4467 cfs_rq->runtime_remaining -= delta_exec; 4420 cfs_rq->runtime_remaining -= delta_exec;
4468 expire_cfs_rq_runtime(cfs_rq);
4469 4421
4470 if (likely(cfs_rq->runtime_remaining > 0)) 4422 if (likely(cfs_rq->runtime_remaining > 0))
4471 return; 4423 return;
@@ -4556,7 +4508,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4556 struct rq *rq = rq_of(cfs_rq); 4508 struct rq *rq = rq_of(cfs_rq);
4557 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4509 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4558 struct sched_entity *se; 4510 struct sched_entity *se;
4559 long task_delta, dequeue = 1; 4511 long task_delta, idle_task_delta, dequeue = 1;
4560 bool empty; 4512 bool empty;
4561 4513
4562 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; 4514 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4567,6 +4519,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4567 rcu_read_unlock(); 4519 rcu_read_unlock();
4568 4520
4569 task_delta = cfs_rq->h_nr_running; 4521 task_delta = cfs_rq->h_nr_running;
4522 idle_task_delta = cfs_rq->idle_h_nr_running;
4570 for_each_sched_entity(se) { 4523 for_each_sched_entity(se) {
4571 struct cfs_rq *qcfs_rq = cfs_rq_of(se); 4524 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4572 /* throttled entity or throttle-on-deactivate */ 4525 /* throttled entity or throttle-on-deactivate */
@@ -4576,6 +4529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4576 if (dequeue) 4529 if (dequeue)
4577 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); 4530 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4578 qcfs_rq->h_nr_running -= task_delta; 4531 qcfs_rq->h_nr_running -= task_delta;
4532 qcfs_rq->idle_h_nr_running -= idle_task_delta;
4579 4533
4580 if (qcfs_rq->load.weight) 4534 if (qcfs_rq->load.weight)
4581 dequeue = 0; 4535 dequeue = 0;
@@ -4615,7 +4569,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4615 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); 4569 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4616 struct sched_entity *se; 4570 struct sched_entity *se;
4617 int enqueue = 1; 4571 int enqueue = 1;
4618 long task_delta; 4572 long task_delta, idle_task_delta;
4619 4573
4620 se = cfs_rq->tg->se[cpu_of(rq)]; 4574 se = cfs_rq->tg->se[cpu_of(rq)];
4621 4575
@@ -4635,6 +4589,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4635 return; 4589 return;
4636 4590
4637 task_delta = cfs_rq->h_nr_running; 4591 task_delta = cfs_rq->h_nr_running;
4592 idle_task_delta = cfs_rq->idle_h_nr_running;
4638 for_each_sched_entity(se) { 4593 for_each_sched_entity(se) {
4639 if (se->on_rq) 4594 if (se->on_rq)
4640 enqueue = 0; 4595 enqueue = 0;
@@ -4643,6 +4598,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4643 if (enqueue) 4598 if (enqueue)
4644 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); 4599 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4645 cfs_rq->h_nr_running += task_delta; 4600 cfs_rq->h_nr_running += task_delta;
4601 cfs_rq->idle_h_nr_running += idle_task_delta;
4646 4602
4647 if (cfs_rq_throttled(cfs_rq)) 4603 if (cfs_rq_throttled(cfs_rq))
4648 break; 4604 break;
@@ -4658,8 +4614,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4658 resched_curr(rq); 4614 resched_curr(rq);
4659} 4615}
4660 4616
4661static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, 4617static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining)
4662 u64 remaining, u64 expires)
4663{ 4618{
4664 struct cfs_rq *cfs_rq; 4619 struct cfs_rq *cfs_rq;
4665 u64 runtime; 4620 u64 runtime;
@@ -4684,7 +4639,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4684 remaining -= runtime; 4639 remaining -= runtime;
4685 4640
4686 cfs_rq->runtime_remaining += runtime; 4641 cfs_rq->runtime_remaining += runtime;
4687 cfs_rq->runtime_expires = expires;
4688 4642
4689 /* we check whether we're throttled above */ 4643 /* we check whether we're throttled above */
4690 if (cfs_rq->runtime_remaining > 0) 4644 if (cfs_rq->runtime_remaining > 0)
@@ -4709,7 +4663,7 @@ next:
4709 */ 4663 */
4710static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) 4664static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags)
4711{ 4665{
4712 u64 runtime, runtime_expires; 4666 u64 runtime;
4713 int throttled; 4667 int throttled;
4714 4668
4715 /* no need to continue the timer with no bandwidth constraint */ 4669 /* no need to continue the timer with no bandwidth constraint */
@@ -4737,8 +4691,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
4737 /* account preceding periods in which throttling occurred */ 4691 /* account preceding periods in which throttling occurred */
4738 cfs_b->nr_throttled += overrun; 4692 cfs_b->nr_throttled += overrun;
4739 4693
4740 runtime_expires = cfs_b->runtime_expires;
4741
4742 /* 4694 /*
4743 * This check is repeated as we are holding onto the new bandwidth while 4695 * This check is repeated as we are holding onto the new bandwidth while
4744 * we unthrottle. This can potentially race with an unthrottled group 4696 * we unthrottle. This can potentially race with an unthrottled group
@@ -4751,8 +4703,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
4751 cfs_b->distribute_running = 1; 4703 cfs_b->distribute_running = 1;
4752 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4704 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4753 /* we can't nest cfs_b->lock while distributing bandwidth */ 4705 /* we can't nest cfs_b->lock while distributing bandwidth */
4754 runtime = distribute_cfs_runtime(cfs_b, runtime, 4706 runtime = distribute_cfs_runtime(cfs_b, runtime);
4755 runtime_expires);
4756 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4707 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4757 4708
4758 cfs_b->distribute_running = 0; 4709 cfs_b->distribute_running = 0;
@@ -4834,8 +4785,7 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4834 return; 4785 return;
4835 4786
4836 raw_spin_lock(&cfs_b->lock); 4787 raw_spin_lock(&cfs_b->lock);
4837 if (cfs_b->quota != RUNTIME_INF && 4788 if (cfs_b->quota != RUNTIME_INF) {
4838 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4839 cfs_b->runtime += slack_runtime; 4789 cfs_b->runtime += slack_runtime;
4840 4790
4841 /* we are under rq->lock, defer unthrottling using a timer */ 4791 /* we are under rq->lock, defer unthrottling using a timer */
@@ -4868,7 +4818,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4868{ 4818{
4869 u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); 4819 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4870 unsigned long flags; 4820 unsigned long flags;
4871 u64 expires;
4872 4821
4873 /* confirm we're still not at a refresh boundary */ 4822 /* confirm we're still not at a refresh boundary */
4874 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4823 raw_spin_lock_irqsave(&cfs_b->lock, flags);
@@ -4886,7 +4835,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4886 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) 4835 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4887 runtime = cfs_b->runtime; 4836 runtime = cfs_b->runtime;
4888 4837
4889 expires = cfs_b->runtime_expires;
4890 if (runtime) 4838 if (runtime)
4891 cfs_b->distribute_running = 1; 4839 cfs_b->distribute_running = 1;
4892 4840
@@ -4895,11 +4843,10 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4895 if (!runtime) 4843 if (!runtime)
4896 return; 4844 return;
4897 4845
4898 runtime = distribute_cfs_runtime(cfs_b, runtime, expires); 4846 runtime = distribute_cfs_runtime(cfs_b, runtime);
4899 4847
4900 raw_spin_lock_irqsave(&cfs_b->lock, flags); 4848 raw_spin_lock_irqsave(&cfs_b->lock, flags);
4901 if (expires == cfs_b->runtime_expires) 4849 lsub_positive(&cfs_b->runtime, runtime);
4902 lsub_positive(&cfs_b->runtime, runtime);
4903 cfs_b->distribute_running = 0; 4850 cfs_b->distribute_running = 0;
4904 raw_spin_unlock_irqrestore(&cfs_b->lock, flags); 4851 raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
4905} 4852}
@@ -5056,8 +5003,6 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5056 5003
5057 cfs_b->period_active = 1; 5004 cfs_b->period_active = 1;
5058 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); 5005 overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
5059 cfs_b->runtime_expires += (overrun + 1) * ktime_to_ns(cfs_b->period);
5060 cfs_b->expires_seq++;
5061 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); 5006 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
5062} 5007}
5063 5008
@@ -5235,7 +5180,7 @@ static inline unsigned long cpu_util(int cpu);
5235 5180
5236static inline bool cpu_overutilized(int cpu) 5181static inline bool cpu_overutilized(int cpu)
5237{ 5182{
5238 return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); 5183 return !fits_capacity(cpu_util(cpu), capacity_of(cpu));
5239} 5184}
5240 5185
5241static inline void update_overutilized_status(struct rq *rq) 5186static inline void update_overutilized_status(struct rq *rq)
@@ -5259,6 +5204,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5259{ 5204{
5260 struct cfs_rq *cfs_rq; 5205 struct cfs_rq *cfs_rq;
5261 struct sched_entity *se = &p->se; 5206 struct sched_entity *se = &p->se;
5207 int idle_h_nr_running = task_has_idle_policy(p);
5262 5208
5263 /* 5209 /*
5264 * The code below (indirectly) updates schedutil which looks at 5210 * The code below (indirectly) updates schedutil which looks at
@@ -5291,6 +5237,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5291 if (cfs_rq_throttled(cfs_rq)) 5237 if (cfs_rq_throttled(cfs_rq))
5292 break; 5238 break;
5293 cfs_rq->h_nr_running++; 5239 cfs_rq->h_nr_running++;
5240 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5294 5241
5295 flags = ENQUEUE_WAKEUP; 5242 flags = ENQUEUE_WAKEUP;
5296 } 5243 }
@@ -5298,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5298 for_each_sched_entity(se) { 5245 for_each_sched_entity(se) {
5299 cfs_rq = cfs_rq_of(se); 5246 cfs_rq = cfs_rq_of(se);
5300 cfs_rq->h_nr_running++; 5247 cfs_rq->h_nr_running++;
5248 cfs_rq->idle_h_nr_running += idle_h_nr_running;
5301 5249
5302 if (cfs_rq_throttled(cfs_rq)) 5250 if (cfs_rq_throttled(cfs_rq))
5303 break; 5251 break;
@@ -5359,6 +5307,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5359 struct cfs_rq *cfs_rq; 5307 struct cfs_rq *cfs_rq;
5360 struct sched_entity *se = &p->se; 5308 struct sched_entity *se = &p->se;
5361 int task_sleep = flags & DEQUEUE_SLEEP; 5309 int task_sleep = flags & DEQUEUE_SLEEP;
5310 int idle_h_nr_running = task_has_idle_policy(p);
5362 5311
5363 for_each_sched_entity(se) { 5312 for_each_sched_entity(se) {
5364 cfs_rq = cfs_rq_of(se); 5313 cfs_rq = cfs_rq_of(se);
@@ -5373,6 +5322,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5373 if (cfs_rq_throttled(cfs_rq)) 5322 if (cfs_rq_throttled(cfs_rq))
5374 break; 5323 break;
5375 cfs_rq->h_nr_running--; 5324 cfs_rq->h_nr_running--;
5325 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5376 5326
5377 /* Don't dequeue parent if it has other entities besides us */ 5327 /* Don't dequeue parent if it has other entities besides us */
5378 if (cfs_rq->load.weight) { 5328 if (cfs_rq->load.weight) {
@@ -5392,6 +5342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
5392 for_each_sched_entity(se) { 5342 for_each_sched_entity(se) {
5393 cfs_rq = cfs_rq_of(se); 5343 cfs_rq = cfs_rq_of(se);
5394 cfs_rq->h_nr_running--; 5344 cfs_rq->h_nr_running--;
5345 cfs_rq->idle_h_nr_running -= idle_h_nr_running;
5395 5346
5396 if (cfs_rq_throttled(cfs_rq)) 5347 if (cfs_rq_throttled(cfs_rq))
5397 break; 5348 break;
@@ -5425,6 +5376,15 @@ static struct {
5425 5376
5426#endif /* CONFIG_NO_HZ_COMMON */ 5377#endif /* CONFIG_NO_HZ_COMMON */
5427 5378
5379/* CPU only has SCHED_IDLE tasks enqueued */
5380static int sched_idle_cpu(int cpu)
5381{
5382 struct rq *rq = cpu_rq(cpu);
5383
5384 return unlikely(rq->nr_running == rq->cfs.idle_h_nr_running &&
5385 rq->nr_running);
5386}
5387
5428static unsigned long cpu_runnable_load(struct rq *rq) 5388static unsigned long cpu_runnable_load(struct rq *rq)
5429{ 5389{
5430 return cfs_rq_runnable_load_avg(&rq->cfs); 5390 return cfs_rq_runnable_load_avg(&rq->cfs);
@@ -5747,7 +5707,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5747 unsigned int min_exit_latency = UINT_MAX; 5707 unsigned int min_exit_latency = UINT_MAX;
5748 u64 latest_idle_timestamp = 0; 5708 u64 latest_idle_timestamp = 0;
5749 int least_loaded_cpu = this_cpu; 5709 int least_loaded_cpu = this_cpu;
5750 int shallowest_idle_cpu = -1; 5710 int shallowest_idle_cpu = -1, si_cpu = -1;
5751 int i; 5711 int i;
5752 5712
5753 /* Check if we have any choice: */ 5713 /* Check if we have any choice: */
@@ -5778,7 +5738,12 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5778 latest_idle_timestamp = rq->idle_stamp; 5738 latest_idle_timestamp = rq->idle_stamp;
5779 shallowest_idle_cpu = i; 5739 shallowest_idle_cpu = i;
5780 } 5740 }
5781 } else if (shallowest_idle_cpu == -1) { 5741 } else if (shallowest_idle_cpu == -1 && si_cpu == -1) {
5742 if (sched_idle_cpu(i)) {
5743 si_cpu = i;
5744 continue;
5745 }
5746
5782 load = cpu_runnable_load(cpu_rq(i)); 5747 load = cpu_runnable_load(cpu_rq(i));
5783 if (load < min_load) { 5748 if (load < min_load) {
5784 min_load = load; 5749 min_load = load;
@@ -5787,7 +5752,11 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
5787 } 5752 }
5788 } 5753 }
5789 5754
5790 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; 5755 if (shallowest_idle_cpu != -1)
5756 return shallowest_idle_cpu;
5757 if (si_cpu != -1)
5758 return si_cpu;
5759 return least_loaded_cpu;
5791} 5760}
5792 5761
5793static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, 5762static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
@@ -5940,7 +5909,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
5940 */ 5909 */
5941static int select_idle_smt(struct task_struct *p, int target) 5910static int select_idle_smt(struct task_struct *p, int target)
5942{ 5911{
5943 int cpu; 5912 int cpu, si_cpu = -1;
5944 5913
5945 if (!static_branch_likely(&sched_smt_present)) 5914 if (!static_branch_likely(&sched_smt_present))
5946 return -1; 5915 return -1;
@@ -5950,9 +5919,11 @@ static int select_idle_smt(struct task_struct *p, int target)
5950 continue; 5919 continue;
5951 if (available_idle_cpu(cpu)) 5920 if (available_idle_cpu(cpu))
5952 return cpu; 5921 return cpu;
5922 if (si_cpu == -1 && sched_idle_cpu(cpu))
5923 si_cpu = cpu;
5953 } 5924 }
5954 5925
5955 return -1; 5926 return si_cpu;
5956} 5927}
5957 5928
5958#else /* CONFIG_SCHED_SMT */ 5929#else /* CONFIG_SCHED_SMT */
@@ -5980,8 +5951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
5980 u64 avg_cost, avg_idle; 5951 u64 avg_cost, avg_idle;
5981 u64 time, cost; 5952 u64 time, cost;
5982 s64 delta; 5953 s64 delta;
5983 int cpu, nr = INT_MAX;
5984 int this = smp_processor_id(); 5954 int this = smp_processor_id();
5955 int cpu, nr = INT_MAX, si_cpu = -1;
5985 5956
5986 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); 5957 this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
5987 if (!this_sd) 5958 if (!this_sd)
@@ -6009,11 +5980,13 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
6009 5980
6010 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) { 5981 for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
6011 if (!--nr) 5982 if (!--nr)
6012 return -1; 5983 return si_cpu;
6013 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 5984 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6014 continue; 5985 continue;
6015 if (available_idle_cpu(cpu)) 5986 if (available_idle_cpu(cpu))
6016 break; 5987 break;
5988 if (si_cpu == -1 && sched_idle_cpu(cpu))
5989 si_cpu = cpu;
6017 } 5990 }
6018 5991
6019 time = cpu_clock(this) - time; 5992 time = cpu_clock(this) - time;
@@ -6032,13 +6005,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6032 struct sched_domain *sd; 6005 struct sched_domain *sd;
6033 int i, recent_used_cpu; 6006 int i, recent_used_cpu;
6034 6007
6035 if (available_idle_cpu(target)) 6008 if (available_idle_cpu(target) || sched_idle_cpu(target))
6036 return target; 6009 return target;
6037 6010
6038 /* 6011 /*
6039 * If the previous CPU is cache affine and idle, don't be stupid: 6012 * If the previous CPU is cache affine and idle, don't be stupid:
6040 */ 6013 */
6041 if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) 6014 if (prev != target && cpus_share_cache(prev, target) &&
6015 (available_idle_cpu(prev) || sched_idle_cpu(prev)))
6042 return prev; 6016 return prev;
6043 6017
6044 /* Check a recently used CPU as a potential idle candidate: */ 6018 /* Check a recently used CPU as a potential idle candidate: */
@@ -6046,7 +6020,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6046 if (recent_used_cpu != prev && 6020 if (recent_used_cpu != prev &&
6047 recent_used_cpu != target && 6021 recent_used_cpu != target &&
6048 cpus_share_cache(recent_used_cpu, target) && 6022 cpus_share_cache(recent_used_cpu, target) &&
6049 available_idle_cpu(recent_used_cpu) && 6023 (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) &&
6050 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { 6024 cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) {
6051 /* 6025 /*
6052 * Replace recent_used_cpu with prev as it is a potential 6026 * Replace recent_used_cpu with prev as it is a potential
@@ -6282,69 +6256,55 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
6282} 6256}
6283 6257
6284/* 6258/*
6285 * compute_energy(): Estimates the energy that would be consumed if @p was 6259 * compute_energy(): Estimates the energy that @pd would consume if @p was
6286 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization 6260 * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
6287 * landscape of the * CPUs after the task migration, and uses the Energy Model 6261 * landscape of @pd's CPUs after the task migration, and uses the Energy Model
6288 * to compute what would be the energy if we decided to actually migrate that 6262 * to compute what would be the energy if we decided to actually migrate that
6289 * task. 6263 * task.
6290 */ 6264 */
6291static long 6265static long
6292compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) 6266compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6293{ 6267{
6294 unsigned int max_util, util_cfs, cpu_util, cpu_cap; 6268 struct cpumask *pd_mask = perf_domain_span(pd);
6295 unsigned long sum_util, energy = 0; 6269 unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask));
6296 struct task_struct *tsk; 6270 unsigned long max_util = 0, sum_util = 0;
6297 int cpu; 6271 int cpu;
6298 6272
6299 for (; pd; pd = pd->next) { 6273 /*
6300 struct cpumask *pd_mask = perf_domain_span(pd); 6274 * The capacity state of CPUs of the current rd can be driven by CPUs
6275 * of another rd if they belong to the same pd. So, account for the
6276 * utilization of these CPUs too by masking pd with cpu_online_mask
6277 * instead of the rd span.
6278 *
6279 * If an entire pd is outside of the current rd, it will not appear in
6280 * its pd list and will not be accounted by compute_energy().
6281 */
6282 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
6283 unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
6284 struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
6301 6285
6302 /* 6286 /*
6303 * The energy model mandates all the CPUs of a performance 6287 * Busy time computation: utilization clamping is not
6304 * domain have the same capacity. 6288 * required since the ratio (sum_util / cpu_capacity)
6289 * is already enough to scale the EM reported power
6290 * consumption at the (eventually clamped) cpu_capacity.
6305 */ 6291 */
6306 cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); 6292 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6307 max_util = sum_util = 0; 6293 ENERGY_UTIL, NULL);
6308 6294
6309 /* 6295 /*
6310 * The capacity state of CPUs of the current rd can be driven by 6296 * Performance domain frequency: utilization clamping
6311 * CPUs of another rd if they belong to the same performance 6297 * must be considered since it affects the selection
6312 * domain. So, account for the utilization of these CPUs too 6298 * of the performance domain frequency.
6313 * by masking pd with cpu_online_mask instead of the rd span. 6299 * NOTE: in case RT tasks are running, by default the
6314 * 6300 * FREQUENCY_UTIL's utilization can be max OPP.
6315 * If an entire performance domain is outside of the current rd,
6316 * it will not appear in its pd list and will not be accounted
6317 * by compute_energy().
6318 */ 6301 */
6319 for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { 6302 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6320 util_cfs = cpu_util_next(cpu, p, dst_cpu); 6303 FREQUENCY_UTIL, tsk);
6321 6304 max_util = max(max_util, cpu_util);
6322 /*
6323 * Busy time computation: utilization clamping is not
6324 * required since the ratio (sum_util / cpu_capacity)
6325 * is already enough to scale the EM reported power
6326 * consumption at the (eventually clamped) cpu_capacity.
6327 */
6328 sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6329 ENERGY_UTIL, NULL);
6330
6331 /*
6332 * Performance domain frequency: utilization clamping
6333 * must be considered since it affects the selection
6334 * of the performance domain frequency.
6335 * NOTE: in case RT tasks are running, by default the
6336 * FREQUENCY_UTIL's utilization can be max OPP.
6337 */
6338 tsk = cpu == dst_cpu ? p : NULL;
6339 cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap,
6340 FREQUENCY_UTIL, tsk);
6341 max_util = max(max_util, cpu_util);
6342 }
6343
6344 energy += em_pd_energy(pd->em_pd, max_util, sum_util);
6345 } 6305 }
6346 6306
6347 return energy; 6307 return em_pd_energy(pd->em_pd, max_util, sum_util);
6348} 6308}
6349 6309
6350/* 6310/*
@@ -6386,21 +6346,19 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
6386 * other use-cases too. So, until someone finds a better way to solve this, 6346 * other use-cases too. So, until someone finds a better way to solve this,
6387 * let's keep things simple by re-using the existing slow path. 6347 * let's keep things simple by re-using the existing slow path.
6388 */ 6348 */
6389
6390static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) 6349static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6391{ 6350{
6392 unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX; 6351 unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX;
6393 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 6352 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6353 unsigned long cpu_cap, util, base_energy = 0;
6394 int cpu, best_energy_cpu = prev_cpu; 6354 int cpu, best_energy_cpu = prev_cpu;
6395 struct perf_domain *head, *pd;
6396 unsigned long cpu_cap, util;
6397 struct sched_domain *sd; 6355 struct sched_domain *sd;
6356 struct perf_domain *pd;
6398 6357
6399 rcu_read_lock(); 6358 rcu_read_lock();
6400 pd = rcu_dereference(rd->pd); 6359 pd = rcu_dereference(rd->pd);
6401 if (!pd || READ_ONCE(rd->overutilized)) 6360 if (!pd || READ_ONCE(rd->overutilized))
6402 goto fail; 6361 goto fail;
6403 head = pd;
6404 6362
6405 /* 6363 /*
6406 * Energy-aware wake-up happens on the lowest sched_domain starting 6364 * Energy-aware wake-up happens on the lowest sched_domain starting
@@ -6417,9 +6375,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6417 goto unlock; 6375 goto unlock;
6418 6376
6419 for (; pd; pd = pd->next) { 6377 for (; pd; pd = pd->next) {
6420 unsigned long cur_energy, spare_cap, max_spare_cap = 0; 6378 unsigned long cur_delta, spare_cap, max_spare_cap = 0;
6379 unsigned long base_energy_pd;
6421 int max_spare_cap_cpu = -1; 6380 int max_spare_cap_cpu = -1;
6422 6381
6382 /* Compute the 'base' energy of the pd, without @p */
6383 base_energy_pd = compute_energy(p, -1, pd);
6384 base_energy += base_energy_pd;
6385
6423 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { 6386 for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
6424 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) 6387 if (!cpumask_test_cpu(cpu, p->cpus_ptr))
6425 continue; 6388 continue;
@@ -6427,14 +6390,14 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6427 /* Skip CPUs that will be overutilized. */ 6390 /* Skip CPUs that will be overutilized. */
6428 util = cpu_util_next(cpu, p, cpu); 6391 util = cpu_util_next(cpu, p, cpu);
6429 cpu_cap = capacity_of(cpu); 6392 cpu_cap = capacity_of(cpu);
6430 if (cpu_cap * 1024 < util * capacity_margin) 6393 if (!fits_capacity(util, cpu_cap))
6431 continue; 6394 continue;
6432 6395
6433 /* Always use prev_cpu as a candidate. */ 6396 /* Always use prev_cpu as a candidate. */
6434 if (cpu == prev_cpu) { 6397 if (cpu == prev_cpu) {
6435 prev_energy = compute_energy(p, prev_cpu, head); 6398 prev_delta = compute_energy(p, prev_cpu, pd);
6436 best_energy = min(best_energy, prev_energy); 6399 prev_delta -= base_energy_pd;
6437 continue; 6400 best_delta = min(best_delta, prev_delta);
6438 } 6401 }
6439 6402
6440 /* 6403 /*
@@ -6450,9 +6413,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
6450 6413
6451 /* Evaluate the energy impact of using this CPU. */ 6414 /* Evaluate the energy impact of using this CPU. */
6452 if (max_spare_cap_cpu >= 0) { 6415 if (max_spare_cap_cpu >= 0) {
6453 cur_energy = compute_energy(p, max_spare_cap_cpu, head); 6416 cur_delta = compute_energy(p, max_spare_cap_cpu, pd);
6454 if (cur_energy < best_energy) { 6417 cur_delta -= base_energy_pd;
6455 best_energy = cur_energy; 6418 if (cur_delta < best_delta) {
6419 best_delta = cur_delta;
6456 best_energy_cpu = max_spare_cap_cpu; 6420 best_energy_cpu = max_spare_cap_cpu;
6457 } 6421 }
6458 } 6422 }
@@ -6464,10 +6428,10 @@ unlock:
6464 * Pick the best CPU if prev_cpu cannot be used, or if it saves at 6428 * Pick the best CPU if prev_cpu cannot be used, or if it saves at
6465 * least 6% of the energy used by prev_cpu. 6429 * least 6% of the energy used by prev_cpu.
6466 */ 6430 */
6467 if (prev_energy == ULONG_MAX) 6431 if (prev_delta == ULONG_MAX)
6468 return best_energy_cpu; 6432 return best_energy_cpu;
6469 6433
6470 if ((prev_energy - best_energy) > (prev_energy >> 4)) 6434 if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4))
6471 return best_energy_cpu; 6435 return best_energy_cpu;
6472 6436
6473 return prev_cpu; 6437 return prev_cpu;
@@ -6801,7 +6765,7 @@ again:
6801 goto idle; 6765 goto idle;
6802 6766
6803#ifdef CONFIG_FAIR_GROUP_SCHED 6767#ifdef CONFIG_FAIR_GROUP_SCHED
6804 if (prev->sched_class != &fair_sched_class) 6768 if (!prev || prev->sched_class != &fair_sched_class)
6805 goto simple; 6769 goto simple;
6806 6770
6807 /* 6771 /*
@@ -6878,8 +6842,8 @@ again:
6878 goto done; 6842 goto done;
6879simple: 6843simple:
6880#endif 6844#endif
6881 6845 if (prev)
6882 put_prev_task(rq, prev); 6846 put_prev_task(rq, prev);
6883 6847
6884 do { 6848 do {
6885 se = pick_next_entity(cfs_rq, NULL); 6849 se = pick_next_entity(cfs_rq, NULL);
@@ -6907,11 +6871,13 @@ done: __maybe_unused;
6907 return p; 6871 return p;
6908 6872
6909idle: 6873idle:
6910 update_misfit_status(NULL, rq); 6874 if (!rf)
6911 new_tasks = idle_balance(rq, rf); 6875 return NULL;
6876
6877 new_tasks = newidle_balance(rq, rf);
6912 6878
6913 /* 6879 /*
6914 * Because idle_balance() releases (and re-acquires) rq->lock, it is 6880 * Because newidle_balance() releases (and re-acquires) rq->lock, it is
6915 * possible for any higher priority task to appear. In that case we 6881 * possible for any higher priority task to appear. In that case we
6916 * must re-start the pick_next_entity() loop. 6882 * must re-start the pick_next_entity() loop.
6917 */ 6883 */
@@ -6933,7 +6899,7 @@ idle:
6933/* 6899/*
6934 * Account for a descheduled task: 6900 * Account for a descheduled task:
6935 */ 6901 */
6936static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) 6902static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
6937{ 6903{
6938 struct sched_entity *se = &prev->se; 6904 struct sched_entity *se = &prev->se;
6939 struct cfs_rq *cfs_rq; 6905 struct cfs_rq *cfs_rq;
@@ -7435,7 +7401,7 @@ static int detach_tasks(struct lb_env *env)
7435 detached++; 7401 detached++;
7436 env->imbalance -= load; 7402 env->imbalance -= load;
7437 7403
7438#ifdef CONFIG_PREEMPT 7404#ifdef CONFIG_PREEMPTION
7439 /* 7405 /*
7440 * NEWIDLE balancing is a source of latency, so preemptible 7406 * NEWIDLE balancing is a source of latency, so preemptible
7441 * kernels will stop after the first task is detached to minimize 7407 * kernels will stop after the first task is detached to minimize
@@ -7982,8 +7948,7 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7982static inline bool 7948static inline bool
7983group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref) 7949group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7984{ 7950{
7985 return sg->sgc->min_capacity * capacity_margin < 7951 return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
7986 ref->sgc->min_capacity * 1024;
7987} 7952}
7988 7953
7989/* 7954/*
@@ -7993,8 +7958,7 @@ group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7993static inline bool 7958static inline bool
7994group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref) 7959group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7995{ 7960{
7996 return sg->sgc->max_capacity * capacity_margin < 7961 return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
7997 ref->sgc->max_capacity * 1024;
7998} 7962}
7999 7963
8000static inline enum 7964static inline enum
@@ -9052,9 +9016,10 @@ more_balance:
9052out_balanced: 9016out_balanced:
9053 /* 9017 /*
9054 * We reach balance although we may have faced some affinity 9018 * We reach balance although we may have faced some affinity
9055 * constraints. Clear the imbalance flag if it was set. 9019 * constraints. Clear the imbalance flag only if other tasks got
9020 * a chance to move and fix the imbalance.
9056 */ 9021 */
9057 if (sd_parent) { 9022 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9058 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 9023 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9059 9024
9060 if (*group_imbalance) 9025 if (*group_imbalance)
@@ -9075,10 +9040,10 @@ out_one_pinned:
9075 ld_moved = 0; 9040 ld_moved = 0;
9076 9041
9077 /* 9042 /*
9078 * idle_balance() disregards balance intervals, so we could repeatedly 9043 * newidle_balance() disregards balance intervals, so we could
9079 * reach this code, which would lead to balance_interval skyrocketting 9044 * repeatedly reach this code, which would lead to balance_interval
9080 * in a short amount of time. Skip the balance_interval increase logic 9045 * skyrocketting in a short amount of time. Skip the balance_interval
9081 * to avoid that. 9046 * increase logic to avoid that.
9082 */ 9047 */
9083 if (env.idle == CPU_NEWLY_IDLE) 9048 if (env.idle == CPU_NEWLY_IDLE)
9084 goto out; 9049 goto out;
@@ -9788,7 +9753,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { }
9788 * idle_balance is called by schedule() if this_cpu is about to become 9753 * idle_balance is called by schedule() if this_cpu is about to become
9789 * idle. Attempts to pull tasks from other CPUs. 9754 * idle. Attempts to pull tasks from other CPUs.
9790 */ 9755 */
9791static int idle_balance(struct rq *this_rq, struct rq_flags *rf) 9756int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
9792{ 9757{
9793 unsigned long next_balance = jiffies + HZ; 9758 unsigned long next_balance = jiffies + HZ;
9794 int this_cpu = this_rq->cpu; 9759 int this_cpu = this_rq->cpu;
@@ -9796,6 +9761,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
9796 int pulled_task = 0; 9761 int pulled_task = 0;
9797 u64 curr_cost = 0; 9762 u64 curr_cost = 0;
9798 9763
9764 update_misfit_status(NULL, this_rq);
9799 /* 9765 /*
9800 * We must set idle_stamp _before_ calling idle_balance(), such that we 9766 * We must set idle_stamp _before_ calling idle_balance(), such that we
9801 * measure the duration of idle_balance() as idle time. 9767 * measure the duration of idle_balance() as idle time.
@@ -10180,9 +10146,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p)
10180 * This routine is mostly called to set cfs_rq->curr field when a task 10146 * This routine is mostly called to set cfs_rq->curr field when a task
10181 * migrates between groups/classes. 10147 * migrates between groups/classes.
10182 */ 10148 */
10183static void set_curr_task_fair(struct rq *rq) 10149static void set_next_task_fair(struct rq *rq, struct task_struct *p)
10184{ 10150{
10185 struct sched_entity *se = &rq->curr->se; 10151 struct sched_entity *se = &p->se;
10152
10153#ifdef CONFIG_SMP
10154 if (task_on_rq_queued(p)) {
10155 /*
10156 * Move the next running task to the front of the list, so our
10157 * cfs_tasks list becomes MRU one.
10158 */
10159 list_move(&se->group_node, &rq->cfs_tasks);
10160 }
10161#endif
10186 10162
10187 for_each_sched_entity(se) { 10163 for_each_sched_entity(se) {
10188 struct cfs_rq *cfs_rq = cfs_rq_of(se); 10164 struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -10300,18 +10276,18 @@ err:
10300void online_fair_sched_group(struct task_group *tg) 10276void online_fair_sched_group(struct task_group *tg)
10301{ 10277{
10302 struct sched_entity *se; 10278 struct sched_entity *se;
10279 struct rq_flags rf;
10303 struct rq *rq; 10280 struct rq *rq;
10304 int i; 10281 int i;
10305 10282
10306 for_each_possible_cpu(i) { 10283 for_each_possible_cpu(i) {
10307 rq = cpu_rq(i); 10284 rq = cpu_rq(i);
10308 se = tg->se[i]; 10285 se = tg->se[i];
10309 10286 rq_lock_irq(rq, &rf);
10310 raw_spin_lock_irq(&rq->lock);
10311 update_rq_clock(rq); 10287 update_rq_clock(rq);
10312 attach_entity_cfs_rq(se); 10288 attach_entity_cfs_rq(se);
10313 sync_throttle(tg, i); 10289 sync_throttle(tg, i);
10314 raw_spin_unlock_irq(&rq->lock); 10290 rq_unlock_irq(rq, &rf);
10315 } 10291 }
10316} 10292}
10317 10293
@@ -10453,7 +10429,9 @@ const struct sched_class fair_sched_class = {
10453 .check_preempt_curr = check_preempt_wakeup, 10429 .check_preempt_curr = check_preempt_wakeup,
10454 10430
10455 .pick_next_task = pick_next_task_fair, 10431 .pick_next_task = pick_next_task_fair,
10432
10456 .put_prev_task = put_prev_task_fair, 10433 .put_prev_task = put_prev_task_fair,
10434 .set_next_task = set_next_task_fair,
10457 10435
10458#ifdef CONFIG_SMP 10436#ifdef CONFIG_SMP
10459 .select_task_rq = select_task_rq_fair, 10437 .select_task_rq = select_task_rq_fair,
@@ -10466,7 +10444,6 @@ const struct sched_class fair_sched_class = {
10466 .set_cpus_allowed = set_cpus_allowed_common, 10444 .set_cpus_allowed = set_cpus_allowed_common,
10467#endif 10445#endif
10468 10446
10469 .set_curr_task = set_curr_task_fair,
10470 .task_tick = task_tick_fair, 10447 .task_tick = task_tick_fair,
10471 .task_fork = task_fork_fair, 10448 .task_fork = task_fork_fair,
10472 10449
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e4bc4aa739b8..8bfeb6395bdd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -375,14 +375,27 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
375 resched_curr(rq); 375 resched_curr(rq);
376} 376}
377 377
378static struct task_struct * 378static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
379pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 379{
380}
381
382static void set_next_task_idle(struct rq *rq, struct task_struct *next)
380{ 383{
381 put_prev_task(rq, prev);
382 update_idle_core(rq); 384 update_idle_core(rq);
383 schedstat_inc(rq->sched_goidle); 385 schedstat_inc(rq->sched_goidle);
386}
387
388static struct task_struct *
389pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
390{
391 struct task_struct *next = rq->idle;
392
393 if (prev)
394 put_prev_task(rq, prev);
395
396 set_next_task_idle(rq, next);
384 397
385 return rq->idle; 398 return next;
386} 399}
387 400
388/* 401/*
@@ -398,10 +411,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
398 raw_spin_lock_irq(&rq->lock); 411 raw_spin_lock_irq(&rq->lock);
399} 412}
400 413
401static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
402{
403}
404
405/* 414/*
406 * scheduler tick hitting a task of our scheduling class. 415 * scheduler tick hitting a task of our scheduling class.
407 * 416 *
@@ -414,10 +423,6 @@ static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
414{ 423{
415} 424}
416 425
417static void set_curr_task_idle(struct rq *rq)
418{
419}
420
421static void switched_to_idle(struct rq *rq, struct task_struct *p) 426static void switched_to_idle(struct rq *rq, struct task_struct *p)
422{ 427{
423 BUG(); 428 BUG();
@@ -452,13 +457,13 @@ const struct sched_class idle_sched_class = {
452 457
453 .pick_next_task = pick_next_task_idle, 458 .pick_next_task = pick_next_task_idle,
454 .put_prev_task = put_prev_task_idle, 459 .put_prev_task = put_prev_task_idle,
460 .set_next_task = set_next_task_idle,
455 461
456#ifdef CONFIG_SMP 462#ifdef CONFIG_SMP
457 .select_task_rq = select_task_rq_idle, 463 .select_task_rq = select_task_rq_idle,
458 .set_cpus_allowed = set_cpus_allowed_common, 464 .set_cpus_allowed = set_cpus_allowed_common,
459#endif 465#endif
460 466
461 .set_curr_task = set_curr_task_idle,
462 .task_tick = task_tick_idle, 467 .task_tick = task_tick_idle,
463 468
464 .get_rr_interval = get_rr_interval_idle, 469 .get_rr_interval = get_rr_interval_idle,
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ccb28085b114..9fcb2a695a41 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -22,9 +22,17 @@ EXPORT_SYMBOL_GPL(housekeeping_enabled);
22 22
23int housekeeping_any_cpu(enum hk_flags flags) 23int housekeeping_any_cpu(enum hk_flags flags)
24{ 24{
25 if (static_branch_unlikely(&housekeeping_overridden)) 25 int cpu;
26 if (housekeeping_flags & flags) 26
27 if (static_branch_unlikely(&housekeeping_overridden)) {
28 if (housekeeping_flags & flags) {
29 cpu = sched_numa_find_closest(housekeeping_mask, smp_processor_id());
30 if (cpu < nr_cpu_ids)
31 return cpu;
32
27 return cpumask_any_and(housekeeping_mask, cpu_online_mask); 33 return cpumask_any_and(housekeeping_mask, cpu_online_mask);
34 }
35 }
28 return smp_processor_id(); 36 return smp_processor_id();
29} 37}
30EXPORT_SYMBOL_GPL(housekeeping_any_cpu); 38EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 6e52b67b420e..517e3719027e 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1198,7 +1198,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
1198 if (static_branch_likely(&psi_disabled)) 1198 if (static_branch_likely(&psi_disabled))
1199 return -EOPNOTSUPP; 1199 return -EOPNOTSUPP;
1200 1200
1201 buf_size = min(nbytes, (sizeof(buf) - 1)); 1201 buf_size = min(nbytes, sizeof(buf));
1202 if (copy_from_user(buf, user_buf, buf_size)) 1202 if (copy_from_user(buf, user_buf, buf_size))
1203 return -EFAULT; 1203 return -EFAULT;
1204 1204
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..858c4cc6f99b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
1498#endif 1498#endif
1499} 1499}
1500 1500
1501static inline void set_next_task(struct rq *rq, struct task_struct *p) 1501static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
1502{ 1502{
1503 p->se.exec_start = rq_clock_task(rq); 1503 p->se.exec_start = rq_clock_task(rq);
1504 1504
1505 /* The running task is never eligible for pushing */ 1505 /* The running task is never eligible for pushing */
1506 dequeue_pushable_task(rq, p); 1506 dequeue_pushable_task(rq, p);
1507
1508 /*
1509 * If prev task was rt, put_prev_task() has already updated the
1510 * utilization. We only care of the case where we start to schedule a
1511 * rt task
1512 */
1513 if (rq->curr->sched_class != &rt_sched_class)
1514 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1515
1516 rt_queue_push_tasks(rq);
1507} 1517}
1508 1518
1509static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 1519static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1543,56 +1553,19 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1543 struct task_struct *p; 1553 struct task_struct *p;
1544 struct rt_rq *rt_rq = &rq->rt; 1554 struct rt_rq *rt_rq = &rq->rt;
1545 1555
1546 if (need_pull_rt_task(rq, prev)) { 1556 WARN_ON_ONCE(prev || rf);
1547 /*
1548 * This is OK, because current is on_cpu, which avoids it being
1549 * picked for load-balance and preemption/IRQs are still
1550 * disabled avoiding further scheduler activity on it and we're
1551 * being very careful to re-start the picking loop.
1552 */
1553 rq_unpin_lock(rq, rf);
1554 pull_rt_task(rq);
1555 rq_repin_lock(rq, rf);
1556 /*
1557 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1558 * means a dl or stop task can slip in, in which case we need
1559 * to re-start task selection.
1560 */
1561 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1562 rq->dl.dl_nr_running))
1563 return RETRY_TASK;
1564 }
1565
1566 /*
1567 * We may dequeue prev's rt_rq in put_prev_task().
1568 * So, we update time before rt_queued check.
1569 */
1570 if (prev->sched_class == &rt_sched_class)
1571 update_curr_rt(rq);
1572 1557
1573 if (!rt_rq->rt_queued) 1558 if (!rt_rq->rt_queued)
1574 return NULL; 1559 return NULL;
1575 1560
1576 put_prev_task(rq, prev);
1577
1578 p = _pick_next_task_rt(rq); 1561 p = _pick_next_task_rt(rq);
1579 1562
1580 set_next_task(rq, p); 1563 set_next_task_rt(rq, p);
1581
1582 rt_queue_push_tasks(rq);
1583
1584 /*
1585 * If prev task was rt, put_prev_task() has already updated the
1586 * utilization. We only care of the case where we start to schedule a
1587 * rt task
1588 */
1589 if (rq->curr->sched_class != &rt_sched_class)
1590 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1591 1564
1592 return p; 1565 return p;
1593} 1566}
1594 1567
1595static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 1568static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1596{ 1569{
1597 update_curr_rt(rq); 1570 update_curr_rt(rq);
1598 1571
@@ -1604,6 +1577,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1604 */ 1577 */
1605 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) 1578 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1606 enqueue_pushable_task(rq, p); 1579 enqueue_pushable_task(rq, p);
1580
1581 if (rf && !on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1582 /*
1583 * This is OK, because current is on_cpu, which avoids it being
1584 * picked for load-balance and preemption/IRQs are still
1585 * disabled avoiding further scheduler activity on it and we've
1586 * not yet started the picking loop.
1587 */
1588 rq_unpin_lock(rq, rf);
1589 pull_rt_task(rq);
1590 rq_repin_lock(rq, rf);
1591 }
1607} 1592}
1608 1593
1609#ifdef CONFIG_SMP 1594#ifdef CONFIG_SMP
@@ -2354,11 +2339,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2354 } 2339 }
2355} 2340}
2356 2341
2357static void set_curr_task_rt(struct rq *rq)
2358{
2359 set_next_task(rq, rq->curr);
2360}
2361
2362static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 2342static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2363{ 2343{
2364 /* 2344 /*
@@ -2380,6 +2360,7 @@ const struct sched_class rt_sched_class = {
2380 2360
2381 .pick_next_task = pick_next_task_rt, 2361 .pick_next_task = pick_next_task_rt,
2382 .put_prev_task = put_prev_task_rt, 2362 .put_prev_task = put_prev_task_rt,
2363 .set_next_task = set_next_task_rt,
2383 2364
2384#ifdef CONFIG_SMP 2365#ifdef CONFIG_SMP
2385 .select_task_rq = select_task_rq_rt, 2366 .select_task_rq = select_task_rq_rt,
@@ -2391,7 +2372,6 @@ const struct sched_class rt_sched_class = {
2391 .switched_from = switched_from_rt, 2372 .switched_from = switched_from_rt,
2392#endif 2373#endif
2393 2374
2394 .set_curr_task = set_curr_task_rt,
2395 .task_tick = task_tick_rt, 2375 .task_tick = task_tick_rt,
2396 2376
2397 .get_rr_interval = get_rr_interval_rt, 2377 .get_rr_interval = get_rr_interval_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 802b1f3405f2..b3cb895d14a2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -335,8 +335,6 @@ struct cfs_bandwidth {
335 u64 quota; 335 u64 quota;
336 u64 runtime; 336 u64 runtime;
337 s64 hierarchical_quota; 337 s64 hierarchical_quota;
338 u64 runtime_expires;
339 int expires_seq;
340 338
341 u8 idle; 339 u8 idle;
342 u8 period_active; 340 u8 period_active;
@@ -393,6 +391,16 @@ struct task_group {
393#endif 391#endif
394 392
395 struct cfs_bandwidth cfs_bandwidth; 393 struct cfs_bandwidth cfs_bandwidth;
394
395#ifdef CONFIG_UCLAMP_TASK_GROUP
396 /* The two decimal precision [%] value requested from user-space */
397 unsigned int uclamp_pct[UCLAMP_CNT];
398 /* Clamp values requested for a task group */
399 struct uclamp_se uclamp_req[UCLAMP_CNT];
400 /* Effective clamp values used for a task group */
401 struct uclamp_se uclamp[UCLAMP_CNT];
402#endif
403
396}; 404};
397 405
398#ifdef CONFIG_FAIR_GROUP_SCHED 406#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -483,7 +491,8 @@ struct cfs_rq {
483 struct load_weight load; 491 struct load_weight load;
484 unsigned long runnable_weight; 492 unsigned long runnable_weight;
485 unsigned int nr_running; 493 unsigned int nr_running;
486 unsigned int h_nr_running; 494 unsigned int h_nr_running; /* SCHED_{NORMAL,BATCH,IDLE} */
495 unsigned int idle_h_nr_running; /* SCHED_IDLE */
487 496
488 u64 exec_clock; 497 u64 exec_clock;
489 u64 min_vruntime; 498 u64 min_vruntime;
@@ -556,8 +565,6 @@ struct cfs_rq {
556 565
557#ifdef CONFIG_CFS_BANDWIDTH 566#ifdef CONFIG_CFS_BANDWIDTH
558 int runtime_enabled; 567 int runtime_enabled;
559 int expires_seq;
560 u64 runtime_expires;
561 s64 runtime_remaining; 568 s64 runtime_remaining;
562 569
563 u64 throttled_clock; 570 u64 throttled_clock;
@@ -777,9 +784,6 @@ struct root_domain {
777 struct perf_domain __rcu *pd; 784 struct perf_domain __rcu *pd;
778}; 785};
779 786
780extern struct root_domain def_root_domain;
781extern struct mutex sched_domains_mutex;
782
783extern void init_defrootdomain(void); 787extern void init_defrootdomain(void);
784extern int sched_init_domains(const struct cpumask *cpu_map); 788extern int sched_init_domains(const struct cpumask *cpu_map);
785extern void rq_attach_root(struct rq *rq, struct root_domain *rd); 789extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
@@ -1261,16 +1265,18 @@ enum numa_topology_type {
1261extern enum numa_topology_type sched_numa_topology_type; 1265extern enum numa_topology_type sched_numa_topology_type;
1262extern int sched_max_numa_distance; 1266extern int sched_max_numa_distance;
1263extern bool find_numa_distance(int distance); 1267extern bool find_numa_distance(int distance);
1264#endif
1265
1266#ifdef CONFIG_NUMA
1267extern void sched_init_numa(void); 1268extern void sched_init_numa(void);
1268extern void sched_domains_numa_masks_set(unsigned int cpu); 1269extern void sched_domains_numa_masks_set(unsigned int cpu);
1269extern void sched_domains_numa_masks_clear(unsigned int cpu); 1270extern void sched_domains_numa_masks_clear(unsigned int cpu);
1271extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu);
1270#else 1272#else
1271static inline void sched_init_numa(void) { } 1273static inline void sched_init_numa(void) { }
1272static inline void sched_domains_numa_masks_set(unsigned int cpu) { } 1274static inline void sched_domains_numa_masks_set(unsigned int cpu) { }
1273static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } 1275static inline void sched_domains_numa_masks_clear(unsigned int cpu) { }
1276static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1277{
1278 return nr_cpu_ids;
1279}
1274#endif 1280#endif
1275 1281
1276#ifdef CONFIG_NUMA_BALANCING 1282#ifdef CONFIG_NUMA_BALANCING
@@ -1449,10 +1455,14 @@ static inline void unregister_sched_domain_sysctl(void)
1449} 1455}
1450#endif 1456#endif
1451 1457
1458extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
1459
1452#else 1460#else
1453 1461
1454static inline void sched_ttwu_pending(void) { } 1462static inline void sched_ttwu_pending(void) { }
1455 1463
1464static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; }
1465
1456#endif /* CONFIG_SMP */ 1466#endif /* CONFIG_SMP */
1457 1467
1458#include "stats.h" 1468#include "stats.h"
@@ -1700,17 +1710,21 @@ struct sched_class {
1700 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); 1710 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1701 1711
1702 /* 1712 /*
1703 * It is the responsibility of the pick_next_task() method that will 1713 * Both @prev and @rf are optional and may be NULL, in which case the
1704 * return the next task to call put_prev_task() on the @prev task or 1714 * caller must already have invoked put_prev_task(rq, prev, rf).
1705 * something equivalent. 1715 *
1716 * Otherwise it is the responsibility of the pick_next_task() to call
1717 * put_prev_task() on the @prev task or something equivalent, IFF it
1718 * returns a next task.
1706 * 1719 *
1707 * May return RETRY_TASK when it finds a higher prio class has runnable 1720 * In that case (@rf != NULL) it may return RETRY_TASK when it finds a
1708 * tasks. 1721 * higher prio class has runnable tasks.
1709 */ 1722 */
1710 struct task_struct * (*pick_next_task)(struct rq *rq, 1723 struct task_struct * (*pick_next_task)(struct rq *rq,
1711 struct task_struct *prev, 1724 struct task_struct *prev,
1712 struct rq_flags *rf); 1725 struct rq_flags *rf);
1713 void (*put_prev_task)(struct rq *rq, struct task_struct *p); 1726 void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct rq_flags *rf);
1727 void (*set_next_task)(struct rq *rq, struct task_struct *p);
1714 1728
1715#ifdef CONFIG_SMP 1729#ifdef CONFIG_SMP
1716 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1730 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
@@ -1725,7 +1739,6 @@ struct sched_class {
1725 void (*rq_offline)(struct rq *rq); 1739 void (*rq_offline)(struct rq *rq);
1726#endif 1740#endif
1727 1741
1728 void (*set_curr_task)(struct rq *rq);
1729 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); 1742 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1730 void (*task_fork)(struct task_struct *p); 1743 void (*task_fork)(struct task_struct *p);
1731 void (*task_dead)(struct task_struct *p); 1744 void (*task_dead)(struct task_struct *p);
@@ -1755,12 +1768,14 @@ struct sched_class {
1755 1768
1756static inline void put_prev_task(struct rq *rq, struct task_struct *prev) 1769static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
1757{ 1770{
1758 prev->sched_class->put_prev_task(rq, prev); 1771 WARN_ON_ONCE(rq->curr != prev);
1772 prev->sched_class->put_prev_task(rq, prev, NULL);
1759} 1773}
1760 1774
1761static inline void set_curr_task(struct rq *rq, struct task_struct *curr) 1775static inline void set_next_task(struct rq *rq, struct task_struct *next)
1762{ 1776{
1763 curr->sched_class->set_curr_task(rq); 1777 WARN_ON_ONCE(rq->curr != next);
1778 next->sched_class->set_next_task(rq, next);
1764} 1779}
1765 1780
1766#ifdef CONFIG_SMP 1781#ifdef CONFIG_SMP
@@ -1943,7 +1958,7 @@ unsigned long arch_scale_freq_capacity(int cpu)
1943#endif 1958#endif
1944 1959
1945#ifdef CONFIG_SMP 1960#ifdef CONFIG_SMP
1946#ifdef CONFIG_PREEMPT 1961#ifdef CONFIG_PREEMPTION
1947 1962
1948static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); 1963static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
1949 1964
@@ -1995,7 +2010,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1995 return ret; 2010 return ret;
1996} 2011}
1997 2012
1998#endif /* CONFIG_PREEMPT */ 2013#endif /* CONFIG_PREEMPTION */
1999 2014
2000/* 2015/*
2001 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2016 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
@@ -2266,7 +2281,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2266#endif /* CONFIG_CPU_FREQ */ 2281#endif /* CONFIG_CPU_FREQ */
2267 2282
2268#ifdef CONFIG_UCLAMP_TASK 2283#ifdef CONFIG_UCLAMP_TASK
2269unsigned int uclamp_eff_value(struct task_struct *p, unsigned int clamp_id); 2284enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
2270 2285
2271static __always_inline 2286static __always_inline
2272unsigned int uclamp_util_with(struct rq *rq, unsigned int util, 2287unsigned int uclamp_util_with(struct rq *rq, unsigned int util,
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index aa0de240fb41..ba683fe81a6e 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -157,9 +157,10 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
157{ 157{
158 unsigned long long now = rq_clock(rq), delta = 0; 158 unsigned long long now = rq_clock(rq), delta = 0;
159 159
160 if (unlikely(sched_info_on())) 160 if (sched_info_on()) {
161 if (t->sched_info.last_queued) 161 if (t->sched_info.last_queued)
162 delta = now - t->sched_info.last_queued; 162 delta = now - t->sched_info.last_queued;
163 }
163 sched_info_reset_dequeued(t); 164 sched_info_reset_dequeued(t);
164 t->sched_info.run_delay += delta; 165 t->sched_info.run_delay += delta;
165 166
@@ -192,7 +193,7 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
192 */ 193 */
193static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 194static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
194{ 195{
195 if (unlikely(sched_info_on())) { 196 if (sched_info_on()) {
196 if (!t->sched_info.last_queued) 197 if (!t->sched_info.last_queued)
197 t->sched_info.last_queued = rq_clock(rq); 198 t->sched_info.last_queued = rq_clock(rq);
198 } 199 }
@@ -239,7 +240,7 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
239static inline void 240static inline void
240sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) 241sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
241{ 242{
242 if (unlikely(sched_info_on())) 243 if (sched_info_on())
243 __sched_info_switch(rq, prev, next); 244 __sched_info_switch(rq, prev, next);
244} 245}
245 246
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index c183b790ca54..7e1cee4e65b2 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,17 +23,22 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
23 /* we're never preempted */ 23 /* we're never preempted */
24} 24}
25 25
26static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
27{
28 stop->se.exec_start = rq_clock_task(rq);
29}
30
26static struct task_struct * 31static struct task_struct *
27pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) 32pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
28{ 33{
29 struct task_struct *stop = rq->stop; 34 struct task_struct *stop = rq->stop;
30 35
36 WARN_ON_ONCE(prev || rf);
37
31 if (!stop || !task_on_rq_queued(stop)) 38 if (!stop || !task_on_rq_queued(stop))
32 return NULL; 39 return NULL;
33 40
34 put_prev_task(rq, prev); 41 set_next_task_stop(rq, stop);
35
36 stop->se.exec_start = rq_clock_task(rq);
37 42
38 return stop; 43 return stop;
39} 44}
@@ -55,7 +60,7 @@ static void yield_task_stop(struct rq *rq)
55 BUG(); /* the stop task should never yield, its pointless. */ 60 BUG(); /* the stop task should never yield, its pointless. */
56} 61}
57 62
58static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) 63static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
59{ 64{
60 struct task_struct *curr = rq->curr; 65 struct task_struct *curr = rq->curr;
61 u64 delta_exec; 66 u64 delta_exec;
@@ -86,13 +91,6 @@ static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
86{ 91{
87} 92}
88 93
89static void set_curr_task_stop(struct rq *rq)
90{
91 struct task_struct *stop = rq->stop;
92
93 stop->se.exec_start = rq_clock_task(rq);
94}
95
96static void switched_to_stop(struct rq *rq, struct task_struct *p) 94static void switched_to_stop(struct rq *rq, struct task_struct *p)
97{ 95{
98 BUG(); /* its impossible to change to this class */ 96 BUG(); /* its impossible to change to this class */
@@ -128,13 +126,13 @@ const struct sched_class stop_sched_class = {
128 126
129 .pick_next_task = pick_next_task_stop, 127 .pick_next_task = pick_next_task_stop,
130 .put_prev_task = put_prev_task_stop, 128 .put_prev_task = put_prev_task_stop,
129 .set_next_task = set_next_task_stop,
131 130
132#ifdef CONFIG_SMP 131#ifdef CONFIG_SMP
133 .select_task_rq = select_task_rq_stop, 132 .select_task_rq = select_task_rq_stop,
134 .set_cpus_allowed = set_cpus_allowed_common, 133 .set_cpus_allowed = set_cpus_allowed_common,
135#endif 134#endif
136 135
137 .set_curr_task = set_curr_task_stop,
138 .task_tick = task_tick_stop, 136 .task_tick = task_tick_stop,
139 137
140 .get_rr_interval = get_rr_interval_stop, 138 .get_rr_interval = get_rr_interval_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index f751ce0b783e..b5667a273bf6 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1284,6 +1284,7 @@ static int sched_domains_curr_level;
1284int sched_max_numa_distance; 1284int sched_max_numa_distance;
1285static int *sched_domains_numa_distance; 1285static int *sched_domains_numa_distance;
1286static struct cpumask ***sched_domains_numa_masks; 1286static struct cpumask ***sched_domains_numa_masks;
1287int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
1287#endif 1288#endif
1288 1289
1289/* 1290/*
@@ -1402,7 +1403,7 @@ sd_init(struct sched_domain_topology_level *tl,
1402 1403
1403 sd->flags &= ~SD_PREFER_SIBLING; 1404 sd->flags &= ~SD_PREFER_SIBLING;
1404 sd->flags |= SD_SERIALIZE; 1405 sd->flags |= SD_SERIALIZE;
1405 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) { 1406 if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) {
1406 sd->flags &= ~(SD_BALANCE_EXEC | 1407 sd->flags &= ~(SD_BALANCE_EXEC |
1407 SD_BALANCE_FORK | 1408 SD_BALANCE_FORK |
1408 SD_WAKE_AFFINE); 1409 SD_WAKE_AFFINE);
@@ -1724,6 +1725,26 @@ void sched_domains_numa_masks_clear(unsigned int cpu)
1724 } 1725 }
1725} 1726}
1726 1727
1728/*
1729 * sched_numa_find_closest() - given the NUMA topology, find the cpu
1730 * closest to @cpu from @cpumask.
1731 * cpumask: cpumask to find a cpu from
1732 * cpu: cpu to be close to
1733 *
1734 * returns: cpu, or nr_cpu_ids when nothing found.
1735 */
1736int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
1737{
1738 int i, j = cpu_to_node(cpu);
1739
1740 for (i = 0; i < sched_domains_numa_levels; i++) {
1741 cpu = cpumask_any_and(cpus, sched_domains_numa_masks[i][j]);
1742 if (cpu < nr_cpu_ids)
1743 return cpu;
1744 }
1745 return nr_cpu_ids;
1746}
1747
1727#endif /* CONFIG_NUMA */ 1748#endif /* CONFIG_NUMA */
1728 1749
1729static int __sdt_alloc(const struct cpumask *cpu_map) 1750static int __sdt_alloc(const struct cpumask *cpu_map)
@@ -2149,16 +2170,16 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
2149 * ndoms_new == 0 is a special case for destroying existing domains, 2170 * ndoms_new == 0 is a special case for destroying existing domains,
2150 * and it will not create the default domain. 2171 * and it will not create the default domain.
2151 * 2172 *
2152 * Call with hotplug lock held 2173 * Call with hotplug lock and sched_domains_mutex held
2153 */ 2174 */
2154void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], 2175void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
2155 struct sched_domain_attr *dattr_new) 2176 struct sched_domain_attr *dattr_new)
2156{ 2177{
2157 bool __maybe_unused has_eas = false; 2178 bool __maybe_unused has_eas = false;
2158 int i, j, n; 2179 int i, j, n;
2159 int new_topology; 2180 int new_topology;
2160 2181
2161 mutex_lock(&sched_domains_mutex); 2182 lockdep_assert_held(&sched_domains_mutex);
2162 2183
2163 /* Always unregister in case we don't destroy any domains: */ 2184 /* Always unregister in case we don't destroy any domains: */
2164 unregister_sched_domain_sysctl(); 2185 unregister_sched_domain_sysctl();
@@ -2183,8 +2204,19 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2183 for (i = 0; i < ndoms_cur; i++) { 2204 for (i = 0; i < ndoms_cur; i++) {
2184 for (j = 0; j < n && !new_topology; j++) { 2205 for (j = 0; j < n && !new_topology; j++) {
2185 if (cpumask_equal(doms_cur[i], doms_new[j]) && 2206 if (cpumask_equal(doms_cur[i], doms_new[j]) &&
2186 dattrs_equal(dattr_cur, i, dattr_new, j)) 2207 dattrs_equal(dattr_cur, i, dattr_new, j)) {
2208 struct root_domain *rd;
2209
2210 /*
2211 * This domain won't be destroyed and as such
2212 * its dl_bw->total_bw needs to be cleared. It
2213 * will be recomputed in function
2214 * update_tasks_root_domain().
2215 */
2216 rd = cpu_rq(cpumask_any(doms_cur[i]))->rd;
2217 dl_clear_root_domain(rd);
2187 goto match1; 2218 goto match1;
2219 }
2188 } 2220 }
2189 /* No match - a current sched domain not in new doms_new[] */ 2221 /* No match - a current sched domain not in new doms_new[] */
2190 detach_destroy_domains(doms_cur[i]); 2222 detach_destroy_domains(doms_cur[i]);
@@ -2241,6 +2273,15 @@ match3:
2241 ndoms_cur = ndoms_new; 2273 ndoms_cur = ndoms_new;
2242 2274
2243 register_sched_domain_sysctl(); 2275 register_sched_domain_sysctl();
2276}
2244 2277
2278/*
2279 * Call with hotplug lock held
2280 */
2281void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2282 struct sched_domain_attr *dattr_new)
2283{
2284 mutex_lock(&sched_domains_mutex);
2285 partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
2245 mutex_unlock(&sched_domains_mutex); 2286 mutex_unlock(&sched_domains_mutex);
2246} 2287}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4f83f7bdf86..c7031a22aa7b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
383 */ 383 */
384 preempt_disable(); 384 preempt_disable();
385 stop_cpus_in_progress = true; 385 stop_cpus_in_progress = true;
386 barrier();
386 for_each_cpu(cpu, cpumask) { 387 for_each_cpu(cpu, cpumask) {
387 work = &per_cpu(cpu_stopper.stop_work, cpu); 388 work = &per_cpu(cpu_stopper.stop_work, cpu);
388 work->fn = fn; 389 work->fn = fn;
@@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
391 if (cpu_stop_queue_work(cpu, work)) 392 if (cpu_stop_queue_work(cpu, work))
392 queued = true; 393 queued = true;
393 } 394 }
395 barrier();
394 stop_cpus_in_progress = false; 396 stop_cpus_in_progress = false;
395 preempt_enable(); 397 preempt_enable();
396 398
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 98da8998c25c..6a64d7772870 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -146,7 +146,7 @@ config FUNCTION_TRACER
146 select GENERIC_TRACER 146 select GENERIC_TRACER
147 select CONTEXT_SWITCH_TRACER 147 select CONTEXT_SWITCH_TRACER
148 select GLOB 148 select GLOB
149 select TASKS_RCU if PREEMPT 149 select TASKS_RCU if PREEMPTION
150 help 150 help
151 Enable the kernel to trace every kernel function. This is done 151 Enable the kernel to trace every kernel function. This is done
152 by using a compiler feature to insert a small, 5-byte No-Operation 152 by using a compiler feature to insert a small, 5-byte No-Operation
@@ -179,7 +179,7 @@ config TRACE_PREEMPT_TOGGLE
179config PREEMPTIRQ_EVENTS 179config PREEMPTIRQ_EVENTS
180 bool "Enable trace events for preempt and irq disable/enable" 180 bool "Enable trace events for preempt and irq disable/enable"
181 select TRACE_IRQFLAGS 181 select TRACE_IRQFLAGS
182 select TRACE_PREEMPT_TOGGLE if PREEMPT 182 select TRACE_PREEMPT_TOGGLE if PREEMPTION
183 select GENERIC_TRACER 183 select GENERIC_TRACER
184 default n 184 default n
185 help 185 help
@@ -214,7 +214,7 @@ config PREEMPT_TRACER
214 bool "Preemption-off Latency Tracer" 214 bool "Preemption-off Latency Tracer"
215 default n 215 default n
216 depends on !ARCH_USES_GETTIMEOFFSET 216 depends on !ARCH_USES_GETTIMEOFFSET
217 depends on PREEMPT 217 depends on PREEMPTION
218 select GENERIC_TRACER 218 select GENERIC_TRACER
219 select TRACER_MAX_TRACE 219 select TRACER_MAX_TRACE
220 select RING_BUFFER_ALLOW_SWAP 220 select RING_BUFFER_ALLOW_SWAP
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f9821a3374e9..356b848c697a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2814,7 +2814,7 @@ int ftrace_shutdown(struct ftrace_ops *ops, int command)
2814 * synchornize_rcu_tasks() will wait for those tasks to 2814 * synchornize_rcu_tasks() will wait for those tasks to
2815 * execute and either schedule voluntarily or enter user space. 2815 * execute and either schedule voluntarily or enter user space.
2816 */ 2816 */
2817 if (IS_ENABLED(CONFIG_PREEMPT)) 2817 if (IS_ENABLED(CONFIG_PREEMPTION))
2818 synchronize_rcu_tasks(); 2818 synchronize_rcu_tasks();
2819 2819
2820 free_ops: 2820 free_ops:
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0564f6db0561..09b0b49f346e 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -267,7 +267,7 @@ static void ring_buffer_producer(void)
267 if (consumer && !(cnt % wakeup_interval)) 267 if (consumer && !(cnt % wakeup_interval))
268 wake_up_process(consumer); 268 wake_up_process(consumer);
269 269
270#ifndef CONFIG_PREEMPT 270#ifndef CONFIG_PREEMPTION
271 /* 271 /*
272 * If we are a non preempt kernel, the 10 second run will 272 * If we are a non preempt kernel, the 10 second run will
273 * stop everything while it runs. Instead, we will call 273 * stop everything while it runs. Instead, we will call
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 648930823b57..b89cdfe20bc1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -255,12 +255,12 @@ void *trace_event_buffer_reserve(struct trace_event_buffer *fbuffer,
255 local_save_flags(fbuffer->flags); 255 local_save_flags(fbuffer->flags);
256 fbuffer->pc = preempt_count(); 256 fbuffer->pc = preempt_count();
257 /* 257 /*
258 * If CONFIG_PREEMPT is enabled, then the tracepoint itself disables 258 * If CONFIG_PREEMPTION is enabled, then the tracepoint itself disables
259 * preemption (adding one to the preempt_count). Since we are 259 * preemption (adding one to the preempt_count). Since we are
260 * interested in the preempt_count at the time the tracepoint was 260 * interested in the preempt_count at the time the tracepoint was
261 * hit, we need to subtract one to offset the increment. 261 * hit, we need to subtract one to offset the increment.
262 */ 262 */
263 if (IS_ENABLED(CONFIG_PREEMPT)) 263 if (IS_ENABLED(CONFIG_PREEMPTION))
264 fbuffer->pc--; 264 fbuffer->pc--;
265 fbuffer->trace_file = trace_file; 265 fbuffer->trace_file = trace_file;
266 266
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 743b2b520d34..5e43b9664eca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -579,8 +579,7 @@ probe_wakeup(void *ignore, struct task_struct *p)
579 else 579 else
580 tracing_dl = 0; 580 tracing_dl = 0;
581 581
582 wakeup_task = p; 582 wakeup_task = get_task_struct(p);
583 get_task_struct(wakeup_task);
584 583
585 local_save_flags(flags); 584 local_save_flags(flags);
586 585
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index eaaa21b23215..ccede2425c3f 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid)
710 for (i = 0; i < MAX_NUMNODES; i++) { 710 for (i = 0; i < MAX_NUMNODES; i++) {
711 if (!khugepaged_node_load[i]) 711 if (!khugepaged_node_load[i])
712 continue; 712 continue;
713 if (node_distance(nid, i) > RECLAIM_DISTANCE) 713 if (node_distance(nid, i) > node_reclaim_distance)
714 return true; 714 return true;
715 } 715 }
716 return false; 716 return false;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9c9194959271..6991ccec9c32 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3511,7 +3511,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3511static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3511static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3512{ 3512{
3513 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= 3513 return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3514 RECLAIM_DISTANCE; 3514 node_reclaim_distance;
3515} 3515}
3516#else /* CONFIG_NUMA */ 3516#else /* CONFIG_NUMA */
3517static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) 3517static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)