aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-12-12 15:15:10 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-12-12 15:15:10 -0500
commit92c020d08d83673ecd15a9069d4457378668da31 (patch)
tree3dbc5a9c1ab179f55be49e30e378cc4e650fc20e
parentbca13ce4554ae9cf5083e5adf395ad2266cb571b (diff)
parent6b94780e45c17b83e3e75f8aaca5a328db583c74 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main scheduler changes in this cycle were: - support Intel Turbo Boost Max Technology 3.0 (TBM3) by introducig a notion of 'better cores', which the scheduler will prefer to schedule single threaded workloads on. (Tim Chen, Srinivas Pandruvada) - enhance the handling of asymmetric capacity CPUs further (Morten Rasmussen) - improve/fix load handling when moving tasks between task groups (Vincent Guittot) - simplify and clean up the cputime code (Stanislaw Gruszka) - improve mass fork()ed task spread a.k.a. hackbench speedup (Vincent Guittot) - make struct kthread kmalloc()ed and related fixes (Oleg Nesterov) - add uaccess atomicity debugging (when using access_ok() in the wrong context), under CONFIG_DEBUG_ATOMIC_SLEEP=y (Peter Zijlstra) - implement various fixes, cleanups and other enhancements (Daniel Bristot de Oliveira, Martin Schwidefsky, Rafael J. Wysocki)" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) sched/core: Use load_avg for selecting idlest group sched/core: Fix find_idlest_group() for fork kthread: Don't abuse kthread_create_on_cpu() in __kthread_create_worker() kthread: Don't use to_live_kthread() in kthread_[un]park() kthread: Don't use to_live_kthread() in kthread_stop() Revert "kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function" kthread: Make struct kthread kmalloc'ed x86/uaccess, sched/preempt: Verify access_ok() context sched/x86: Make CONFIG_SCHED_MC_PRIO=y easier to enable sched/x86: Change CONFIG_SCHED_ITMT to CONFIG_SCHED_MC_PRIO x86/sched: Use #include <linux/mutex.h> instead of #include <asm/mutex.h> cpufreq/intel_pstate: Use CPPC to get max performance acpi/bus: Set _OSC for diverse core support acpi/bus: Enable HWP CPPC objects x86/sched: Add SD_ASYM_PACKING flags to x86 ITMT CPU x86/sysctl: Add sysctl for ITMT scheduling feature x86: Enable Intel Turbo Boost Max Technology 3.0 x86/topology: Define x86's arch_update_cpu_topology sched: Extend scheduler's asym packing sched/fair: Clean up the tunable parameter definitions ...
-rw-r--r--arch/Kconfig3
-rw-r--r--arch/ia64/kernel/time.c4
-rw-r--r--arch/powerpc/Kconfig1
-rw-r--r--arch/powerpc/include/asm/cputime.h14
-rw-r--r--arch/powerpc/kernel/time.c8
-rw-r--r--arch/s390/Kconfig1
-rw-r--r--arch/s390/kernel/vtime.c9
-rw-r--r--arch/x86/Kconfig21
-rw-r--r--arch/x86/include/asm/preempt.h8
-rw-r--r--arch/x86/include/asm/topology.h32
-rw-r--r--arch/x86/include/asm/uaccess.h13
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/itmt.c215
-rw-r--r--arch/x86/kernel/smpboot.c39
-rw-r--r--drivers/acpi/bus.c10
-rw-r--r--drivers/cpufreq/Kconfig.x861
-rw-r--r--drivers/cpufreq/intel_pstate.c56
-rw-r--r--include/asm-generic/cputime_jiffies.h1
-rw-r--r--include/asm-generic/cputime_nsecs.h1
-rw-r--r--include/linux/acpi.h1
-rw-r--r--include/linux/kernel_stat.h4
-rw-r--r--include/linux/kthread.h1
-rw-r--r--include/linux/preempt.h21
-rw-r--r--include/linux/sched.h89
-rw-r--r--include/linux/sched/sysctl.h1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/kthread.c144
-rw-r--r--kernel/sched/core.c34
-rw-r--r--kernel/sched/cpuacct.c2
-rw-r--r--kernel/sched/cputime.c124
-rw-r--r--kernel/sched/deadline.c4
-rw-r--r--kernel/sched/fair.c665
-rw-r--r--kernel/sched/sched.h11
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/time/posix-cpu-timers.c4
36 files changed, 1152 insertions, 406 deletions
diff --git a/arch/Kconfig b/arch/Kconfig
index 44a44b49eb3a..835d55d52104 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -513,6 +513,9 @@ config HAVE_CONTEXT_TRACKING
513config HAVE_VIRT_CPU_ACCOUNTING 513config HAVE_VIRT_CPU_ACCOUNTING
514 bool 514 bool
515 515
516config ARCH_HAS_SCALED_CPUTIME
517 bool
518
516config HAVE_VIRT_CPU_ACCOUNTING_GEN 519config HAVE_VIRT_CPU_ACCOUNTING_GEN
517 bool 520 bool
518 default y if 64BIT 521 default y if 64BIT
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index 6f892b94e906..021f44ab4bfb 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk)
68 68
69 if (ti->ac_utime) { 69 if (ti->ac_utime) {
70 delta_utime = cycle_to_cputime(ti->ac_utime); 70 delta_utime = cycle_to_cputime(ti->ac_utime);
71 account_user_time(tsk, delta_utime, delta_utime); 71 account_user_time(tsk, delta_utime);
72 ti->ac_utime = 0; 72 ti->ac_utime = 0;
73 } 73 }
74} 74}
@@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk)
112{ 112{
113 cputime_t delta = vtime_delta(tsk); 113 cputime_t delta = vtime_delta(tsk);
114 114
115 account_system_time(tsk, 0, delta, delta); 115 account_system_time(tsk, 0, delta);
116} 116}
117EXPORT_SYMBOL_GPL(vtime_account_system); 117EXPORT_SYMBOL_GPL(vtime_account_system);
118 118
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 65fba4c34cd7..c7f120aaa98f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -160,6 +160,7 @@ config PPC
160 select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS 160 select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
161 select GENERIC_CPU_AUTOPROBE 161 select GENERIC_CPU_AUTOPROBE
162 select HAVE_VIRT_CPU_ACCOUNTING 162 select HAVE_VIRT_CPU_ACCOUNTING
163 select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
163 select HAVE_ARCH_HARDENED_USERCOPY 164 select HAVE_ARCH_HARDENED_USERCOPY
164 select HAVE_KERNEL_GZIP 165 select HAVE_KERNEL_GZIP
165 166
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h
index 4f60db074725..aa2e6a34b872 100644
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -46,26 +46,12 @@ extern cputime_t cputime_one_jiffy;
46 * Convert cputime <-> jiffies 46 * Convert cputime <-> jiffies
47 */ 47 */
48extern u64 __cputime_jiffies_factor; 48extern u64 __cputime_jiffies_factor;
49DECLARE_PER_CPU(unsigned long, cputime_last_delta);
50DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta);
51 49
52static inline unsigned long cputime_to_jiffies(const cputime_t ct) 50static inline unsigned long cputime_to_jiffies(const cputime_t ct)
53{ 51{
54 return mulhdu((__force u64) ct, __cputime_jiffies_factor); 52 return mulhdu((__force u64) ct, __cputime_jiffies_factor);
55} 53}
56 54
57/* Estimate the scaled cputime by scaling the real cputime based on
58 * the last scaled to real ratio */
59static inline cputime_t cputime_to_scaled(const cputime_t ct)
60{
61 if (cpu_has_feature(CPU_FTR_SPURR) &&
62 __this_cpu_read(cputime_last_delta))
63 return (__force u64) ct *
64 __this_cpu_read(cputime_scaled_last_delta) /
65 __this_cpu_read(cputime_last_delta);
66 return ct;
67}
68
69static inline cputime_t jiffies_to_cputime(const unsigned long jif) 55static inline cputime_t jiffies_to_cputime(const unsigned long jif)
70{ 56{
71 u64 ct; 57 u64 ct;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bc3f7d0d7b79..be9751f1cb2a 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -164,8 +164,6 @@ u64 __cputime_sec_factor;
164EXPORT_SYMBOL(__cputime_sec_factor); 164EXPORT_SYMBOL(__cputime_sec_factor);
165u64 __cputime_clockt_factor; 165u64 __cputime_clockt_factor;
166EXPORT_SYMBOL(__cputime_clockt_factor); 166EXPORT_SYMBOL(__cputime_clockt_factor);
167DEFINE_PER_CPU(unsigned long, cputime_last_delta);
168DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta);
169 167
170cputime_t cputime_one_jiffy; 168cputime_t cputime_one_jiffy;
171 169
@@ -360,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk)
360 unsigned long delta, sys_scaled, stolen; 358 unsigned long delta, sys_scaled, stolen;
361 359
362 delta = vtime_delta(tsk, &sys_scaled, &stolen); 360 delta = vtime_delta(tsk, &sys_scaled, &stolen);
363 account_system_time(tsk, 0, delta, sys_scaled); 361 account_system_time(tsk, 0, delta);
362 tsk->stimescaled += sys_scaled;
364 if (stolen) 363 if (stolen)
365 account_steal_time(stolen); 364 account_steal_time(stolen);
366} 365}
@@ -393,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk)
393 acct->user_time = 0; 392 acct->user_time = 0;
394 acct->user_time_scaled = 0; 393 acct->user_time_scaled = 0;
395 acct->utime_sspurr = 0; 394 acct->utime_sspurr = 0;
396 account_user_time(tsk, utime, utimescaled); 395 account_user_time(tsk, utime);
396 tsk->utimescaled += utimescaled;
397} 397}
398 398
399#ifdef CONFIG_PPC32 399#ifdef CONFIG_PPC32
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 426481d4cc86..028f97be5bae 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -171,6 +171,7 @@ config S390
171 select SYSCTL_EXCEPTION_TRACE 171 select SYSCTL_EXCEPTION_TRACE
172 select TTY 172 select TTY
173 select VIRT_CPU_ACCOUNTING 173 select VIRT_CPU_ACCOUNTING
174 select ARCH_HAS_SCALED_CPUTIME
174 select VIRT_TO_BUS 175 select VIRT_TO_BUS
175 select HAVE_NMI 176 select HAVE_NMI
176 177
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c
index 856e30d8463f..1bd5dde2d5a9 100644
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
137 user_scaled = (user_scaled * mult) / div; 137 user_scaled = (user_scaled * mult) / div;
138 system_scaled = (system_scaled * mult) / div; 138 system_scaled = (system_scaled * mult) / div;
139 } 139 }
140 account_user_time(tsk, user, user_scaled); 140 account_user_time(tsk, user);
141 account_system_time(tsk, hardirq_offset, system, system_scaled); 141 tsk->utimescaled += user_scaled;
142 account_system_time(tsk, hardirq_offset, system);
143 tsk->stimescaled += system_scaled;
142 144
143 steal = S390_lowcore.steal_timer; 145 steal = S390_lowcore.steal_timer;
144 if ((s64) steal > 0) { 146 if ((s64) steal > 0) {
@@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk)
202 204
203 system_scaled = (system_scaled * mult) / div; 205 system_scaled = (system_scaled * mult) / div;
204 } 206 }
205 account_system_time(tsk, 0, system, system_scaled); 207 account_system_time(tsk, 0, system);
208 tsk->stimescaled += system_scaled;
206 209
207 virt_timer_forward(system); 210 virt_timer_forward(system);
208} 211}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bada636d1065..b50e5eeefd21 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -939,6 +939,27 @@ config SCHED_MC
939 making when dealing with multi-core CPU chips at a cost of slightly 939 making when dealing with multi-core CPU chips at a cost of slightly
940 increased overhead in some places. If unsure say N here. 940 increased overhead in some places. If unsure say N here.
941 941
942config SCHED_MC_PRIO
943 bool "CPU core priorities scheduler support"
944 depends on SCHED_MC && CPU_SUP_INTEL
945 select X86_INTEL_PSTATE
946 select CPU_FREQ
947 default y
948 ---help---
949 Intel Turbo Boost Max Technology 3.0 enabled CPUs have a
950 core ordering determined at manufacturing time, which allows
951 certain cores to reach higher turbo frequencies (when running
952 single threaded workloads) than others.
953
954 Enabling this kernel feature teaches the scheduler about
955 the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the
956 scheduler's CPU selection logic accordingly, so that higher
957 overall system performance can be achieved.
958
959 This feature will have no effect on CPUs without this feature.
960
961 If unsure say Y here.
962
942source "kernel/Kconfig.preempt" 963source "kernel/Kconfig.preempt"
943 964
944config UP_LATE_INIT 965config UP_LATE_INIT
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
index 17f218645701..ec1f3c651150 100644
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -24,7 +24,13 @@ static __always_inline int preempt_count(void)
24 24
25static __always_inline void preempt_count_set(int pc) 25static __always_inline void preempt_count_set(int pc)
26{ 26{
27 raw_cpu_write_4(__preempt_count, pc); 27 int old, new;
28
29 do {
30 old = raw_cpu_read_4(__preempt_count);
31 new = (old & PREEMPT_NEED_RESCHED) |
32 (pc & ~PREEMPT_NEED_RESCHED);
33 } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
28} 34}
29 35
30/* 36/*
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index cf75871d2f81..6358a85e2270 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -146,4 +146,36 @@ struct pci_bus;
146int x86_pci_root_bus_node(int bus); 146int x86_pci_root_bus_node(int bus);
147void x86_pci_root_bus_resources(int bus, struct list_head *resources); 147void x86_pci_root_bus_resources(int bus, struct list_head *resources);
148 148
149extern bool x86_topology_update;
150
151#ifdef CONFIG_SCHED_MC_PRIO
152#include <asm/percpu.h>
153
154DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
155extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
156
157/* Interface to set priority of a cpu */
158void sched_set_itmt_core_prio(int prio, int core_cpu);
159
160/* Interface to notify scheduler that system supports ITMT */
161int sched_set_itmt_support(void);
162
163/* Interface to notify scheduler that system revokes ITMT support */
164void sched_clear_itmt_support(void);
165
166#else /* CONFIG_SCHED_MC_PRIO */
167
168#define sysctl_sched_itmt_enabled 0
169static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
170{
171}
172static inline int sched_set_itmt_support(void)
173{
174 return 0;
175}
176static inline void sched_clear_itmt_support(void)
177{
178}
179#endif /* CONFIG_SCHED_MC_PRIO */
180
149#endif /* _ASM_X86_TOPOLOGY_H */ 181#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index faf3687f1035..ea148313570f 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
68 __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ 68 __chk_range_not_ok((unsigned long __force)(addr), size, limit); \
69}) 69})
70 70
71#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
72# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task())
73#else
74# define WARN_ON_IN_IRQ()
75#endif
76
71/** 77/**
72 * access_ok: - Checks if a user space pointer is valid 78 * access_ok: - Checks if a user space pointer is valid
73 * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that 79 * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
@@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
88 * checks that the pointer is in the user space range - after calling 94 * checks that the pointer is in the user space range - after calling
89 * this function, memory access functions may still return -EFAULT. 95 * this function, memory access functions may still return -EFAULT.
90 */ 96 */
91#define access_ok(type, addr, size) \ 97#define access_ok(type, addr, size) \
92 likely(!__range_not_ok(addr, size, user_addr_max())) 98({ \
99 WARN_ON_IN_IRQ(); \
100 likely(!__range_not_ok(addr, size, user_addr_max())); \
101})
93 102
94/* 103/*
95 * These are the main single-value transfer routines. They automatically 104 * These are the main single-value transfer routines. They automatically
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 79076d75bdbf..05110c1097ae 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o
123 123
124obj-$(CONFIG_PERF_EVENTS) += perf_regs.o 124obj-$(CONFIG_PERF_EVENTS) += perf_regs.o
125obj-$(CONFIG_TRACING) += tracepoint.o 125obj-$(CONFIG_TRACING) += tracepoint.o
126obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o
126 127
127ifdef CONFIG_FRAME_POINTER 128ifdef CONFIG_FRAME_POINTER
128obj-y += unwind_frame.o 129obj-y += unwind_frame.o
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 51287cd90bf6..643818a7688b 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
906 static int use_apm_idle; /* = 0 */ 906 static int use_apm_idle; /* = 0 */
907 static unsigned int last_jiffies; /* = 0 */ 907 static unsigned int last_jiffies; /* = 0 */
908 static unsigned int last_stime; /* = 0 */ 908 static unsigned int last_stime; /* = 0 */
909 cputime_t stime; 909 cputime_t stime, utime;
910 910
911 int apm_idle_done = 0; 911 int apm_idle_done = 0;
912 unsigned int jiffies_since_last_check = jiffies - last_jiffies; 912 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
913 unsigned int bucket; 913 unsigned int bucket;
914 914
915recalc: 915recalc:
916 task_cputime(current, NULL, &stime); 916 task_cputime(current, &utime, &stime);
917 if (jiffies_since_last_check > IDLE_CALC_LIMIT) { 917 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
918 use_apm_idle = 0; 918 use_apm_idle = 0;
919 } else if (jiffies_since_last_check > idle_period) { 919 } else if (jiffies_since_last_check > idle_period) {
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
new file mode 100644
index 000000000000..cb9c1ed1d391
--- /dev/null
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,215 @@
1/*
2 * itmt.c: Support Intel Turbo Boost Max Technology 3.0
3 *
4 * (C) Copyright 2016 Intel Corporation
5 * Author: Tim Chen <tim.c.chen@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
13 * the maximum turbo frequencies of some cores in a CPU package may be
14 * higher than for the other cores in the same package. In that case,
15 * better performance can be achieved by making the scheduler prefer
16 * to run tasks on the CPUs with higher max turbo frequencies.
17 *
18 * This file provides functions and data structures for enabling the
19 * scheduler to favor scheduling on cores can be boosted to a higher
20 * frequency under ITMT.
21 */
22
23#include <linux/sched.h>
24#include <linux/cpumask.h>
25#include <linux/cpuset.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/sysctl.h>
29#include <linux/nodemask.h>
30
31static DEFINE_MUTEX(itmt_update_mutex);
32DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
33
34/* Boolean to track if system has ITMT capabilities */
35static bool __read_mostly sched_itmt_capable;
36
37/*
38 * Boolean to control whether we want to move processes to cpu capable
39 * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
40 * Technology 3.0.
41 *
42 * It can be set via /proc/sys/kernel/sched_itmt_enabled
43 */
44unsigned int __read_mostly sysctl_sched_itmt_enabled;
45
46static int sched_itmt_update_handler(struct ctl_table *table, int write,
47 void __user *buffer, size_t *lenp,
48 loff_t *ppos)
49{
50 unsigned int old_sysctl;
51 int ret;
52
53 mutex_lock(&itmt_update_mutex);
54
55 if (!sched_itmt_capable) {
56 mutex_unlock(&itmt_update_mutex);
57 return -EINVAL;
58 }
59
60 old_sysctl = sysctl_sched_itmt_enabled;
61 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
62
63 if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
64 x86_topology_update = true;
65 rebuild_sched_domains();
66 }
67
68 mutex_unlock(&itmt_update_mutex);
69
70 return ret;
71}
72
73static unsigned int zero;
74static unsigned int one = 1;
75static struct ctl_table itmt_kern_table[] = {
76 {
77 .procname = "sched_itmt_enabled",
78 .data = &sysctl_sched_itmt_enabled,
79 .maxlen = sizeof(unsigned int),
80 .mode = 0644,
81 .proc_handler = sched_itmt_update_handler,
82 .extra1 = &zero,
83 .extra2 = &one,
84 },
85 {}
86};
87
88static struct ctl_table itmt_root_table[] = {
89 {
90 .procname = "kernel",
91 .mode = 0555,
92 .child = itmt_kern_table,
93 },
94 {}
95};
96
97static struct ctl_table_header *itmt_sysctl_header;
98
99/**
100 * sched_set_itmt_support() - Indicate platform supports ITMT
101 *
102 * This function is used by the OS to indicate to scheduler that the platform
103 * is capable of supporting the ITMT feature.
104 *
105 * The current scheme has the pstate driver detects if the system
106 * is ITMT capable and call sched_set_itmt_support.
107 *
108 * This must be done only after sched_set_itmt_core_prio
109 * has been called to set the cpus' priorities.
110 * It must not be called with cpu hot plug lock
111 * held as we need to acquire the lock to rebuild sched domains
112 * later.
113 *
114 * Return: 0 on success
115 */
116int sched_set_itmt_support(void)
117{
118 mutex_lock(&itmt_update_mutex);
119
120 if (sched_itmt_capable) {
121 mutex_unlock(&itmt_update_mutex);
122 return 0;
123 }
124
125 itmt_sysctl_header = register_sysctl_table(itmt_root_table);
126 if (!itmt_sysctl_header) {
127 mutex_unlock(&itmt_update_mutex);
128 return -ENOMEM;
129 }
130
131 sched_itmt_capable = true;
132
133 sysctl_sched_itmt_enabled = 1;
134
135 if (sysctl_sched_itmt_enabled) {
136 x86_topology_update = true;
137 rebuild_sched_domains();
138 }
139
140 mutex_unlock(&itmt_update_mutex);
141
142 return 0;
143}
144
145/**
146 * sched_clear_itmt_support() - Revoke platform's support of ITMT
147 *
148 * This function is used by the OS to indicate that it has
149 * revoked the platform's support of ITMT feature.
150 *
151 * It must not be called with cpu hot plug lock
152 * held as we need to acquire the lock to rebuild sched domains
153 * later.
154 */
155void sched_clear_itmt_support(void)
156{
157 mutex_lock(&itmt_update_mutex);
158
159 if (!sched_itmt_capable) {
160 mutex_unlock(&itmt_update_mutex);
161 return;
162 }
163 sched_itmt_capable = false;
164
165 if (itmt_sysctl_header) {
166 unregister_sysctl_table(itmt_sysctl_header);
167 itmt_sysctl_header = NULL;
168 }
169
170 if (sysctl_sched_itmt_enabled) {
171 /* disable sched_itmt if we are no longer ITMT capable */
172 sysctl_sched_itmt_enabled = 0;
173 x86_topology_update = true;
174 rebuild_sched_domains();
175 }
176
177 mutex_unlock(&itmt_update_mutex);
178}
179
180int arch_asym_cpu_priority(int cpu)
181{
182 return per_cpu(sched_core_priority, cpu);
183}
184
185/**
186 * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
187 * @prio: Priority of cpu core
188 * @core_cpu: The cpu number associated with the core
189 *
190 * The pstate driver will find out the max boost frequency
191 * and call this function to set a priority proportional
192 * to the max boost frequency. CPU with higher boost
193 * frequency will receive higher priority.
194 *
195 * No need to rebuild sched domain after updating
196 * the CPU priorities. The sched domains have no
197 * dependency on CPU priorities.
198 */
199void sched_set_itmt_core_prio(int prio, int core_cpu)
200{
201 int cpu, i = 1;
202
203 for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
204 int smt_prio;
205
206 /*
207 * Ensure that the siblings are moved to the end
208 * of the priority chain and only used when
209 * all other high priority cpus are out of capacity.
210 */
211 smt_prio = prio * smp_num_siblings / i;
212 per_cpu(sched_core_priority, cpu) = smt_prio;
213 i++;
214 }
215}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index b9f02383f372..118e792a7be6 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -109,6 +109,17 @@ static bool logical_packages_frozen __read_mostly;
109/* Maximum number of SMT threads on any online core */ 109/* Maximum number of SMT threads on any online core */
110int __max_smt_threads __read_mostly; 110int __max_smt_threads __read_mostly;
111 111
112/* Flag to indicate if a complete sched domain rebuild is required */
113bool x86_topology_update;
114
115int arch_update_cpu_topology(void)
116{
117 int retval = x86_topology_update;
118
119 x86_topology_update = false;
120 return retval;
121}
122
112static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) 123static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
113{ 124{
114 unsigned long flags; 125 unsigned long flags;
@@ -471,22 +482,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
471 return false; 482 return false;
472} 483}
473 484
485#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
486static inline int x86_sched_itmt_flags(void)
487{
488 return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
489}
490
491#ifdef CONFIG_SCHED_MC
492static int x86_core_flags(void)
493{
494 return cpu_core_flags() | x86_sched_itmt_flags();
495}
496#endif
497#ifdef CONFIG_SCHED_SMT
498static int x86_smt_flags(void)
499{
500 return cpu_smt_flags() | x86_sched_itmt_flags();
501}
502#endif
503#endif
504
474static struct sched_domain_topology_level x86_numa_in_package_topology[] = { 505static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
475#ifdef CONFIG_SCHED_SMT 506#ifdef CONFIG_SCHED_SMT
476 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 507 { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
477#endif 508#endif
478#ifdef CONFIG_SCHED_MC 509#ifdef CONFIG_SCHED_MC
479 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 510 { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
480#endif 511#endif
481 { NULL, }, 512 { NULL, },
482}; 513};
483 514
484static struct sched_domain_topology_level x86_topology[] = { 515static struct sched_domain_topology_level x86_topology[] = {
485#ifdef CONFIG_SCHED_SMT 516#ifdef CONFIG_SCHED_SMT
486 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, 517 { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
487#endif 518#endif
488#ifdef CONFIG_SCHED_MC 519#ifdef CONFIG_SCHED_MC
489 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, 520 { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
490#endif 521#endif
491 { cpu_cpu_mask, SD_INIT_NAME(DIE) }, 522 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
492 { NULL, }, 523 { NULL, },
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index 56190d00fd87..5cbefd7621f0 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -331,6 +331,16 @@ static void acpi_bus_osc_support(void)
331 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; 331 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
332 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; 332 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
333 333
334#ifdef CONFIG_X86
335 if (boot_cpu_has(X86_FEATURE_HWP)) {
336 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT;
337 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT;
338 }
339#endif
340
341 if (IS_ENABLED(CONFIG_SCHED_MC_PRIO))
342 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT;
343
334 if (!ghes_disable) 344 if (!ghes_disable)
335 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; 345 capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT;
336 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) 346 if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86
index adbd1de1cea5..35f71825b7f3 100644
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -6,6 +6,7 @@ config X86_INTEL_PSTATE
6 bool "Intel P state control" 6 bool "Intel P state control"
7 depends on X86 7 depends on X86
8 select ACPI_PROCESSOR if ACPI 8 select ACPI_PROCESSOR if ACPI
9 select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
9 help 10 help
10 This driver provides a P state for Intel core processors. 11 This driver provides a P state for Intel core processors.
11 The driver implements an internal governor and will become 12 The driver implements an internal governor and will become
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 4737520ec823..e8dc42fc0915 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -44,6 +44,7 @@
44 44
45#ifdef CONFIG_ACPI 45#ifdef CONFIG_ACPI
46#include <acpi/processor.h> 46#include <acpi/processor.h>
47#include <acpi/cppc_acpi.h>
47#endif 48#endif
48 49
49#define FRAC_BITS 8 50#define FRAC_BITS 8
@@ -379,14 +380,67 @@ static bool intel_pstate_get_ppc_enable_status(void)
379 return acpi_ppc; 380 return acpi_ppc;
380} 381}
381 382
383#ifdef CONFIG_ACPI_CPPC_LIB
384
385/* The work item is needed to avoid CPU hotplug locking issues */
386static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
387{
388 sched_set_itmt_support();
389}
390
391static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
392
393static void intel_pstate_set_itmt_prio(int cpu)
394{
395 struct cppc_perf_caps cppc_perf;
396 static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
397 int ret;
398
399 ret = cppc_get_perf_caps(cpu, &cppc_perf);
400 if (ret)
401 return;
402
403 /*
404 * The priorities can be set regardless of whether or not
405 * sched_set_itmt_support(true) has been called and it is valid to
406 * update them at any time after it has been called.
407 */
408 sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
409
410 if (max_highest_perf <= min_highest_perf) {
411 if (cppc_perf.highest_perf > max_highest_perf)
412 max_highest_perf = cppc_perf.highest_perf;
413
414 if (cppc_perf.highest_perf < min_highest_perf)
415 min_highest_perf = cppc_perf.highest_perf;
416
417 if (max_highest_perf > min_highest_perf) {
418 /*
419 * This code can be run during CPU online under the
420 * CPU hotplug locks, so sched_set_itmt_support()
421 * cannot be called from here. Queue up a work item
422 * to invoke it.
423 */
424 schedule_work(&sched_itmt_work);
425 }
426 }
427}
428#else
429static void intel_pstate_set_itmt_prio(int cpu)
430{
431}
432#endif
433
382static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) 434static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
383{ 435{
384 struct cpudata *cpu; 436 struct cpudata *cpu;
385 int ret; 437 int ret;
386 int i; 438 int i;
387 439
388 if (hwp_active) 440 if (hwp_active) {
441 intel_pstate_set_itmt_prio(policy->cpu);
389 return; 442 return;
443 }
390 444
391 if (!intel_pstate_get_ppc_enable_status()) 445 if (!intel_pstate_get_ppc_enable_status())
392 return; 446 return;
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h
index fe386fc6e85e..6bb8cd45f53b 100644
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t;
7 7
8#define cputime_one_jiffy jiffies_to_cputime(1) 8#define cputime_one_jiffy jiffies_to_cputime(1)
9#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) 9#define cputime_to_jiffies(__ct) (__force unsigned long)(__ct)
10#define cputime_to_scaled(__ct) (__ct)
11#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) 10#define jiffies_to_cputime(__hz) (__force cputime_t)(__hz)
12 11
13typedef u64 __nocast cputime64_t; 12typedef u64 __nocast cputime64_t;
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h
index a84e28e0c634..4e3b18e559b1 100644
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t;
34 */ 34 */
35#define cputime_to_jiffies(__ct) \ 35#define cputime_to_jiffies(__ct) \
36 cputime_div(__ct, NSEC_PER_SEC / HZ) 36 cputime_div(__ct, NSEC_PER_SEC / HZ)
37#define cputime_to_scaled(__ct) (__ct)
38#define jiffies_to_cputime(__jif) \ 37#define jiffies_to_cputime(__jif) \
39 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) 38 (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
40#define cputime64_to_jiffies64(__ct) \ 39#define cputime64_to_jiffies64(__ct) \
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 61a3d90f32b3..051023756520 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -469,6 +469,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
469#define OSC_SB_CPCV2_SUPPORT 0x00000040 469#define OSC_SB_CPCV2_SUPPORT 0x00000040
470#define OSC_SB_PCLPI_SUPPORT 0x00000080 470#define OSC_SB_PCLPI_SUPPORT 0x00000080
471#define OSC_SB_OSLPI_SUPPORT 0x00000100 471#define OSC_SB_OSLPI_SUPPORT 0x00000100
472#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000
472 473
473extern bool osc_sb_apei_support_acked; 474extern bool osc_sb_apei_support_acked;
474extern bool osc_pc_lpi_support_confirmed; 475extern bool osc_pc_lpi_support_confirmed;
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 44fda64ad434..00f776816aa3 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
78 return kstat_cpu(cpu).irqs_sum; 78 return kstat_cpu(cpu).irqs_sum;
79} 79}
80 80
81extern void account_user_time(struct task_struct *, cputime_t, cputime_t); 81extern void account_user_time(struct task_struct *, cputime_t);
82extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); 82extern void account_system_time(struct task_struct *, int, cputime_t);
83extern void account_steal_time(cputime_t); 83extern void account_steal_time(cputime_t);
84extern void account_idle_time(cputime_t); 84extern void account_idle_time(cputime_t);
85 85
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index a6e82a69c363..c1c3e63d52c1 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -48,6 +48,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
48 __k; \ 48 __k; \
49}) 49})
50 50
51void free_kthread_struct(struct task_struct *k);
51void kthread_bind(struct task_struct *k, unsigned int cpu); 52void kthread_bind(struct task_struct *k, unsigned int cpu);
52void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); 53void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
53int kthread_stop(struct task_struct *k); 54int kthread_stop(struct task_struct *k);
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 75e4e30677f1..7eeceac52dea 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -65,19 +65,24 @@
65 65
66/* 66/*
67 * Are we doing bottom half or hardware interrupt processing? 67 * Are we doing bottom half or hardware interrupt processing?
68 * Are we in a softirq context? Interrupt context? 68 *
69 * in_softirq - Are we currently processing softirq or have bh disabled? 69 * in_irq() - We're in (hard) IRQ context
70 * in_serving_softirq - Are we currently processing softirq? 70 * in_softirq() - We have BH disabled, or are processing softirqs
71 * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
72 * in_serving_softirq() - We're in softirq context
73 * in_nmi() - We're in NMI context
74 * in_task() - We're in task context
75 *
76 * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really
77 * should not be used in new code.
71 */ 78 */
72#define in_irq() (hardirq_count()) 79#define in_irq() (hardirq_count())
73#define in_softirq() (softirq_count()) 80#define in_softirq() (softirq_count())
74#define in_interrupt() (irq_count()) 81#define in_interrupt() (irq_count())
75#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) 82#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
76 83#define in_nmi() (preempt_count() & NMI_MASK)
77/* 84#define in_task() (!(preempt_count() & \
78 * Are we in NMI context? 85 (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
79 */
80#define in_nmi() (preempt_count() & NMI_MASK)
81 86
82/* 87/*
83 * The preempt_count offset after preempt_disable(); 88 * The preempt_count offset after preempt_disable();
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8863bdf582d5..7551d3e2ab70 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!(
262#define set_task_state(tsk, state_value) \ 262#define set_task_state(tsk, state_value) \
263 do { \ 263 do { \
264 (tsk)->task_state_change = _THIS_IP_; \ 264 (tsk)->task_state_change = _THIS_IP_; \
265 smp_store_mb((tsk)->state, (state_value)); \ 265 smp_store_mb((tsk)->state, (state_value)); \
266 } while (0) 266 } while (0)
267 267
268/*
269 * set_current_state() includes a barrier so that the write of current->state
270 * is correctly serialised wrt the caller's subsequent test of whether to
271 * actually sleep:
272 *
273 * set_current_state(TASK_UNINTERRUPTIBLE);
274 * if (do_i_need_to_sleep())
275 * schedule();
276 *
277 * If the caller does not need such serialisation then use __set_current_state()
278 */
279#define __set_current_state(state_value) \ 268#define __set_current_state(state_value) \
280 do { \ 269 do { \
281 current->task_state_change = _THIS_IP_; \ 270 current->task_state_change = _THIS_IP_; \
@@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!(
284#define set_current_state(state_value) \ 273#define set_current_state(state_value) \
285 do { \ 274 do { \
286 current->task_state_change = _THIS_IP_; \ 275 current->task_state_change = _THIS_IP_; \
287 smp_store_mb(current->state, (state_value)); \ 276 smp_store_mb(current->state, (state_value)); \
288 } while (0) 277 } while (0)
289 278
290#else 279#else
291 280
281/*
282 * @tsk had better be current, or you get to keep the pieces.
283 *
284 * The only reason is that computing current can be more expensive than
285 * using a pointer that's already available.
286 *
287 * Therefore, see set_current_state().
288 */
292#define __set_task_state(tsk, state_value) \ 289#define __set_task_state(tsk, state_value) \
293 do { (tsk)->state = (state_value); } while (0) 290 do { (tsk)->state = (state_value); } while (0)
294#define set_task_state(tsk, state_value) \ 291#define set_task_state(tsk, state_value) \
@@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!(
299 * is correctly serialised wrt the caller's subsequent test of whether to 296 * is correctly serialised wrt the caller's subsequent test of whether to
300 * actually sleep: 297 * actually sleep:
301 * 298 *
299 * for (;;) {
302 * set_current_state(TASK_UNINTERRUPTIBLE); 300 * set_current_state(TASK_UNINTERRUPTIBLE);
303 * if (do_i_need_to_sleep()) 301 * if (!need_sleep)
304 * schedule(); 302 * break;
303 *
304 * schedule();
305 * }
306 * __set_current_state(TASK_RUNNING);
307 *
308 * If the caller does not need such serialisation (because, for instance, the
309 * condition test and condition change and wakeup are under the same lock) then
310 * use __set_current_state().
311 *
312 * The above is typically ordered against the wakeup, which does:
313 *
314 * need_sleep = false;
315 * wake_up_state(p, TASK_UNINTERRUPTIBLE);
316 *
317 * Where wake_up_state() (and all other wakeup primitives) imply enough
318 * barriers to order the store of the variable against wakeup.
319 *
320 * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
321 * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
322 * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
305 * 323 *
306 * If the caller does not need such serialisation then use __set_current_state() 324 * This is obviously fine, since they both store the exact same value.
325 *
326 * Also see the comments of try_to_wake_up().
307 */ 327 */
308#define __set_current_state(state_value) \ 328#define __set_current_state(state_value) \
309 do { current->state = (state_value); } while (0) 329 do { current->state = (state_value); } while (0)
@@ -1057,6 +1077,8 @@ static inline int cpu_numa_flags(void)
1057} 1077}
1058#endif 1078#endif
1059 1079
1080extern int arch_asym_cpu_priority(int cpu);
1081
1060struct sched_domain_attr { 1082struct sched_domain_attr {
1061 int relax_domain_level; 1083 int relax_domain_level;
1062}; 1084};
@@ -1627,7 +1649,10 @@ struct task_struct {
1627 int __user *set_child_tid; /* CLONE_CHILD_SETTID */ 1649 int __user *set_child_tid; /* CLONE_CHILD_SETTID */
1628 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ 1650 int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
1629 1651
1630 cputime_t utime, stime, utimescaled, stimescaled; 1652 cputime_t utime, stime;
1653#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
1654 cputime_t utimescaled, stimescaled;
1655#endif
1631 cputime_t gtime; 1656 cputime_t gtime;
1632 struct prev_cputime prev_cputime; 1657 struct prev_cputime prev_cputime;
1633#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1658#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -2220,34 +2245,38 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
2220#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 2245#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
2221extern void task_cputime(struct task_struct *t, 2246extern void task_cputime(struct task_struct *t,
2222 cputime_t *utime, cputime_t *stime); 2247 cputime_t *utime, cputime_t *stime);
2223extern void task_cputime_scaled(struct task_struct *t,
2224 cputime_t *utimescaled, cputime_t *stimescaled);
2225extern cputime_t task_gtime(struct task_struct *t); 2248extern cputime_t task_gtime(struct task_struct *t);
2226#else 2249#else
2227static inline void task_cputime(struct task_struct *t, 2250static inline void task_cputime(struct task_struct *t,
2228 cputime_t *utime, cputime_t *stime) 2251 cputime_t *utime, cputime_t *stime)
2229{ 2252{
2230 if (utime) 2253 *utime = t->utime;
2231 *utime = t->utime; 2254 *stime = t->stime;
2232 if (stime)
2233 *stime = t->stime;
2234} 2255}
2235 2256
2257static inline cputime_t task_gtime(struct task_struct *t)
2258{
2259 return t->gtime;
2260}
2261#endif
2262
2263#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
2236static inline void task_cputime_scaled(struct task_struct *t, 2264static inline void task_cputime_scaled(struct task_struct *t,
2237 cputime_t *utimescaled, 2265 cputime_t *utimescaled,
2238 cputime_t *stimescaled) 2266 cputime_t *stimescaled)
2239{ 2267{
2240 if (utimescaled) 2268 *utimescaled = t->utimescaled;
2241 *utimescaled = t->utimescaled; 2269 *stimescaled = t->stimescaled;
2242 if (stimescaled)
2243 *stimescaled = t->stimescaled;
2244} 2270}
2245 2271#else
2246static inline cputime_t task_gtime(struct task_struct *t) 2272static inline void task_cputime_scaled(struct task_struct *t,
2273 cputime_t *utimescaled,
2274 cputime_t *stimescaled)
2247{ 2275{
2248 return t->gtime; 2276 task_cputime(t, utimescaled, stimescaled);
2249} 2277}
2250#endif 2278#endif
2279
2251extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); 2280extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
2252extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); 2281extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
2253 2282
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 22db1e63707e..441145351301 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
36extern unsigned int sysctl_sched_migration_cost; 36extern unsigned int sysctl_sched_migration_cost;
37extern unsigned int sysctl_sched_nr_migrate; 37extern unsigned int sysctl_sched_nr_migrate;
38extern unsigned int sysctl_sched_time_avg; 38extern unsigned int sysctl_sched_time_avg;
39extern unsigned int sysctl_sched_shares_window;
40 39
41int sched_proc_update_handler(struct ctl_table *table, int write, 40int sched_proc_update_handler(struct ctl_table *table, int write,
42 void __user *buffer, size_t *length, 41 void __user *buffer, size_t *length,
diff --git a/kernel/fork.c b/kernel/fork.c
index 997ac1d584f7..7ffa16033ded 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -354,6 +354,8 @@ void free_task(struct task_struct *tsk)
354 ftrace_graph_exit_task(tsk); 354 ftrace_graph_exit_task(tsk);
355 put_seccomp_filter(tsk); 355 put_seccomp_filter(tsk);
356 arch_release_task_struct(tsk); 356 arch_release_task_struct(tsk);
357 if (tsk->flags & PF_KTHREAD)
358 free_kthread_struct(tsk);
357 free_task_struct(tsk); 359 free_task_struct(tsk);
358} 360}
359EXPORT_SYMBOL(free_task); 361EXPORT_SYMBOL(free_task);
@@ -1551,7 +1553,9 @@ static __latent_entropy struct task_struct *copy_process(
1551 init_sigpending(&p->pending); 1553 init_sigpending(&p->pending);
1552 1554
1553 p->utime = p->stime = p->gtime = 0; 1555 p->utime = p->stime = p->gtime = 0;
1556#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
1554 p->utimescaled = p->stimescaled = 0; 1557 p->utimescaled = p->stimescaled = 0;
1558#endif
1555 prev_cputime_init(&p->prev_cputime); 1559 prev_cputime_init(&p->prev_cputime);
1556 1560
1557#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1561#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be2cc1f9dd57..956495f0efaf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -53,20 +53,29 @@ enum KTHREAD_BITS {
53 KTHREAD_IS_PARKED, 53 KTHREAD_IS_PARKED,
54}; 54};
55 55
56#define __to_kthread(vfork) \ 56static inline void set_kthread_struct(void *kthread)
57 container_of(vfork, struct kthread, exited) 57{
58 /*
59 * We abuse ->set_child_tid to avoid the new member and because it
60 * can't be wrongly copied by copy_process(). We also rely on fact
61 * that the caller can't exec, so PF_KTHREAD can't be cleared.
62 */
63 current->set_child_tid = (__force void __user *)kthread;
64}
58 65
59static inline struct kthread *to_kthread(struct task_struct *k) 66static inline struct kthread *to_kthread(struct task_struct *k)
60{ 67{
61 return __to_kthread(k->vfork_done); 68 WARN_ON(!(k->flags & PF_KTHREAD));
69 return (__force void *)k->set_child_tid;
62} 70}
63 71
64static struct kthread *to_live_kthread(struct task_struct *k) 72void free_kthread_struct(struct task_struct *k)
65{ 73{
66 struct completion *vfork = ACCESS_ONCE(k->vfork_done); 74 /*
67 if (likely(vfork) && try_get_task_stack(k)) 75 * Can be NULL if this kthread was created by kernel_thread()
68 return __to_kthread(vfork); 76 * or if kmalloc() in kthread() failed.
69 return NULL; 77 */
78 kfree(to_kthread(k));
70} 79}
71 80
72/** 81/**
@@ -181,14 +190,11 @@ static int kthread(void *_create)
181 int (*threadfn)(void *data) = create->threadfn; 190 int (*threadfn)(void *data) = create->threadfn;
182 void *data = create->data; 191 void *data = create->data;
183 struct completion *done; 192 struct completion *done;
184 struct kthread self; 193 struct kthread *self;
185 int ret; 194 int ret;
186 195
187 self.flags = 0; 196 self = kmalloc(sizeof(*self), GFP_KERNEL);
188 self.data = data; 197 set_kthread_struct(self);
189 init_completion(&self.exited);
190 init_completion(&self.parked);
191 current->vfork_done = &self.exited;
192 198
193 /* If user was SIGKILLed, I release the structure. */ 199 /* If user was SIGKILLed, I release the structure. */
194 done = xchg(&create->done, NULL); 200 done = xchg(&create->done, NULL);
@@ -196,6 +202,19 @@ static int kthread(void *_create)
196 kfree(create); 202 kfree(create);
197 do_exit(-EINTR); 203 do_exit(-EINTR);
198 } 204 }
205
206 if (!self) {
207 create->result = ERR_PTR(-ENOMEM);
208 complete(done);
209 do_exit(-ENOMEM);
210 }
211
212 self->flags = 0;
213 self->data = data;
214 init_completion(&self->exited);
215 init_completion(&self->parked);
216 current->vfork_done = &self->exited;
217
199 /* OK, tell user we're spawned, wait for stop or wakeup */ 218 /* OK, tell user we're spawned, wait for stop or wakeup */
200 __set_current_state(TASK_UNINTERRUPTIBLE); 219 __set_current_state(TASK_UNINTERRUPTIBLE);
201 create->result = current; 220 create->result = current;
@@ -203,12 +222,10 @@ static int kthread(void *_create)
203 schedule(); 222 schedule();
204 223
205 ret = -EINTR; 224 ret = -EINTR;
206 225 if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
207 if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { 226 __kthread_parkme(self);
208 __kthread_parkme(&self);
209 ret = threadfn(data); 227 ret = threadfn(data);
210 } 228 }
211 /* we can't just return, we must preserve "self" on stack */
212 do_exit(ret); 229 do_exit(ret);
213} 230}
214 231
@@ -409,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
409 return p; 426 return p;
410} 427}
411 428
412static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) 429/**
430 * kthread_unpark - unpark a thread created by kthread_create().
431 * @k: thread created by kthread_create().
432 *
433 * Sets kthread_should_park() for @k to return false, wakes it, and
434 * waits for it to return. If the thread is marked percpu then its
435 * bound to the cpu again.
436 */
437void kthread_unpark(struct task_struct *k)
413{ 438{
439 struct kthread *kthread = to_kthread(k);
440
414 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 441 clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
415 /* 442 /*
416 * We clear the IS_PARKED bit here as we don't wait 443 * We clear the IS_PARKED bit here as we don't wait
@@ -428,24 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
428 wake_up_state(k, TASK_PARKED); 455 wake_up_state(k, TASK_PARKED);
429 } 456 }
430} 457}
431
432/**
433 * kthread_unpark - unpark a thread created by kthread_create().
434 * @k: thread created by kthread_create().
435 *
436 * Sets kthread_should_park() for @k to return false, wakes it, and
437 * waits for it to return. If the thread is marked percpu then its
438 * bound to the cpu again.
439 */
440void kthread_unpark(struct task_struct *k)
441{
442 struct kthread *kthread = to_live_kthread(k);
443
444 if (kthread) {
445 __kthread_unpark(k, kthread);
446 put_task_stack(k);
447 }
448}
449EXPORT_SYMBOL_GPL(kthread_unpark); 458EXPORT_SYMBOL_GPL(kthread_unpark);
450 459
451/** 460/**
@@ -462,21 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark);
462 */ 471 */
463int kthread_park(struct task_struct *k) 472int kthread_park(struct task_struct *k)
464{ 473{
465 struct kthread *kthread = to_live_kthread(k); 474 struct kthread *kthread = to_kthread(k);
466 int ret = -ENOSYS; 475
467 476 if (WARN_ON(k->flags & PF_EXITING))
468 if (kthread) { 477 return -ENOSYS;
469 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { 478
470 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); 479 if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
471 if (k != current) { 480 set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
472 wake_up_process(k); 481 if (k != current) {
473 wait_for_completion(&kthread->parked); 482 wake_up_process(k);
474 } 483 wait_for_completion(&kthread->parked);
475 } 484 }
476 put_task_stack(k);
477 ret = 0;
478 } 485 }
479 return ret; 486
487 return 0;
480} 488}
481EXPORT_SYMBOL_GPL(kthread_park); 489EXPORT_SYMBOL_GPL(kthread_park);
482 490
@@ -503,14 +511,11 @@ int kthread_stop(struct task_struct *k)
503 trace_sched_kthread_stop(k); 511 trace_sched_kthread_stop(k);
504 512
505 get_task_struct(k); 513 get_task_struct(k);
506 kthread = to_live_kthread(k); 514 kthread = to_kthread(k);
507 if (kthread) { 515 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
508 set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); 516 kthread_unpark(k);
509 __kthread_unpark(k, kthread); 517 wake_up_process(k);
510 wake_up_process(k); 518 wait_for_completion(&kthread->exited);
511 wait_for_completion(&kthread->exited);
512 put_task_stack(k);
513 }
514 ret = k->exit_code; 519 ret = k->exit_code;
515 put_task_struct(k); 520 put_task_struct(k);
516 521
@@ -636,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
636{ 641{
637 struct kthread_worker *worker; 642 struct kthread_worker *worker;
638 struct task_struct *task; 643 struct task_struct *task;
644 int node = -1;
639 645
640 worker = kzalloc(sizeof(*worker), GFP_KERNEL); 646 worker = kzalloc(sizeof(*worker), GFP_KERNEL);
641 if (!worker) 647 if (!worker)
@@ -643,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags,
643 649
644 kthread_init_worker(worker); 650 kthread_init_worker(worker);
645 651
646 if (cpu >= 0) { 652 if (cpu >= 0)
647 char name[TASK_COMM_LEN]; 653 node = cpu_to_node(cpu);
648
649 /*
650 * kthread_create_worker_on_cpu() allows to pass a generic
651 * namefmt in compare with kthread_create_on_cpu. We need
652 * to format it here.
653 */
654 vsnprintf(name, sizeof(name), namefmt, args);
655 task = kthread_create_on_cpu(kthread_worker_fn, worker,
656 cpu, name);
657 } else {
658 task = __kthread_create_on_node(kthread_worker_fn, worker,
659 -1, namefmt, args);
660 }
661 654
655 task = __kthread_create_on_node(kthread_worker_fn, worker,
656 node, namefmt, args);
662 if (IS_ERR(task)) 657 if (IS_ERR(task))
663 goto fail_task; 658 goto fail_task;
664 659
660 if (cpu >= 0)
661 kthread_bind(task, cpu);
662
665 worker->flags = flags; 663 worker->flags = flags;
666 worker->task = task; 664 worker->task = task;
667 wake_up_process(task); 665 wake_up_process(task);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8b08fb257856..d18804491d9f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1995 * @state: the mask of task states that can be woken 1995 * @state: the mask of task states that can be woken
1996 * @wake_flags: wake modifier flags (WF_*) 1996 * @wake_flags: wake modifier flags (WF_*)
1997 * 1997 *
1998 * Put it on the run-queue if it's not already there. The "current" 1998 * If (@state & @p->state) @p->state = TASK_RUNNING.
1999 * thread is always on the run-queue (except when the actual
2000 * re-schedule is in progress), and as such you're allowed to do
2001 * the simpler "current->state = TASK_RUNNING" to mark yourself
2002 * runnable without the overhead of this.
2003 * 1999 *
2004 * Return: %true if @p was woken up, %false if it was already running. 2000 * If the task was not queued/runnable, also place it back on a runqueue.
2005 * or @state didn't match @p's state. 2001 *
2002 * Atomic against schedule() which would dequeue a task, also see
2003 * set_current_state().
2004 *
2005 * Return: %true if @p->state changes (an actual wakeup was done),
2006 * %false otherwise.
2006 */ 2007 */
2007static int 2008static int
2008try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) 2009try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
@@ -5707,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5707 printk(KERN_CONT " %*pbl", 5708 printk(KERN_CONT " %*pbl",
5708 cpumask_pr_args(sched_group_cpus(group))); 5709 cpumask_pr_args(sched_group_cpus(group)));
5709 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { 5710 if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
5710 printk(KERN_CONT " (cpu_capacity = %d)", 5711 printk(KERN_CONT " (cpu_capacity = %lu)",
5711 group->sgc->capacity); 5712 group->sgc->capacity);
5712 } 5713 }
5713 5714
@@ -6184,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6184 * die on a /0 trap. 6185 * die on a /0 trap.
6185 */ 6186 */
6186 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); 6187 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
6188 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
6187 6189
6188 /* 6190 /*
6189 * Make sure the first group of this domain contains the 6191 * Make sure the first group of this domain contains the
@@ -6301,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
6301 WARN_ON(!sg); 6303 WARN_ON(!sg);
6302 6304
6303 do { 6305 do {
6306 int cpu, max_cpu = -1;
6307
6304 sg->group_weight = cpumask_weight(sched_group_cpus(sg)); 6308 sg->group_weight = cpumask_weight(sched_group_cpus(sg));
6309
6310 if (!(sd->flags & SD_ASYM_PACKING))
6311 goto next;
6312
6313 for_each_cpu(cpu, sched_group_cpus(sg)) {
6314 if (max_cpu < 0)
6315 max_cpu = cpu;
6316 else if (sched_asym_prefer(cpu, max_cpu))
6317 max_cpu = cpu;
6318 }
6319 sg->asym_prefer_cpu = max_cpu;
6320
6321next:
6305 sg = sg->next; 6322 sg = sg->next;
6306 } while (sg != sd->groups); 6323 } while (sg != sd->groups);
6307 6324
@@ -7602,6 +7619,7 @@ void __init sched_init(void)
7602#ifdef CONFIG_FAIR_GROUP_SCHED 7619#ifdef CONFIG_FAIR_GROUP_SCHED
7603 root_task_group.shares = ROOT_TASK_GROUP_LOAD; 7620 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
7604 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7621 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7622 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
7605 /* 7623 /*
7606 * How much cpu bandwidth does root_task_group get? 7624 * How much cpu bandwidth does root_task_group get?
7607 * 7625 *
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index bc0b309c3f19..9add206b5608 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
297 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { 297 for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
298 seq_printf(sf, "%s %lld\n", 298 seq_printf(sf, "%s %lld\n",
299 cpuacct_stat_desc[stat], 299 cpuacct_stat_desc[stat],
300 cputime64_to_clock_t(val[stat])); 300 (long long)cputime64_to_clock_t(val[stat]));
301 } 301 }
302 302
303 return 0; 303 return 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 5ebee3164e64..7700a9cba335 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
128 * Account user cpu time to a process. 128 * Account user cpu time to a process.
129 * @p: the process that the cpu time gets accounted to 129 * @p: the process that the cpu time gets accounted to
130 * @cputime: the cpu time spent in user space since the last update 130 * @cputime: the cpu time spent in user space since the last update
131 * @cputime_scaled: cputime scaled by cpu frequency
132 */ 131 */
133void account_user_time(struct task_struct *p, cputime_t cputime, 132void account_user_time(struct task_struct *p, cputime_t cputime)
134 cputime_t cputime_scaled)
135{ 133{
136 int index; 134 int index;
137 135
138 /* Add user time to process. */ 136 /* Add user time to process. */
139 p->utime += cputime; 137 p->utime += cputime;
140 p->utimescaled += cputime_scaled;
141 account_group_user_time(p, cputime); 138 account_group_user_time(p, cputime);
142 139
143 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; 140 index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
153 * Account guest cpu time to a process. 150 * Account guest cpu time to a process.
154 * @p: the process that the cpu time gets accounted to 151 * @p: the process that the cpu time gets accounted to
155 * @cputime: the cpu time spent in virtual machine since the last update 152 * @cputime: the cpu time spent in virtual machine since the last update
156 * @cputime_scaled: cputime scaled by cpu frequency
157 */ 153 */
158static void account_guest_time(struct task_struct *p, cputime_t cputime, 154static void account_guest_time(struct task_struct *p, cputime_t cputime)
159 cputime_t cputime_scaled)
160{ 155{
161 u64 *cpustat = kcpustat_this_cpu->cpustat; 156 u64 *cpustat = kcpustat_this_cpu->cpustat;
162 157
163 /* Add guest time to process. */ 158 /* Add guest time to process. */
164 p->utime += cputime; 159 p->utime += cputime;
165 p->utimescaled += cputime_scaled;
166 account_group_user_time(p, cputime); 160 account_group_user_time(p, cputime);
167 p->gtime += cputime; 161 p->gtime += cputime;
168 162
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
180 * Account system cpu time to a process and desired cpustat field 174 * Account system cpu time to a process and desired cpustat field
181 * @p: the process that the cpu time gets accounted to 175 * @p: the process that the cpu time gets accounted to
182 * @cputime: the cpu time spent in kernel space since the last update 176 * @cputime: the cpu time spent in kernel space since the last update
183 * @cputime_scaled: cputime scaled by cpu frequency 177 * @index: pointer to cpustat field that has to be updated
184 * @target_cputime64: pointer to cpustat field that has to be updated
185 */ 178 */
186static inline 179static inline
187void __account_system_time(struct task_struct *p, cputime_t cputime, 180void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
188 cputime_t cputime_scaled, int index)
189{ 181{
190 /* Add system time to process. */ 182 /* Add system time to process. */
191 p->stime += cputime; 183 p->stime += cputime;
192 p->stimescaled += cputime_scaled;
193 account_group_system_time(p, cputime); 184 account_group_system_time(p, cputime);
194 185
195 /* Add system time to cpustat. */ 186 /* Add system time to cpustat. */
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
204 * @p: the process that the cpu time gets accounted to 195 * @p: the process that the cpu time gets accounted to
205 * @hardirq_offset: the offset to subtract from hardirq_count() 196 * @hardirq_offset: the offset to subtract from hardirq_count()
206 * @cputime: the cpu time spent in kernel space since the last update 197 * @cputime: the cpu time spent in kernel space since the last update
207 * @cputime_scaled: cputime scaled by cpu frequency
208 */ 198 */
209void account_system_time(struct task_struct *p, int hardirq_offset, 199void account_system_time(struct task_struct *p, int hardirq_offset,
210 cputime_t cputime, cputime_t cputime_scaled) 200 cputime_t cputime)
211{ 201{
212 int index; 202 int index;
213 203
214 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { 204 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
215 account_guest_time(p, cputime, cputime_scaled); 205 account_guest_time(p, cputime);
216 return; 206 return;
217 } 207 }
218 208
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
223 else 213 else
224 index = CPUTIME_SYSTEM; 214 index = CPUTIME_SYSTEM;
225 215
226 __account_system_time(p, cputime, cputime_scaled, index); 216 __account_system_time(p, cputime, index);
227} 217}
228 218
229/* 219/*
@@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
390 struct rq *rq, int ticks) 380 struct rq *rq, int ticks)
391{ 381{
392 u64 cputime = (__force u64) cputime_one_jiffy * ticks; 382 u64 cputime = (__force u64) cputime_one_jiffy * ticks;
393 cputime_t scaled, other; 383 cputime_t other;
394 384
395 /* 385 /*
396 * When returning from idle, many ticks can get accounted at 386 * When returning from idle, many ticks can get accounted at
@@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
403 if (other >= cputime) 393 if (other >= cputime)
404 return; 394 return;
405 cputime -= other; 395 cputime -= other;
406 scaled = cputime_to_scaled(cputime);
407 396
408 if (this_cpu_ksoftirqd() == p) { 397 if (this_cpu_ksoftirqd() == p) {
409 /* 398 /*
@@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
411 * So, we have to handle it separately here. 400 * So, we have to handle it separately here.
412 * Also, p->stime needs to be updated for ksoftirqd. 401 * Also, p->stime needs to be updated for ksoftirqd.
413 */ 402 */
414 __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); 403 __account_system_time(p, cputime, CPUTIME_SOFTIRQ);
415 } else if (user_tick) { 404 } else if (user_tick) {
416 account_user_time(p, cputime, scaled); 405 account_user_time(p, cputime);
417 } else if (p == rq->idle) { 406 } else if (p == rq->idle) {
418 account_idle_time(cputime); 407 account_idle_time(cputime);
419 } else if (p->flags & PF_VCPU) { /* System time or guest time */ 408 } else if (p->flags & PF_VCPU) { /* System time or guest time */
420 account_guest_time(p, cputime, scaled); 409 account_guest_time(p, cputime);
421 } else { 410 } else {
422 __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); 411 __account_system_time(p, cputime, CPUTIME_SYSTEM);
423 } 412 }
424} 413}
425 414
@@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
502 */ 491 */
503void account_process_tick(struct task_struct *p, int user_tick) 492void account_process_tick(struct task_struct *p, int user_tick)
504{ 493{
505 cputime_t cputime, scaled, steal; 494 cputime_t cputime, steal;
506 struct rq *rq = this_rq(); 495 struct rq *rq = this_rq();
507 496
508 if (vtime_accounting_cpu_enabled()) 497 if (vtime_accounting_cpu_enabled())
@@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
520 return; 509 return;
521 510
522 cputime -= steal; 511 cputime -= steal;
523 scaled = cputime_to_scaled(cputime);
524 512
525 if (user_tick) 513 if (user_tick)
526 account_user_time(p, cputime, scaled); 514 account_user_time(p, cputime);
527 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) 515 else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
528 account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); 516 account_system_time(p, HARDIRQ_OFFSET, cputime);
529 else 517 else
530 account_idle_time(cputime); 518 account_idle_time(cputime);
531} 519}
@@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk)
746{ 734{
747 cputime_t delta_cpu = get_vtime_delta(tsk); 735 cputime_t delta_cpu = get_vtime_delta(tsk);
748 736
749 account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); 737 account_system_time(tsk, irq_count(), delta_cpu);
750} 738}
751 739
752void vtime_account_system(struct task_struct *tsk) 740void vtime_account_system(struct task_struct *tsk)
@@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk)
767 tsk->vtime_snap_whence = VTIME_SYS; 755 tsk->vtime_snap_whence = VTIME_SYS;
768 if (vtime_delta(tsk)) { 756 if (vtime_delta(tsk)) {
769 delta_cpu = get_vtime_delta(tsk); 757 delta_cpu = get_vtime_delta(tsk);
770 account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); 758 account_user_time(tsk, delta_cpu);
771 } 759 }
772 write_seqcount_end(&tsk->vtime_seqcount); 760 write_seqcount_end(&tsk->vtime_seqcount);
773} 761}
@@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t)
863 * add up the pending nohz execution time since the last 851 * add up the pending nohz execution time since the last
864 * cputime snapshot. 852 * cputime snapshot.
865 */ 853 */
866static void 854void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
867fetch_task_cputime(struct task_struct *t,
868 cputime_t *u_dst, cputime_t *s_dst,
869 cputime_t *u_src, cputime_t *s_src,
870 cputime_t *udelta, cputime_t *sdelta)
871{ 855{
856 cputime_t delta;
872 unsigned int seq; 857 unsigned int seq;
873 unsigned long long delta;
874 858
875 do { 859 if (!vtime_accounting_enabled()) {
876 *udelta = 0; 860 *utime = t->utime;
877 *sdelta = 0; 861 *stime = t->stime;
862 return;
863 }
878 864
865 do {
879 seq = read_seqcount_begin(&t->vtime_seqcount); 866 seq = read_seqcount_begin(&t->vtime_seqcount);
880 867
881 if (u_dst) 868 *utime = t->utime;
882 *u_dst = *u_src; 869 *stime = t->stime;
883 if (s_dst)
884 *s_dst = *s_src;
885 870
886 /* Task is sleeping, nothing to add */ 871 /* Task is sleeping, nothing to add */
887 if (t->vtime_snap_whence == VTIME_INACTIVE || 872 if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
888 is_idle_task(t))
889 continue; 873 continue;
890 874
891 delta = vtime_delta(t); 875 delta = vtime_delta(t);
@@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t,
894 * Task runs either in user or kernel space, add pending nohz time to 878 * Task runs either in user or kernel space, add pending nohz time to
895 * the right place. 879 * the right place.
896 */ 880 */
897 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { 881 if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
898 *udelta = delta; 882 *utime += delta;
899 } else { 883 else if (t->vtime_snap_whence == VTIME_SYS)
900 if (t->vtime_snap_whence == VTIME_SYS) 884 *stime += delta;
901 *sdelta = delta;
902 }
903 } while (read_seqcount_retry(&t->vtime_seqcount, seq)); 885 } while (read_seqcount_retry(&t->vtime_seqcount, seq));
904} 886}
905
906
907void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
908{
909 cputime_t udelta, sdelta;
910
911 if (!vtime_accounting_enabled()) {
912 if (utime)
913 *utime = t->utime;
914 if (stime)
915 *stime = t->stime;
916 return;
917 }
918
919 fetch_task_cputime(t, utime, stime, &t->utime,
920 &t->stime, &udelta, &sdelta);
921 if (utime)
922 *utime += udelta;
923 if (stime)
924 *stime += sdelta;
925}
926
927void task_cputime_scaled(struct task_struct *t,
928 cputime_t *utimescaled, cputime_t *stimescaled)
929{
930 cputime_t udelta, sdelta;
931
932 if (!vtime_accounting_enabled()) {
933 if (utimescaled)
934 *utimescaled = t->utimescaled;
935 if (stimescaled)
936 *stimescaled = t->stimescaled;
937 return;
938 }
939
940 fetch_task_cputime(t, utimescaled, stimescaled,
941 &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
942 if (utimescaled)
943 *utimescaled += cputime_to_scaled(udelta);
944 if (stimescaled)
945 *stimescaled += cputime_to_scaled(sdelta);
946}
947#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ 887#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 37e2449186c4..70ef2b1901e4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
586 586
587 /* 587 /*
588 * The task might have changed its scheduling policy to something 588 * The task might have changed its scheduling policy to something
589 * different than SCHED_DEADLINE (through switched_fromd_dl()). 589 * different than SCHED_DEADLINE (through switched_from_dl()).
590 */ 590 */
591 if (!dl_task(p)) { 591 if (!dl_task(p)) {
592 __dl_clear_params(p); 592 __dl_clear_params(p);
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
1137 pull_dl_task(rq); 1137 pull_dl_task(rq);
1138 lockdep_repin_lock(&rq->lock, cookie); 1138 lockdep_repin_lock(&rq->lock, cookie);
1139 /* 1139 /*
1140 * pull_rt_task() can drop (and re-acquire) rq->lock; this 1140 * pull_dl_task() can drop (and re-acquire) rq->lock; this
1141 * means a stop task can slip in, in which case we need to 1141 * means a stop task can slip in, in which case we need to
1142 * re-start task selection. 1142 * re-start task selection.
1143 */ 1143 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944f5cbd..6559d197e08a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
37 37
38/* 38/*
39 * Targeted preemption latency for CPU-bound tasks: 39 * Targeted preemption latency for CPU-bound tasks:
40 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
41 * 40 *
42 * NOTE: this latency value is not the same as the concept of 41 * NOTE: this latency value is not the same as the concept of
43 * 'timeslice length' - timeslices in CFS are of variable length 42 * 'timeslice length' - timeslices in CFS are of variable length
@@ -46,31 +45,35 @@
46 * 45 *
47 * (to see the precise effective timeslice length of your workload, 46 * (to see the precise effective timeslice length of your workload,
48 * run vmstat and monitor the context-switches (cs) field) 47 * run vmstat and monitor the context-switches (cs) field)
48 *
49 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
49 */ 50 */
50unsigned int sysctl_sched_latency = 6000000ULL; 51unsigned int sysctl_sched_latency = 6000000ULL;
51unsigned int normalized_sysctl_sched_latency = 6000000ULL; 52unsigned int normalized_sysctl_sched_latency = 6000000ULL;
52 53
53/* 54/*
54 * The initial- and re-scaling of tunables is configurable 55 * The initial- and re-scaling of tunables is configurable
55 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
56 * 56 *
57 * Options are: 57 * Options are:
58 * SCHED_TUNABLESCALING_NONE - unscaled, always *1 58 *
59 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) 59 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
60 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus 60 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
61 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
62 *
63 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
61 */ 64 */
62enum sched_tunable_scaling sysctl_sched_tunable_scaling 65enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
63 = SCHED_TUNABLESCALING_LOG;
64 66
65/* 67/*
66 * Minimal preemption granularity for CPU-bound tasks: 68 * Minimal preemption granularity for CPU-bound tasks:
69 *
67 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) 70 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
68 */ 71 */
69unsigned int sysctl_sched_min_granularity = 750000ULL; 72unsigned int sysctl_sched_min_granularity = 750000ULL;
70unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; 73unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
71 74
72/* 75/*
73 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 76 * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
74 */ 77 */
75static unsigned int sched_nr_latency = 8; 78static unsigned int sched_nr_latency = 8;
76 79
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
82 85
83/* 86/*
84 * SCHED_OTHER wake-up granularity. 87 * SCHED_OTHER wake-up granularity.
85 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
86 * 88 *
87 * This option delays the preemption effects of decoupled workloads 89 * This option delays the preemption effects of decoupled workloads
88 * and reduces their over-scheduling. Synchronous workloads will still 90 * and reduces their over-scheduling. Synchronous workloads will still
89 * have immediate wakeup/sleep latencies. 91 * have immediate wakeup/sleep latencies.
92 *
93 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
90 */ 94 */
91unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 95unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
92unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; 96unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
93 97
94const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 98const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
95 99
100#ifdef CONFIG_SMP
96/* 101/*
97 * The exponential sliding window over which load is averaged for shares 102 * For asym packing, by default the lower numbered cpu has higher priority.
98 * distribution.
99 * (default: 10msec)
100 */ 103 */
101unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; 104int __weak arch_asym_cpu_priority(int cpu)
105{
106 return -cpu;
107}
108#endif
102 109
103#ifdef CONFIG_CFS_BANDWIDTH 110#ifdef CONFIG_CFS_BANDWIDTH
104/* 111/*
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
109 * to consumption or the quota being specified to be smaller than the slice) 116 * to consumption or the quota being specified to be smaller than the slice)
110 * we will always only issue the remaining available time. 117 * we will always only issue the remaining available time.
111 * 118 *
112 * default: 5 msec, units: microseconds 119 * (default: 5 msec, units: microseconds)
113 */ 120 */
114unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; 121unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
115#endif 122#endif
116 123
117/* 124/*
118 * The margin used when comparing utilization with CPU capacity: 125 * The margin used when comparing utilization with CPU capacity:
119 * util * 1024 < capacity * margin 126 * util * margin < capacity * 1024
127 *
128 * (default: ~20%)
120 */ 129 */
121unsigned int capacity_margin = 1280; /* ~20% */ 130unsigned int capacity_margin = 1280;
122 131
123static inline void update_load_add(struct load_weight *lw, unsigned long inc) 132static inline void update_load_add(struct load_weight *lw, unsigned long inc)
124{ 133{
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
290static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 299static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
291{ 300{
292 if (!cfs_rq->on_list) { 301 if (!cfs_rq->on_list) {
302 struct rq *rq = rq_of(cfs_rq);
303 int cpu = cpu_of(rq);
293 /* 304 /*
294 * Ensure we either appear before our parent (if already 305 * Ensure we either appear before our parent (if already
295 * enqueued) or force our parent to appear after us when it is 306 * enqueued) or force our parent to appear after us when it is
296 * enqueued. The fact that we always enqueue bottom-up 307 * enqueued. The fact that we always enqueue bottom-up
297 * reduces this to two cases. 308 * reduces this to two cases and a special case for the root
309 * cfs_rq. Furthermore, it also means that we will always reset
310 * tmp_alone_branch either when the branch is connected
311 * to a tree or when we reach the beg of the tree
298 */ 312 */
299 if (cfs_rq->tg->parent && 313 if (cfs_rq->tg->parent &&
300 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { 314 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
301 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, 315 /*
302 &rq_of(cfs_rq)->leaf_cfs_rq_list); 316 * If parent is already on the list, we add the child
303 } else { 317 * just before. Thanks to circular linked property of
318 * the list, this means to put the child at the tail
319 * of the list that starts by parent.
320 */
304 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, 321 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
305 &rq_of(cfs_rq)->leaf_cfs_rq_list); 322 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
323 /*
324 * The branch is now connected to its tree so we can
325 * reset tmp_alone_branch to the beginning of the
326 * list.
327 */
328 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
329 } else if (!cfs_rq->tg->parent) {
330 /*
331 * cfs rq without parent should be put
332 * at the tail of the list.
333 */
334 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
335 &rq->leaf_cfs_rq_list);
336 /*
337 * We have reach the beg of a tree so we can reset
338 * tmp_alone_branch to the beginning of the list.
339 */
340 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
341 } else {
342 /*
343 * The parent has not already been added so we want to
344 * make sure that it will be put after us.
345 * tmp_alone_branch points to the beg of the branch
346 * where we will add parent.
347 */
348 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
349 rq->tmp_alone_branch);
350 /*
351 * update tmp_alone_branch to points to the new beg
352 * of the branch
353 */
354 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
306 } 355 }
307 356
308 cfs_rq->on_list = 1; 357 cfs_rq->on_list = 1;
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
708} 757}
709 758
710static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); 759static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
711static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); 760static void attach_entity_cfs_rq(struct sched_entity *se);
712static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
713static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
714 761
715/* 762/*
716 * With new tasks being created, their initial util_avgs are extrapolated 763 * With new tasks being created, their initial util_avgs are extrapolated
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
742 struct cfs_rq *cfs_rq = cfs_rq_of(se); 789 struct cfs_rq *cfs_rq = cfs_rq_of(se);
743 struct sched_avg *sa = &se->avg; 790 struct sched_avg *sa = &se->avg;
744 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; 791 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
745 u64 now = cfs_rq_clock_task(cfs_rq);
746 792
747 if (cap > 0) { 793 if (cap > 0) {
748 if (cfs_rq->avg.util_avg != 0) { 794 if (cfs_rq->avg.util_avg != 0) {
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
770 * such that the next switched_to_fair() has the 816 * such that the next switched_to_fair() has the
771 * expected state. 817 * expected state.
772 */ 818 */
773 se->avg.last_update_time = now; 819 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
774 return; 820 return;
775 } 821 }
776 } 822 }
777 823
778 update_cfs_rq_load_avg(now, cfs_rq, false); 824 attach_entity_cfs_rq(se);
779 attach_entity_load_avg(cfs_rq, se);
780 update_tg_load_avg(cfs_rq, false);
781} 825}
782 826
783#else /* !CONFIG_SMP */ 827#else /* !CONFIG_SMP */
@@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2890 return decayed; 2934 return decayed;
2891} 2935}
2892 2936
2937/*
2938 * Signed add and clamp on underflow.
2939 *
2940 * Explicitly do a load-store to ensure the intermediate value never hits
2941 * memory. This allows lockless observations without ever seeing the negative
2942 * values.
2943 */
2944#define add_positive(_ptr, _val) do { \
2945 typeof(_ptr) ptr = (_ptr); \
2946 typeof(_val) val = (_val); \
2947 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2948 \
2949 res = var + val; \
2950 \
2951 if (val < 0 && res > var) \
2952 res = 0; \
2953 \
2954 WRITE_ONCE(*ptr, res); \
2955} while (0)
2956
2893#ifdef CONFIG_FAIR_GROUP_SCHED 2957#ifdef CONFIG_FAIR_GROUP_SCHED
2894/** 2958/**
2895 * update_tg_load_avg - update the tg's load avg 2959 * update_tg_load_avg - update the tg's load avg
@@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se,
2969 se->avg.last_update_time = n_last_update_time; 3033 se->avg.last_update_time = n_last_update_time;
2970 } 3034 }
2971} 3035}
3036
3037/* Take into account change of utilization of a child task group */
3038static inline void
3039update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
3040{
3041 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3042 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
3043
3044 /* Nothing to update */
3045 if (!delta)
3046 return;
3047
3048 /* Set new sched_entity's utilization */
3049 se->avg.util_avg = gcfs_rq->avg.util_avg;
3050 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
3051
3052 /* Update parent cfs_rq utilization */
3053 add_positive(&cfs_rq->avg.util_avg, delta);
3054 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
3055}
3056
3057/* Take into account change of load of a child task group */
3058static inline void
3059update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
3060{
3061 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3062 long delta, load = gcfs_rq->avg.load_avg;
3063
3064 /*
3065 * If the load of group cfs_rq is null, the load of the
3066 * sched_entity will also be null so we can skip the formula
3067 */
3068 if (load) {
3069 long tg_load;
3070
3071 /* Get tg's load and ensure tg_load > 0 */
3072 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
3073
3074 /* Ensure tg_load >= load and updated with current load*/
3075 tg_load -= gcfs_rq->tg_load_avg_contrib;
3076 tg_load += load;
3077
3078 /*
3079 * We need to compute a correction term in the case that the
3080 * task group is consuming more CPU than a task of equal
3081 * weight. A task with a weight equals to tg->shares will have
3082 * a load less or equal to scale_load_down(tg->shares).
3083 * Similarly, the sched_entities that represent the task group
3084 * at parent level, can't have a load higher than
3085 * scale_load_down(tg->shares). And the Sum of sched_entities'
3086 * load must be <= scale_load_down(tg->shares).
3087 */
3088 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3089 /* scale gcfs_rq's load into tg's shares*/
3090 load *= scale_load_down(gcfs_rq->tg->shares);
3091 load /= tg_load;
3092 }
3093 }
3094
3095 delta = load - se->avg.load_avg;
3096
3097 /* Nothing to update */
3098 if (!delta)
3099 return;
3100
3101 /* Set new sched_entity's load */
3102 se->avg.load_avg = load;
3103 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3104
3105 /* Update parent cfs_rq load */
3106 add_positive(&cfs_rq->avg.load_avg, delta);
3107 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3108
3109 /*
3110 * If the sched_entity is already enqueued, we also have to update the
3111 * runnable load avg.
3112 */
3113 if (se->on_rq) {
3114 /* Update parent cfs_rq runnable_load_avg */
3115 add_positive(&cfs_rq->runnable_load_avg, delta);
3116 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3117 }
3118}
3119
3120static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
3121{
3122 cfs_rq->propagate_avg = 1;
3123}
3124
3125static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3126{
3127 struct cfs_rq *cfs_rq = group_cfs_rq(se);
3128
3129 if (!cfs_rq->propagate_avg)
3130 return 0;
3131
3132 cfs_rq->propagate_avg = 0;
3133 return 1;
3134}
3135
3136/* Update task and its cfs_rq load average */
3137static inline int propagate_entity_load_avg(struct sched_entity *se)
3138{
3139 struct cfs_rq *cfs_rq;
3140
3141 if (entity_is_task(se))
3142 return 0;
3143
3144 if (!test_and_clear_tg_cfs_propagate(se))
3145 return 0;
3146
3147 cfs_rq = cfs_rq_of(se);
3148
3149 set_tg_cfs_propagate(cfs_rq);
3150
3151 update_tg_cfs_util(cfs_rq, se);
3152 update_tg_cfs_load(cfs_rq, se);
3153
3154 return 1;
3155}
3156
2972#else /* CONFIG_FAIR_GROUP_SCHED */ 3157#else /* CONFIG_FAIR_GROUP_SCHED */
3158
2973static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} 3159static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3160
3161static inline int propagate_entity_load_avg(struct sched_entity *se)
3162{
3163 return 0;
3164}
3165
3166static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3167
2974#endif /* CONFIG_FAIR_GROUP_SCHED */ 3168#endif /* CONFIG_FAIR_GROUP_SCHED */
2975 3169
2976static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) 3170static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
@@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3041 sub_positive(&sa->load_avg, r); 3235 sub_positive(&sa->load_avg, r);
3042 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); 3236 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
3043 removed_load = 1; 3237 removed_load = 1;
3238 set_tg_cfs_propagate(cfs_rq);
3044 } 3239 }
3045 3240
3046 if (atomic_long_read(&cfs_rq->removed_util_avg)) { 3241 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
@@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3048 sub_positive(&sa->util_avg, r); 3243 sub_positive(&sa->util_avg, r);
3049 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); 3244 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
3050 removed_util = 1; 3245 removed_util = 1;
3246 set_tg_cfs_propagate(cfs_rq);
3051 } 3247 }
3052 3248
3053 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, 3249 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3064 return decayed || removed_load; 3260 return decayed || removed_load;
3065} 3261}
3066 3262
3263/*
3264 * Optional action to be done while updating the load average
3265 */
3266#define UPDATE_TG 0x1
3267#define SKIP_AGE_LOAD 0x2
3268
3067/* Update task and its cfs_rq load average */ 3269/* Update task and its cfs_rq load average */
3068static inline void update_load_avg(struct sched_entity *se, int update_tg) 3270static inline void update_load_avg(struct sched_entity *se, int flags)
3069{ 3271{
3070 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3272 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3071 u64 now = cfs_rq_clock_task(cfs_rq); 3273 u64 now = cfs_rq_clock_task(cfs_rq);
3072 struct rq *rq = rq_of(cfs_rq); 3274 struct rq *rq = rq_of(cfs_rq);
3073 int cpu = cpu_of(rq); 3275 int cpu = cpu_of(rq);
3276 int decayed;
3074 3277
3075 /* 3278 /*
3076 * Track task load average for carrying it to new CPU after migrated, and 3279 * Track task load average for carrying it to new CPU after migrated, and
3077 * track group sched_entity load average for task_h_load calc in migration 3280 * track group sched_entity load average for task_h_load calc in migration
3078 */ 3281 */
3079 __update_load_avg(now, cpu, &se->avg, 3282 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
3283 __update_load_avg(now, cpu, &se->avg,
3080 se->on_rq * scale_load_down(se->load.weight), 3284 se->on_rq * scale_load_down(se->load.weight),
3081 cfs_rq->curr == se, NULL); 3285 cfs_rq->curr == se, NULL);
3286 }
3082 3287
3083 if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) 3288 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
3289 decayed |= propagate_entity_load_avg(se);
3290
3291 if (decayed && (flags & UPDATE_TG))
3084 update_tg_load_avg(cfs_rq, 0); 3292 update_tg_load_avg(cfs_rq, 0);
3085} 3293}
3086 3294
@@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
3094 */ 3302 */
3095static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3303static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3096{ 3304{
3097 if (!sched_feat(ATTACH_AGE_LOAD))
3098 goto skip_aging;
3099
3100 /*
3101 * If we got migrated (either between CPUs or between cgroups) we'll
3102 * have aged the average right before clearing @last_update_time.
3103 *
3104 * Or we're fresh through post_init_entity_util_avg().
3105 */
3106 if (se->avg.last_update_time) {
3107 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3108 &se->avg, 0, 0, NULL);
3109
3110 /*
3111 * XXX: we could have just aged the entire load away if we've been
3112 * absent from the fair class for too long.
3113 */
3114 }
3115
3116skip_aging:
3117 se->avg.last_update_time = cfs_rq->avg.last_update_time; 3305 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3118 cfs_rq->avg.load_avg += se->avg.load_avg; 3306 cfs_rq->avg.load_avg += se->avg.load_avg;
3119 cfs_rq->avg.load_sum += se->avg.load_sum; 3307 cfs_rq->avg.load_sum += se->avg.load_sum;
3120 cfs_rq->avg.util_avg += se->avg.util_avg; 3308 cfs_rq->avg.util_avg += se->avg.util_avg;
3121 cfs_rq->avg.util_sum += se->avg.util_sum; 3309 cfs_rq->avg.util_sum += se->avg.util_sum;
3310 set_tg_cfs_propagate(cfs_rq);
3122 3311
3123 cfs_rq_util_change(cfs_rq); 3312 cfs_rq_util_change(cfs_rq);
3124} 3313}
@@ -3133,14 +3322,12 @@ skip_aging:
3133 */ 3322 */
3134static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3323static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3135{ 3324{
3136 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
3137 &se->avg, se->on_rq * scale_load_down(se->load.weight),
3138 cfs_rq->curr == se, NULL);
3139 3325
3140 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); 3326 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3141 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); 3327 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3142 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); 3328 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3143 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); 3329 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3330 set_tg_cfs_propagate(cfs_rq);
3144 3331
3145 cfs_rq_util_change(cfs_rq); 3332 cfs_rq_util_change(cfs_rq);
3146} 3333}
@@ -3150,34 +3337,20 @@ static inline void
3150enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3337enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3151{ 3338{
3152 struct sched_avg *sa = &se->avg; 3339 struct sched_avg *sa = &se->avg;
3153 u64 now = cfs_rq_clock_task(cfs_rq);
3154 int migrated, decayed;
3155
3156 migrated = !sa->last_update_time;
3157 if (!migrated) {
3158 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3159 se->on_rq * scale_load_down(se->load.weight),
3160 cfs_rq->curr == se, NULL);
3161 }
3162
3163 decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
3164 3340
3165 cfs_rq->runnable_load_avg += sa->load_avg; 3341 cfs_rq->runnable_load_avg += sa->load_avg;
3166 cfs_rq->runnable_load_sum += sa->load_sum; 3342 cfs_rq->runnable_load_sum += sa->load_sum;
3167 3343
3168 if (migrated) 3344 if (!sa->last_update_time) {
3169 attach_entity_load_avg(cfs_rq, se); 3345 attach_entity_load_avg(cfs_rq, se);
3170
3171 if (decayed || migrated)
3172 update_tg_load_avg(cfs_rq, 0); 3346 update_tg_load_avg(cfs_rq, 0);
3347 }
3173} 3348}
3174 3349
3175/* Remove the runnable load generated by se from cfs_rq's runnable load average */ 3350/* Remove the runnable load generated by se from cfs_rq's runnable load average */
3176static inline void 3351static inline void
3177dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) 3352dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3178{ 3353{
3179 update_load_avg(se, 1);
3180
3181 cfs_rq->runnable_load_avg = 3354 cfs_rq->runnable_load_avg =
3182 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); 3355 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3183 cfs_rq->runnable_load_sum = 3356 cfs_rq->runnable_load_sum =
@@ -3206,13 +3379,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3206#endif 3379#endif
3207 3380
3208/* 3381/*
3382 * Synchronize entity load avg of dequeued entity without locking
3383 * the previous rq.
3384 */
3385void sync_entity_load_avg(struct sched_entity *se)
3386{
3387 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3388 u64 last_update_time;
3389
3390 last_update_time = cfs_rq_last_update_time(cfs_rq);
3391 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3392}
3393
3394/*
3209 * Task first catches up with cfs_rq, and then subtract 3395 * Task first catches up with cfs_rq, and then subtract
3210 * itself from the cfs_rq (task must be off the queue now). 3396 * itself from the cfs_rq (task must be off the queue now).
3211 */ 3397 */
3212void remove_entity_load_avg(struct sched_entity *se) 3398void remove_entity_load_avg(struct sched_entity *se)
3213{ 3399{
3214 struct cfs_rq *cfs_rq = cfs_rq_of(se); 3400 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3215 u64 last_update_time;
3216 3401
3217 /* 3402 /*
3218 * tasks cannot exit without having gone through wake_up_new_task() -> 3403 * tasks cannot exit without having gone through wake_up_new_task() ->
@@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se)
3224 * calls this. 3409 * calls this.
3225 */ 3410 */
3226 3411
3227 last_update_time = cfs_rq_last_update_time(cfs_rq); 3412 sync_entity_load_avg(se);
3228
3229 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3230 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); 3413 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3231 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); 3414 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3232} 3415}
@@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3251 return 0; 3434 return 0;
3252} 3435}
3253 3436
3254static inline void update_load_avg(struct sched_entity *se, int not_used) 3437#define UPDATE_TG 0x0
3438#define SKIP_AGE_LOAD 0x0
3439
3440static inline void update_load_avg(struct sched_entity *se, int not_used1)
3255{ 3441{
3256 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); 3442 cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
3257} 3443}
@@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3396 if (renorm && !curr) 3582 if (renorm && !curr)
3397 se->vruntime += cfs_rq->min_vruntime; 3583 se->vruntime += cfs_rq->min_vruntime;
3398 3584
3585 update_load_avg(se, UPDATE_TG);
3399 enqueue_entity_load_avg(cfs_rq, se); 3586 enqueue_entity_load_avg(cfs_rq, se);
3400 account_entity_enqueue(cfs_rq, se); 3587 account_entity_enqueue(cfs_rq, se);
3401 update_cfs_shares(cfs_rq); 3588 update_cfs_shares(cfs_rq);
@@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3470 * Update run-time statistics of the 'current'. 3657 * Update run-time statistics of the 'current'.
3471 */ 3658 */
3472 update_curr(cfs_rq); 3659 update_curr(cfs_rq);
3660 update_load_avg(se, UPDATE_TG);
3473 dequeue_entity_load_avg(cfs_rq, se); 3661 dequeue_entity_load_avg(cfs_rq, se);
3474 3662
3475 update_stats_dequeue(cfs_rq, se, flags); 3663 update_stats_dequeue(cfs_rq, se, flags);
@@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3557 */ 3745 */
3558 update_stats_wait_end(cfs_rq, se); 3746 update_stats_wait_end(cfs_rq, se);
3559 __dequeue_entity(cfs_rq, se); 3747 __dequeue_entity(cfs_rq, se);
3560 update_load_avg(se, 1); 3748 update_load_avg(se, UPDATE_TG);
3561 } 3749 }
3562 3750
3563 update_stats_curr_start(cfs_rq, se); 3751 update_stats_curr_start(cfs_rq, se);
@@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3675 /* 3863 /*
3676 * Ensure that runnable average is periodically updated. 3864 * Ensure that runnable average is periodically updated.
3677 */ 3865 */
3678 update_load_avg(curr, 1); 3866 update_load_avg(curr, UPDATE_TG);
3679 update_cfs_shares(cfs_rq); 3867 update_cfs_shares(cfs_rq);
3680 3868
3681#ifdef CONFIG_SCHED_HRTICK 3869#ifdef CONFIG_SCHED_HRTICK
@@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4572 if (cfs_rq_throttled(cfs_rq)) 4760 if (cfs_rq_throttled(cfs_rq))
4573 break; 4761 break;
4574 4762
4575 update_load_avg(se, 1); 4763 update_load_avg(se, UPDATE_TG);
4576 update_cfs_shares(cfs_rq); 4764 update_cfs_shares(cfs_rq);
4577 } 4765 }
4578 4766
@@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4631 if (cfs_rq_throttled(cfs_rq)) 4819 if (cfs_rq_throttled(cfs_rq))
4632 break; 4820 break;
4633 4821
4634 update_load_avg(se, 1); 4822 update_load_avg(se, UPDATE_TG);
4635 update_cfs_shares(cfs_rq); 4823 update_cfs_shares(cfs_rq);
4636 } 4824 }
4637 4825
@@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5199 return 1; 5387 return 1;
5200} 5388}
5201 5389
5390static inline int task_util(struct task_struct *p);
5391static int cpu_util_wake(int cpu, struct task_struct *p);
5392
5393static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
5394{
5395 return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
5396}
5397
5202/* 5398/*
5203 * find_idlest_group finds and returns the least busy CPU group within the 5399 * find_idlest_group finds and returns the least busy CPU group within the
5204 * domain. 5400 * domain.
@@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5208 int this_cpu, int sd_flag) 5404 int this_cpu, int sd_flag)
5209{ 5405{
5210 struct sched_group *idlest = NULL, *group = sd->groups; 5406 struct sched_group *idlest = NULL, *group = sd->groups;
5211 unsigned long min_load = ULONG_MAX, this_load = 0; 5407 struct sched_group *most_spare_sg = NULL;
5408 unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
5409 unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
5410 unsigned long most_spare = 0, this_spare = 0;
5212 int load_idx = sd->forkexec_idx; 5411 int load_idx = sd->forkexec_idx;
5213 int imbalance = 100 + (sd->imbalance_pct-100)/2; 5412 int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
5413 unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
5414 (sd->imbalance_pct-100) / 100;
5214 5415
5215 if (sd_flag & SD_BALANCE_WAKE) 5416 if (sd_flag & SD_BALANCE_WAKE)
5216 load_idx = sd->wake_idx; 5417 load_idx = sd->wake_idx;
5217 5418
5218 do { 5419 do {
5219 unsigned long load, avg_load; 5420 unsigned long load, avg_load, runnable_load;
5421 unsigned long spare_cap, max_spare_cap;
5220 int local_group; 5422 int local_group;
5221 int i; 5423 int i;
5222 5424
@@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5228 local_group = cpumask_test_cpu(this_cpu, 5430 local_group = cpumask_test_cpu(this_cpu,
5229 sched_group_cpus(group)); 5431 sched_group_cpus(group));
5230 5432
5231 /* Tally up the load of all CPUs in the group */ 5433 /*
5434 * Tally up the load of all CPUs in the group and find
5435 * the group containing the CPU with most spare capacity.
5436 */
5232 avg_load = 0; 5437 avg_load = 0;
5438 runnable_load = 0;
5439 max_spare_cap = 0;
5233 5440
5234 for_each_cpu(i, sched_group_cpus(group)) { 5441 for_each_cpu(i, sched_group_cpus(group)) {
5235 /* Bias balancing toward cpus of our domain */ 5442 /* Bias balancing toward cpus of our domain */
@@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5238 else 5445 else
5239 load = target_load(i, load_idx); 5446 load = target_load(i, load_idx);
5240 5447
5241 avg_load += load; 5448 runnable_load += load;
5449
5450 avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
5451
5452 spare_cap = capacity_spare_wake(i, p);
5453
5454 if (spare_cap > max_spare_cap)
5455 max_spare_cap = spare_cap;
5242 } 5456 }
5243 5457
5244 /* Adjust by relative CPU capacity of the group */ 5458 /* Adjust by relative CPU capacity of the group */
5245 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; 5459 avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
5460 group->sgc->capacity;
5461 runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
5462 group->sgc->capacity;
5246 5463
5247 if (local_group) { 5464 if (local_group) {
5248 this_load = avg_load; 5465 this_runnable_load = runnable_load;
5249 } else if (avg_load < min_load) { 5466 this_avg_load = avg_load;
5250 min_load = avg_load; 5467 this_spare = max_spare_cap;
5251 idlest = group; 5468 } else {
5469 if (min_runnable_load > (runnable_load + imbalance)) {
5470 /*
5471 * The runnable load is significantly smaller
5472 * so we can pick this new cpu
5473 */
5474 min_runnable_load = runnable_load;
5475 min_avg_load = avg_load;
5476 idlest = group;
5477 } else if ((runnable_load < (min_runnable_load + imbalance)) &&
5478 (100*min_avg_load > imbalance_scale*avg_load)) {
5479 /*
5480 * The runnable loads are close so take the
5481 * blocked load into account through avg_load.
5482 */
5483 min_avg_load = avg_load;
5484 idlest = group;
5485 }
5486
5487 if (most_spare < max_spare_cap) {
5488 most_spare = max_spare_cap;
5489 most_spare_sg = group;
5490 }
5252 } 5491 }
5253 } while (group = group->next, group != sd->groups); 5492 } while (group = group->next, group != sd->groups);
5254 5493
5255 if (!idlest || 100*this_load < imbalance*min_load) 5494 /*
5495 * The cross-over point between using spare capacity or least load
5496 * is too conservative for high utilization tasks on partially
5497 * utilized systems if we require spare_capacity > task_util(p),
5498 * so we allow for some task stuffing by using
5499 * spare_capacity > task_util(p)/2.
5500 *
5501 * Spare capacity can't be used for fork because the utilization has
5502 * not been set yet, we must first select a rq to compute the initial
5503 * utilization.
5504 */
5505 if (sd_flag & SD_BALANCE_FORK)
5506 goto skip_spare;
5507
5508 if (this_spare > task_util(p) / 2 &&
5509 imbalance_scale*this_spare > 100*most_spare)
5510 return NULL;
5511
5512 if (most_spare > task_util(p) / 2)
5513 return most_spare_sg;
5514
5515skip_spare:
5516 if (!idlest)
5517 return NULL;
5518
5519 if (min_runnable_load > (this_runnable_load + imbalance))
5256 return NULL; 5520 return NULL;
5521
5522 if ((this_runnable_load < (min_runnable_load + imbalance)) &&
5523 (100*this_avg_load < imbalance_scale*min_avg_load))
5524 return NULL;
5525
5257 return idlest; 5526 return idlest;
5258} 5527}
5259 5528
@@ -5590,6 +5859,24 @@ static inline int task_util(struct task_struct *p)
5590} 5859}
5591 5860
5592/* 5861/*
5862 * cpu_util_wake: Compute cpu utilization with any contributions from
5863 * the waking task p removed.
5864 */
5865static int cpu_util_wake(int cpu, struct task_struct *p)
5866{
5867 unsigned long util, capacity;
5868
5869 /* Task has no contribution or is new */
5870 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
5871 return cpu_util(cpu);
5872
5873 capacity = capacity_orig_of(cpu);
5874 util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
5875
5876 return (util >= capacity) ? capacity : util;
5877}
5878
5879/*
5593 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the 5880 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
5594 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. 5881 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
5595 * 5882 *
@@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
5607 if (max_cap - min_cap < max_cap >> 3) 5894 if (max_cap - min_cap < max_cap >> 3)
5608 return 0; 5895 return 0;
5609 5896
5897 /* Bring task utilization in sync with prev_cpu */
5898 sync_entity_load_avg(&p->se);
5899
5610 return min_cap * 1024 < task_util(p) * capacity_margin; 5900 return min_cap * 1024 < task_util(p) * capacity_margin;
5611} 5901}
5612 5902
@@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu)
6641 6931
6642 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) 6932 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
6643 update_tg_load_avg(cfs_rq, 0); 6933 update_tg_load_avg(cfs_rq, 0);
6934
6935 /* Propagate pending load changes to the parent */
6936 if (cfs_rq->tg->se[cpu])
6937 update_load_avg(cfs_rq->tg->se[cpu], 0);
6644 } 6938 }
6645 raw_spin_unlock_irqrestore(&rq->lock, flags); 6939 raw_spin_unlock_irqrestore(&rq->lock, flags);
6646} 6940}
@@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
6845 7139
6846 cpu_rq(cpu)->cpu_capacity = capacity; 7140 cpu_rq(cpu)->cpu_capacity = capacity;
6847 sdg->sgc->capacity = capacity; 7141 sdg->sgc->capacity = capacity;
7142 sdg->sgc->min_capacity = capacity;
6848} 7143}
6849 7144
6850void update_group_capacity(struct sched_domain *sd, int cpu) 7145void update_group_capacity(struct sched_domain *sd, int cpu)
6851{ 7146{
6852 struct sched_domain *child = sd->child; 7147 struct sched_domain *child = sd->child;
6853 struct sched_group *group, *sdg = sd->groups; 7148 struct sched_group *group, *sdg = sd->groups;
6854 unsigned long capacity; 7149 unsigned long capacity, min_capacity;
6855 unsigned long interval; 7150 unsigned long interval;
6856 7151
6857 interval = msecs_to_jiffies(sd->balance_interval); 7152 interval = msecs_to_jiffies(sd->balance_interval);
@@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6864 } 7159 }
6865 7160
6866 capacity = 0; 7161 capacity = 0;
7162 min_capacity = ULONG_MAX;
6867 7163
6868 if (child->flags & SD_OVERLAP) { 7164 if (child->flags & SD_OVERLAP) {
6869 /* 7165 /*
@@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6888 */ 7184 */
6889 if (unlikely(!rq->sd)) { 7185 if (unlikely(!rq->sd)) {
6890 capacity += capacity_of(cpu); 7186 capacity += capacity_of(cpu);
6891 continue; 7187 } else {
7188 sgc = rq->sd->groups->sgc;
7189 capacity += sgc->capacity;
6892 } 7190 }
6893 7191
6894 sgc = rq->sd->groups->sgc; 7192 min_capacity = min(capacity, min_capacity);
6895 capacity += sgc->capacity;
6896 } 7193 }
6897 } else { 7194 } else {
6898 /* 7195 /*
@@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
6902 7199
6903 group = child->groups; 7200 group = child->groups;
6904 do { 7201 do {
6905 capacity += group->sgc->capacity; 7202 struct sched_group_capacity *sgc = group->sgc;
7203
7204 capacity += sgc->capacity;
7205 min_capacity = min(sgc->min_capacity, min_capacity);
6906 group = group->next; 7206 group = group->next;
6907 } while (group != child->groups); 7207 } while (group != child->groups);
6908 } 7208 }
6909 7209
6910 sdg->sgc->capacity = capacity; 7210 sdg->sgc->capacity = capacity;
7211 sdg->sgc->min_capacity = min_capacity;
6911} 7212}
6912 7213
6913/* 7214/*
@@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
6930 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 7231 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
6931 * Something like: 7232 * Something like:
6932 * 7233 *
6933 * { 0 1 2 3 } { 4 5 6 7 } 7234 * { 0 1 2 3 } { 4 5 6 7 }
6934 * * * * * 7235 * * * * *
6935 * 7236 *
6936 * If we were to balance group-wise we'd place two tasks in the first group and 7237 * If we were to balance group-wise we'd place two tasks in the first group and
6937 * two tasks in the second group. Clearly this is undesired as it will overload 7238 * two tasks in the second group. Clearly this is undesired as it will overload
@@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
7002 return false; 7303 return false;
7003} 7304}
7004 7305
7306/*
7307 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
7308 * per-CPU capacity than sched_group ref.
7309 */
7310static inline bool
7311group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
7312{
7313 return sg->sgc->min_capacity * capacity_margin <
7314 ref->sgc->min_capacity * 1024;
7315}
7316
7005static inline enum 7317static inline enum
7006group_type group_classify(struct sched_group *group, 7318group_type group_classify(struct sched_group *group,
7007 struct sg_lb_stats *sgs) 7319 struct sg_lb_stats *sgs)
@@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
7105 if (sgs->avg_load <= busiest->avg_load) 7417 if (sgs->avg_load <= busiest->avg_load)
7106 return false; 7418 return false;
7107 7419
7420 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
7421 goto asym_packing;
7422
7423 /*
7424 * Candidate sg has no more than one task per CPU and
7425 * has higher per-CPU capacity. Migrating tasks to less
7426 * capable CPUs may harm throughput. Maximize throughput,
7427 * power/energy consequences are not considered.
7428 */
7429 if (sgs->sum_nr_running <= sgs->group_weight &&
7430 group_smaller_cpu_capacity(sds->local, sg))
7431 return false;
7432
7433asym_packing:
7108 /* This is the busiest node in its class. */ 7434 /* This is the busiest node in its class. */
7109 if (!(env->sd->flags & SD_ASYM_PACKING)) 7435 if (!(env->sd->flags & SD_ASYM_PACKING))
7110 return true; 7436 return true;
@@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
7113 if (env->idle == CPU_NOT_IDLE) 7439 if (env->idle == CPU_NOT_IDLE)
7114 return true; 7440 return true;
7115 /* 7441 /*
7116 * ASYM_PACKING needs to move all the work to the lowest 7442 * ASYM_PACKING needs to move all the work to the highest
7117 * numbered CPUs in the group, therefore mark all groups 7443 * prority CPUs in the group, therefore mark all groups
7118 * higher than ourself as busy. 7444 * of lower priority than ourself as busy.
7119 */ 7445 */
7120 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { 7446 if (sgs->sum_nr_running &&
7447 sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
7121 if (!sds->busiest) 7448 if (!sds->busiest)
7122 return true; 7449 return true;
7123 7450
7124 /* Prefer to move from highest possible cpu's work */ 7451 /* Prefer to move from lowest priority cpu's work */
7125 if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) 7452 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7453 sg->asym_prefer_cpu))
7126 return true; 7454 return true;
7127 } 7455 }
7128 7456
@@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7274 if (!sds->busiest) 7602 if (!sds->busiest)
7275 return 0; 7603 return 0;
7276 7604
7277 busiest_cpu = group_first_cpu(sds->busiest); 7605 busiest_cpu = sds->busiest->asym_prefer_cpu;
7278 if (env->dst_cpu > busiest_cpu) 7606 if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
7279 return 0; 7607 return 0;
7280 7608
7281 env->imbalance = DIV_ROUND_CLOSEST( 7609 env->imbalance = DIV_ROUND_CLOSEST(
@@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env)
7613 7941
7614 /* 7942 /*
7615 * ASYM_PACKING needs to force migrate tasks from busy but 7943 * ASYM_PACKING needs to force migrate tasks from busy but
7616 * higher numbered CPUs in order to pack all tasks in the 7944 * lower priority CPUs in order to pack all tasks in the
7617 * lowest numbered CPUs. 7945 * highest priority CPUs.
7618 */ 7946 */
7619 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) 7947 if ((sd->flags & SD_ASYM_PACKING) &&
7948 sched_asym_prefer(env->dst_cpu, env->src_cpu))
7620 return 1; 7949 return 1;
7621 } 7950 }
7622 7951
@@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
8465 unsigned long now = jiffies; 8794 unsigned long now = jiffies;
8466 struct sched_domain_shared *sds; 8795 struct sched_domain_shared *sds;
8467 struct sched_domain *sd; 8796 struct sched_domain *sd;
8468 int nr_busy, cpu = rq->cpu; 8797 int nr_busy, i, cpu = rq->cpu;
8469 bool kick = false; 8798 bool kick = false;
8470 8799
8471 if (unlikely(rq->idle_balance)) 8800 if (unlikely(rq->idle_balance))
@@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
8516 } 8845 }
8517 8846
8518 sd = rcu_dereference(per_cpu(sd_asym, cpu)); 8847 sd = rcu_dereference(per_cpu(sd_asym, cpu));
8519 if (sd && (cpumask_first_and(nohz.idle_cpus_mask, 8848 if (sd) {
8520 sched_domain_span(sd)) < cpu)) { 8849 for_each_cpu(i, sched_domain_span(sd)) {
8521 kick = true; 8850 if (i == cpu ||
8522 goto unlock; 8851 !cpumask_test_cpu(i, nohz.idle_cpus_mask))
8523 } 8852 continue;
8524 8853
8854 if (sched_asym_prefer(i, cpu)) {
8855 kick = true;
8856 goto unlock;
8857 }
8858 }
8859 }
8525unlock: 8860unlock:
8526 rcu_read_unlock(); 8861 rcu_read_unlock();
8527 return kick; 8862 return kick;
@@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
8687 return false; 9022 return false;
8688} 9023}
8689 9024
8690static void detach_task_cfs_rq(struct task_struct *p) 9025#ifdef CONFIG_FAIR_GROUP_SCHED
9026/*
9027 * Propagate the changes of the sched_entity across the tg tree to make it
9028 * visible to the root
9029 */
9030static void propagate_entity_cfs_rq(struct sched_entity *se)
8691{ 9031{
8692 struct sched_entity *se = &p->se; 9032 struct cfs_rq *cfs_rq;
8693 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8694 u64 now = cfs_rq_clock_task(cfs_rq);
8695 9033
8696 if (!vruntime_normalized(p)) { 9034 /* Start to propagate at parent */
8697 /* 9035 se = se->parent;
8698 * Fix up our vruntime so that the current sleep doesn't 9036
8699 * cause 'unlimited' sleep bonus. 9037 for_each_sched_entity(se) {
8700 */ 9038 cfs_rq = cfs_rq_of(se);
8701 place_entity(cfs_rq, se, 0); 9039
8702 se->vruntime -= cfs_rq->min_vruntime; 9040 if (cfs_rq_throttled(cfs_rq))
9041 break;
9042
9043 update_load_avg(se, UPDATE_TG);
8703 } 9044 }
9045}
9046#else
9047static void propagate_entity_cfs_rq(struct sched_entity *se) { }
9048#endif
9049
9050static void detach_entity_cfs_rq(struct sched_entity *se)
9051{
9052 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8704 9053
8705 /* Catch up with the cfs_rq and remove our load when we leave */ 9054 /* Catch up with the cfs_rq and remove our load when we leave */
8706 update_cfs_rq_load_avg(now, cfs_rq, false); 9055 update_load_avg(se, 0);
8707 detach_entity_load_avg(cfs_rq, se); 9056 detach_entity_load_avg(cfs_rq, se);
8708 update_tg_load_avg(cfs_rq, false); 9057 update_tg_load_avg(cfs_rq, false);
9058 propagate_entity_cfs_rq(se);
8709} 9059}
8710 9060
8711static void attach_task_cfs_rq(struct task_struct *p) 9061static void attach_entity_cfs_rq(struct sched_entity *se)
8712{ 9062{
8713 struct sched_entity *se = &p->se;
8714 struct cfs_rq *cfs_rq = cfs_rq_of(se); 9063 struct cfs_rq *cfs_rq = cfs_rq_of(se);
8715 u64 now = cfs_rq_clock_task(cfs_rq);
8716 9064
8717#ifdef CONFIG_FAIR_GROUP_SCHED 9065#ifdef CONFIG_FAIR_GROUP_SCHED
8718 /* 9066 /*
@@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
8722 se->depth = se->parent ? se->parent->depth + 1 : 0; 9070 se->depth = se->parent ? se->parent->depth + 1 : 0;
8723#endif 9071#endif
8724 9072
8725 /* Synchronize task with its cfs_rq */ 9073 /* Synchronize entity with its cfs_rq */
8726 update_cfs_rq_load_avg(now, cfs_rq, false); 9074 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
8727 attach_entity_load_avg(cfs_rq, se); 9075 attach_entity_load_avg(cfs_rq, se);
8728 update_tg_load_avg(cfs_rq, false); 9076 update_tg_load_avg(cfs_rq, false);
9077 propagate_entity_cfs_rq(se);
9078}
9079
9080static void detach_task_cfs_rq(struct task_struct *p)
9081{
9082 struct sched_entity *se = &p->se;
9083 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9084
9085 if (!vruntime_normalized(p)) {
9086 /*
9087 * Fix up our vruntime so that the current sleep doesn't
9088 * cause 'unlimited' sleep bonus.
9089 */
9090 place_entity(cfs_rq, se, 0);
9091 se->vruntime -= cfs_rq->min_vruntime;
9092 }
9093
9094 detach_entity_cfs_rq(se);
9095}
9096
9097static void attach_task_cfs_rq(struct task_struct *p)
9098{
9099 struct sched_entity *se = &p->se;
9100 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9101
9102 attach_entity_cfs_rq(se);
8729 9103
8730 if (!vruntime_normalized(p)) 9104 if (!vruntime_normalized(p))
8731 se->vruntime += cfs_rq->min_vruntime; 9105 se->vruntime += cfs_rq->min_vruntime;
@@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
8779 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 9153 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8780#endif 9154#endif
8781#ifdef CONFIG_SMP 9155#ifdef CONFIG_SMP
9156#ifdef CONFIG_FAIR_GROUP_SCHED
9157 cfs_rq->propagate_avg = 0;
9158#endif
8782 atomic_long_set(&cfs_rq->removed_load_avg, 0); 9159 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8783 atomic_long_set(&cfs_rq->removed_util_avg, 0); 9160 atomic_long_set(&cfs_rq->removed_util_avg, 0);
8784#endif 9161#endif
@@ -8887,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg)
8887 se = tg->se[i]; 9264 se = tg->se[i];
8888 9265
8889 raw_spin_lock_irq(&rq->lock); 9266 raw_spin_lock_irq(&rq->lock);
8890 post_init_entity_util_avg(se); 9267 attach_entity_cfs_rq(se);
8891 sync_throttle(tg, i); 9268 sync_throttle(tg, i);
8892 raw_spin_unlock_irq(&rq->lock); 9269 raw_spin_unlock_irq(&rq->lock);
8893 } 9270 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935d4421..7b34c7826ca5 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@ struct cfs_rq {
404 unsigned long runnable_load_avg; 404 unsigned long runnable_load_avg;
405#ifdef CONFIG_FAIR_GROUP_SCHED 405#ifdef CONFIG_FAIR_GROUP_SCHED
406 unsigned long tg_load_avg_contrib; 406 unsigned long tg_load_avg_contrib;
407 unsigned long propagate_avg;
407#endif 408#endif
408 atomic_long_t removed_load_avg, removed_util_avg; 409 atomic_long_t removed_load_avg, removed_util_avg;
409#ifndef CONFIG_64BIT 410#ifndef CONFIG_64BIT
@@ -539,6 +540,11 @@ struct dl_rq {
539 540
540#ifdef CONFIG_SMP 541#ifdef CONFIG_SMP
541 542
543static inline bool sched_asym_prefer(int a, int b)
544{
545 return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
546}
547
542/* 548/*
543 * We add the notion of a root-domain which will be used to define per-domain 549 * We add the notion of a root-domain which will be used to define per-domain
544 * variables. Each exclusive cpuset essentially defines an island domain by 550 * variables. Each exclusive cpuset essentially defines an island domain by
@@ -623,6 +629,7 @@ struct rq {
623#ifdef CONFIG_FAIR_GROUP_SCHED 629#ifdef CONFIG_FAIR_GROUP_SCHED
624 /* list of leaf cfs_rq on this cpu: */ 630 /* list of leaf cfs_rq on this cpu: */
625 struct list_head leaf_cfs_rq_list; 631 struct list_head leaf_cfs_rq_list;
632 struct list_head *tmp_alone_branch;
626#endif /* CONFIG_FAIR_GROUP_SCHED */ 633#endif /* CONFIG_FAIR_GROUP_SCHED */
627 634
628 /* 635 /*
@@ -892,7 +899,8 @@ struct sched_group_capacity {
892 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 899 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
893 * for a single CPU. 900 * for a single CPU.
894 */ 901 */
895 unsigned int capacity; 902 unsigned long capacity;
903 unsigned long min_capacity; /* Min per-CPU capacity in group */
896 unsigned long next_update; 904 unsigned long next_update;
897 int imbalance; /* XXX unrelated to capacity but shared group state */ 905 int imbalance; /* XXX unrelated to capacity but shared group state */
898 906
@@ -905,6 +913,7 @@ struct sched_group {
905 913
906 unsigned int group_weight; 914 unsigned int group_weight;
907 struct sched_group_capacity *sgc; 915 struct sched_group_capacity *sgc;
916 int asym_prefer_cpu; /* cpu of highest priority in group */
908 917
909 /* 918 /*
910 * The CPUs this group covers. 919 * The CPUs this group covers.
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 706309f9ed84..739fb17371af 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = {
347 .mode = 0644, 347 .mode = 0644,
348 .proc_handler = proc_dointvec, 348 .proc_handler = proc_dointvec,
349 }, 349 },
350 {
351 .procname = "sched_shares_window_ns",
352 .data = &sysctl_sched_shares_window,
353 .maxlen = sizeof(unsigned int),
354 .mode = 0644,
355 .proc_handler = proc_dointvec,
356 },
357#ifdef CONFIG_SCHEDSTATS 350#ifdef CONFIG_SCHEDSTATS
358 { 351 {
359 .procname = "sched_schedstats", 352 .procname = "sched_schedstats",
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d78927a..e887ffc8eef3 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p)
133} 133}
134static inline unsigned long long virt_ticks(struct task_struct *p) 134static inline unsigned long long virt_ticks(struct task_struct *p)
135{ 135{
136 cputime_t utime; 136 cputime_t utime, stime;
137 137
138 task_cputime(p, &utime, NULL); 138 task_cputime(p, &utime, &stime);
139 139
140 return cputime_to_expires(utime); 140 return cputime_to_expires(utime);
141} 141}