diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 15:15:10 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-12-12 15:15:10 -0500 |
commit | 92c020d08d83673ecd15a9069d4457378668da31 (patch) | |
tree | 3dbc5a9c1ab179f55be49e30e378cc4e650fc20e | |
parent | bca13ce4554ae9cf5083e5adf395ad2266cb571b (diff) | |
parent | 6b94780e45c17b83e3e75f8aaca5a328db583c74 (diff) |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The main scheduler changes in this cycle were:
- support Intel Turbo Boost Max Technology 3.0 (TBM3) by introducig a
notion of 'better cores', which the scheduler will prefer to
schedule single threaded workloads on. (Tim Chen, Srinivas
Pandruvada)
- enhance the handling of asymmetric capacity CPUs further (Morten
Rasmussen)
- improve/fix load handling when moving tasks between task groups
(Vincent Guittot)
- simplify and clean up the cputime code (Stanislaw Gruszka)
- improve mass fork()ed task spread a.k.a. hackbench speedup (Vincent
Guittot)
- make struct kthread kmalloc()ed and related fixes (Oleg Nesterov)
- add uaccess atomicity debugging (when using access_ok() in the
wrong context), under CONFIG_DEBUG_ATOMIC_SLEEP=y (Peter Zijlstra)
- implement various fixes, cleanups and other enhancements (Daniel
Bristot de Oliveira, Martin Schwidefsky, Rafael J. Wysocki)"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits)
sched/core: Use load_avg for selecting idlest group
sched/core: Fix find_idlest_group() for fork
kthread: Don't abuse kthread_create_on_cpu() in __kthread_create_worker()
kthread: Don't use to_live_kthread() in kthread_[un]park()
kthread: Don't use to_live_kthread() in kthread_stop()
Revert "kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function"
kthread: Make struct kthread kmalloc'ed
x86/uaccess, sched/preempt: Verify access_ok() context
sched/x86: Make CONFIG_SCHED_MC_PRIO=y easier to enable
sched/x86: Change CONFIG_SCHED_ITMT to CONFIG_SCHED_MC_PRIO
x86/sched: Use #include <linux/mutex.h> instead of #include <asm/mutex.h>
cpufreq/intel_pstate: Use CPPC to get max performance
acpi/bus: Set _OSC for diverse core support
acpi/bus: Enable HWP CPPC objects
x86/sched: Add SD_ASYM_PACKING flags to x86 ITMT CPU
x86/sysctl: Add sysctl for ITMT scheduling feature
x86: Enable Intel Turbo Boost Max Technology 3.0
x86/topology: Define x86's arch_update_cpu_topology
sched: Extend scheduler's asym packing
sched/fair: Clean up the tunable parameter definitions
...
36 files changed, 1152 insertions, 406 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 44a44b49eb3a..835d55d52104 100644 --- a/arch/Kconfig +++ b/arch/Kconfig | |||
@@ -513,6 +513,9 @@ config HAVE_CONTEXT_TRACKING | |||
513 | config HAVE_VIRT_CPU_ACCOUNTING | 513 | config HAVE_VIRT_CPU_ACCOUNTING |
514 | bool | 514 | bool |
515 | 515 | ||
516 | config ARCH_HAS_SCALED_CPUTIME | ||
517 | bool | ||
518 | |||
516 | config HAVE_VIRT_CPU_ACCOUNTING_GEN | 519 | config HAVE_VIRT_CPU_ACCOUNTING_GEN |
517 | bool | 520 | bool |
518 | default y if 64BIT | 521 | default y if 64BIT |
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6f892b94e906..021f44ab4bfb 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c | |||
@@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk) | |||
68 | 68 | ||
69 | if (ti->ac_utime) { | 69 | if (ti->ac_utime) { |
70 | delta_utime = cycle_to_cputime(ti->ac_utime); | 70 | delta_utime = cycle_to_cputime(ti->ac_utime); |
71 | account_user_time(tsk, delta_utime, delta_utime); | 71 | account_user_time(tsk, delta_utime); |
72 | ti->ac_utime = 0; | 72 | ti->ac_utime = 0; |
73 | } | 73 | } |
74 | } | 74 | } |
@@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk) | |||
112 | { | 112 | { |
113 | cputime_t delta = vtime_delta(tsk); | 113 | cputime_t delta = vtime_delta(tsk); |
114 | 114 | ||
115 | account_system_time(tsk, 0, delta, delta); | 115 | account_system_time(tsk, 0, delta); |
116 | } | 116 | } |
117 | EXPORT_SYMBOL_GPL(vtime_account_system); | 117 | EXPORT_SYMBOL_GPL(vtime_account_system); |
118 | 118 | ||
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c34cd7..c7f120aaa98f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig | |||
@@ -160,6 +160,7 @@ config PPC | |||
160 | select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS | 160 | select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS |
161 | select GENERIC_CPU_AUTOPROBE | 161 | select GENERIC_CPU_AUTOPROBE |
162 | select HAVE_VIRT_CPU_ACCOUNTING | 162 | select HAVE_VIRT_CPU_ACCOUNTING |
163 | select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE | ||
163 | select HAVE_ARCH_HARDENED_USERCOPY | 164 | select HAVE_ARCH_HARDENED_USERCOPY |
164 | select HAVE_KERNEL_GZIP | 165 | select HAVE_KERNEL_GZIP |
165 | 166 | ||
diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 4f60db074725..aa2e6a34b872 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h | |||
@@ -46,26 +46,12 @@ extern cputime_t cputime_one_jiffy; | |||
46 | * Convert cputime <-> jiffies | 46 | * Convert cputime <-> jiffies |
47 | */ | 47 | */ |
48 | extern u64 __cputime_jiffies_factor; | 48 | extern u64 __cputime_jiffies_factor; |
49 | DECLARE_PER_CPU(unsigned long, cputime_last_delta); | ||
50 | DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta); | ||
51 | 49 | ||
52 | static inline unsigned long cputime_to_jiffies(const cputime_t ct) | 50 | static inline unsigned long cputime_to_jiffies(const cputime_t ct) |
53 | { | 51 | { |
54 | return mulhdu((__force u64) ct, __cputime_jiffies_factor); | 52 | return mulhdu((__force u64) ct, __cputime_jiffies_factor); |
55 | } | 53 | } |
56 | 54 | ||
57 | /* Estimate the scaled cputime by scaling the real cputime based on | ||
58 | * the last scaled to real ratio */ | ||
59 | static inline cputime_t cputime_to_scaled(const cputime_t ct) | ||
60 | { | ||
61 | if (cpu_has_feature(CPU_FTR_SPURR) && | ||
62 | __this_cpu_read(cputime_last_delta)) | ||
63 | return (__force u64) ct * | ||
64 | __this_cpu_read(cputime_scaled_last_delta) / | ||
65 | __this_cpu_read(cputime_last_delta); | ||
66 | return ct; | ||
67 | } | ||
68 | |||
69 | static inline cputime_t jiffies_to_cputime(const unsigned long jif) | 55 | static inline cputime_t jiffies_to_cputime(const unsigned long jif) |
70 | { | 56 | { |
71 | u64 ct; | 57 | u64 ct; |
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc3f7d0d7b79..be9751f1cb2a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c | |||
@@ -164,8 +164,6 @@ u64 __cputime_sec_factor; | |||
164 | EXPORT_SYMBOL(__cputime_sec_factor); | 164 | EXPORT_SYMBOL(__cputime_sec_factor); |
165 | u64 __cputime_clockt_factor; | 165 | u64 __cputime_clockt_factor; |
166 | EXPORT_SYMBOL(__cputime_clockt_factor); | 166 | EXPORT_SYMBOL(__cputime_clockt_factor); |
167 | DEFINE_PER_CPU(unsigned long, cputime_last_delta); | ||
168 | DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); | ||
169 | 167 | ||
170 | cputime_t cputime_one_jiffy; | 168 | cputime_t cputime_one_jiffy; |
171 | 169 | ||
@@ -360,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk) | |||
360 | unsigned long delta, sys_scaled, stolen; | 358 | unsigned long delta, sys_scaled, stolen; |
361 | 359 | ||
362 | delta = vtime_delta(tsk, &sys_scaled, &stolen); | 360 | delta = vtime_delta(tsk, &sys_scaled, &stolen); |
363 | account_system_time(tsk, 0, delta, sys_scaled); | 361 | account_system_time(tsk, 0, delta); |
362 | tsk->stimescaled += sys_scaled; | ||
364 | if (stolen) | 363 | if (stolen) |
365 | account_steal_time(stolen); | 364 | account_steal_time(stolen); |
366 | } | 365 | } |
@@ -393,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk) | |||
393 | acct->user_time = 0; | 392 | acct->user_time = 0; |
394 | acct->user_time_scaled = 0; | 393 | acct->user_time_scaled = 0; |
395 | acct->utime_sspurr = 0; | 394 | acct->utime_sspurr = 0; |
396 | account_user_time(tsk, utime, utimescaled); | 395 | account_user_time(tsk, utime); |
396 | tsk->utimescaled += utimescaled; | ||
397 | } | 397 | } |
398 | 398 | ||
399 | #ifdef CONFIG_PPC32 | 399 | #ifdef CONFIG_PPC32 |
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 426481d4cc86..028f97be5bae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig | |||
@@ -171,6 +171,7 @@ config S390 | |||
171 | select SYSCTL_EXCEPTION_TRACE | 171 | select SYSCTL_EXCEPTION_TRACE |
172 | select TTY | 172 | select TTY |
173 | select VIRT_CPU_ACCOUNTING | 173 | select VIRT_CPU_ACCOUNTING |
174 | select ARCH_HAS_SCALED_CPUTIME | ||
174 | select VIRT_TO_BUS | 175 | select VIRT_TO_BUS |
175 | select HAVE_NMI | 176 | select HAVE_NMI |
176 | 177 | ||
diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 856e30d8463f..1bd5dde2d5a9 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c | |||
@@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) | |||
137 | user_scaled = (user_scaled * mult) / div; | 137 | user_scaled = (user_scaled * mult) / div; |
138 | system_scaled = (system_scaled * mult) / div; | 138 | system_scaled = (system_scaled * mult) / div; |
139 | } | 139 | } |
140 | account_user_time(tsk, user, user_scaled); | 140 | account_user_time(tsk, user); |
141 | account_system_time(tsk, hardirq_offset, system, system_scaled); | 141 | tsk->utimescaled += user_scaled; |
142 | account_system_time(tsk, hardirq_offset, system); | ||
143 | tsk->stimescaled += system_scaled; | ||
142 | 144 | ||
143 | steal = S390_lowcore.steal_timer; | 145 | steal = S390_lowcore.steal_timer; |
144 | if ((s64) steal > 0) { | 146 | if ((s64) steal > 0) { |
@@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk) | |||
202 | 204 | ||
203 | system_scaled = (system_scaled * mult) / div; | 205 | system_scaled = (system_scaled * mult) / div; |
204 | } | 206 | } |
205 | account_system_time(tsk, 0, system, system_scaled); | 207 | account_system_time(tsk, 0, system); |
208 | tsk->stimescaled += system_scaled; | ||
206 | 209 | ||
207 | virt_timer_forward(system); | 210 | virt_timer_forward(system); |
208 | } | 211 | } |
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..b50e5eeefd21 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig | |||
@@ -939,6 +939,27 @@ config SCHED_MC | |||
939 | making when dealing with multi-core CPU chips at a cost of slightly | 939 | making when dealing with multi-core CPU chips at a cost of slightly |
940 | increased overhead in some places. If unsure say N here. | 940 | increased overhead in some places. If unsure say N here. |
941 | 941 | ||
942 | config SCHED_MC_PRIO | ||
943 | bool "CPU core priorities scheduler support" | ||
944 | depends on SCHED_MC && CPU_SUP_INTEL | ||
945 | select X86_INTEL_PSTATE | ||
946 | select CPU_FREQ | ||
947 | default y | ||
948 | ---help--- | ||
949 | Intel Turbo Boost Max Technology 3.0 enabled CPUs have a | ||
950 | core ordering determined at manufacturing time, which allows | ||
951 | certain cores to reach higher turbo frequencies (when running | ||
952 | single threaded workloads) than others. | ||
953 | |||
954 | Enabling this kernel feature teaches the scheduler about | ||
955 | the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the | ||
956 | scheduler's CPU selection logic accordingly, so that higher | ||
957 | overall system performance can be achieved. | ||
958 | |||
959 | This feature will have no effect on CPUs without this feature. | ||
960 | |||
961 | If unsure say Y here. | ||
962 | |||
942 | source "kernel/Kconfig.preempt" | 963 | source "kernel/Kconfig.preempt" |
943 | 964 | ||
944 | config UP_LATE_INIT | 965 | config UP_LATE_INIT |
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 17f218645701..ec1f3c651150 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h | |||
@@ -24,7 +24,13 @@ static __always_inline int preempt_count(void) | |||
24 | 24 | ||
25 | static __always_inline void preempt_count_set(int pc) | 25 | static __always_inline void preempt_count_set(int pc) |
26 | { | 26 | { |
27 | raw_cpu_write_4(__preempt_count, pc); | 27 | int old, new; |
28 | |||
29 | do { | ||
30 | old = raw_cpu_read_4(__preempt_count); | ||
31 | new = (old & PREEMPT_NEED_RESCHED) | | ||
32 | (pc & ~PREEMPT_NEED_RESCHED); | ||
33 | } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); | ||
28 | } | 34 | } |
29 | 35 | ||
30 | /* | 36 | /* |
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cf75871d2f81..6358a85e2270 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h | |||
@@ -146,4 +146,36 @@ struct pci_bus; | |||
146 | int x86_pci_root_bus_node(int bus); | 146 | int x86_pci_root_bus_node(int bus); |
147 | void x86_pci_root_bus_resources(int bus, struct list_head *resources); | 147 | void x86_pci_root_bus_resources(int bus, struct list_head *resources); |
148 | 148 | ||
149 | extern bool x86_topology_update; | ||
150 | |||
151 | #ifdef CONFIG_SCHED_MC_PRIO | ||
152 | #include <asm/percpu.h> | ||
153 | |||
154 | DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); | ||
155 | extern unsigned int __read_mostly sysctl_sched_itmt_enabled; | ||
156 | |||
157 | /* Interface to set priority of a cpu */ | ||
158 | void sched_set_itmt_core_prio(int prio, int core_cpu); | ||
159 | |||
160 | /* Interface to notify scheduler that system supports ITMT */ | ||
161 | int sched_set_itmt_support(void); | ||
162 | |||
163 | /* Interface to notify scheduler that system revokes ITMT support */ | ||
164 | void sched_clear_itmt_support(void); | ||
165 | |||
166 | #else /* CONFIG_SCHED_MC_PRIO */ | ||
167 | |||
168 | #define sysctl_sched_itmt_enabled 0 | ||
169 | static inline void sched_set_itmt_core_prio(int prio, int core_cpu) | ||
170 | { | ||
171 | } | ||
172 | static inline int sched_set_itmt_support(void) | ||
173 | { | ||
174 | return 0; | ||
175 | } | ||
176 | static inline void sched_clear_itmt_support(void) | ||
177 | { | ||
178 | } | ||
179 | #endif /* CONFIG_SCHED_MC_PRIO */ | ||
180 | |||
149 | #endif /* _ASM_X86_TOPOLOGY_H */ | 181 | #endif /* _ASM_X86_TOPOLOGY_H */ |
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index faf3687f1035..ea148313570f 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h | |||
@@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un | |||
68 | __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ | 68 | __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ |
69 | }) | 69 | }) |
70 | 70 | ||
71 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | ||
72 | # define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) | ||
73 | #else | ||
74 | # define WARN_ON_IN_IRQ() | ||
75 | #endif | ||
76 | |||
71 | /** | 77 | /** |
72 | * access_ok: - Checks if a user space pointer is valid | 78 | * access_ok: - Checks if a user space pointer is valid |
73 | * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that | 79 | * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that |
@@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un | |||
88 | * checks that the pointer is in the user space range - after calling | 94 | * checks that the pointer is in the user space range - after calling |
89 | * this function, memory access functions may still return -EFAULT. | 95 | * this function, memory access functions may still return -EFAULT. |
90 | */ | 96 | */ |
91 | #define access_ok(type, addr, size) \ | 97 | #define access_ok(type, addr, size) \ |
92 | likely(!__range_not_ok(addr, size, user_addr_max())) | 98 | ({ \ |
99 | WARN_ON_IN_IRQ(); \ | ||
100 | likely(!__range_not_ok(addr, size, user_addr_max())); \ | ||
101 | }) | ||
93 | 102 | ||
94 | /* | 103 | /* |
95 | * These are the main single-value transfer routines. They automatically | 104 | * These are the main single-value transfer routines. They automatically |
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..05110c1097ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile | |||
@@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o | |||
123 | 123 | ||
124 | obj-$(CONFIG_PERF_EVENTS) += perf_regs.o | 124 | obj-$(CONFIG_PERF_EVENTS) += perf_regs.o |
125 | obj-$(CONFIG_TRACING) += tracepoint.o | 125 | obj-$(CONFIG_TRACING) += tracepoint.o |
126 | obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o | ||
126 | 127 | ||
127 | ifdef CONFIG_FRAME_POINTER | 128 | ifdef CONFIG_FRAME_POINTER |
128 | obj-y += unwind_frame.o | 129 | obj-y += unwind_frame.o |
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 51287cd90bf6..643818a7688b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c | |||
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, | |||
906 | static int use_apm_idle; /* = 0 */ | 906 | static int use_apm_idle; /* = 0 */ |
907 | static unsigned int last_jiffies; /* = 0 */ | 907 | static unsigned int last_jiffies; /* = 0 */ |
908 | static unsigned int last_stime; /* = 0 */ | 908 | static unsigned int last_stime; /* = 0 */ |
909 | cputime_t stime; | 909 | cputime_t stime, utime; |
910 | 910 | ||
911 | int apm_idle_done = 0; | 911 | int apm_idle_done = 0; |
912 | unsigned int jiffies_since_last_check = jiffies - last_jiffies; | 912 | unsigned int jiffies_since_last_check = jiffies - last_jiffies; |
913 | unsigned int bucket; | 913 | unsigned int bucket; |
914 | 914 | ||
915 | recalc: | 915 | recalc: |
916 | task_cputime(current, NULL, &stime); | 916 | task_cputime(current, &utime, &stime); |
917 | if (jiffies_since_last_check > IDLE_CALC_LIMIT) { | 917 | if (jiffies_since_last_check > IDLE_CALC_LIMIT) { |
918 | use_apm_idle = 0; | 918 | use_apm_idle = 0; |
919 | } else if (jiffies_since_last_check > idle_period) { | 919 | } else if (jiffies_since_last_check > idle_period) { |
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000000..cb9c1ed1d391 --- /dev/null +++ b/arch/x86/kernel/itmt.c | |||
@@ -0,0 +1,215 @@ | |||
1 | /* | ||
2 | * itmt.c: Support Intel Turbo Boost Max Technology 3.0 | ||
3 | * | ||
4 | * (C) Copyright 2016 Intel Corporation | ||
5 | * Author: Tim Chen <tim.c.chen@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | * | ||
12 | * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), | ||
13 | * the maximum turbo frequencies of some cores in a CPU package may be | ||
14 | * higher than for the other cores in the same package. In that case, | ||
15 | * better performance can be achieved by making the scheduler prefer | ||
16 | * to run tasks on the CPUs with higher max turbo frequencies. | ||
17 | * | ||
18 | * This file provides functions and data structures for enabling the | ||
19 | * scheduler to favor scheduling on cores can be boosted to a higher | ||
20 | * frequency under ITMT. | ||
21 | */ | ||
22 | |||
23 | #include <linux/sched.h> | ||
24 | #include <linux/cpumask.h> | ||
25 | #include <linux/cpuset.h> | ||
26 | #include <linux/mutex.h> | ||
27 | #include <linux/sched.h> | ||
28 | #include <linux/sysctl.h> | ||
29 | #include <linux/nodemask.h> | ||
30 | |||
31 | static DEFINE_MUTEX(itmt_update_mutex); | ||
32 | DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); | ||
33 | |||
34 | /* Boolean to track if system has ITMT capabilities */ | ||
35 | static bool __read_mostly sched_itmt_capable; | ||
36 | |||
37 | /* | ||
38 | * Boolean to control whether we want to move processes to cpu capable | ||
39 | * of higher turbo frequency for cpus supporting Intel Turbo Boost Max | ||
40 | * Technology 3.0. | ||
41 | * | ||
42 | * It can be set via /proc/sys/kernel/sched_itmt_enabled | ||
43 | */ | ||
44 | unsigned int __read_mostly sysctl_sched_itmt_enabled; | ||
45 | |||
46 | static int sched_itmt_update_handler(struct ctl_table *table, int write, | ||
47 | void __user *buffer, size_t *lenp, | ||
48 | loff_t *ppos) | ||
49 | { | ||
50 | unsigned int old_sysctl; | ||
51 | int ret; | ||
52 | |||
53 | mutex_lock(&itmt_update_mutex); | ||
54 | |||
55 | if (!sched_itmt_capable) { | ||
56 | mutex_unlock(&itmt_update_mutex); | ||
57 | return -EINVAL; | ||
58 | } | ||
59 | |||
60 | old_sysctl = sysctl_sched_itmt_enabled; | ||
61 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); | ||
62 | |||
63 | if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { | ||
64 | x86_topology_update = true; | ||
65 | rebuild_sched_domains(); | ||
66 | } | ||
67 | |||
68 | mutex_unlock(&itmt_update_mutex); | ||
69 | |||
70 | return ret; | ||
71 | } | ||
72 | |||
73 | static unsigned int zero; | ||
74 | static unsigned int one = 1; | ||
75 | static struct ctl_table itmt_kern_table[] = { | ||
76 | { | ||
77 | .procname = "sched_itmt_enabled", | ||
78 | .data = &sysctl_sched_itmt_enabled, | ||
79 | .maxlen = sizeof(unsigned int), | ||
80 | .mode = 0644, | ||
81 | .proc_handler = sched_itmt_update_handler, | ||
82 | .extra1 = &zero, | ||
83 | .extra2 = &one, | ||
84 | }, | ||
85 | {} | ||
86 | }; | ||
87 | |||
88 | static struct ctl_table itmt_root_table[] = { | ||
89 | { | ||
90 | .procname = "kernel", | ||
91 | .mode = 0555, | ||
92 | .child = itmt_kern_table, | ||
93 | }, | ||
94 | {} | ||
95 | }; | ||
96 | |||
97 | static struct ctl_table_header *itmt_sysctl_header; | ||
98 | |||
99 | /** | ||
100 | * sched_set_itmt_support() - Indicate platform supports ITMT | ||
101 | * | ||
102 | * This function is used by the OS to indicate to scheduler that the platform | ||
103 | * is capable of supporting the ITMT feature. | ||
104 | * | ||
105 | * The current scheme has the pstate driver detects if the system | ||
106 | * is ITMT capable and call sched_set_itmt_support. | ||
107 | * | ||
108 | * This must be done only after sched_set_itmt_core_prio | ||
109 | * has been called to set the cpus' priorities. | ||
110 | * It must not be called with cpu hot plug lock | ||
111 | * held as we need to acquire the lock to rebuild sched domains | ||
112 | * later. | ||
113 | * | ||
114 | * Return: 0 on success | ||
115 | */ | ||
116 | int sched_set_itmt_support(void) | ||
117 | { | ||
118 | mutex_lock(&itmt_update_mutex); | ||
119 | |||
120 | if (sched_itmt_capable) { | ||
121 | mutex_unlock(&itmt_update_mutex); | ||
122 | return 0; | ||
123 | } | ||
124 | |||
125 | itmt_sysctl_header = register_sysctl_table(itmt_root_table); | ||
126 | if (!itmt_sysctl_header) { | ||
127 | mutex_unlock(&itmt_update_mutex); | ||
128 | return -ENOMEM; | ||
129 | } | ||
130 | |||
131 | sched_itmt_capable = true; | ||
132 | |||
133 | sysctl_sched_itmt_enabled = 1; | ||
134 | |||
135 | if (sysctl_sched_itmt_enabled) { | ||
136 | x86_topology_update = true; | ||
137 | rebuild_sched_domains(); | ||
138 | } | ||
139 | |||
140 | mutex_unlock(&itmt_update_mutex); | ||
141 | |||
142 | return 0; | ||
143 | } | ||
144 | |||
145 | /** | ||
146 | * sched_clear_itmt_support() - Revoke platform's support of ITMT | ||
147 | * | ||
148 | * This function is used by the OS to indicate that it has | ||
149 | * revoked the platform's support of ITMT feature. | ||
150 | * | ||
151 | * It must not be called with cpu hot plug lock | ||
152 | * held as we need to acquire the lock to rebuild sched domains | ||
153 | * later. | ||
154 | */ | ||
155 | void sched_clear_itmt_support(void) | ||
156 | { | ||
157 | mutex_lock(&itmt_update_mutex); | ||
158 | |||
159 | if (!sched_itmt_capable) { | ||
160 | mutex_unlock(&itmt_update_mutex); | ||
161 | return; | ||
162 | } | ||
163 | sched_itmt_capable = false; | ||
164 | |||
165 | if (itmt_sysctl_header) { | ||
166 | unregister_sysctl_table(itmt_sysctl_header); | ||
167 | itmt_sysctl_header = NULL; | ||
168 | } | ||
169 | |||
170 | if (sysctl_sched_itmt_enabled) { | ||
171 | /* disable sched_itmt if we are no longer ITMT capable */ | ||
172 | sysctl_sched_itmt_enabled = 0; | ||
173 | x86_topology_update = true; | ||
174 | rebuild_sched_domains(); | ||
175 | } | ||
176 | |||
177 | mutex_unlock(&itmt_update_mutex); | ||
178 | } | ||
179 | |||
180 | int arch_asym_cpu_priority(int cpu) | ||
181 | { | ||
182 | return per_cpu(sched_core_priority, cpu); | ||
183 | } | ||
184 | |||
185 | /** | ||
186 | * sched_set_itmt_core_prio() - Set CPU priority based on ITMT | ||
187 | * @prio: Priority of cpu core | ||
188 | * @core_cpu: The cpu number associated with the core | ||
189 | * | ||
190 | * The pstate driver will find out the max boost frequency | ||
191 | * and call this function to set a priority proportional | ||
192 | * to the max boost frequency. CPU with higher boost | ||
193 | * frequency will receive higher priority. | ||
194 | * | ||
195 | * No need to rebuild sched domain after updating | ||
196 | * the CPU priorities. The sched domains have no | ||
197 | * dependency on CPU priorities. | ||
198 | */ | ||
199 | void sched_set_itmt_core_prio(int prio, int core_cpu) | ||
200 | { | ||
201 | int cpu, i = 1; | ||
202 | |||
203 | for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { | ||
204 | int smt_prio; | ||
205 | |||
206 | /* | ||
207 | * Ensure that the siblings are moved to the end | ||
208 | * of the priority chain and only used when | ||
209 | * all other high priority cpus are out of capacity. | ||
210 | */ | ||
211 | smt_prio = prio * smp_num_siblings / i; | ||
212 | per_cpu(sched_core_priority, cpu) = smt_prio; | ||
213 | i++; | ||
214 | } | ||
215 | } | ||
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b9f02383f372..118e792a7be6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c | |||
@@ -109,6 +109,17 @@ static bool logical_packages_frozen __read_mostly; | |||
109 | /* Maximum number of SMT threads on any online core */ | 109 | /* Maximum number of SMT threads on any online core */ |
110 | int __max_smt_threads __read_mostly; | 110 | int __max_smt_threads __read_mostly; |
111 | 111 | ||
112 | /* Flag to indicate if a complete sched domain rebuild is required */ | ||
113 | bool x86_topology_update; | ||
114 | |||
115 | int arch_update_cpu_topology(void) | ||
116 | { | ||
117 | int retval = x86_topology_update; | ||
118 | |||
119 | x86_topology_update = false; | ||
120 | return retval; | ||
121 | } | ||
122 | |||
112 | static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) | 123 | static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) |
113 | { | 124 | { |
114 | unsigned long flags; | 125 | unsigned long flags; |
@@ -471,22 +482,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) | |||
471 | return false; | 482 | return false; |
472 | } | 483 | } |
473 | 484 | ||
485 | #if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) | ||
486 | static inline int x86_sched_itmt_flags(void) | ||
487 | { | ||
488 | return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; | ||
489 | } | ||
490 | |||
491 | #ifdef CONFIG_SCHED_MC | ||
492 | static int x86_core_flags(void) | ||
493 | { | ||
494 | return cpu_core_flags() | x86_sched_itmt_flags(); | ||
495 | } | ||
496 | #endif | ||
497 | #ifdef CONFIG_SCHED_SMT | ||
498 | static int x86_smt_flags(void) | ||
499 | { | ||
500 | return cpu_smt_flags() | x86_sched_itmt_flags(); | ||
501 | } | ||
502 | #endif | ||
503 | #endif | ||
504 | |||
474 | static struct sched_domain_topology_level x86_numa_in_package_topology[] = { | 505 | static struct sched_domain_topology_level x86_numa_in_package_topology[] = { |
475 | #ifdef CONFIG_SCHED_SMT | 506 | #ifdef CONFIG_SCHED_SMT |
476 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | 507 | { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, |
477 | #endif | 508 | #endif |
478 | #ifdef CONFIG_SCHED_MC | 509 | #ifdef CONFIG_SCHED_MC |
479 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | 510 | { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, |
480 | #endif | 511 | #endif |
481 | { NULL, }, | 512 | { NULL, }, |
482 | }; | 513 | }; |
483 | 514 | ||
484 | static struct sched_domain_topology_level x86_topology[] = { | 515 | static struct sched_domain_topology_level x86_topology[] = { |
485 | #ifdef CONFIG_SCHED_SMT | 516 | #ifdef CONFIG_SCHED_SMT |
486 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, | 517 | { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, |
487 | #endif | 518 | #endif |
488 | #ifdef CONFIG_SCHED_MC | 519 | #ifdef CONFIG_SCHED_MC |
489 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, | 520 | { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, |
490 | #endif | 521 | #endif |
491 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, | 522 | { cpu_cpu_mask, SD_INIT_NAME(DIE) }, |
492 | { NULL, }, | 523 | { NULL, }, |
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 56190d00fd87..5cbefd7621f0 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c | |||
@@ -331,6 +331,16 @@ static void acpi_bus_osc_support(void) | |||
331 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; | 331 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT; |
332 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; | 332 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT; |
333 | 333 | ||
334 | #ifdef CONFIG_X86 | ||
335 | if (boot_cpu_has(X86_FEATURE_HWP)) { | ||
336 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT; | ||
337 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT; | ||
338 | } | ||
339 | #endif | ||
340 | |||
341 | if (IS_ENABLED(CONFIG_SCHED_MC_PRIO)) | ||
342 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT; | ||
343 | |||
334 | if (!ghes_disable) | 344 | if (!ghes_disable) |
335 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; | 345 | capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT; |
336 | if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) | 346 | if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle))) |
diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 index adbd1de1cea5..35f71825b7f3 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 | |||
@@ -6,6 +6,7 @@ config X86_INTEL_PSTATE | |||
6 | bool "Intel P state control" | 6 | bool "Intel P state control" |
7 | depends on X86 | 7 | depends on X86 |
8 | select ACPI_PROCESSOR if ACPI | 8 | select ACPI_PROCESSOR if ACPI |
9 | select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO | ||
9 | help | 10 | help |
10 | This driver provides a P state for Intel core processors. | 11 | This driver provides a P state for Intel core processors. |
11 | The driver implements an internal governor and will become | 12 | The driver implements an internal governor and will become |
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 4737520ec823..e8dc42fc0915 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c | |||
@@ -44,6 +44,7 @@ | |||
44 | 44 | ||
45 | #ifdef CONFIG_ACPI | 45 | #ifdef CONFIG_ACPI |
46 | #include <acpi/processor.h> | 46 | #include <acpi/processor.h> |
47 | #include <acpi/cppc_acpi.h> | ||
47 | #endif | 48 | #endif |
48 | 49 | ||
49 | #define FRAC_BITS 8 | 50 | #define FRAC_BITS 8 |
@@ -379,14 +380,67 @@ static bool intel_pstate_get_ppc_enable_status(void) | |||
379 | return acpi_ppc; | 380 | return acpi_ppc; |
380 | } | 381 | } |
381 | 382 | ||
383 | #ifdef CONFIG_ACPI_CPPC_LIB | ||
384 | |||
385 | /* The work item is needed to avoid CPU hotplug locking issues */ | ||
386 | static void intel_pstste_sched_itmt_work_fn(struct work_struct *work) | ||
387 | { | ||
388 | sched_set_itmt_support(); | ||
389 | } | ||
390 | |||
391 | static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn); | ||
392 | |||
393 | static void intel_pstate_set_itmt_prio(int cpu) | ||
394 | { | ||
395 | struct cppc_perf_caps cppc_perf; | ||
396 | static u32 max_highest_perf = 0, min_highest_perf = U32_MAX; | ||
397 | int ret; | ||
398 | |||
399 | ret = cppc_get_perf_caps(cpu, &cppc_perf); | ||
400 | if (ret) | ||
401 | return; | ||
402 | |||
403 | /* | ||
404 | * The priorities can be set regardless of whether or not | ||
405 | * sched_set_itmt_support(true) has been called and it is valid to | ||
406 | * update them at any time after it has been called. | ||
407 | */ | ||
408 | sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu); | ||
409 | |||
410 | if (max_highest_perf <= min_highest_perf) { | ||
411 | if (cppc_perf.highest_perf > max_highest_perf) | ||
412 | max_highest_perf = cppc_perf.highest_perf; | ||
413 | |||
414 | if (cppc_perf.highest_perf < min_highest_perf) | ||
415 | min_highest_perf = cppc_perf.highest_perf; | ||
416 | |||
417 | if (max_highest_perf > min_highest_perf) { | ||
418 | /* | ||
419 | * This code can be run during CPU online under the | ||
420 | * CPU hotplug locks, so sched_set_itmt_support() | ||
421 | * cannot be called from here. Queue up a work item | ||
422 | * to invoke it. | ||
423 | */ | ||
424 | schedule_work(&sched_itmt_work); | ||
425 | } | ||
426 | } | ||
427 | } | ||
428 | #else | ||
429 | static void intel_pstate_set_itmt_prio(int cpu) | ||
430 | { | ||
431 | } | ||
432 | #endif | ||
433 | |||
382 | static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) | 434 | static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy) |
383 | { | 435 | { |
384 | struct cpudata *cpu; | 436 | struct cpudata *cpu; |
385 | int ret; | 437 | int ret; |
386 | int i; | 438 | int i; |
387 | 439 | ||
388 | if (hwp_active) | 440 | if (hwp_active) { |
441 | intel_pstate_set_itmt_prio(policy->cpu); | ||
389 | return; | 442 | return; |
443 | } | ||
390 | 444 | ||
391 | if (!intel_pstate_get_ppc_enable_status()) | 445 | if (!intel_pstate_get_ppc_enable_status()) |
392 | return; | 446 | return; |
diff --git a/include/asm-generic/cputime_jiffies.h b/include/asm-generic/cputime_jiffies.h index fe386fc6e85e..6bb8cd45f53b 100644 --- a/include/asm-generic/cputime_jiffies.h +++ b/include/asm-generic/cputime_jiffies.h | |||
@@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t; | |||
7 | 7 | ||
8 | #define cputime_one_jiffy jiffies_to_cputime(1) | 8 | #define cputime_one_jiffy jiffies_to_cputime(1) |
9 | #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) | 9 | #define cputime_to_jiffies(__ct) (__force unsigned long)(__ct) |
10 | #define cputime_to_scaled(__ct) (__ct) | ||
11 | #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) | 10 | #define jiffies_to_cputime(__hz) (__force cputime_t)(__hz) |
12 | 11 | ||
13 | typedef u64 __nocast cputime64_t; | 12 | typedef u64 __nocast cputime64_t; |
diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index a84e28e0c634..4e3b18e559b1 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h | |||
@@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t; | |||
34 | */ | 34 | */ |
35 | #define cputime_to_jiffies(__ct) \ | 35 | #define cputime_to_jiffies(__ct) \ |
36 | cputime_div(__ct, NSEC_PER_SEC / HZ) | 36 | cputime_div(__ct, NSEC_PER_SEC / HZ) |
37 | #define cputime_to_scaled(__ct) (__ct) | ||
38 | #define jiffies_to_cputime(__jif) \ | 37 | #define jiffies_to_cputime(__jif) \ |
39 | (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) | 38 | (__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ)) |
40 | #define cputime64_to_jiffies64(__ct) \ | 39 | #define cputime64_to_jiffies64(__ct) \ |
diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 61a3d90f32b3..051023756520 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h | |||
@@ -469,6 +469,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context); | |||
469 | #define OSC_SB_CPCV2_SUPPORT 0x00000040 | 469 | #define OSC_SB_CPCV2_SUPPORT 0x00000040 |
470 | #define OSC_SB_PCLPI_SUPPORT 0x00000080 | 470 | #define OSC_SB_PCLPI_SUPPORT 0x00000080 |
471 | #define OSC_SB_OSLPI_SUPPORT 0x00000100 | 471 | #define OSC_SB_OSLPI_SUPPORT 0x00000100 |
472 | #define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT 0x00001000 | ||
472 | 473 | ||
473 | extern bool osc_sb_apei_support_acked; | 474 | extern bool osc_sb_apei_support_acked; |
474 | extern bool osc_pc_lpi_support_confirmed; | 475 | extern bool osc_pc_lpi_support_confirmed; |
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 44fda64ad434..00f776816aa3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h | |||
@@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) | |||
78 | return kstat_cpu(cpu).irqs_sum; | 78 | return kstat_cpu(cpu).irqs_sum; |
79 | } | 79 | } |
80 | 80 | ||
81 | extern void account_user_time(struct task_struct *, cputime_t, cputime_t); | 81 | extern void account_user_time(struct task_struct *, cputime_t); |
82 | extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); | 82 | extern void account_system_time(struct task_struct *, int, cputime_t); |
83 | extern void account_steal_time(cputime_t); | 83 | extern void account_steal_time(cputime_t); |
84 | extern void account_idle_time(cputime_t); | 84 | extern void account_idle_time(cputime_t); |
85 | 85 | ||
diff --git a/include/linux/kthread.h b/include/linux/kthread.h index a6e82a69c363..c1c3e63d52c1 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h | |||
@@ -48,6 +48,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
48 | __k; \ | 48 | __k; \ |
49 | }) | 49 | }) |
50 | 50 | ||
51 | void free_kthread_struct(struct task_struct *k); | ||
51 | void kthread_bind(struct task_struct *k, unsigned int cpu); | 52 | void kthread_bind(struct task_struct *k, unsigned int cpu); |
52 | void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); | 53 | void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask); |
53 | int kthread_stop(struct task_struct *k); | 54 | int kthread_stop(struct task_struct *k); |
diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 75e4e30677f1..7eeceac52dea 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h | |||
@@ -65,19 +65,24 @@ | |||
65 | 65 | ||
66 | /* | 66 | /* |
67 | * Are we doing bottom half or hardware interrupt processing? | 67 | * Are we doing bottom half or hardware interrupt processing? |
68 | * Are we in a softirq context? Interrupt context? | 68 | * |
69 | * in_softirq - Are we currently processing softirq or have bh disabled? | 69 | * in_irq() - We're in (hard) IRQ context |
70 | * in_serving_softirq - Are we currently processing softirq? | 70 | * in_softirq() - We have BH disabled, or are processing softirqs |
71 | * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled | ||
72 | * in_serving_softirq() - We're in softirq context | ||
73 | * in_nmi() - We're in NMI context | ||
74 | * in_task() - We're in task context | ||
75 | * | ||
76 | * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really | ||
77 | * should not be used in new code. | ||
71 | */ | 78 | */ |
72 | #define in_irq() (hardirq_count()) | 79 | #define in_irq() (hardirq_count()) |
73 | #define in_softirq() (softirq_count()) | 80 | #define in_softirq() (softirq_count()) |
74 | #define in_interrupt() (irq_count()) | 81 | #define in_interrupt() (irq_count()) |
75 | #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) | 82 | #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) |
76 | 83 | #define in_nmi() (preempt_count() & NMI_MASK) | |
77 | /* | 84 | #define in_task() (!(preempt_count() & \ |
78 | * Are we in NMI context? | 85 | (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) |
79 | */ | ||
80 | #define in_nmi() (preempt_count() & NMI_MASK) | ||
81 | 86 | ||
82 | /* | 87 | /* |
83 | * The preempt_count offset after preempt_disable(); | 88 | * The preempt_count offset after preempt_disable(); |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 8863bdf582d5..7551d3e2ab70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!( | |||
262 | #define set_task_state(tsk, state_value) \ | 262 | #define set_task_state(tsk, state_value) \ |
263 | do { \ | 263 | do { \ |
264 | (tsk)->task_state_change = _THIS_IP_; \ | 264 | (tsk)->task_state_change = _THIS_IP_; \ |
265 | smp_store_mb((tsk)->state, (state_value)); \ | 265 | smp_store_mb((tsk)->state, (state_value)); \ |
266 | } while (0) | 266 | } while (0) |
267 | 267 | ||
268 | /* | ||
269 | * set_current_state() includes a barrier so that the write of current->state | ||
270 | * is correctly serialised wrt the caller's subsequent test of whether to | ||
271 | * actually sleep: | ||
272 | * | ||
273 | * set_current_state(TASK_UNINTERRUPTIBLE); | ||
274 | * if (do_i_need_to_sleep()) | ||
275 | * schedule(); | ||
276 | * | ||
277 | * If the caller does not need such serialisation then use __set_current_state() | ||
278 | */ | ||
279 | #define __set_current_state(state_value) \ | 268 | #define __set_current_state(state_value) \ |
280 | do { \ | 269 | do { \ |
281 | current->task_state_change = _THIS_IP_; \ | 270 | current->task_state_change = _THIS_IP_; \ |
@@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!( | |||
284 | #define set_current_state(state_value) \ | 273 | #define set_current_state(state_value) \ |
285 | do { \ | 274 | do { \ |
286 | current->task_state_change = _THIS_IP_; \ | 275 | current->task_state_change = _THIS_IP_; \ |
287 | smp_store_mb(current->state, (state_value)); \ | 276 | smp_store_mb(current->state, (state_value)); \ |
288 | } while (0) | 277 | } while (0) |
289 | 278 | ||
290 | #else | 279 | #else |
291 | 280 | ||
281 | /* | ||
282 | * @tsk had better be current, or you get to keep the pieces. | ||
283 | * | ||
284 | * The only reason is that computing current can be more expensive than | ||
285 | * using a pointer that's already available. | ||
286 | * | ||
287 | * Therefore, see set_current_state(). | ||
288 | */ | ||
292 | #define __set_task_state(tsk, state_value) \ | 289 | #define __set_task_state(tsk, state_value) \ |
293 | do { (tsk)->state = (state_value); } while (0) | 290 | do { (tsk)->state = (state_value); } while (0) |
294 | #define set_task_state(tsk, state_value) \ | 291 | #define set_task_state(tsk, state_value) \ |
@@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!( | |||
299 | * is correctly serialised wrt the caller's subsequent test of whether to | 296 | * is correctly serialised wrt the caller's subsequent test of whether to |
300 | * actually sleep: | 297 | * actually sleep: |
301 | * | 298 | * |
299 | * for (;;) { | ||
302 | * set_current_state(TASK_UNINTERRUPTIBLE); | 300 | * set_current_state(TASK_UNINTERRUPTIBLE); |
303 | * if (do_i_need_to_sleep()) | 301 | * if (!need_sleep) |
304 | * schedule(); | 302 | * break; |
303 | * | ||
304 | * schedule(); | ||
305 | * } | ||
306 | * __set_current_state(TASK_RUNNING); | ||
307 | * | ||
308 | * If the caller does not need such serialisation (because, for instance, the | ||
309 | * condition test and condition change and wakeup are under the same lock) then | ||
310 | * use __set_current_state(). | ||
311 | * | ||
312 | * The above is typically ordered against the wakeup, which does: | ||
313 | * | ||
314 | * need_sleep = false; | ||
315 | * wake_up_state(p, TASK_UNINTERRUPTIBLE); | ||
316 | * | ||
317 | * Where wake_up_state() (and all other wakeup primitives) imply enough | ||
318 | * barriers to order the store of the variable against wakeup. | ||
319 | * | ||
320 | * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is, | ||
321 | * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a | ||
322 | * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). | ||
305 | * | 323 | * |
306 | * If the caller does not need such serialisation then use __set_current_state() | 324 | * This is obviously fine, since they both store the exact same value. |
325 | * | ||
326 | * Also see the comments of try_to_wake_up(). | ||
307 | */ | 327 | */ |
308 | #define __set_current_state(state_value) \ | 328 | #define __set_current_state(state_value) \ |
309 | do { current->state = (state_value); } while (0) | 329 | do { current->state = (state_value); } while (0) |
@@ -1057,6 +1077,8 @@ static inline int cpu_numa_flags(void) | |||
1057 | } | 1077 | } |
1058 | #endif | 1078 | #endif |
1059 | 1079 | ||
1080 | extern int arch_asym_cpu_priority(int cpu); | ||
1081 | |||
1060 | struct sched_domain_attr { | 1082 | struct sched_domain_attr { |
1061 | int relax_domain_level; | 1083 | int relax_domain_level; |
1062 | }; | 1084 | }; |
@@ -1627,7 +1649,10 @@ struct task_struct { | |||
1627 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ | 1649 | int __user *set_child_tid; /* CLONE_CHILD_SETTID */ |
1628 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ | 1650 | int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */ |
1629 | 1651 | ||
1630 | cputime_t utime, stime, utimescaled, stimescaled; | 1652 | cputime_t utime, stime; |
1653 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | ||
1654 | cputime_t utimescaled, stimescaled; | ||
1655 | #endif | ||
1631 | cputime_t gtime; | 1656 | cputime_t gtime; |
1632 | struct prev_cputime prev_cputime; | 1657 | struct prev_cputime prev_cputime; |
1633 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1658 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
@@ -2220,34 +2245,38 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask); | |||
2220 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 2245 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
2221 | extern void task_cputime(struct task_struct *t, | 2246 | extern void task_cputime(struct task_struct *t, |
2222 | cputime_t *utime, cputime_t *stime); | 2247 | cputime_t *utime, cputime_t *stime); |
2223 | extern void task_cputime_scaled(struct task_struct *t, | ||
2224 | cputime_t *utimescaled, cputime_t *stimescaled); | ||
2225 | extern cputime_t task_gtime(struct task_struct *t); | 2248 | extern cputime_t task_gtime(struct task_struct *t); |
2226 | #else | 2249 | #else |
2227 | static inline void task_cputime(struct task_struct *t, | 2250 | static inline void task_cputime(struct task_struct *t, |
2228 | cputime_t *utime, cputime_t *stime) | 2251 | cputime_t *utime, cputime_t *stime) |
2229 | { | 2252 | { |
2230 | if (utime) | 2253 | *utime = t->utime; |
2231 | *utime = t->utime; | 2254 | *stime = t->stime; |
2232 | if (stime) | ||
2233 | *stime = t->stime; | ||
2234 | } | 2255 | } |
2235 | 2256 | ||
2257 | static inline cputime_t task_gtime(struct task_struct *t) | ||
2258 | { | ||
2259 | return t->gtime; | ||
2260 | } | ||
2261 | #endif | ||
2262 | |||
2263 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | ||
2236 | static inline void task_cputime_scaled(struct task_struct *t, | 2264 | static inline void task_cputime_scaled(struct task_struct *t, |
2237 | cputime_t *utimescaled, | 2265 | cputime_t *utimescaled, |
2238 | cputime_t *stimescaled) | 2266 | cputime_t *stimescaled) |
2239 | { | 2267 | { |
2240 | if (utimescaled) | 2268 | *utimescaled = t->utimescaled; |
2241 | *utimescaled = t->utimescaled; | 2269 | *stimescaled = t->stimescaled; |
2242 | if (stimescaled) | ||
2243 | *stimescaled = t->stimescaled; | ||
2244 | } | 2270 | } |
2245 | 2271 | #else | |
2246 | static inline cputime_t task_gtime(struct task_struct *t) | 2272 | static inline void task_cputime_scaled(struct task_struct *t, |
2273 | cputime_t *utimescaled, | ||
2274 | cputime_t *stimescaled) | ||
2247 | { | 2275 | { |
2248 | return t->gtime; | 2276 | task_cputime(t, utimescaled, stimescaled); |
2249 | } | 2277 | } |
2250 | #endif | 2278 | #endif |
2279 | |||
2251 | extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); | 2280 | extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); |
2252 | extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); | 2281 | extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); |
2253 | 2282 | ||
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 22db1e63707e..441145351301 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h | |||
@@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size; | |||
36 | extern unsigned int sysctl_sched_migration_cost; | 36 | extern unsigned int sysctl_sched_migration_cost; |
37 | extern unsigned int sysctl_sched_nr_migrate; | 37 | extern unsigned int sysctl_sched_nr_migrate; |
38 | extern unsigned int sysctl_sched_time_avg; | 38 | extern unsigned int sysctl_sched_time_avg; |
39 | extern unsigned int sysctl_sched_shares_window; | ||
40 | 39 | ||
41 | int sched_proc_update_handler(struct ctl_table *table, int write, | 40 | int sched_proc_update_handler(struct ctl_table *table, int write, |
42 | void __user *buffer, size_t *length, | 41 | void __user *buffer, size_t *length, |
diff --git a/kernel/fork.c b/kernel/fork.c index 997ac1d584f7..7ffa16033ded 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -354,6 +354,8 @@ void free_task(struct task_struct *tsk) | |||
354 | ftrace_graph_exit_task(tsk); | 354 | ftrace_graph_exit_task(tsk); |
355 | put_seccomp_filter(tsk); | 355 | put_seccomp_filter(tsk); |
356 | arch_release_task_struct(tsk); | 356 | arch_release_task_struct(tsk); |
357 | if (tsk->flags & PF_KTHREAD) | ||
358 | free_kthread_struct(tsk); | ||
357 | free_task_struct(tsk); | 359 | free_task_struct(tsk); |
358 | } | 360 | } |
359 | EXPORT_SYMBOL(free_task); | 361 | EXPORT_SYMBOL(free_task); |
@@ -1551,7 +1553,9 @@ static __latent_entropy struct task_struct *copy_process( | |||
1551 | init_sigpending(&p->pending); | 1553 | init_sigpending(&p->pending); |
1552 | 1554 | ||
1553 | p->utime = p->stime = p->gtime = 0; | 1555 | p->utime = p->stime = p->gtime = 0; |
1556 | #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME | ||
1554 | p->utimescaled = p->stimescaled = 0; | 1557 | p->utimescaled = p->stimescaled = 0; |
1558 | #endif | ||
1555 | prev_cputime_init(&p->prev_cputime); | 1559 | prev_cputime_init(&p->prev_cputime); |
1556 | 1560 | ||
1557 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN | 1561 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN |
diff --git a/kernel/kthread.c b/kernel/kthread.c index be2cc1f9dd57..956495f0efaf 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -53,20 +53,29 @@ enum KTHREAD_BITS { | |||
53 | KTHREAD_IS_PARKED, | 53 | KTHREAD_IS_PARKED, |
54 | }; | 54 | }; |
55 | 55 | ||
56 | #define __to_kthread(vfork) \ | 56 | static inline void set_kthread_struct(void *kthread) |
57 | container_of(vfork, struct kthread, exited) | 57 | { |
58 | /* | ||
59 | * We abuse ->set_child_tid to avoid the new member and because it | ||
60 | * can't be wrongly copied by copy_process(). We also rely on fact | ||
61 | * that the caller can't exec, so PF_KTHREAD can't be cleared. | ||
62 | */ | ||
63 | current->set_child_tid = (__force void __user *)kthread; | ||
64 | } | ||
58 | 65 | ||
59 | static inline struct kthread *to_kthread(struct task_struct *k) | 66 | static inline struct kthread *to_kthread(struct task_struct *k) |
60 | { | 67 | { |
61 | return __to_kthread(k->vfork_done); | 68 | WARN_ON(!(k->flags & PF_KTHREAD)); |
69 | return (__force void *)k->set_child_tid; | ||
62 | } | 70 | } |
63 | 71 | ||
64 | static struct kthread *to_live_kthread(struct task_struct *k) | 72 | void free_kthread_struct(struct task_struct *k) |
65 | { | 73 | { |
66 | struct completion *vfork = ACCESS_ONCE(k->vfork_done); | 74 | /* |
67 | if (likely(vfork) && try_get_task_stack(k)) | 75 | * Can be NULL if this kthread was created by kernel_thread() |
68 | return __to_kthread(vfork); | 76 | * or if kmalloc() in kthread() failed. |
69 | return NULL; | 77 | */ |
78 | kfree(to_kthread(k)); | ||
70 | } | 79 | } |
71 | 80 | ||
72 | /** | 81 | /** |
@@ -181,14 +190,11 @@ static int kthread(void *_create) | |||
181 | int (*threadfn)(void *data) = create->threadfn; | 190 | int (*threadfn)(void *data) = create->threadfn; |
182 | void *data = create->data; | 191 | void *data = create->data; |
183 | struct completion *done; | 192 | struct completion *done; |
184 | struct kthread self; | 193 | struct kthread *self; |
185 | int ret; | 194 | int ret; |
186 | 195 | ||
187 | self.flags = 0; | 196 | self = kmalloc(sizeof(*self), GFP_KERNEL); |
188 | self.data = data; | 197 | set_kthread_struct(self); |
189 | init_completion(&self.exited); | ||
190 | init_completion(&self.parked); | ||
191 | current->vfork_done = &self.exited; | ||
192 | 198 | ||
193 | /* If user was SIGKILLed, I release the structure. */ | 199 | /* If user was SIGKILLed, I release the structure. */ |
194 | done = xchg(&create->done, NULL); | 200 | done = xchg(&create->done, NULL); |
@@ -196,6 +202,19 @@ static int kthread(void *_create) | |||
196 | kfree(create); | 202 | kfree(create); |
197 | do_exit(-EINTR); | 203 | do_exit(-EINTR); |
198 | } | 204 | } |
205 | |||
206 | if (!self) { | ||
207 | create->result = ERR_PTR(-ENOMEM); | ||
208 | complete(done); | ||
209 | do_exit(-ENOMEM); | ||
210 | } | ||
211 | |||
212 | self->flags = 0; | ||
213 | self->data = data; | ||
214 | init_completion(&self->exited); | ||
215 | init_completion(&self->parked); | ||
216 | current->vfork_done = &self->exited; | ||
217 | |||
199 | /* OK, tell user we're spawned, wait for stop or wakeup */ | 218 | /* OK, tell user we're spawned, wait for stop or wakeup */ |
200 | __set_current_state(TASK_UNINTERRUPTIBLE); | 219 | __set_current_state(TASK_UNINTERRUPTIBLE); |
201 | create->result = current; | 220 | create->result = current; |
@@ -203,12 +222,10 @@ static int kthread(void *_create) | |||
203 | schedule(); | 222 | schedule(); |
204 | 223 | ||
205 | ret = -EINTR; | 224 | ret = -EINTR; |
206 | 225 | if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) { | |
207 | if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) { | 226 | __kthread_parkme(self); |
208 | __kthread_parkme(&self); | ||
209 | ret = threadfn(data); | 227 | ret = threadfn(data); |
210 | } | 228 | } |
211 | /* we can't just return, we must preserve "self" on stack */ | ||
212 | do_exit(ret); | 229 | do_exit(ret); |
213 | } | 230 | } |
214 | 231 | ||
@@ -409,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), | |||
409 | return p; | 426 | return p; |
410 | } | 427 | } |
411 | 428 | ||
412 | static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) | 429 | /** |
430 | * kthread_unpark - unpark a thread created by kthread_create(). | ||
431 | * @k: thread created by kthread_create(). | ||
432 | * | ||
433 | * Sets kthread_should_park() for @k to return false, wakes it, and | ||
434 | * waits for it to return. If the thread is marked percpu then its | ||
435 | * bound to the cpu again. | ||
436 | */ | ||
437 | void kthread_unpark(struct task_struct *k) | ||
413 | { | 438 | { |
439 | struct kthread *kthread = to_kthread(k); | ||
440 | |||
414 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 441 | clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
415 | /* | 442 | /* |
416 | * We clear the IS_PARKED bit here as we don't wait | 443 | * We clear the IS_PARKED bit here as we don't wait |
@@ -428,24 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) | |||
428 | wake_up_state(k, TASK_PARKED); | 455 | wake_up_state(k, TASK_PARKED); |
429 | } | 456 | } |
430 | } | 457 | } |
431 | |||
432 | /** | ||
433 | * kthread_unpark - unpark a thread created by kthread_create(). | ||
434 | * @k: thread created by kthread_create(). | ||
435 | * | ||
436 | * Sets kthread_should_park() for @k to return false, wakes it, and | ||
437 | * waits for it to return. If the thread is marked percpu then its | ||
438 | * bound to the cpu again. | ||
439 | */ | ||
440 | void kthread_unpark(struct task_struct *k) | ||
441 | { | ||
442 | struct kthread *kthread = to_live_kthread(k); | ||
443 | |||
444 | if (kthread) { | ||
445 | __kthread_unpark(k, kthread); | ||
446 | put_task_stack(k); | ||
447 | } | ||
448 | } | ||
449 | EXPORT_SYMBOL_GPL(kthread_unpark); | 458 | EXPORT_SYMBOL_GPL(kthread_unpark); |
450 | 459 | ||
451 | /** | 460 | /** |
@@ -462,21 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark); | |||
462 | */ | 471 | */ |
463 | int kthread_park(struct task_struct *k) | 472 | int kthread_park(struct task_struct *k) |
464 | { | 473 | { |
465 | struct kthread *kthread = to_live_kthread(k); | 474 | struct kthread *kthread = to_kthread(k); |
466 | int ret = -ENOSYS; | 475 | |
467 | 476 | if (WARN_ON(k->flags & PF_EXITING)) | |
468 | if (kthread) { | 477 | return -ENOSYS; |
469 | if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { | 478 | |
470 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); | 479 | if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) { |
471 | if (k != current) { | 480 | set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); |
472 | wake_up_process(k); | 481 | if (k != current) { |
473 | wait_for_completion(&kthread->parked); | 482 | wake_up_process(k); |
474 | } | 483 | wait_for_completion(&kthread->parked); |
475 | } | 484 | } |
476 | put_task_stack(k); | ||
477 | ret = 0; | ||
478 | } | 485 | } |
479 | return ret; | 486 | |
487 | return 0; | ||
480 | } | 488 | } |
481 | EXPORT_SYMBOL_GPL(kthread_park); | 489 | EXPORT_SYMBOL_GPL(kthread_park); |
482 | 490 | ||
@@ -503,14 +511,11 @@ int kthread_stop(struct task_struct *k) | |||
503 | trace_sched_kthread_stop(k); | 511 | trace_sched_kthread_stop(k); |
504 | 512 | ||
505 | get_task_struct(k); | 513 | get_task_struct(k); |
506 | kthread = to_live_kthread(k); | 514 | kthread = to_kthread(k); |
507 | if (kthread) { | 515 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); |
508 | set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); | 516 | kthread_unpark(k); |
509 | __kthread_unpark(k, kthread); | 517 | wake_up_process(k); |
510 | wake_up_process(k); | 518 | wait_for_completion(&kthread->exited); |
511 | wait_for_completion(&kthread->exited); | ||
512 | put_task_stack(k); | ||
513 | } | ||
514 | ret = k->exit_code; | 519 | ret = k->exit_code; |
515 | put_task_struct(k); | 520 | put_task_struct(k); |
516 | 521 | ||
@@ -636,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags, | |||
636 | { | 641 | { |
637 | struct kthread_worker *worker; | 642 | struct kthread_worker *worker; |
638 | struct task_struct *task; | 643 | struct task_struct *task; |
644 | int node = -1; | ||
639 | 645 | ||
640 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); | 646 | worker = kzalloc(sizeof(*worker), GFP_KERNEL); |
641 | if (!worker) | 647 | if (!worker) |
@@ -643,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags, | |||
643 | 649 | ||
644 | kthread_init_worker(worker); | 650 | kthread_init_worker(worker); |
645 | 651 | ||
646 | if (cpu >= 0) { | 652 | if (cpu >= 0) |
647 | char name[TASK_COMM_LEN]; | 653 | node = cpu_to_node(cpu); |
648 | |||
649 | /* | ||
650 | * kthread_create_worker_on_cpu() allows to pass a generic | ||
651 | * namefmt in compare with kthread_create_on_cpu. We need | ||
652 | * to format it here. | ||
653 | */ | ||
654 | vsnprintf(name, sizeof(name), namefmt, args); | ||
655 | task = kthread_create_on_cpu(kthread_worker_fn, worker, | ||
656 | cpu, name); | ||
657 | } else { | ||
658 | task = __kthread_create_on_node(kthread_worker_fn, worker, | ||
659 | -1, namefmt, args); | ||
660 | } | ||
661 | 654 | ||
655 | task = __kthread_create_on_node(kthread_worker_fn, worker, | ||
656 | node, namefmt, args); | ||
662 | if (IS_ERR(task)) | 657 | if (IS_ERR(task)) |
663 | goto fail_task; | 658 | goto fail_task; |
664 | 659 | ||
660 | if (cpu >= 0) | ||
661 | kthread_bind(task, cpu); | ||
662 | |||
665 | worker->flags = flags; | 663 | worker->flags = flags; |
666 | worker->task = task; | 664 | worker->task = task; |
667 | wake_up_process(task); | 665 | wake_up_process(task); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8b08fb257856..d18804491d9f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) | |||
1995 | * @state: the mask of task states that can be woken | 1995 | * @state: the mask of task states that can be woken |
1996 | * @wake_flags: wake modifier flags (WF_*) | 1996 | * @wake_flags: wake modifier flags (WF_*) |
1997 | * | 1997 | * |
1998 | * Put it on the run-queue if it's not already there. The "current" | 1998 | * If (@state & @p->state) @p->state = TASK_RUNNING. |
1999 | * thread is always on the run-queue (except when the actual | ||
2000 | * re-schedule is in progress), and as such you're allowed to do | ||
2001 | * the simpler "current->state = TASK_RUNNING" to mark yourself | ||
2002 | * runnable without the overhead of this. | ||
2003 | * | 1999 | * |
2004 | * Return: %true if @p was woken up, %false if it was already running. | 2000 | * If the task was not queued/runnable, also place it back on a runqueue. |
2005 | * or @state didn't match @p's state. | 2001 | * |
2002 | * Atomic against schedule() which would dequeue a task, also see | ||
2003 | * set_current_state(). | ||
2004 | * | ||
2005 | * Return: %true if @p->state changes (an actual wakeup was done), | ||
2006 | * %false otherwise. | ||
2006 | */ | 2007 | */ |
2007 | static int | 2008 | static int |
2008 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) | 2009 | try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) |
@@ -5707,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5707 | printk(KERN_CONT " %*pbl", | 5708 | printk(KERN_CONT " %*pbl", |
5708 | cpumask_pr_args(sched_group_cpus(group))); | 5709 | cpumask_pr_args(sched_group_cpus(group))); |
5709 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { | 5710 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) { |
5710 | printk(KERN_CONT " (cpu_capacity = %d)", | 5711 | printk(KERN_CONT " (cpu_capacity = %lu)", |
5711 | group->sgc->capacity); | 5712 | group->sgc->capacity); |
5712 | } | 5713 | } |
5713 | 5714 | ||
@@ -6184,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6184 | * die on a /0 trap. | 6185 | * die on a /0 trap. |
6185 | */ | 6186 | */ |
6186 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); | 6187 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); |
6188 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; | ||
6187 | 6189 | ||
6188 | /* | 6190 | /* |
6189 | * Make sure the first group of this domain contains the | 6191 | * Make sure the first group of this domain contains the |
@@ -6301,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) | |||
6301 | WARN_ON(!sg); | 6303 | WARN_ON(!sg); |
6302 | 6304 | ||
6303 | do { | 6305 | do { |
6306 | int cpu, max_cpu = -1; | ||
6307 | |||
6304 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); | 6308 | sg->group_weight = cpumask_weight(sched_group_cpus(sg)); |
6309 | |||
6310 | if (!(sd->flags & SD_ASYM_PACKING)) | ||
6311 | goto next; | ||
6312 | |||
6313 | for_each_cpu(cpu, sched_group_cpus(sg)) { | ||
6314 | if (max_cpu < 0) | ||
6315 | max_cpu = cpu; | ||
6316 | else if (sched_asym_prefer(cpu, max_cpu)) | ||
6317 | max_cpu = cpu; | ||
6318 | } | ||
6319 | sg->asym_prefer_cpu = max_cpu; | ||
6320 | |||
6321 | next: | ||
6305 | sg = sg->next; | 6322 | sg = sg->next; |
6306 | } while (sg != sd->groups); | 6323 | } while (sg != sd->groups); |
6307 | 6324 | ||
@@ -7602,6 +7619,7 @@ void __init sched_init(void) | |||
7602 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7619 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7603 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; | 7620 | root_task_group.shares = ROOT_TASK_GROUP_LOAD; |
7604 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7621 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7622 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
7605 | /* | 7623 | /* |
7606 | * How much cpu bandwidth does root_task_group get? | 7624 | * How much cpu bandwidth does root_task_group get? |
7607 | * | 7625 | * |
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bc0b309c3f19..9add206b5608 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v) | |||
297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { | 297 | for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { |
298 | seq_printf(sf, "%s %lld\n", | 298 | seq_printf(sf, "%s %lld\n", |
299 | cpuacct_stat_desc[stat], | 299 | cpuacct_stat_desc[stat], |
300 | cputime64_to_clock_t(val[stat])); | 300 | (long long)cputime64_to_clock_t(val[stat])); |
301 | } | 301 | } |
302 | 302 | ||
303 | return 0; | 303 | return 0; |
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5ebee3164e64..7700a9cba335 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
128 | * Account user cpu time to a process. | 128 | * Account user cpu time to a process. |
129 | * @p: the process that the cpu time gets accounted to | 129 | * @p: the process that the cpu time gets accounted to |
130 | * @cputime: the cpu time spent in user space since the last update | 130 | * @cputime: the cpu time spent in user space since the last update |
131 | * @cputime_scaled: cputime scaled by cpu frequency | ||
132 | */ | 131 | */ |
133 | void account_user_time(struct task_struct *p, cputime_t cputime, | 132 | void account_user_time(struct task_struct *p, cputime_t cputime) |
134 | cputime_t cputime_scaled) | ||
135 | { | 133 | { |
136 | int index; | 134 | int index; |
137 | 135 | ||
138 | /* Add user time to process. */ | 136 | /* Add user time to process. */ |
139 | p->utime += cputime; | 137 | p->utime += cputime; |
140 | p->utimescaled += cputime_scaled; | ||
141 | account_group_user_time(p, cputime); | 138 | account_group_user_time(p, cputime); |
142 | 139 | ||
143 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; | 140 | index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; |
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime, | |||
153 | * Account guest cpu time to a process. | 150 | * Account guest cpu time to a process. |
154 | * @p: the process that the cpu time gets accounted to | 151 | * @p: the process that the cpu time gets accounted to |
155 | * @cputime: the cpu time spent in virtual machine since the last update | 152 | * @cputime: the cpu time spent in virtual machine since the last update |
156 | * @cputime_scaled: cputime scaled by cpu frequency | ||
157 | */ | 153 | */ |
158 | static void account_guest_time(struct task_struct *p, cputime_t cputime, | 154 | static void account_guest_time(struct task_struct *p, cputime_t cputime) |
159 | cputime_t cputime_scaled) | ||
160 | { | 155 | { |
161 | u64 *cpustat = kcpustat_this_cpu->cpustat; | 156 | u64 *cpustat = kcpustat_this_cpu->cpustat; |
162 | 157 | ||
163 | /* Add guest time to process. */ | 158 | /* Add guest time to process. */ |
164 | p->utime += cputime; | 159 | p->utime += cputime; |
165 | p->utimescaled += cputime_scaled; | ||
166 | account_group_user_time(p, cputime); | 160 | account_group_user_time(p, cputime); |
167 | p->gtime += cputime; | 161 | p->gtime += cputime; |
168 | 162 | ||
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime, | |||
180 | * Account system cpu time to a process and desired cpustat field | 174 | * Account system cpu time to a process and desired cpustat field |
181 | * @p: the process that the cpu time gets accounted to | 175 | * @p: the process that the cpu time gets accounted to |
182 | * @cputime: the cpu time spent in kernel space since the last update | 176 | * @cputime: the cpu time spent in kernel space since the last update |
183 | * @cputime_scaled: cputime scaled by cpu frequency | 177 | * @index: pointer to cpustat field that has to be updated |
184 | * @target_cputime64: pointer to cpustat field that has to be updated | ||
185 | */ | 178 | */ |
186 | static inline | 179 | static inline |
187 | void __account_system_time(struct task_struct *p, cputime_t cputime, | 180 | void __account_system_time(struct task_struct *p, cputime_t cputime, int index) |
188 | cputime_t cputime_scaled, int index) | ||
189 | { | 181 | { |
190 | /* Add system time to process. */ | 182 | /* Add system time to process. */ |
191 | p->stime += cputime; | 183 | p->stime += cputime; |
192 | p->stimescaled += cputime_scaled; | ||
193 | account_group_system_time(p, cputime); | 184 | account_group_system_time(p, cputime); |
194 | 185 | ||
195 | /* Add system time to cpustat. */ | 186 | /* Add system time to cpustat. */ |
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime, | |||
204 | * @p: the process that the cpu time gets accounted to | 195 | * @p: the process that the cpu time gets accounted to |
205 | * @hardirq_offset: the offset to subtract from hardirq_count() | 196 | * @hardirq_offset: the offset to subtract from hardirq_count() |
206 | * @cputime: the cpu time spent in kernel space since the last update | 197 | * @cputime: the cpu time spent in kernel space since the last update |
207 | * @cputime_scaled: cputime scaled by cpu frequency | ||
208 | */ | 198 | */ |
209 | void account_system_time(struct task_struct *p, int hardirq_offset, | 199 | void account_system_time(struct task_struct *p, int hardirq_offset, |
210 | cputime_t cputime, cputime_t cputime_scaled) | 200 | cputime_t cputime) |
211 | { | 201 | { |
212 | int index; | 202 | int index; |
213 | 203 | ||
214 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { | 204 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { |
215 | account_guest_time(p, cputime, cputime_scaled); | 205 | account_guest_time(p, cputime); |
216 | return; | 206 | return; |
217 | } | 207 | } |
218 | 208 | ||
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
223 | else | 213 | else |
224 | index = CPUTIME_SYSTEM; | 214 | index = CPUTIME_SYSTEM; |
225 | 215 | ||
226 | __account_system_time(p, cputime, cputime_scaled, index); | 216 | __account_system_time(p, cputime, index); |
227 | } | 217 | } |
228 | 218 | ||
229 | /* | 219 | /* |
@@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
390 | struct rq *rq, int ticks) | 380 | struct rq *rq, int ticks) |
391 | { | 381 | { |
392 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; | 382 | u64 cputime = (__force u64) cputime_one_jiffy * ticks; |
393 | cputime_t scaled, other; | 383 | cputime_t other; |
394 | 384 | ||
395 | /* | 385 | /* |
396 | * When returning from idle, many ticks can get accounted at | 386 | * When returning from idle, many ticks can get accounted at |
@@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
403 | if (other >= cputime) | 393 | if (other >= cputime) |
404 | return; | 394 | return; |
405 | cputime -= other; | 395 | cputime -= other; |
406 | scaled = cputime_to_scaled(cputime); | ||
407 | 396 | ||
408 | if (this_cpu_ksoftirqd() == p) { | 397 | if (this_cpu_ksoftirqd() == p) { |
409 | /* | 398 | /* |
@@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, | |||
411 | * So, we have to handle it separately here. | 400 | * So, we have to handle it separately here. |
412 | * Also, p->stime needs to be updated for ksoftirqd. | 401 | * Also, p->stime needs to be updated for ksoftirqd. |
413 | */ | 402 | */ |
414 | __account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ); | 403 | __account_system_time(p, cputime, CPUTIME_SOFTIRQ); |
415 | } else if (user_tick) { | 404 | } else if (user_tick) { |
416 | account_user_time(p, cputime, scaled); | 405 | account_user_time(p, cputime); |
417 | } else if (p == rq->idle) { | 406 | } else if (p == rq->idle) { |
418 | account_idle_time(cputime); | 407 | account_idle_time(cputime); |
419 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ | 408 | } else if (p->flags & PF_VCPU) { /* System time or guest time */ |
420 | account_guest_time(p, cputime, scaled); | 409 | account_guest_time(p, cputime); |
421 | } else { | 410 | } else { |
422 | __account_system_time(p, cputime, scaled, CPUTIME_SYSTEM); | 411 | __account_system_time(p, cputime, CPUTIME_SYSTEM); |
423 | } | 412 | } |
424 | } | 413 | } |
425 | 414 | ||
@@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime | |||
502 | */ | 491 | */ |
503 | void account_process_tick(struct task_struct *p, int user_tick) | 492 | void account_process_tick(struct task_struct *p, int user_tick) |
504 | { | 493 | { |
505 | cputime_t cputime, scaled, steal; | 494 | cputime_t cputime, steal; |
506 | struct rq *rq = this_rq(); | 495 | struct rq *rq = this_rq(); |
507 | 496 | ||
508 | if (vtime_accounting_cpu_enabled()) | 497 | if (vtime_accounting_cpu_enabled()) |
@@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick) | |||
520 | return; | 509 | return; |
521 | 510 | ||
522 | cputime -= steal; | 511 | cputime -= steal; |
523 | scaled = cputime_to_scaled(cputime); | ||
524 | 512 | ||
525 | if (user_tick) | 513 | if (user_tick) |
526 | account_user_time(p, cputime, scaled); | 514 | account_user_time(p, cputime); |
527 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) | 515 | else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) |
528 | account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); | 516 | account_system_time(p, HARDIRQ_OFFSET, cputime); |
529 | else | 517 | else |
530 | account_idle_time(cputime); | 518 | account_idle_time(cputime); |
531 | } | 519 | } |
@@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk) | |||
746 | { | 734 | { |
747 | cputime_t delta_cpu = get_vtime_delta(tsk); | 735 | cputime_t delta_cpu = get_vtime_delta(tsk); |
748 | 736 | ||
749 | account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu)); | 737 | account_system_time(tsk, irq_count(), delta_cpu); |
750 | } | 738 | } |
751 | 739 | ||
752 | void vtime_account_system(struct task_struct *tsk) | 740 | void vtime_account_system(struct task_struct *tsk) |
@@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk) | |||
767 | tsk->vtime_snap_whence = VTIME_SYS; | 755 | tsk->vtime_snap_whence = VTIME_SYS; |
768 | if (vtime_delta(tsk)) { | 756 | if (vtime_delta(tsk)) { |
769 | delta_cpu = get_vtime_delta(tsk); | 757 | delta_cpu = get_vtime_delta(tsk); |
770 | account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); | 758 | account_user_time(tsk, delta_cpu); |
771 | } | 759 | } |
772 | write_seqcount_end(&tsk->vtime_seqcount); | 760 | write_seqcount_end(&tsk->vtime_seqcount); |
773 | } | 761 | } |
@@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t) | |||
863 | * add up the pending nohz execution time since the last | 851 | * add up the pending nohz execution time since the last |
864 | * cputime snapshot. | 852 | * cputime snapshot. |
865 | */ | 853 | */ |
866 | static void | 854 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) |
867 | fetch_task_cputime(struct task_struct *t, | ||
868 | cputime_t *u_dst, cputime_t *s_dst, | ||
869 | cputime_t *u_src, cputime_t *s_src, | ||
870 | cputime_t *udelta, cputime_t *sdelta) | ||
871 | { | 855 | { |
856 | cputime_t delta; | ||
872 | unsigned int seq; | 857 | unsigned int seq; |
873 | unsigned long long delta; | ||
874 | 858 | ||
875 | do { | 859 | if (!vtime_accounting_enabled()) { |
876 | *udelta = 0; | 860 | *utime = t->utime; |
877 | *sdelta = 0; | 861 | *stime = t->stime; |
862 | return; | ||
863 | } | ||
878 | 864 | ||
865 | do { | ||
879 | seq = read_seqcount_begin(&t->vtime_seqcount); | 866 | seq = read_seqcount_begin(&t->vtime_seqcount); |
880 | 867 | ||
881 | if (u_dst) | 868 | *utime = t->utime; |
882 | *u_dst = *u_src; | 869 | *stime = t->stime; |
883 | if (s_dst) | ||
884 | *s_dst = *s_src; | ||
885 | 870 | ||
886 | /* Task is sleeping, nothing to add */ | 871 | /* Task is sleeping, nothing to add */ |
887 | if (t->vtime_snap_whence == VTIME_INACTIVE || | 872 | if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) |
888 | is_idle_task(t)) | ||
889 | continue; | 873 | continue; |
890 | 874 | ||
891 | delta = vtime_delta(t); | 875 | delta = vtime_delta(t); |
@@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t, | |||
894 | * Task runs either in user or kernel space, add pending nohz time to | 878 | * Task runs either in user or kernel space, add pending nohz time to |
895 | * the right place. | 879 | * the right place. |
896 | */ | 880 | */ |
897 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) { | 881 | if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) |
898 | *udelta = delta; | 882 | *utime += delta; |
899 | } else { | 883 | else if (t->vtime_snap_whence == VTIME_SYS) |
900 | if (t->vtime_snap_whence == VTIME_SYS) | 884 | *stime += delta; |
901 | *sdelta = delta; | ||
902 | } | ||
903 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); | 885 | } while (read_seqcount_retry(&t->vtime_seqcount, seq)); |
904 | } | 886 | } |
905 | |||
906 | |||
907 | void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) | ||
908 | { | ||
909 | cputime_t udelta, sdelta; | ||
910 | |||
911 | if (!vtime_accounting_enabled()) { | ||
912 | if (utime) | ||
913 | *utime = t->utime; | ||
914 | if (stime) | ||
915 | *stime = t->stime; | ||
916 | return; | ||
917 | } | ||
918 | |||
919 | fetch_task_cputime(t, utime, stime, &t->utime, | ||
920 | &t->stime, &udelta, &sdelta); | ||
921 | if (utime) | ||
922 | *utime += udelta; | ||
923 | if (stime) | ||
924 | *stime += sdelta; | ||
925 | } | ||
926 | |||
927 | void task_cputime_scaled(struct task_struct *t, | ||
928 | cputime_t *utimescaled, cputime_t *stimescaled) | ||
929 | { | ||
930 | cputime_t udelta, sdelta; | ||
931 | |||
932 | if (!vtime_accounting_enabled()) { | ||
933 | if (utimescaled) | ||
934 | *utimescaled = t->utimescaled; | ||
935 | if (stimescaled) | ||
936 | *stimescaled = t->stimescaled; | ||
937 | return; | ||
938 | } | ||
939 | |||
940 | fetch_task_cputime(t, utimescaled, stimescaled, | ||
941 | &t->utimescaled, &t->stimescaled, &udelta, &sdelta); | ||
942 | if (utimescaled) | ||
943 | *utimescaled += cputime_to_scaled(udelta); | ||
944 | if (stimescaled) | ||
945 | *stimescaled += cputime_to_scaled(sdelta); | ||
946 | } | ||
947 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ | 887 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 37e2449186c4..70ef2b1901e4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) | |||
586 | 586 | ||
587 | /* | 587 | /* |
588 | * The task might have changed its scheduling policy to something | 588 | * The task might have changed its scheduling policy to something |
589 | * different than SCHED_DEADLINE (through switched_fromd_dl()). | 589 | * different than SCHED_DEADLINE (through switched_from_dl()). |
590 | */ | 590 | */ |
591 | if (!dl_task(p)) { | 591 | if (!dl_task(p)) { |
592 | __dl_clear_params(p); | 592 | __dl_clear_params(p); |
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo | |||
1137 | pull_dl_task(rq); | 1137 | pull_dl_task(rq); |
1138 | lockdep_repin_lock(&rq->lock, cookie); | 1138 | lockdep_repin_lock(&rq->lock, cookie); |
1139 | /* | 1139 | /* |
1140 | * pull_rt_task() can drop (and re-acquire) rq->lock; this | 1140 | * pull_dl_task() can drop (and re-acquire) rq->lock; this |
1141 | * means a stop task can slip in, in which case we need to | 1141 | * means a stop task can slip in, in which case we need to |
1142 | * re-start task selection. | 1142 | * re-start task selection. |
1143 | */ | 1143 | */ |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c242944f5cbd..6559d197e08a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -37,7 +37,6 @@ | |||
37 | 37 | ||
38 | /* | 38 | /* |
39 | * Targeted preemption latency for CPU-bound tasks: | 39 | * Targeted preemption latency for CPU-bound tasks: |
40 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) | ||
41 | * | 40 | * |
42 | * NOTE: this latency value is not the same as the concept of | 41 | * NOTE: this latency value is not the same as the concept of |
43 | * 'timeslice length' - timeslices in CFS are of variable length | 42 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -46,31 +45,35 @@ | |||
46 | * | 45 | * |
47 | * (to see the precise effective timeslice length of your workload, | 46 | * (to see the precise effective timeslice length of your workload, |
48 | * run vmstat and monitor the context-switches (cs) field) | 47 | * run vmstat and monitor the context-switches (cs) field) |
48 | * | ||
49 | * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) | ||
49 | */ | 50 | */ |
50 | unsigned int sysctl_sched_latency = 6000000ULL; | 51 | unsigned int sysctl_sched_latency = 6000000ULL; |
51 | unsigned int normalized_sysctl_sched_latency = 6000000ULL; | 52 | unsigned int normalized_sysctl_sched_latency = 6000000ULL; |
52 | 53 | ||
53 | /* | 54 | /* |
54 | * The initial- and re-scaling of tunables is configurable | 55 | * The initial- and re-scaling of tunables is configurable |
55 | * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) | ||
56 | * | 56 | * |
57 | * Options are: | 57 | * Options are: |
58 | * SCHED_TUNABLESCALING_NONE - unscaled, always *1 | 58 | * |
59 | * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) | 59 | * SCHED_TUNABLESCALING_NONE - unscaled, always *1 |
60 | * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus | 60 | * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) |
61 | * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus | ||
62 | * | ||
63 | * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) | ||
61 | */ | 64 | */ |
62 | enum sched_tunable_scaling sysctl_sched_tunable_scaling | 65 | enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; |
63 | = SCHED_TUNABLESCALING_LOG; | ||
64 | 66 | ||
65 | /* | 67 | /* |
66 | * Minimal preemption granularity for CPU-bound tasks: | 68 | * Minimal preemption granularity for CPU-bound tasks: |
69 | * | ||
67 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) | 70 | * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) |
68 | */ | 71 | */ |
69 | unsigned int sysctl_sched_min_granularity = 750000ULL; | 72 | unsigned int sysctl_sched_min_granularity = 750000ULL; |
70 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; | 73 | unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; |
71 | 74 | ||
72 | /* | 75 | /* |
73 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity | 76 | * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity |
74 | */ | 77 | */ |
75 | static unsigned int sched_nr_latency = 8; | 78 | static unsigned int sched_nr_latency = 8; |
76 | 79 | ||
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; | |||
82 | 85 | ||
83 | /* | 86 | /* |
84 | * SCHED_OTHER wake-up granularity. | 87 | * SCHED_OTHER wake-up granularity. |
85 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
86 | * | 88 | * |
87 | * This option delays the preemption effects of decoupled workloads | 89 | * This option delays the preemption effects of decoupled workloads |
88 | * and reduces their over-scheduling. Synchronous workloads will still | 90 | * and reduces their over-scheduling. Synchronous workloads will still |
89 | * have immediate wakeup/sleep latencies. | 91 | * have immediate wakeup/sleep latencies. |
92 | * | ||
93 | * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) | ||
90 | */ | 94 | */ |
91 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; | 95 | unsigned int sysctl_sched_wakeup_granularity = 1000000UL; |
92 | unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; | 96 | unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; |
93 | 97 | ||
94 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 98 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
95 | 99 | ||
100 | #ifdef CONFIG_SMP | ||
96 | /* | 101 | /* |
97 | * The exponential sliding window over which load is averaged for shares | 102 | * For asym packing, by default the lower numbered cpu has higher priority. |
98 | * distribution. | ||
99 | * (default: 10msec) | ||
100 | */ | 103 | */ |
101 | unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | 104 | int __weak arch_asym_cpu_priority(int cpu) |
105 | { | ||
106 | return -cpu; | ||
107 | } | ||
108 | #endif | ||
102 | 109 | ||
103 | #ifdef CONFIG_CFS_BANDWIDTH | 110 | #ifdef CONFIG_CFS_BANDWIDTH |
104 | /* | 111 | /* |
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; | |||
109 | * to consumption or the quota being specified to be smaller than the slice) | 116 | * to consumption or the quota being specified to be smaller than the slice) |
110 | * we will always only issue the remaining available time. | 117 | * we will always only issue the remaining available time. |
111 | * | 118 | * |
112 | * default: 5 msec, units: microseconds | 119 | * (default: 5 msec, units: microseconds) |
113 | */ | 120 | */ |
114 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; | 121 | unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; |
115 | #endif | 122 | #endif |
116 | 123 | ||
117 | /* | 124 | /* |
118 | * The margin used when comparing utilization with CPU capacity: | 125 | * The margin used when comparing utilization with CPU capacity: |
119 | * util * 1024 < capacity * margin | 126 | * util * margin < capacity * 1024 |
127 | * | ||
128 | * (default: ~20%) | ||
120 | */ | 129 | */ |
121 | unsigned int capacity_margin = 1280; /* ~20% */ | 130 | unsigned int capacity_margin = 1280; |
122 | 131 | ||
123 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 132 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
124 | { | 133 | { |
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) | |||
290 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) | 299 | static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) |
291 | { | 300 | { |
292 | if (!cfs_rq->on_list) { | 301 | if (!cfs_rq->on_list) { |
302 | struct rq *rq = rq_of(cfs_rq); | ||
303 | int cpu = cpu_of(rq); | ||
293 | /* | 304 | /* |
294 | * Ensure we either appear before our parent (if already | 305 | * Ensure we either appear before our parent (if already |
295 | * enqueued) or force our parent to appear after us when it is | 306 | * enqueued) or force our parent to appear after us when it is |
296 | * enqueued. The fact that we always enqueue bottom-up | 307 | * enqueued. The fact that we always enqueue bottom-up |
297 | * reduces this to two cases. | 308 | * reduces this to two cases and a special case for the root |
309 | * cfs_rq. Furthermore, it also means that we will always reset | ||
310 | * tmp_alone_branch either when the branch is connected | ||
311 | * to a tree or when we reach the beg of the tree | ||
298 | */ | 312 | */ |
299 | if (cfs_rq->tg->parent && | 313 | if (cfs_rq->tg->parent && |
300 | cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { | 314 | cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { |
301 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | 315 | /* |
302 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | 316 | * If parent is already on the list, we add the child |
303 | } else { | 317 | * just before. Thanks to circular linked property of |
318 | * the list, this means to put the child at the tail | ||
319 | * of the list that starts by parent. | ||
320 | */ | ||
304 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | 321 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, |
305 | &rq_of(cfs_rq)->leaf_cfs_rq_list); | 322 | &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); |
323 | /* | ||
324 | * The branch is now connected to its tree so we can | ||
325 | * reset tmp_alone_branch to the beginning of the | ||
326 | * list. | ||
327 | */ | ||
328 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
329 | } else if (!cfs_rq->tg->parent) { | ||
330 | /* | ||
331 | * cfs rq without parent should be put | ||
332 | * at the tail of the list. | ||
333 | */ | ||
334 | list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
335 | &rq->leaf_cfs_rq_list); | ||
336 | /* | ||
337 | * We have reach the beg of a tree so we can reset | ||
338 | * tmp_alone_branch to the beginning of the list. | ||
339 | */ | ||
340 | rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; | ||
341 | } else { | ||
342 | /* | ||
343 | * The parent has not already been added so we want to | ||
344 | * make sure that it will be put after us. | ||
345 | * tmp_alone_branch points to the beg of the branch | ||
346 | * where we will add parent. | ||
347 | */ | ||
348 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, | ||
349 | rq->tmp_alone_branch); | ||
350 | /* | ||
351 | * update tmp_alone_branch to points to the new beg | ||
352 | * of the branch | ||
353 | */ | ||
354 | rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; | ||
306 | } | 355 | } |
307 | 356 | ||
308 | cfs_rq->on_list = 1; | 357 | cfs_rq->on_list = 1; |
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
708 | } | 757 | } |
709 | 758 | ||
710 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | 759 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); |
711 | static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); | 760 | static void attach_entity_cfs_rq(struct sched_entity *se); |
712 | static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); | ||
713 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); | ||
714 | 761 | ||
715 | /* | 762 | /* |
716 | * With new tasks being created, their initial util_avgs are extrapolated | 763 | * With new tasks being created, their initial util_avgs are extrapolated |
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
742 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 789 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
743 | struct sched_avg *sa = &se->avg; | 790 | struct sched_avg *sa = &se->avg; |
744 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; | 791 | long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; |
745 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
746 | 792 | ||
747 | if (cap > 0) { | 793 | if (cap > 0) { |
748 | if (cfs_rq->avg.util_avg != 0) { | 794 | if (cfs_rq->avg.util_avg != 0) { |
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se) | |||
770 | * such that the next switched_to_fair() has the | 816 | * such that the next switched_to_fair() has the |
771 | * expected state. | 817 | * expected state. |
772 | */ | 818 | */ |
773 | se->avg.last_update_time = now; | 819 | se->avg.last_update_time = cfs_rq_clock_task(cfs_rq); |
774 | return; | 820 | return; |
775 | } | 821 | } |
776 | } | 822 | } |
777 | 823 | ||
778 | update_cfs_rq_load_avg(now, cfs_rq, false); | 824 | attach_entity_cfs_rq(se); |
779 | attach_entity_load_avg(cfs_rq, se); | ||
780 | update_tg_load_avg(cfs_rq, false); | ||
781 | } | 825 | } |
782 | 826 | ||
783 | #else /* !CONFIG_SMP */ | 827 | #else /* !CONFIG_SMP */ |
@@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2890 | return decayed; | 2934 | return decayed; |
2891 | } | 2935 | } |
2892 | 2936 | ||
2937 | /* | ||
2938 | * Signed add and clamp on underflow. | ||
2939 | * | ||
2940 | * Explicitly do a load-store to ensure the intermediate value never hits | ||
2941 | * memory. This allows lockless observations without ever seeing the negative | ||
2942 | * values. | ||
2943 | */ | ||
2944 | #define add_positive(_ptr, _val) do { \ | ||
2945 | typeof(_ptr) ptr = (_ptr); \ | ||
2946 | typeof(_val) val = (_val); \ | ||
2947 | typeof(*ptr) res, var = READ_ONCE(*ptr); \ | ||
2948 | \ | ||
2949 | res = var + val; \ | ||
2950 | \ | ||
2951 | if (val < 0 && res > var) \ | ||
2952 | res = 0; \ | ||
2953 | \ | ||
2954 | WRITE_ONCE(*ptr, res); \ | ||
2955 | } while (0) | ||
2956 | |||
2893 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2957 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2894 | /** | 2958 | /** |
2895 | * update_tg_load_avg - update the tg's load avg | 2959 | * update_tg_load_avg - update the tg's load avg |
@@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se, | |||
2969 | se->avg.last_update_time = n_last_update_time; | 3033 | se->avg.last_update_time = n_last_update_time; |
2970 | } | 3034 | } |
2971 | } | 3035 | } |
3036 | |||
3037 | /* Take into account change of utilization of a child task group */ | ||
3038 | static inline void | ||
3039 | update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3040 | { | ||
3041 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); | ||
3042 | long delta = gcfs_rq->avg.util_avg - se->avg.util_avg; | ||
3043 | |||
3044 | /* Nothing to update */ | ||
3045 | if (!delta) | ||
3046 | return; | ||
3047 | |||
3048 | /* Set new sched_entity's utilization */ | ||
3049 | se->avg.util_avg = gcfs_rq->avg.util_avg; | ||
3050 | se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; | ||
3051 | |||
3052 | /* Update parent cfs_rq utilization */ | ||
3053 | add_positive(&cfs_rq->avg.util_avg, delta); | ||
3054 | cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX; | ||
3055 | } | ||
3056 | |||
3057 | /* Take into account change of load of a child task group */ | ||
3058 | static inline void | ||
3059 | update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se) | ||
3060 | { | ||
3061 | struct cfs_rq *gcfs_rq = group_cfs_rq(se); | ||
3062 | long delta, load = gcfs_rq->avg.load_avg; | ||
3063 | |||
3064 | /* | ||
3065 | * If the load of group cfs_rq is null, the load of the | ||
3066 | * sched_entity will also be null so we can skip the formula | ||
3067 | */ | ||
3068 | if (load) { | ||
3069 | long tg_load; | ||
3070 | |||
3071 | /* Get tg's load and ensure tg_load > 0 */ | ||
3072 | tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1; | ||
3073 | |||
3074 | /* Ensure tg_load >= load and updated with current load*/ | ||
3075 | tg_load -= gcfs_rq->tg_load_avg_contrib; | ||
3076 | tg_load += load; | ||
3077 | |||
3078 | /* | ||
3079 | * We need to compute a correction term in the case that the | ||
3080 | * task group is consuming more CPU than a task of equal | ||
3081 | * weight. A task with a weight equals to tg->shares will have | ||
3082 | * a load less or equal to scale_load_down(tg->shares). | ||
3083 | * Similarly, the sched_entities that represent the task group | ||
3084 | * at parent level, can't have a load higher than | ||
3085 | * scale_load_down(tg->shares). And the Sum of sched_entities' | ||
3086 | * load must be <= scale_load_down(tg->shares). | ||
3087 | */ | ||
3088 | if (tg_load > scale_load_down(gcfs_rq->tg->shares)) { | ||
3089 | /* scale gcfs_rq's load into tg's shares*/ | ||
3090 | load *= scale_load_down(gcfs_rq->tg->shares); | ||
3091 | load /= tg_load; | ||
3092 | } | ||
3093 | } | ||
3094 | |||
3095 | delta = load - se->avg.load_avg; | ||
3096 | |||
3097 | /* Nothing to update */ | ||
3098 | if (!delta) | ||
3099 | return; | ||
3100 | |||
3101 | /* Set new sched_entity's load */ | ||
3102 | se->avg.load_avg = load; | ||
3103 | se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX; | ||
3104 | |||
3105 | /* Update parent cfs_rq load */ | ||
3106 | add_positive(&cfs_rq->avg.load_avg, delta); | ||
3107 | cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX; | ||
3108 | |||
3109 | /* | ||
3110 | * If the sched_entity is already enqueued, we also have to update the | ||
3111 | * runnable load avg. | ||
3112 | */ | ||
3113 | if (se->on_rq) { | ||
3114 | /* Update parent cfs_rq runnable_load_avg */ | ||
3115 | add_positive(&cfs_rq->runnable_load_avg, delta); | ||
3116 | cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX; | ||
3117 | } | ||
3118 | } | ||
3119 | |||
3120 | static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) | ||
3121 | { | ||
3122 | cfs_rq->propagate_avg = 1; | ||
3123 | } | ||
3124 | |||
3125 | static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se) | ||
3126 | { | ||
3127 | struct cfs_rq *cfs_rq = group_cfs_rq(se); | ||
3128 | |||
3129 | if (!cfs_rq->propagate_avg) | ||
3130 | return 0; | ||
3131 | |||
3132 | cfs_rq->propagate_avg = 0; | ||
3133 | return 1; | ||
3134 | } | ||
3135 | |||
3136 | /* Update task and its cfs_rq load average */ | ||
3137 | static inline int propagate_entity_load_avg(struct sched_entity *se) | ||
3138 | { | ||
3139 | struct cfs_rq *cfs_rq; | ||
3140 | |||
3141 | if (entity_is_task(se)) | ||
3142 | return 0; | ||
3143 | |||
3144 | if (!test_and_clear_tg_cfs_propagate(se)) | ||
3145 | return 0; | ||
3146 | |||
3147 | cfs_rq = cfs_rq_of(se); | ||
3148 | |||
3149 | set_tg_cfs_propagate(cfs_rq); | ||
3150 | |||
3151 | update_tg_cfs_util(cfs_rq, se); | ||
3152 | update_tg_cfs_load(cfs_rq, se); | ||
3153 | |||
3154 | return 1; | ||
3155 | } | ||
3156 | |||
2972 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 3157 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
3158 | |||
2973 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} | 3159 | static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} |
3160 | |||
3161 | static inline int propagate_entity_load_avg(struct sched_entity *se) | ||
3162 | { | ||
3163 | return 0; | ||
3164 | } | ||
3165 | |||
3166 | static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {} | ||
3167 | |||
2974 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 3168 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
2975 | 3169 | ||
2976 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) | 3170 | static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) |
@@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3041 | sub_positive(&sa->load_avg, r); | 3235 | sub_positive(&sa->load_avg, r); |
3042 | sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); | 3236 | sub_positive(&sa->load_sum, r * LOAD_AVG_MAX); |
3043 | removed_load = 1; | 3237 | removed_load = 1; |
3238 | set_tg_cfs_propagate(cfs_rq); | ||
3044 | } | 3239 | } |
3045 | 3240 | ||
3046 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { | 3241 | if (atomic_long_read(&cfs_rq->removed_util_avg)) { |
@@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3048 | sub_positive(&sa->util_avg, r); | 3243 | sub_positive(&sa->util_avg, r); |
3049 | sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); | 3244 | sub_positive(&sa->util_sum, r * LOAD_AVG_MAX); |
3050 | removed_util = 1; | 3245 | removed_util = 1; |
3246 | set_tg_cfs_propagate(cfs_rq); | ||
3051 | } | 3247 | } |
3052 | 3248 | ||
3053 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | 3249 | decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, |
@@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3064 | return decayed || removed_load; | 3260 | return decayed || removed_load; |
3065 | } | 3261 | } |
3066 | 3262 | ||
3263 | /* | ||
3264 | * Optional action to be done while updating the load average | ||
3265 | */ | ||
3266 | #define UPDATE_TG 0x1 | ||
3267 | #define SKIP_AGE_LOAD 0x2 | ||
3268 | |||
3067 | /* Update task and its cfs_rq load average */ | 3269 | /* Update task and its cfs_rq load average */ |
3068 | static inline void update_load_avg(struct sched_entity *se, int update_tg) | 3270 | static inline void update_load_avg(struct sched_entity *se, int flags) |
3069 | { | 3271 | { |
3070 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3272 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
3071 | u64 now = cfs_rq_clock_task(cfs_rq); | 3273 | u64 now = cfs_rq_clock_task(cfs_rq); |
3072 | struct rq *rq = rq_of(cfs_rq); | 3274 | struct rq *rq = rq_of(cfs_rq); |
3073 | int cpu = cpu_of(rq); | 3275 | int cpu = cpu_of(rq); |
3276 | int decayed; | ||
3074 | 3277 | ||
3075 | /* | 3278 | /* |
3076 | * Track task load average for carrying it to new CPU after migrated, and | 3279 | * Track task load average for carrying it to new CPU after migrated, and |
3077 | * track group sched_entity load average for task_h_load calc in migration | 3280 | * track group sched_entity load average for task_h_load calc in migration |
3078 | */ | 3281 | */ |
3079 | __update_load_avg(now, cpu, &se->avg, | 3282 | if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { |
3283 | __update_load_avg(now, cpu, &se->avg, | ||
3080 | se->on_rq * scale_load_down(se->load.weight), | 3284 | se->on_rq * scale_load_down(se->load.weight), |
3081 | cfs_rq->curr == se, NULL); | 3285 | cfs_rq->curr == se, NULL); |
3286 | } | ||
3082 | 3287 | ||
3083 | if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg) | 3288 | decayed = update_cfs_rq_load_avg(now, cfs_rq, true); |
3289 | decayed |= propagate_entity_load_avg(se); | ||
3290 | |||
3291 | if (decayed && (flags & UPDATE_TG)) | ||
3084 | update_tg_load_avg(cfs_rq, 0); | 3292 | update_tg_load_avg(cfs_rq, 0); |
3085 | } | 3293 | } |
3086 | 3294 | ||
@@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) | |||
3094 | */ | 3302 | */ |
3095 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3303 | static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3096 | { | 3304 | { |
3097 | if (!sched_feat(ATTACH_AGE_LOAD)) | ||
3098 | goto skip_aging; | ||
3099 | |||
3100 | /* | ||
3101 | * If we got migrated (either between CPUs or between cgroups) we'll | ||
3102 | * have aged the average right before clearing @last_update_time. | ||
3103 | * | ||
3104 | * Or we're fresh through post_init_entity_util_avg(). | ||
3105 | */ | ||
3106 | if (se->avg.last_update_time) { | ||
3107 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
3108 | &se->avg, 0, 0, NULL); | ||
3109 | |||
3110 | /* | ||
3111 | * XXX: we could have just aged the entire load away if we've been | ||
3112 | * absent from the fair class for too long. | ||
3113 | */ | ||
3114 | } | ||
3115 | |||
3116 | skip_aging: | ||
3117 | se->avg.last_update_time = cfs_rq->avg.last_update_time; | 3305 | se->avg.last_update_time = cfs_rq->avg.last_update_time; |
3118 | cfs_rq->avg.load_avg += se->avg.load_avg; | 3306 | cfs_rq->avg.load_avg += se->avg.load_avg; |
3119 | cfs_rq->avg.load_sum += se->avg.load_sum; | 3307 | cfs_rq->avg.load_sum += se->avg.load_sum; |
3120 | cfs_rq->avg.util_avg += se->avg.util_avg; | 3308 | cfs_rq->avg.util_avg += se->avg.util_avg; |
3121 | cfs_rq->avg.util_sum += se->avg.util_sum; | 3309 | cfs_rq->avg.util_sum += se->avg.util_sum; |
3310 | set_tg_cfs_propagate(cfs_rq); | ||
3122 | 3311 | ||
3123 | cfs_rq_util_change(cfs_rq); | 3312 | cfs_rq_util_change(cfs_rq); |
3124 | } | 3313 | } |
@@ -3133,14 +3322,12 @@ skip_aging: | |||
3133 | */ | 3322 | */ |
3134 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3323 | static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3135 | { | 3324 | { |
3136 | __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), | ||
3137 | &se->avg, se->on_rq * scale_load_down(se->load.weight), | ||
3138 | cfs_rq->curr == se, NULL); | ||
3139 | 3325 | ||
3140 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); | 3326 | sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); |
3141 | sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); | 3327 | sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum); |
3142 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); | 3328 | sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); |
3143 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); | 3329 | sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); |
3330 | set_tg_cfs_propagate(cfs_rq); | ||
3144 | 3331 | ||
3145 | cfs_rq_util_change(cfs_rq); | 3332 | cfs_rq_util_change(cfs_rq); |
3146 | } | 3333 | } |
@@ -3150,34 +3337,20 @@ static inline void | |||
3150 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3337 | enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3151 | { | 3338 | { |
3152 | struct sched_avg *sa = &se->avg; | 3339 | struct sched_avg *sa = &se->avg; |
3153 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
3154 | int migrated, decayed; | ||
3155 | |||
3156 | migrated = !sa->last_update_time; | ||
3157 | if (!migrated) { | ||
3158 | __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, | ||
3159 | se->on_rq * scale_load_down(se->load.weight), | ||
3160 | cfs_rq->curr == se, NULL); | ||
3161 | } | ||
3162 | |||
3163 | decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated); | ||
3164 | 3340 | ||
3165 | cfs_rq->runnable_load_avg += sa->load_avg; | 3341 | cfs_rq->runnable_load_avg += sa->load_avg; |
3166 | cfs_rq->runnable_load_sum += sa->load_sum; | 3342 | cfs_rq->runnable_load_sum += sa->load_sum; |
3167 | 3343 | ||
3168 | if (migrated) | 3344 | if (!sa->last_update_time) { |
3169 | attach_entity_load_avg(cfs_rq, se); | 3345 | attach_entity_load_avg(cfs_rq, se); |
3170 | |||
3171 | if (decayed || migrated) | ||
3172 | update_tg_load_avg(cfs_rq, 0); | 3346 | update_tg_load_avg(cfs_rq, 0); |
3347 | } | ||
3173 | } | 3348 | } |
3174 | 3349 | ||
3175 | /* Remove the runnable load generated by se from cfs_rq's runnable load average */ | 3350 | /* Remove the runnable load generated by se from cfs_rq's runnable load average */ |
3176 | static inline void | 3351 | static inline void |
3177 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) | 3352 | dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) |
3178 | { | 3353 | { |
3179 | update_load_avg(se, 1); | ||
3180 | |||
3181 | cfs_rq->runnable_load_avg = | 3354 | cfs_rq->runnable_load_avg = |
3182 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); | 3355 | max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0); |
3183 | cfs_rq->runnable_load_sum = | 3356 | cfs_rq->runnable_load_sum = |
@@ -3206,13 +3379,25 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) | |||
3206 | #endif | 3379 | #endif |
3207 | 3380 | ||
3208 | /* | 3381 | /* |
3382 | * Synchronize entity load avg of dequeued entity without locking | ||
3383 | * the previous rq. | ||
3384 | */ | ||
3385 | void sync_entity_load_avg(struct sched_entity *se) | ||
3386 | { | ||
3387 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
3388 | u64 last_update_time; | ||
3389 | |||
3390 | last_update_time = cfs_rq_last_update_time(cfs_rq); | ||
3391 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | ||
3392 | } | ||
3393 | |||
3394 | /* | ||
3209 | * Task first catches up with cfs_rq, and then subtract | 3395 | * Task first catches up with cfs_rq, and then subtract |
3210 | * itself from the cfs_rq (task must be off the queue now). | 3396 | * itself from the cfs_rq (task must be off the queue now). |
3211 | */ | 3397 | */ |
3212 | void remove_entity_load_avg(struct sched_entity *se) | 3398 | void remove_entity_load_avg(struct sched_entity *se) |
3213 | { | 3399 | { |
3214 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 3400 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
3215 | u64 last_update_time; | ||
3216 | 3401 | ||
3217 | /* | 3402 | /* |
3218 | * tasks cannot exit without having gone through wake_up_new_task() -> | 3403 | * tasks cannot exit without having gone through wake_up_new_task() -> |
@@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se) | |||
3224 | * calls this. | 3409 | * calls this. |
3225 | */ | 3410 | */ |
3226 | 3411 | ||
3227 | last_update_time = cfs_rq_last_update_time(cfs_rq); | 3412 | sync_entity_load_avg(se); |
3228 | |||
3229 | __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); | ||
3230 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); | 3413 | atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); |
3231 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); | 3414 | atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); |
3232 | } | 3415 | } |
@@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) | |||
3251 | return 0; | 3434 | return 0; |
3252 | } | 3435 | } |
3253 | 3436 | ||
3254 | static inline void update_load_avg(struct sched_entity *se, int not_used) | 3437 | #define UPDATE_TG 0x0 |
3438 | #define SKIP_AGE_LOAD 0x0 | ||
3439 | |||
3440 | static inline void update_load_avg(struct sched_entity *se, int not_used1) | ||
3255 | { | 3441 | { |
3256 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); | 3442 | cpufreq_update_util(rq_of(cfs_rq_of(se)), 0); |
3257 | } | 3443 | } |
@@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3396 | if (renorm && !curr) | 3582 | if (renorm && !curr) |
3397 | se->vruntime += cfs_rq->min_vruntime; | 3583 | se->vruntime += cfs_rq->min_vruntime; |
3398 | 3584 | ||
3585 | update_load_avg(se, UPDATE_TG); | ||
3399 | enqueue_entity_load_avg(cfs_rq, se); | 3586 | enqueue_entity_load_avg(cfs_rq, se); |
3400 | account_entity_enqueue(cfs_rq, se); | 3587 | account_entity_enqueue(cfs_rq, se); |
3401 | update_cfs_shares(cfs_rq); | 3588 | update_cfs_shares(cfs_rq); |
@@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
3470 | * Update run-time statistics of the 'current'. | 3657 | * Update run-time statistics of the 'current'. |
3471 | */ | 3658 | */ |
3472 | update_curr(cfs_rq); | 3659 | update_curr(cfs_rq); |
3660 | update_load_avg(se, UPDATE_TG); | ||
3473 | dequeue_entity_load_avg(cfs_rq, se); | 3661 | dequeue_entity_load_avg(cfs_rq, se); |
3474 | 3662 | ||
3475 | update_stats_dequeue(cfs_rq, se, flags); | 3663 | update_stats_dequeue(cfs_rq, se, flags); |
@@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
3557 | */ | 3745 | */ |
3558 | update_stats_wait_end(cfs_rq, se); | 3746 | update_stats_wait_end(cfs_rq, se); |
3559 | __dequeue_entity(cfs_rq, se); | 3747 | __dequeue_entity(cfs_rq, se); |
3560 | update_load_avg(se, 1); | 3748 | update_load_avg(se, UPDATE_TG); |
3561 | } | 3749 | } |
3562 | 3750 | ||
3563 | update_stats_curr_start(cfs_rq, se); | 3751 | update_stats_curr_start(cfs_rq, se); |
@@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | |||
3675 | /* | 3863 | /* |
3676 | * Ensure that runnable average is periodically updated. | 3864 | * Ensure that runnable average is periodically updated. |
3677 | */ | 3865 | */ |
3678 | update_load_avg(curr, 1); | 3866 | update_load_avg(curr, UPDATE_TG); |
3679 | update_cfs_shares(cfs_rq); | 3867 | update_cfs_shares(cfs_rq); |
3680 | 3868 | ||
3681 | #ifdef CONFIG_SCHED_HRTICK | 3869 | #ifdef CONFIG_SCHED_HRTICK |
@@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4572 | if (cfs_rq_throttled(cfs_rq)) | 4760 | if (cfs_rq_throttled(cfs_rq)) |
4573 | break; | 4761 | break; |
4574 | 4762 | ||
4575 | update_load_avg(se, 1); | 4763 | update_load_avg(se, UPDATE_TG); |
4576 | update_cfs_shares(cfs_rq); | 4764 | update_cfs_shares(cfs_rq); |
4577 | } | 4765 | } |
4578 | 4766 | ||
@@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
4631 | if (cfs_rq_throttled(cfs_rq)) | 4819 | if (cfs_rq_throttled(cfs_rq)) |
4632 | break; | 4820 | break; |
4633 | 4821 | ||
4634 | update_load_avg(se, 1); | 4822 | update_load_avg(se, UPDATE_TG); |
4635 | update_cfs_shares(cfs_rq); | 4823 | update_cfs_shares(cfs_rq); |
4636 | } | 4824 | } |
4637 | 4825 | ||
@@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5199 | return 1; | 5387 | return 1; |
5200 | } | 5388 | } |
5201 | 5389 | ||
5390 | static inline int task_util(struct task_struct *p); | ||
5391 | static int cpu_util_wake(int cpu, struct task_struct *p); | ||
5392 | |||
5393 | static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) | ||
5394 | { | ||
5395 | return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); | ||
5396 | } | ||
5397 | |||
5202 | /* | 5398 | /* |
5203 | * find_idlest_group finds and returns the least busy CPU group within the | 5399 | * find_idlest_group finds and returns the least busy CPU group within the |
5204 | * domain. | 5400 | * domain. |
@@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5208 | int this_cpu, int sd_flag) | 5404 | int this_cpu, int sd_flag) |
5209 | { | 5405 | { |
5210 | struct sched_group *idlest = NULL, *group = sd->groups; | 5406 | struct sched_group *idlest = NULL, *group = sd->groups; |
5211 | unsigned long min_load = ULONG_MAX, this_load = 0; | 5407 | struct sched_group *most_spare_sg = NULL; |
5408 | unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0; | ||
5409 | unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0; | ||
5410 | unsigned long most_spare = 0, this_spare = 0; | ||
5212 | int load_idx = sd->forkexec_idx; | 5411 | int load_idx = sd->forkexec_idx; |
5213 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 5412 | int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; |
5413 | unsigned long imbalance = scale_load_down(NICE_0_LOAD) * | ||
5414 | (sd->imbalance_pct-100) / 100; | ||
5214 | 5415 | ||
5215 | if (sd_flag & SD_BALANCE_WAKE) | 5416 | if (sd_flag & SD_BALANCE_WAKE) |
5216 | load_idx = sd->wake_idx; | 5417 | load_idx = sd->wake_idx; |
5217 | 5418 | ||
5218 | do { | 5419 | do { |
5219 | unsigned long load, avg_load; | 5420 | unsigned long load, avg_load, runnable_load; |
5421 | unsigned long spare_cap, max_spare_cap; | ||
5220 | int local_group; | 5422 | int local_group; |
5221 | int i; | 5423 | int i; |
5222 | 5424 | ||
@@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5228 | local_group = cpumask_test_cpu(this_cpu, | 5430 | local_group = cpumask_test_cpu(this_cpu, |
5229 | sched_group_cpus(group)); | 5431 | sched_group_cpus(group)); |
5230 | 5432 | ||
5231 | /* Tally up the load of all CPUs in the group */ | 5433 | /* |
5434 | * Tally up the load of all CPUs in the group and find | ||
5435 | * the group containing the CPU with most spare capacity. | ||
5436 | */ | ||
5232 | avg_load = 0; | 5437 | avg_load = 0; |
5438 | runnable_load = 0; | ||
5439 | max_spare_cap = 0; | ||
5233 | 5440 | ||
5234 | for_each_cpu(i, sched_group_cpus(group)) { | 5441 | for_each_cpu(i, sched_group_cpus(group)) { |
5235 | /* Bias balancing toward cpus of our domain */ | 5442 | /* Bias balancing toward cpus of our domain */ |
@@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5238 | else | 5445 | else |
5239 | load = target_load(i, load_idx); | 5446 | load = target_load(i, load_idx); |
5240 | 5447 | ||
5241 | avg_load += load; | 5448 | runnable_load += load; |
5449 | |||
5450 | avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); | ||
5451 | |||
5452 | spare_cap = capacity_spare_wake(i, p); | ||
5453 | |||
5454 | if (spare_cap > max_spare_cap) | ||
5455 | max_spare_cap = spare_cap; | ||
5242 | } | 5456 | } |
5243 | 5457 | ||
5244 | /* Adjust by relative CPU capacity of the group */ | 5458 | /* Adjust by relative CPU capacity of the group */ |
5245 | avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; | 5459 | avg_load = (avg_load * SCHED_CAPACITY_SCALE) / |
5460 | group->sgc->capacity; | ||
5461 | runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / | ||
5462 | group->sgc->capacity; | ||
5246 | 5463 | ||
5247 | if (local_group) { | 5464 | if (local_group) { |
5248 | this_load = avg_load; | 5465 | this_runnable_load = runnable_load; |
5249 | } else if (avg_load < min_load) { | 5466 | this_avg_load = avg_load; |
5250 | min_load = avg_load; | 5467 | this_spare = max_spare_cap; |
5251 | idlest = group; | 5468 | } else { |
5469 | if (min_runnable_load > (runnable_load + imbalance)) { | ||
5470 | /* | ||
5471 | * The runnable load is significantly smaller | ||
5472 | * so we can pick this new cpu | ||
5473 | */ | ||
5474 | min_runnable_load = runnable_load; | ||
5475 | min_avg_load = avg_load; | ||
5476 | idlest = group; | ||
5477 | } else if ((runnable_load < (min_runnable_load + imbalance)) && | ||
5478 | (100*min_avg_load > imbalance_scale*avg_load)) { | ||
5479 | /* | ||
5480 | * The runnable loads are close so take the | ||
5481 | * blocked load into account through avg_load. | ||
5482 | */ | ||
5483 | min_avg_load = avg_load; | ||
5484 | idlest = group; | ||
5485 | } | ||
5486 | |||
5487 | if (most_spare < max_spare_cap) { | ||
5488 | most_spare = max_spare_cap; | ||
5489 | most_spare_sg = group; | ||
5490 | } | ||
5252 | } | 5491 | } |
5253 | } while (group = group->next, group != sd->groups); | 5492 | } while (group = group->next, group != sd->groups); |
5254 | 5493 | ||
5255 | if (!idlest || 100*this_load < imbalance*min_load) | 5494 | /* |
5495 | * The cross-over point between using spare capacity or least load | ||
5496 | * is too conservative for high utilization tasks on partially | ||
5497 | * utilized systems if we require spare_capacity > task_util(p), | ||
5498 | * so we allow for some task stuffing by using | ||
5499 | * spare_capacity > task_util(p)/2. | ||
5500 | * | ||
5501 | * Spare capacity can't be used for fork because the utilization has | ||
5502 | * not been set yet, we must first select a rq to compute the initial | ||
5503 | * utilization. | ||
5504 | */ | ||
5505 | if (sd_flag & SD_BALANCE_FORK) | ||
5506 | goto skip_spare; | ||
5507 | |||
5508 | if (this_spare > task_util(p) / 2 && | ||
5509 | imbalance_scale*this_spare > 100*most_spare) | ||
5510 | return NULL; | ||
5511 | |||
5512 | if (most_spare > task_util(p) / 2) | ||
5513 | return most_spare_sg; | ||
5514 | |||
5515 | skip_spare: | ||
5516 | if (!idlest) | ||
5517 | return NULL; | ||
5518 | |||
5519 | if (min_runnable_load > (this_runnable_load + imbalance)) | ||
5256 | return NULL; | 5520 | return NULL; |
5521 | |||
5522 | if ((this_runnable_load < (min_runnable_load + imbalance)) && | ||
5523 | (100*this_avg_load < imbalance_scale*min_avg_load)) | ||
5524 | return NULL; | ||
5525 | |||
5257 | return idlest; | 5526 | return idlest; |
5258 | } | 5527 | } |
5259 | 5528 | ||
@@ -5590,6 +5859,24 @@ static inline int task_util(struct task_struct *p) | |||
5590 | } | 5859 | } |
5591 | 5860 | ||
5592 | /* | 5861 | /* |
5862 | * cpu_util_wake: Compute cpu utilization with any contributions from | ||
5863 | * the waking task p removed. | ||
5864 | */ | ||
5865 | static int cpu_util_wake(int cpu, struct task_struct *p) | ||
5866 | { | ||
5867 | unsigned long util, capacity; | ||
5868 | |||
5869 | /* Task has no contribution or is new */ | ||
5870 | if (cpu != task_cpu(p) || !p->se.avg.last_update_time) | ||
5871 | return cpu_util(cpu); | ||
5872 | |||
5873 | capacity = capacity_orig_of(cpu); | ||
5874 | util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0); | ||
5875 | |||
5876 | return (util >= capacity) ? capacity : util; | ||
5877 | } | ||
5878 | |||
5879 | /* | ||
5593 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the | 5880 | * Disable WAKE_AFFINE in the case where task @p doesn't fit in the |
5594 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. | 5881 | * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. |
5595 | * | 5882 | * |
@@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
5607 | if (max_cap - min_cap < max_cap >> 3) | 5894 | if (max_cap - min_cap < max_cap >> 3) |
5608 | return 0; | 5895 | return 0; |
5609 | 5896 | ||
5897 | /* Bring task utilization in sync with prev_cpu */ | ||
5898 | sync_entity_load_avg(&p->se); | ||
5899 | |||
5610 | return min_cap * 1024 < task_util(p) * capacity_margin; | 5900 | return min_cap * 1024 < task_util(p) * capacity_margin; |
5611 | } | 5901 | } |
5612 | 5902 | ||
@@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu) | |||
6641 | 6931 | ||
6642 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) | 6932 | if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) |
6643 | update_tg_load_avg(cfs_rq, 0); | 6933 | update_tg_load_avg(cfs_rq, 0); |
6934 | |||
6935 | /* Propagate pending load changes to the parent */ | ||
6936 | if (cfs_rq->tg->se[cpu]) | ||
6937 | update_load_avg(cfs_rq->tg->se[cpu], 0); | ||
6644 | } | 6938 | } |
6645 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6939 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6646 | } | 6940 | } |
@@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) | |||
6845 | 7139 | ||
6846 | cpu_rq(cpu)->cpu_capacity = capacity; | 7140 | cpu_rq(cpu)->cpu_capacity = capacity; |
6847 | sdg->sgc->capacity = capacity; | 7141 | sdg->sgc->capacity = capacity; |
7142 | sdg->sgc->min_capacity = capacity; | ||
6848 | } | 7143 | } |
6849 | 7144 | ||
6850 | void update_group_capacity(struct sched_domain *sd, int cpu) | 7145 | void update_group_capacity(struct sched_domain *sd, int cpu) |
6851 | { | 7146 | { |
6852 | struct sched_domain *child = sd->child; | 7147 | struct sched_domain *child = sd->child; |
6853 | struct sched_group *group, *sdg = sd->groups; | 7148 | struct sched_group *group, *sdg = sd->groups; |
6854 | unsigned long capacity; | 7149 | unsigned long capacity, min_capacity; |
6855 | unsigned long interval; | 7150 | unsigned long interval; |
6856 | 7151 | ||
6857 | interval = msecs_to_jiffies(sd->balance_interval); | 7152 | interval = msecs_to_jiffies(sd->balance_interval); |
@@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6864 | } | 7159 | } |
6865 | 7160 | ||
6866 | capacity = 0; | 7161 | capacity = 0; |
7162 | min_capacity = ULONG_MAX; | ||
6867 | 7163 | ||
6868 | if (child->flags & SD_OVERLAP) { | 7164 | if (child->flags & SD_OVERLAP) { |
6869 | /* | 7165 | /* |
@@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6888 | */ | 7184 | */ |
6889 | if (unlikely(!rq->sd)) { | 7185 | if (unlikely(!rq->sd)) { |
6890 | capacity += capacity_of(cpu); | 7186 | capacity += capacity_of(cpu); |
6891 | continue; | 7187 | } else { |
7188 | sgc = rq->sd->groups->sgc; | ||
7189 | capacity += sgc->capacity; | ||
6892 | } | 7190 | } |
6893 | 7191 | ||
6894 | sgc = rq->sd->groups->sgc; | 7192 | min_capacity = min(capacity, min_capacity); |
6895 | capacity += sgc->capacity; | ||
6896 | } | 7193 | } |
6897 | } else { | 7194 | } else { |
6898 | /* | 7195 | /* |
@@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu) | |||
6902 | 7199 | ||
6903 | group = child->groups; | 7200 | group = child->groups; |
6904 | do { | 7201 | do { |
6905 | capacity += group->sgc->capacity; | 7202 | struct sched_group_capacity *sgc = group->sgc; |
7203 | |||
7204 | capacity += sgc->capacity; | ||
7205 | min_capacity = min(sgc->min_capacity, min_capacity); | ||
6906 | group = group->next; | 7206 | group = group->next; |
6907 | } while (group != child->groups); | 7207 | } while (group != child->groups); |
6908 | } | 7208 | } |
6909 | 7209 | ||
6910 | sdg->sgc->capacity = capacity; | 7210 | sdg->sgc->capacity = capacity; |
7211 | sdg->sgc->min_capacity = min_capacity; | ||
6911 | } | 7212 | } |
6912 | 7213 | ||
6913 | /* | 7214 | /* |
@@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
6930 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | 7231 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. |
6931 | * Something like: | 7232 | * Something like: |
6932 | * | 7233 | * |
6933 | * { 0 1 2 3 } { 4 5 6 7 } | 7234 | * { 0 1 2 3 } { 4 5 6 7 } |
6934 | * * * * * | 7235 | * * * * * |
6935 | * | 7236 | * |
6936 | * If we were to balance group-wise we'd place two tasks in the first group and | 7237 | * If we were to balance group-wise we'd place two tasks in the first group and |
6937 | * two tasks in the second group. Clearly this is undesired as it will overload | 7238 | * two tasks in the second group. Clearly this is undesired as it will overload |
@@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) | |||
7002 | return false; | 7303 | return false; |
7003 | } | 7304 | } |
7004 | 7305 | ||
7306 | /* | ||
7307 | * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller | ||
7308 | * per-CPU capacity than sched_group ref. | ||
7309 | */ | ||
7310 | static inline bool | ||
7311 | group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref) | ||
7312 | { | ||
7313 | return sg->sgc->min_capacity * capacity_margin < | ||
7314 | ref->sgc->min_capacity * 1024; | ||
7315 | } | ||
7316 | |||
7005 | static inline enum | 7317 | static inline enum |
7006 | group_type group_classify(struct sched_group *group, | 7318 | group_type group_classify(struct sched_group *group, |
7007 | struct sg_lb_stats *sgs) | 7319 | struct sg_lb_stats *sgs) |
@@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
7105 | if (sgs->avg_load <= busiest->avg_load) | 7417 | if (sgs->avg_load <= busiest->avg_load) |
7106 | return false; | 7418 | return false; |
7107 | 7419 | ||
7420 | if (!(env->sd->flags & SD_ASYM_CPUCAPACITY)) | ||
7421 | goto asym_packing; | ||
7422 | |||
7423 | /* | ||
7424 | * Candidate sg has no more than one task per CPU and | ||
7425 | * has higher per-CPU capacity. Migrating tasks to less | ||
7426 | * capable CPUs may harm throughput. Maximize throughput, | ||
7427 | * power/energy consequences are not considered. | ||
7428 | */ | ||
7429 | if (sgs->sum_nr_running <= sgs->group_weight && | ||
7430 | group_smaller_cpu_capacity(sds->local, sg)) | ||
7431 | return false; | ||
7432 | |||
7433 | asym_packing: | ||
7108 | /* This is the busiest node in its class. */ | 7434 | /* This is the busiest node in its class. */ |
7109 | if (!(env->sd->flags & SD_ASYM_PACKING)) | 7435 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
7110 | return true; | 7436 | return true; |
@@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
7113 | if (env->idle == CPU_NOT_IDLE) | 7439 | if (env->idle == CPU_NOT_IDLE) |
7114 | return true; | 7440 | return true; |
7115 | /* | 7441 | /* |
7116 | * ASYM_PACKING needs to move all the work to the lowest | 7442 | * ASYM_PACKING needs to move all the work to the highest |
7117 | * numbered CPUs in the group, therefore mark all groups | 7443 | * prority CPUs in the group, therefore mark all groups |
7118 | * higher than ourself as busy. | 7444 | * of lower priority than ourself as busy. |
7119 | */ | 7445 | */ |
7120 | if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) { | 7446 | if (sgs->sum_nr_running && |
7447 | sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) { | ||
7121 | if (!sds->busiest) | 7448 | if (!sds->busiest) |
7122 | return true; | 7449 | return true; |
7123 | 7450 | ||
7124 | /* Prefer to move from highest possible cpu's work */ | 7451 | /* Prefer to move from lowest priority cpu's work */ |
7125 | if (group_first_cpu(sds->busiest) < group_first_cpu(sg)) | 7452 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, |
7453 | sg->asym_prefer_cpu)) | ||
7126 | return true; | 7454 | return true; |
7127 | } | 7455 | } |
7128 | 7456 | ||
@@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
7274 | if (!sds->busiest) | 7602 | if (!sds->busiest) |
7275 | return 0; | 7603 | return 0; |
7276 | 7604 | ||
7277 | busiest_cpu = group_first_cpu(sds->busiest); | 7605 | busiest_cpu = sds->busiest->asym_prefer_cpu; |
7278 | if (env->dst_cpu > busiest_cpu) | 7606 | if (sched_asym_prefer(busiest_cpu, env->dst_cpu)) |
7279 | return 0; | 7607 | return 0; |
7280 | 7608 | ||
7281 | env->imbalance = DIV_ROUND_CLOSEST( | 7609 | env->imbalance = DIV_ROUND_CLOSEST( |
@@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env) | |||
7613 | 7941 | ||
7614 | /* | 7942 | /* |
7615 | * ASYM_PACKING needs to force migrate tasks from busy but | 7943 | * ASYM_PACKING needs to force migrate tasks from busy but |
7616 | * higher numbered CPUs in order to pack all tasks in the | 7944 | * lower priority CPUs in order to pack all tasks in the |
7617 | * lowest numbered CPUs. | 7945 | * highest priority CPUs. |
7618 | */ | 7946 | */ |
7619 | if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu) | 7947 | if ((sd->flags & SD_ASYM_PACKING) && |
7948 | sched_asym_prefer(env->dst_cpu, env->src_cpu)) | ||
7620 | return 1; | 7949 | return 1; |
7621 | } | 7950 | } |
7622 | 7951 | ||
@@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
8465 | unsigned long now = jiffies; | 8794 | unsigned long now = jiffies; |
8466 | struct sched_domain_shared *sds; | 8795 | struct sched_domain_shared *sds; |
8467 | struct sched_domain *sd; | 8796 | struct sched_domain *sd; |
8468 | int nr_busy, cpu = rq->cpu; | 8797 | int nr_busy, i, cpu = rq->cpu; |
8469 | bool kick = false; | 8798 | bool kick = false; |
8470 | 8799 | ||
8471 | if (unlikely(rq->idle_balance)) | 8800 | if (unlikely(rq->idle_balance)) |
@@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq) | |||
8516 | } | 8845 | } |
8517 | 8846 | ||
8518 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); | 8847 | sd = rcu_dereference(per_cpu(sd_asym, cpu)); |
8519 | if (sd && (cpumask_first_and(nohz.idle_cpus_mask, | 8848 | if (sd) { |
8520 | sched_domain_span(sd)) < cpu)) { | 8849 | for_each_cpu(i, sched_domain_span(sd)) { |
8521 | kick = true; | 8850 | if (i == cpu || |
8522 | goto unlock; | 8851 | !cpumask_test_cpu(i, nohz.idle_cpus_mask)) |
8523 | } | 8852 | continue; |
8524 | 8853 | ||
8854 | if (sched_asym_prefer(i, cpu)) { | ||
8855 | kick = true; | ||
8856 | goto unlock; | ||
8857 | } | ||
8858 | } | ||
8859 | } | ||
8525 | unlock: | 8860 | unlock: |
8526 | rcu_read_unlock(); | 8861 | rcu_read_unlock(); |
8527 | return kick; | 8862 | return kick; |
@@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p) | |||
8687 | return false; | 9022 | return false; |
8688 | } | 9023 | } |
8689 | 9024 | ||
8690 | static void detach_task_cfs_rq(struct task_struct *p) | 9025 | #ifdef CONFIG_FAIR_GROUP_SCHED |
9026 | /* | ||
9027 | * Propagate the changes of the sched_entity across the tg tree to make it | ||
9028 | * visible to the root | ||
9029 | */ | ||
9030 | static void propagate_entity_cfs_rq(struct sched_entity *se) | ||
8691 | { | 9031 | { |
8692 | struct sched_entity *se = &p->se; | 9032 | struct cfs_rq *cfs_rq; |
8693 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
8694 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8695 | 9033 | ||
8696 | if (!vruntime_normalized(p)) { | 9034 | /* Start to propagate at parent */ |
8697 | /* | 9035 | se = se->parent; |
8698 | * Fix up our vruntime so that the current sleep doesn't | 9036 | |
8699 | * cause 'unlimited' sleep bonus. | 9037 | for_each_sched_entity(se) { |
8700 | */ | 9038 | cfs_rq = cfs_rq_of(se); |
8701 | place_entity(cfs_rq, se, 0); | 9039 | |
8702 | se->vruntime -= cfs_rq->min_vruntime; | 9040 | if (cfs_rq_throttled(cfs_rq)) |
9041 | break; | ||
9042 | |||
9043 | update_load_avg(se, UPDATE_TG); | ||
8703 | } | 9044 | } |
9045 | } | ||
9046 | #else | ||
9047 | static void propagate_entity_cfs_rq(struct sched_entity *se) { } | ||
9048 | #endif | ||
9049 | |||
9050 | static void detach_entity_cfs_rq(struct sched_entity *se) | ||
9051 | { | ||
9052 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
8704 | 9053 | ||
8705 | /* Catch up with the cfs_rq and remove our load when we leave */ | 9054 | /* Catch up with the cfs_rq and remove our load when we leave */ |
8706 | update_cfs_rq_load_avg(now, cfs_rq, false); | 9055 | update_load_avg(se, 0); |
8707 | detach_entity_load_avg(cfs_rq, se); | 9056 | detach_entity_load_avg(cfs_rq, se); |
8708 | update_tg_load_avg(cfs_rq, false); | 9057 | update_tg_load_avg(cfs_rq, false); |
9058 | propagate_entity_cfs_rq(se); | ||
8709 | } | 9059 | } |
8710 | 9060 | ||
8711 | static void attach_task_cfs_rq(struct task_struct *p) | 9061 | static void attach_entity_cfs_rq(struct sched_entity *se) |
8712 | { | 9062 | { |
8713 | struct sched_entity *se = &p->se; | ||
8714 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | 9063 | struct cfs_rq *cfs_rq = cfs_rq_of(se); |
8715 | u64 now = cfs_rq_clock_task(cfs_rq); | ||
8716 | 9064 | ||
8717 | #ifdef CONFIG_FAIR_GROUP_SCHED | 9065 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8718 | /* | 9066 | /* |
@@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p) | |||
8722 | se->depth = se->parent ? se->parent->depth + 1 : 0; | 9070 | se->depth = se->parent ? se->parent->depth + 1 : 0; |
8723 | #endif | 9071 | #endif |
8724 | 9072 | ||
8725 | /* Synchronize task with its cfs_rq */ | 9073 | /* Synchronize entity with its cfs_rq */ |
8726 | update_cfs_rq_load_avg(now, cfs_rq, false); | 9074 | update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); |
8727 | attach_entity_load_avg(cfs_rq, se); | 9075 | attach_entity_load_avg(cfs_rq, se); |
8728 | update_tg_load_avg(cfs_rq, false); | 9076 | update_tg_load_avg(cfs_rq, false); |
9077 | propagate_entity_cfs_rq(se); | ||
9078 | } | ||
9079 | |||
9080 | static void detach_task_cfs_rq(struct task_struct *p) | ||
9081 | { | ||
9082 | struct sched_entity *se = &p->se; | ||
9083 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
9084 | |||
9085 | if (!vruntime_normalized(p)) { | ||
9086 | /* | ||
9087 | * Fix up our vruntime so that the current sleep doesn't | ||
9088 | * cause 'unlimited' sleep bonus. | ||
9089 | */ | ||
9090 | place_entity(cfs_rq, se, 0); | ||
9091 | se->vruntime -= cfs_rq->min_vruntime; | ||
9092 | } | ||
9093 | |||
9094 | detach_entity_cfs_rq(se); | ||
9095 | } | ||
9096 | |||
9097 | static void attach_task_cfs_rq(struct task_struct *p) | ||
9098 | { | ||
9099 | struct sched_entity *se = &p->se; | ||
9100 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
9101 | |||
9102 | attach_entity_cfs_rq(se); | ||
8729 | 9103 | ||
8730 | if (!vruntime_normalized(p)) | 9104 | if (!vruntime_normalized(p)) |
8731 | se->vruntime += cfs_rq->min_vruntime; | 9105 | se->vruntime += cfs_rq->min_vruntime; |
@@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) | |||
8779 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; | 9153 | cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; |
8780 | #endif | 9154 | #endif |
8781 | #ifdef CONFIG_SMP | 9155 | #ifdef CONFIG_SMP |
9156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
9157 | cfs_rq->propagate_avg = 0; | ||
9158 | #endif | ||
8782 | atomic_long_set(&cfs_rq->removed_load_avg, 0); | 9159 | atomic_long_set(&cfs_rq->removed_load_avg, 0); |
8783 | atomic_long_set(&cfs_rq->removed_util_avg, 0); | 9160 | atomic_long_set(&cfs_rq->removed_util_avg, 0); |
8784 | #endif | 9161 | #endif |
@@ -8887,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg) | |||
8887 | se = tg->se[i]; | 9264 | se = tg->se[i]; |
8888 | 9265 | ||
8889 | raw_spin_lock_irq(&rq->lock); | 9266 | raw_spin_lock_irq(&rq->lock); |
8890 | post_init_entity_util_avg(se); | 9267 | attach_entity_cfs_rq(se); |
8891 | sync_throttle(tg, i); | 9268 | sync_throttle(tg, i); |
8892 | raw_spin_unlock_irq(&rq->lock); | 9269 | raw_spin_unlock_irq(&rq->lock); |
8893 | } | 9270 | } |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 055f935d4421..7b34c7826ca5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -404,6 +404,7 @@ struct cfs_rq { | |||
404 | unsigned long runnable_load_avg; | 404 | unsigned long runnable_load_avg; |
405 | #ifdef CONFIG_FAIR_GROUP_SCHED | 405 | #ifdef CONFIG_FAIR_GROUP_SCHED |
406 | unsigned long tg_load_avg_contrib; | 406 | unsigned long tg_load_avg_contrib; |
407 | unsigned long propagate_avg; | ||
407 | #endif | 408 | #endif |
408 | atomic_long_t removed_load_avg, removed_util_avg; | 409 | atomic_long_t removed_load_avg, removed_util_avg; |
409 | #ifndef CONFIG_64BIT | 410 | #ifndef CONFIG_64BIT |
@@ -539,6 +540,11 @@ struct dl_rq { | |||
539 | 540 | ||
540 | #ifdef CONFIG_SMP | 541 | #ifdef CONFIG_SMP |
541 | 542 | ||
543 | static inline bool sched_asym_prefer(int a, int b) | ||
544 | { | ||
545 | return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); | ||
546 | } | ||
547 | |||
542 | /* | 548 | /* |
543 | * We add the notion of a root-domain which will be used to define per-domain | 549 | * We add the notion of a root-domain which will be used to define per-domain |
544 | * variables. Each exclusive cpuset essentially defines an island domain by | 550 | * variables. Each exclusive cpuset essentially defines an island domain by |
@@ -623,6 +629,7 @@ struct rq { | |||
623 | #ifdef CONFIG_FAIR_GROUP_SCHED | 629 | #ifdef CONFIG_FAIR_GROUP_SCHED |
624 | /* list of leaf cfs_rq on this cpu: */ | 630 | /* list of leaf cfs_rq on this cpu: */ |
625 | struct list_head leaf_cfs_rq_list; | 631 | struct list_head leaf_cfs_rq_list; |
632 | struct list_head *tmp_alone_branch; | ||
626 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 633 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
627 | 634 | ||
628 | /* | 635 | /* |
@@ -892,7 +899,8 @@ struct sched_group_capacity { | |||
892 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity | 899 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity |
893 | * for a single CPU. | 900 | * for a single CPU. |
894 | */ | 901 | */ |
895 | unsigned int capacity; | 902 | unsigned long capacity; |
903 | unsigned long min_capacity; /* Min per-CPU capacity in group */ | ||
896 | unsigned long next_update; | 904 | unsigned long next_update; |
897 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 905 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
898 | 906 | ||
@@ -905,6 +913,7 @@ struct sched_group { | |||
905 | 913 | ||
906 | unsigned int group_weight; | 914 | unsigned int group_weight; |
907 | struct sched_group_capacity *sgc; | 915 | struct sched_group_capacity *sgc; |
916 | int asym_prefer_cpu; /* cpu of highest priority in group */ | ||
908 | 917 | ||
909 | /* | 918 | /* |
910 | * The CPUs this group covers. | 919 | * The CPUs this group covers. |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 706309f9ed84..739fb17371af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = { | |||
347 | .mode = 0644, | 347 | .mode = 0644, |
348 | .proc_handler = proc_dointvec, | 348 | .proc_handler = proc_dointvec, |
349 | }, | 349 | }, |
350 | { | ||
351 | .procname = "sched_shares_window_ns", | ||
352 | .data = &sysctl_sched_shares_window, | ||
353 | .maxlen = sizeof(unsigned int), | ||
354 | .mode = 0644, | ||
355 | .proc_handler = proc_dointvec, | ||
356 | }, | ||
357 | #ifdef CONFIG_SCHEDSTATS | 350 | #ifdef CONFIG_SCHEDSTATS |
358 | { | 351 | { |
359 | .procname = "sched_schedstats", | 352 | .procname = "sched_schedstats", |
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 39008d78927a..e887ffc8eef3 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c | |||
@@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p) | |||
133 | } | 133 | } |
134 | static inline unsigned long long virt_ticks(struct task_struct *p) | 134 | static inline unsigned long long virt_ticks(struct task_struct *p) |
135 | { | 135 | { |
136 | cputime_t utime; | 136 | cputime_t utime, stime; |
137 | 137 | ||
138 | task_cputime(p, &utime, NULL); | 138 | task_cputime(p, &utime, &stime); |
139 | 139 | ||
140 | return cputime_to_expires(utime); | 140 | return cputime_to_expires(utime); |
141 | } | 141 | } |