aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-08-06 12:39:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-06 12:39:22 -0400
commitc4efd6b569b2646e1346a08a4c40286f8bcb5f11 (patch)
treebf33e8594ac4e628cc95f2ef25513788b8273601
parent4aed2fd8e3181fea7c09ba79cf64e7e3f4413bf9 (diff)
parent0bcfe75807944106a3aa655a54bb610d62f3a7f5 (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (27 commits) sched: Use correct macro to display sched_child_runs_first in /proc/sched_debug sched: No need for bootmem special cases sched: Revert nohz_ratelimit() for now sched: Reduce update_group_power() calls sched: Update rq->clock for nohz balanced cpus sched: Fix spelling of sibling sched, cpuset: Drop __cpuexit from cpu hotplug callbacks sched: Fix the racy usage of thread_group_cputimer() in fastpath_timer_check() sched: run_posix_cpu_timers: Don't check ->exit_state, use lock_task_sighand() sched: thread_group_cputime: Simplify, document the "alive" check sched: Remove the obsolete exit_state/signal hacks sched: task_tick_rt: Remove the obsolete ->signal != NULL check sched: __sched_setscheduler: Read the RLIMIT_RTPRIO value lockless sched: Fix comments to make them DocBook happy sched: Fix fix_small_capacity powerpc: Exclude arch_sd_sibiling_asym_packing() on UP powerpc: Enable asymmetric SMT scheduling on POWER7 sched: Add asymmetric group packing option for sibling domain sched: Fix capacity calculations for SMT4 sched: Change nohz idle load balancing logic to push model ...
-rw-r--r--arch/parisc/kernel/ftrace.c4
-rw-r--r--arch/powerpc/include/asm/cputable.h3
-rw-r--r--arch/powerpc/kernel/process.c11
-rw-r--r--include/linux/cpu.h25
-rw-r--r--include/linux/cpuset.h6
-rw-r--r--include/linux/perf_event.h2
-rw-r--r--include/linux/sched.h59
-rw-r--r--include/linux/topology.h1
-rw-r--r--kernel/cpu.c6
-rw-r--r--kernel/cpuset.c21
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/hrtimer.c8
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/perf_event.c2
-rw-r--r--kernel/posix-cpu-timers.c36
-rw-r--r--kernel/rcutorture.c3
-rw-r--r--kernel/sched.c391
-rw-r--r--kernel/sched_clock.c95
-rw-r--r--kernel/sched_cpupri.c8
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c532
-rw-r--r--kernel/sched_rt.c3
-rw-r--r--kernel/sched_stats.h27
-rw-r--r--kernel/time/tick-sched.c10
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/workqueue_sched.h16
28 files changed, 877 insertions, 410 deletions
diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c
index 9877372ffdba..5beb97bafbb1 100644
--- a/arch/parisc/kernel/ftrace.c
+++ b/arch/parisc/kernel/ftrace.c
@@ -82,7 +82,7 @@ unsigned long ftrace_return_to_handler(unsigned long retval0,
82 unsigned long ret; 82 unsigned long ret;
83 83
84 pop_return_trace(&trace, &ret); 84 pop_return_trace(&trace, &ret);
85 trace.rettime = cpu_clock(raw_smp_processor_id()); 85 trace.rettime = local_clock();
86 ftrace_graph_return(&trace); 86 ftrace_graph_return(&trace);
87 87
88 if (unlikely(!ret)) { 88 if (unlikely(!ret)) {
@@ -126,7 +126,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
126 return; 126 return;
127 } 127 }
128 128
129 calltime = cpu_clock(raw_smp_processor_id()); 129 calltime = local_clock();
130 130
131 if (push_return_trace(old, calltime, 131 if (push_return_trace(old, calltime,
132 self_addr, &trace.depth) == -EBUSY) { 132 self_addr, &trace.depth) == -EBUSY) {
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index 5e2e2cfcc81b..3a40a992e594 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -197,6 +197,7 @@ extern const char *powerpc_base_platform;
197#define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000) 197#define CPU_FTR_SAO LONG_ASM_CONST(0x0020000000000000)
198#define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000) 198#define CPU_FTR_CP_USE_DCBTZ LONG_ASM_CONST(0x0040000000000000)
199#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000) 199#define CPU_FTR_UNALIGNED_LD_STD LONG_ASM_CONST(0x0080000000000000)
200#define CPU_FTR_ASYM_SMT LONG_ASM_CONST(0x0100000000000000)
200 201
201#ifndef __ASSEMBLY__ 202#ifndef __ASSEMBLY__
202 203
@@ -412,7 +413,7 @@ extern const char *powerpc_base_platform;
412 CPU_FTR_MMCRA | CPU_FTR_SMT | \ 413 CPU_FTR_MMCRA | CPU_FTR_SMT | \
413 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \ 414 CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE | \
414 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \ 415 CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
415 CPU_FTR_DSCR | CPU_FTR_SAO) 416 CPU_FTR_DSCR | CPU_FTR_SAO | CPU_FTR_ASYM_SMT)
416#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \ 417#define CPU_FTRS_CELL (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
417 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \ 418 CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
418 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \ 419 CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 551f6713ff42..e78a5add7f15 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1299,3 +1299,14 @@ unsigned long randomize_et_dyn(unsigned long base)
1299 1299
1300 return ret; 1300 return ret;
1301} 1301}
1302
1303#ifdef CONFIG_SMP
1304int arch_sd_sibling_asym_packing(void)
1305{
1306 if (cpu_has_feature(CPU_FTR_ASYM_SMT)) {
1307 printk_once(KERN_INFO "Enabling Asymmetric SMT scheduling\n");
1308 return SD_ASYM_PACKING;
1309 }
1310 return 0;
1311}
1312#endif
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index e287863ac053..de6b1722cdca 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -48,6 +48,31 @@ extern ssize_t arch_cpu_release(const char *, size_t);
48#endif 48#endif
49struct notifier_block; 49struct notifier_block;
50 50
51/*
52 * CPU notifier priorities.
53 */
54enum {
55 /*
56 * SCHED_ACTIVE marks a cpu which is coming up active during
57 * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
58 * notifier. CPUSET_ACTIVE adjusts cpuset according to
59 * cpu_active mask right after SCHED_ACTIVE. During
60 * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
61 * ordered in the similar way.
62 *
63 * This ordering guarantees consistent cpu_active mask and
64 * migration behavior to all cpu notifiers.
65 */
66 CPU_PRI_SCHED_ACTIVE = INT_MAX,
67 CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
68 CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
69 CPU_PRI_CPUSET_INACTIVE = INT_MIN,
70
71 /* migration should happen before other stuff but after perf */
72 CPU_PRI_PERF = 20,
73 CPU_PRI_MIGRATION = 10,
74};
75
51#ifdef CONFIG_SMP 76#ifdef CONFIG_SMP
52/* Need to know about CPUs going up/down? */ 77/* Need to know about CPUs going up/down? */
53#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE) 78#if defined(CONFIG_HOTPLUG_CPU) || !defined(MODULE)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 457ed765a116..f20eb8f16025 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -20,6 +20,7 @@ extern int number_of_cpusets; /* How many cpusets are defined in system? */
20 20
21extern int cpuset_init(void); 21extern int cpuset_init(void);
22extern void cpuset_init_smp(void); 22extern void cpuset_init_smp(void);
23extern void cpuset_update_active_cpus(void);
23extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); 24extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
24extern int cpuset_cpus_allowed_fallback(struct task_struct *p); 25extern int cpuset_cpus_allowed_fallback(struct task_struct *p);
25extern nodemask_t cpuset_mems_allowed(struct task_struct *p); 26extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -132,6 +133,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
132static inline int cpuset_init(void) { return 0; } 133static inline int cpuset_init(void) { return 0; }
133static inline void cpuset_init_smp(void) {} 134static inline void cpuset_init_smp(void) {}
134 135
136static inline void cpuset_update_active_cpus(void)
137{
138 partition_sched_domains(1, NULL, NULL);
139}
140
135static inline void cpuset_cpus_allowed(struct task_struct *p, 141static inline void cpuset_cpus_allowed(struct task_struct *p,
136 struct cpumask *mask) 142 struct cpumask *mask)
137{ 143{
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 937495c25073..716f99b682c1 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1067,7 +1067,7 @@ static inline void perf_event_disable(struct perf_event *event) { }
1067#define perf_cpu_notifier(fn) \ 1067#define perf_cpu_notifier(fn) \
1068do { \ 1068do { \
1069 static struct notifier_block fn##_nb __cpuinitdata = \ 1069 static struct notifier_block fn##_nb __cpuinitdata = \
1070 { .notifier_call = fn, .priority = 20 }; \ 1070 { .notifier_call = fn, .priority = CPU_PRI_PERF }; \
1071 fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \ 1071 fn(&fn##_nb, (unsigned long)CPU_UP_PREPARE, \
1072 (void *)(unsigned long)smp_processor_id()); \ 1072 (void *)(unsigned long)smp_processor_id()); \
1073 fn(&fn##_nb, (unsigned long)CPU_STARTING, \ 1073 fn(&fn##_nb, (unsigned long)CPU_STARTING, \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3992f50de614..9591907c4f79 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -272,19 +272,10 @@ extern int runqueue_is_locked(int cpu);
272 272
273extern cpumask_var_t nohz_cpu_mask; 273extern cpumask_var_t nohz_cpu_mask;
274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) 274#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
275extern int select_nohz_load_balancer(int cpu); 275extern void select_nohz_load_balancer(int stop_tick);
276extern int get_nohz_load_balancer(void); 276extern int get_nohz_timer_target(void);
277extern int nohz_ratelimit(int cpu);
278#else 277#else
279static inline int select_nohz_load_balancer(int cpu) 278static inline void select_nohz_load_balancer(int stop_tick) { }
280{
281 return 0;
282}
283
284static inline int nohz_ratelimit(int cpu)
285{
286 return 0;
287}
288#endif 279#endif
289 280
290/* 281/*
@@ -801,7 +792,7 @@ enum cpu_idle_type {
801#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ 792#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
802#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ 793#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
803#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ 794#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
804 795#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
805#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ 796#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
806 797
807enum powersavings_balance_level { 798enum powersavings_balance_level {
@@ -836,6 +827,8 @@ static inline int sd_balance_for_package_power(void)
836 return SD_PREFER_SIBLING; 827 return SD_PREFER_SIBLING;
837} 828}
838 829
830extern int __weak arch_sd_sibiling_asym_packing(void);
831
839/* 832/*
840 * Optimise SD flags for power savings: 833 * Optimise SD flags for power savings:
841 * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings. 834 * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
@@ -857,7 +850,7 @@ struct sched_group {
857 * CPU power of this group, SCHED_LOAD_SCALE being max power for a 850 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
858 * single CPU. 851 * single CPU.
859 */ 852 */
860 unsigned int cpu_power; 853 unsigned int cpu_power, cpu_power_orig;
861 854
862 /* 855 /*
863 * The CPUs this group covers. 856 * The CPUs this group covers.
@@ -1693,6 +1686,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
1693#define PF_EXITING 0x00000004 /* getting shut down */ 1686#define PF_EXITING 0x00000004 /* getting shut down */
1694#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */ 1687#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
1695#define PF_VCPU 0x00000010 /* I'm a virtual CPU */ 1688#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
1689#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
1696#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ 1690#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
1697#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */ 1691#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
1698#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ 1692#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
@@ -1787,20 +1781,23 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
1787#endif 1781#endif
1788 1782
1789/* 1783/*
1790 * Architectures can set this to 1 if they have specified 1784 * Do not use outside of architecture code which knows its limitations.
1791 * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, 1785 *
1792 * but then during bootup it turns out that sched_clock() 1786 * sched_clock() has no promise of monotonicity or bounded drift between
1793 * is reliable after all: 1787 * CPUs, use (which you should not) requires disabling IRQs.
1788 *
1789 * Please use one of the three interfaces below.
1794 */ 1790 */
1795#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
1796extern int sched_clock_stable;
1797#endif
1798
1799/* ftrace calls sched_clock() directly */
1800extern unsigned long long notrace sched_clock(void); 1791extern unsigned long long notrace sched_clock(void);
1792/*
1793 * See the comment in kernel/sched_clock.c
1794 */
1795extern u64 cpu_clock(int cpu);
1796extern u64 local_clock(void);
1797extern u64 sched_clock_cpu(int cpu);
1798
1801 1799
1802extern void sched_clock_init(void); 1800extern void sched_clock_init(void);
1803extern u64 sched_clock_cpu(int cpu);
1804 1801
1805#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 1802#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
1806static inline void sched_clock_tick(void) 1803static inline void sched_clock_tick(void)
@@ -1815,17 +1812,19 @@ static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
1815{ 1812{
1816} 1813}
1817#else 1814#else
1815/*
1816 * Architectures can set this to 1 if they have specified
1817 * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
1818 * but then during bootup it turns out that sched_clock()
1819 * is reliable after all:
1820 */
1821extern int sched_clock_stable;
1822
1818extern void sched_clock_tick(void); 1823extern void sched_clock_tick(void);
1819extern void sched_clock_idle_sleep_event(void); 1824extern void sched_clock_idle_sleep_event(void);
1820extern void sched_clock_idle_wakeup_event(u64 delta_ns); 1825extern void sched_clock_idle_wakeup_event(u64 delta_ns);
1821#endif 1826#endif
1822 1827
1823/*
1824 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
1825 * clock constructed from sched_clock():
1826 */
1827extern unsigned long long cpu_clock(int cpu);
1828
1829extern unsigned long long 1828extern unsigned long long
1830task_sched_runtime(struct task_struct *task); 1829task_sched_runtime(struct task_struct *task);
1831extern unsigned long long thread_group_sched_runtime(struct task_struct *task); 1830extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
diff --git a/include/linux/topology.h b/include/linux/topology.h
index c44df50a05ab..b572e432d2f3 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -103,6 +103,7 @@ int arch_update_cpu_topology(void);
103 | 1*SD_SHARE_PKG_RESOURCES \ 103 | 1*SD_SHARE_PKG_RESOURCES \
104 | 0*SD_SERIALIZE \ 104 | 0*SD_SERIALIZE \
105 | 0*SD_PREFER_SIBLING \ 105 | 0*SD_PREFER_SIBLING \
106 | arch_sd_sibling_asym_packing() \
106 , \ 107 , \
107 .last_balance = jiffies, \ 108 .last_balance = jiffies, \
108 .balance_interval = 1, \ 109 .balance_interval = 1, \
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 97d1b426a4ac..f6e726f18491 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -235,11 +235,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
235 return -EINVAL; 235 return -EINVAL;
236 236
237 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
238 set_cpu_active(cpu, false);
239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); 238 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
240 if (err) { 239 if (err) {
241 set_cpu_active(cpu, true);
242
243 nr_calls--; 240 nr_calls--;
244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); 241 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
245 printk("%s: attempt to take down CPU %u failed\n", 242 printk("%s: attempt to take down CPU %u failed\n",
@@ -249,7 +246,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 246
250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 247 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
251 if (err) { 248 if (err) {
252 set_cpu_active(cpu, true);
253 /* CPU didn't die: tell everyone. Can't complain. */ 249 /* CPU didn't die: tell everyone. Can't complain. */
254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 250 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
255 251
@@ -321,8 +317,6 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 goto out_notify; 317 goto out_notify;
322 BUG_ON(!cpu_online(cpu)); 318 BUG_ON(!cpu_online(cpu));
323 319
324 set_cpu_active(cpu, true);
325
326 /* Now call notifier in preparation. */ 320 /* Now call notifier in preparation. */
327 cpu_notify(CPU_ONLINE | mod, hcpu); 321 cpu_notify(CPU_ONLINE | mod, hcpu);
328 322
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7cb37d86a005..b23c0979bbe7 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2113,31 +2113,17 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2113 * but making no active use of cpusets. 2113 * but making no active use of cpusets.
2114 * 2114 *
2115 * This routine ensures that top_cpuset.cpus_allowed tracks 2115 * This routine ensures that top_cpuset.cpus_allowed tracks
2116 * cpu_online_map on each CPU hotplug (cpuhp) event. 2116 * cpu_active_mask on each CPU hotplug (cpuhp) event.
2117 * 2117 *
2118 * Called within get_online_cpus(). Needs to call cgroup_lock() 2118 * Called within get_online_cpus(). Needs to call cgroup_lock()
2119 * before calling generate_sched_domains(). 2119 * before calling generate_sched_domains().
2120 */ 2120 */
2121static int cpuset_track_online_cpus(struct notifier_block *unused_nb, 2121void cpuset_update_active_cpus(void)
2122 unsigned long phase, void *unused_cpu)
2123{ 2122{
2124 struct sched_domain_attr *attr; 2123 struct sched_domain_attr *attr;
2125 cpumask_var_t *doms; 2124 cpumask_var_t *doms;
2126 int ndoms; 2125 int ndoms;
2127 2126
2128 switch (phase) {
2129 case CPU_ONLINE:
2130 case CPU_ONLINE_FROZEN:
2131 case CPU_DOWN_PREPARE:
2132 case CPU_DOWN_PREPARE_FROZEN:
2133 case CPU_DOWN_FAILED:
2134 case CPU_DOWN_FAILED_FROZEN:
2135 break;
2136
2137 default:
2138 return NOTIFY_DONE;
2139 }
2140
2141 cgroup_lock(); 2127 cgroup_lock();
2142 mutex_lock(&callback_mutex); 2128 mutex_lock(&callback_mutex);
2143 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2129 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
@@ -2148,8 +2134,6 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2148 2134
2149 /* Have scheduler rebuild the domains */ 2135 /* Have scheduler rebuild the domains */
2150 partition_sched_domains(ndoms, doms, attr); 2136 partition_sched_domains(ndoms, doms, attr);
2151
2152 return NOTIFY_OK;
2153} 2137}
2154 2138
2155#ifdef CONFIG_MEMORY_HOTPLUG 2139#ifdef CONFIG_MEMORY_HOTPLUG
@@ -2203,7 +2187,6 @@ void __init cpuset_init_smp(void)
2203 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); 2187 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2204 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2188 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2205 2189
2206 hotcpu_notifier(cpuset_track_online_cpus, 0);
2207 hotplug_memory_notifier(cpuset_track_online_nodes, 10); 2190 hotplug_memory_notifier(cpuset_track_online_nodes, 10);
2208 2191
2209 cpuset_wq = create_singlethread_workqueue("cpuset"); 2192 cpuset_wq = create_singlethread_workqueue("cpuset");
diff --git a/kernel/fork.c b/kernel/fork.c
index b6cce14ba047..a82a65cef741 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -907,7 +907,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
907{ 907{
908 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
909 909
910 new_flags &= ~PF_SUPERPRIV; 910 new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
911 new_flags |= PF_FORKNOEXEC; 911 new_flags |= PF_FORKNOEXEC;
912 new_flags |= PF_STARTING; 912 new_flags |= PF_STARTING;
913 p->flags = new_flags; 913 p->flags = new_flags;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..e934339fbbef 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -144,12 +144,8 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
144static int hrtimer_get_target(int this_cpu, int pinned) 144static int hrtimer_get_target(int this_cpu, int pinned)
145{ 145{
146#ifdef CONFIG_NO_HZ 146#ifdef CONFIG_NO_HZ
147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { 147 if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
148 int preferred_cpu = get_nohz_load_balancer(); 148 return get_nohz_timer_target();
149
150 if (preferred_cpu >= 0)
151 return preferred_cpu;
152 }
153#endif 149#endif
154 return this_cpu; 150 return this_cpu;
155} 151}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..f2852a510232 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -146,7 +146,7 @@ static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
146 146
147static inline u64 lockstat_clock(void) 147static inline u64 lockstat_clock(void)
148{ 148{
149 return cpu_clock(smp_processor_id()); 149 return local_clock();
150} 150}
151 151
152static int lock_point(unsigned long points[], unsigned long ip) 152static int lock_point(unsigned long points[], unsigned long ip)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index c772a3d4000d..403d1804b198 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -214,7 +214,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
214 214
215static inline u64 perf_clock(void) 215static inline u64 perf_clock(void)
216{ 216{
217 return cpu_clock(raw_smp_processor_id()); 217 return local_clock();
218} 218}
219 219
220/* 220/*
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9829646d399c..f66bdd33a6c6 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -232,31 +232,24 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
232 232
233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) 233void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
234{ 234{
235 struct sighand_struct *sighand; 235 struct signal_struct *sig = tsk->signal;
236 struct signal_struct *sig;
237 struct task_struct *t; 236 struct task_struct *t;
238 237
239 *times = INIT_CPUTIME; 238 times->utime = sig->utime;
239 times->stime = sig->stime;
240 times->sum_exec_runtime = sig->sum_sched_runtime;
240 241
241 rcu_read_lock(); 242 rcu_read_lock();
242 sighand = rcu_dereference(tsk->sighand); 243 /* make sure we can trust tsk->thread_group list */
243 if (!sighand) 244 if (!likely(pid_alive(tsk)))
244 goto out; 245 goto out;
245 246
246 sig = tsk->signal;
247
248 t = tsk; 247 t = tsk;
249 do { 248 do {
250 times->utime = cputime_add(times->utime, t->utime); 249 times->utime = cputime_add(times->utime, t->utime);
251 times->stime = cputime_add(times->stime, t->stime); 250 times->stime = cputime_add(times->stime, t->stime);
252 times->sum_exec_runtime += t->se.sum_exec_runtime; 251 times->sum_exec_runtime += t->se.sum_exec_runtime;
253 252 } while_each_thread(tsk, t);
254 t = next_thread(t);
255 } while (t != tsk);
256
257 times->utime = cputime_add(times->utime, sig->utime);
258 times->stime = cputime_add(times->stime, sig->stime);
259 times->sum_exec_runtime += sig->sum_sched_runtime;
260out: 253out:
261 rcu_read_unlock(); 254 rcu_read_unlock();
262} 255}
@@ -1279,10 +1272,6 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1279{ 1272{
1280 struct signal_struct *sig; 1273 struct signal_struct *sig;
1281 1274
1282 /* tsk == current, ensure it is safe to use ->signal/sighand */
1283 if (unlikely(tsk->exit_state))
1284 return 0;
1285
1286 if (!task_cputime_zero(&tsk->cputime_expires)) { 1275 if (!task_cputime_zero(&tsk->cputime_expires)) {
1287 struct task_cputime task_sample = { 1276 struct task_cputime task_sample = {
1288 .utime = tsk->utime, 1277 .utime = tsk->utime,
@@ -1298,7 +1287,10 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1298 if (sig->cputimer.running) { 1287 if (sig->cputimer.running) {
1299 struct task_cputime group_sample; 1288 struct task_cputime group_sample;
1300 1289
1301 thread_group_cputimer(tsk, &group_sample); 1290 spin_lock(&sig->cputimer.lock);
1291 group_sample = sig->cputimer.cputime;
1292 spin_unlock(&sig->cputimer.lock);
1293
1302 if (task_cputime_expired(&group_sample, &sig->cputime_expires)) 1294 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1303 return 1; 1295 return 1;
1304 } 1296 }
@@ -1315,6 +1307,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1315{ 1307{
1316 LIST_HEAD(firing); 1308 LIST_HEAD(firing);
1317 struct k_itimer *timer, *next; 1309 struct k_itimer *timer, *next;
1310 unsigned long flags;
1318 1311
1319 BUG_ON(!irqs_disabled()); 1312 BUG_ON(!irqs_disabled());
1320 1313
@@ -1325,7 +1318,8 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1325 if (!fastpath_timer_check(tsk)) 1318 if (!fastpath_timer_check(tsk))
1326 return; 1319 return;
1327 1320
1328 spin_lock(&tsk->sighand->siglock); 1321 if (!lock_task_sighand(tsk, &flags))
1322 return;
1329 /* 1323 /*
1330 * Here we take off tsk->signal->cpu_timers[N] and 1324 * Here we take off tsk->signal->cpu_timers[N] and
1331 * tsk->cpu_timers[N] all the timers that are firing, and 1325 * tsk->cpu_timers[N] all the timers that are firing, and
@@ -1347,7 +1341,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1347 * that gets the timer lock before we do will give it up and 1341 * that gets the timer lock before we do will give it up and
1348 * spin until we've taken care of that timer below. 1342 * spin until we've taken care of that timer below.
1349 */ 1343 */
1350 spin_unlock(&tsk->sighand->siglock); 1344 unlock_task_sighand(tsk, &flags);
1351 1345
1352 /* 1346 /*
1353 * Now that all the timers on our list have the firing flag, 1347 * Now that all the timers on our list have the firing flag,
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 6535ac8bc6a5..2e2726d790b9 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -239,8 +239,7 @@ static unsigned long
239rcu_random(struct rcu_random_state *rrsp) 239rcu_random(struct rcu_random_state *rrsp)
240{ 240{
241 if (--rrsp->rrs_count < 0) { 241 if (--rrsp->rrs_count < 0) {
242 rrsp->rrs_state += 242 rrsp->rrs_state += (unsigned long)local_clock();
243 (unsigned long)cpu_clock(raw_smp_processor_id());
244 rrsp->rrs_count = RCU_RANDOM_REFRESH; 243 rrsp->rrs_count = RCU_RANDOM_REFRESH;
245 } 244 }
246 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; 245 rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
diff --git a/kernel/sched.c b/kernel/sched.c
index 265cf3a2b5d8..41541d79e3c8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78 78
79#include "sched_cpupri.h" 79#include "sched_cpupri.h"
80#include "workqueue_sched.h"
80 81
81#define CREATE_TRACE_POINTS 82#define CREATE_TRACE_POINTS
82#include <trace/events/sched.h> 83#include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
456 unsigned long nr_running; 457 unsigned long nr_running;
457 #define CPU_LOAD_IDX_MAX 5 458 #define CPU_LOAD_IDX_MAX 5
458 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 459 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
460 unsigned long last_load_update_tick;
459#ifdef CONFIG_NO_HZ 461#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp; 462 u64 nohz_stamp;
461 unsigned char in_nohz_recently; 463 unsigned char nohz_balance_kick;
462#endif 464#endif
463 unsigned int skip_clock_update; 465 unsigned int skip_clock_update;
464 466
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
1193 1195
1194#ifdef CONFIG_NO_HZ 1196#ifdef CONFIG_NO_HZ
1195/* 1197/*
1198 * In the semi idle case, use the nearest busy cpu for migrating timers
1199 * from an idle cpu. This is good for power-savings.
1200 *
1201 * We don't do similar optimization for completely idle system, as
1202 * selecting an idle cpu will add more delays to the timers than intended
1203 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
1204 */
1205int get_nohz_timer_target(void)
1206{
1207 int cpu = smp_processor_id();
1208 int i;
1209 struct sched_domain *sd;
1210
1211 for_each_domain(cpu, sd) {
1212 for_each_cpu(i, sched_domain_span(sd))
1213 if (!idle_cpu(i))
1214 return i;
1215 }
1216 return cpu;
1217}
1218/*
1196 * When add_timer_on() enqueues a timer into the timer wheel of an 1219 * When add_timer_on() enqueues a timer into the timer wheel of an
1197 * idle CPU then this timer might expire before the next timer event 1220 * idle CPU then this timer might expire before the next timer event
1198 * which is scheduled to wake up that CPU. In case of a completely 1221 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
1232 smp_send_reschedule(cpu); 1255 smp_send_reschedule(cpu);
1233} 1256}
1234 1257
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1245#endif /* CONFIG_NO_HZ */ 1258#endif /* CONFIG_NO_HZ */
1246 1259
1247static u64 sched_avg_period(void) 1260static u64 sched_avg_period(void)
@@ -1652,7 +1665,7 @@ static void update_shares(struct sched_domain *sd)
1652 if (root_task_group_empty()) 1665 if (root_task_group_empty())
1653 return; 1666 return;
1654 1667
1655 now = cpu_clock(raw_smp_processor_id()); 1668 now = local_clock();
1656 elapsed = now - sd->last_update; 1669 elapsed = now - sd->last_update;
1657 1670
1658 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1671 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1818,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1805static void calc_load_account_idle(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1819static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1821static void update_cpu_load(struct rq *this_rq);
1808 1822
1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1810{ 1824{
@@ -2267,11 +2281,55 @@ static void update_avg(u64 *avg, u64 sample)
2267} 2281}
2268#endif 2282#endif
2269 2283
2270/*** 2284static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
2285 bool is_sync, bool is_migrate, bool is_local,
2286 unsigned long en_flags)
2287{
2288 schedstat_inc(p, se.statistics.nr_wakeups);
2289 if (is_sync)
2290 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2291 if (is_migrate)
2292 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2293 if (is_local)
2294 schedstat_inc(p, se.statistics.nr_wakeups_local);
2295 else
2296 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2297
2298 activate_task(rq, p, en_flags);
2299}
2300
2301static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
2302 int wake_flags, bool success)
2303{
2304 trace_sched_wakeup(p, success);
2305 check_preempt_curr(rq, p, wake_flags);
2306
2307 p->state = TASK_RUNNING;
2308#ifdef CONFIG_SMP
2309 if (p->sched_class->task_woken)
2310 p->sched_class->task_woken(rq, p);
2311
2312 if (unlikely(rq->idle_stamp)) {
2313 u64 delta = rq->clock - rq->idle_stamp;
2314 u64 max = 2*sysctl_sched_migration_cost;
2315
2316 if (delta > max)
2317 rq->avg_idle = max;
2318 else
2319 update_avg(&rq->avg_idle, delta);
2320 rq->idle_stamp = 0;
2321 }
2322#endif
2323 /* if a worker is waking up, notify workqueue */
2324 if ((p->flags & PF_WQ_WORKER) && success)
2325 wq_worker_waking_up(p, cpu_of(rq));
2326}
2327
2328/**
2271 * try_to_wake_up - wake up a thread 2329 * try_to_wake_up - wake up a thread
2272 * @p: the to-be-woken-up thread 2330 * @p: the thread to be awakened
2273 * @state: the mask of task states that can be woken 2331 * @state: the mask of task states that can be woken
2274 * @sync: do a synchronous wakeup? 2332 * @wake_flags: wake modifier flags (WF_*)
2275 * 2333 *
2276 * Put it on the run-queue if it's not already there. The "current" 2334 * Put it on the run-queue if it's not already there. The "current"
2277 * thread is always on the run-queue (except when the actual 2335 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2337,8 @@ static void update_avg(u64 *avg, u64 sample)
2279 * the simpler "current->state = TASK_RUNNING" to mark yourself 2337 * the simpler "current->state = TASK_RUNNING" to mark yourself
2280 * runnable without the overhead of this. 2338 * runnable without the overhead of this.
2281 * 2339 *
2282 * returns failure only if the task is already active. 2340 * Returns %true if @p was woken up, %false if it was already running
2341 * or @state didn't match @p's state.
2283 */ 2342 */
2284static int try_to_wake_up(struct task_struct *p, unsigned int state, 2343static int try_to_wake_up(struct task_struct *p, unsigned int state,
2285 int wake_flags) 2344 int wake_flags)
@@ -2359,38 +2418,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359 2418
2360out_activate: 2419out_activate:
2361#endif /* CONFIG_SMP */ 2420#endif /* CONFIG_SMP */
2362 schedstat_inc(p, se.statistics.nr_wakeups); 2421 ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
2363 if (wake_flags & WF_SYNC) 2422 cpu == this_cpu, en_flags);
2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2365 if (orig_cpu != cpu)
2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2367 if (cpu == this_cpu)
2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2369 else
2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2371 activate_task(rq, p, en_flags);
2372 success = 1; 2423 success = 1;
2373
2374out_running: 2424out_running:
2375 trace_sched_wakeup(p, success); 2425 ttwu_post_activation(p, rq, wake_flags, success);
2376 check_preempt_curr(rq, p, wake_flags);
2377
2378 p->state = TASK_RUNNING;
2379#ifdef CONFIG_SMP
2380 if (p->sched_class->task_woken)
2381 p->sched_class->task_woken(rq, p);
2382
2383 if (unlikely(rq->idle_stamp)) {
2384 u64 delta = rq->clock - rq->idle_stamp;
2385 u64 max = 2*sysctl_sched_migration_cost;
2386
2387 if (delta > max)
2388 rq->avg_idle = max;
2389 else
2390 update_avg(&rq->avg_idle, delta);
2391 rq->idle_stamp = 0;
2392 }
2393#endif
2394out: 2426out:
2395 task_rq_unlock(rq, &flags); 2427 task_rq_unlock(rq, &flags);
2396 put_cpu(); 2428 put_cpu();
@@ -2399,6 +2431,37 @@ out:
2399} 2431}
2400 2432
2401/** 2433/**
2434 * try_to_wake_up_local - try to wake up a local task with rq lock held
2435 * @p: the thread to be awakened
2436 *
2437 * Put @p on the run-queue if it's not alredy there. The caller must
2438 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2439 * the current task. this_rq() stays locked over invocation.
2440 */
2441static void try_to_wake_up_local(struct task_struct *p)
2442{
2443 struct rq *rq = task_rq(p);
2444 bool success = false;
2445
2446 BUG_ON(rq != this_rq());
2447 BUG_ON(p == current);
2448 lockdep_assert_held(&rq->lock);
2449
2450 if (!(p->state & TASK_NORMAL))
2451 return;
2452
2453 if (!p->se.on_rq) {
2454 if (likely(!task_running(rq, p))) {
2455 schedstat_inc(rq, ttwu_count);
2456 schedstat_inc(rq, ttwu_local);
2457 }
2458 ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
2459 success = true;
2460 }
2461 ttwu_post_activation(p, rq, 0, success);
2462}
2463
2464/**
2402 * wake_up_process - Wake up a specific process 2465 * wake_up_process - Wake up a specific process
2403 * @p: The process to be woken up. 2466 * @p: The process to be woken up.
2404 * 2467 *
@@ -3012,23 +3075,102 @@ static void calc_load_account_active(struct rq *this_rq)
3012} 3075}
3013 3076
3014/* 3077/*
3078 * The exact cpuload at various idx values, calculated at every tick would be
3079 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
3080 *
3081 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
3082 * on nth tick when cpu may be busy, then we have:
3083 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3084 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
3085 *
3086 * decay_load_missed() below does efficient calculation of
3087 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
3088 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
3089 *
3090 * The calculation is approximated on a 128 point scale.
3091 * degrade_zero_ticks is the number of ticks after which load at any
3092 * particular idx is approximated to be zero.
3093 * degrade_factor is a precomputed table, a row for each load idx.
3094 * Each column corresponds to degradation factor for a power of two ticks,
3095 * based on 128 point scale.
3096 * Example:
3097 * row 2, col 3 (=12) says that the degradation at load idx 2 after
3098 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
3099 *
3100 * With this power of 2 load factors, we can degrade the load n times
3101 * by looking at 1 bits in n and doing as many mult/shift instead of
3102 * n mult/shifts needed by the exact degradation.
3103 */
3104#define DEGRADE_SHIFT 7
3105static const unsigned char
3106 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
3107static const unsigned char
3108 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
3109 {0, 0, 0, 0, 0, 0, 0, 0},
3110 {64, 32, 8, 0, 0, 0, 0, 0},
3111 {96, 72, 40, 12, 1, 0, 0},
3112 {112, 98, 75, 43, 15, 1, 0},
3113 {120, 112, 98, 76, 45, 16, 2} };
3114
3115/*
3116 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
3117 * would be when CPU is idle and so we just decay the old load without
3118 * adding any new load.
3119 */
3120static unsigned long
3121decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
3122{
3123 int j = 0;
3124
3125 if (!missed_updates)
3126 return load;
3127
3128 if (missed_updates >= degrade_zero_ticks[idx])
3129 return 0;
3130
3131 if (idx == 1)
3132 return load >> missed_updates;
3133
3134 while (missed_updates) {
3135 if (missed_updates % 2)
3136 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
3137
3138 missed_updates >>= 1;
3139 j++;
3140 }
3141 return load;
3142}
3143
3144/*
3015 * Update rq->cpu_load[] statistics. This function is usually called every 3145 * Update rq->cpu_load[] statistics. This function is usually called every
3016 * scheduler tick (TICK_NSEC). 3146 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
3147 * every tick. We fix it up based on jiffies.
3017 */ 3148 */
3018static void update_cpu_load(struct rq *this_rq) 3149static void update_cpu_load(struct rq *this_rq)
3019{ 3150{
3020 unsigned long this_load = this_rq->load.weight; 3151 unsigned long this_load = this_rq->load.weight;
3152 unsigned long curr_jiffies = jiffies;
3153 unsigned long pending_updates;
3021 int i, scale; 3154 int i, scale;
3022 3155
3023 this_rq->nr_load_updates++; 3156 this_rq->nr_load_updates++;
3024 3157
3158 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
3159 if (curr_jiffies == this_rq->last_load_update_tick)
3160 return;
3161
3162 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
3163 this_rq->last_load_update_tick = curr_jiffies;
3164
3025 /* Update our load: */ 3165 /* Update our load: */
3026 for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 3166 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
3167 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
3027 unsigned long old_load, new_load; 3168 unsigned long old_load, new_load;
3028 3169
3029 /* scale is effectively 1 << i now, and >> i divides by scale */ 3170 /* scale is effectively 1 << i now, and >> i divides by scale */
3030 3171
3031 old_load = this_rq->cpu_load[i]; 3172 old_load = this_rq->cpu_load[i];
3173 old_load = decay_load_missed(old_load, pending_updates - 1, i);
3032 new_load = this_load; 3174 new_load = this_load;
3033 /* 3175 /*
3034 * Round up the averaging division if load is increasing. This 3176 * Round up the averaging division if load is increasing. This
@@ -3036,9 +3178,15 @@ static void update_cpu_load(struct rq *this_rq)
3036 * example. 3178 * example.
3037 */ 3179 */
3038 if (new_load > old_load) 3180 if (new_load > old_load)
3039 new_load += scale-1; 3181 new_load += scale - 1;
3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3182
3183 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
3041 } 3184 }
3185}
3186
3187static void update_cpu_load_active(struct rq *this_rq)
3188{
3189 update_cpu_load(this_rq);
3042 3190
3043 calc_load_account_active(this_rq); 3191 calc_load_account_active(this_rq);
3044} 3192}
@@ -3426,7 +3574,7 @@ void scheduler_tick(void)
3426 3574
3427 raw_spin_lock(&rq->lock); 3575 raw_spin_lock(&rq->lock);
3428 update_rq_clock(rq); 3576 update_rq_clock(rq);
3429 update_cpu_load(rq); 3577 update_cpu_load_active(rq);
3430 curr->sched_class->task_tick(rq, curr, 0); 3578 curr->sched_class->task_tick(rq, curr, 0);
3431 raw_spin_unlock(&rq->lock); 3579 raw_spin_unlock(&rq->lock);
3432 3580
@@ -3598,7 +3746,6 @@ need_resched:
3598 rq = cpu_rq(cpu); 3746 rq = cpu_rq(cpu);
3599 rcu_note_context_switch(cpu); 3747 rcu_note_context_switch(cpu);
3600 prev = rq->curr; 3748 prev = rq->curr;
3601 switch_count = &prev->nivcsw;
3602 3749
3603 release_kernel_lock(prev); 3750 release_kernel_lock(prev);
3604need_resched_nonpreemptible: 3751need_resched_nonpreemptible:
@@ -3611,11 +3758,26 @@ need_resched_nonpreemptible:
3611 raw_spin_lock_irq(&rq->lock); 3758 raw_spin_lock_irq(&rq->lock);
3612 clear_tsk_need_resched(prev); 3759 clear_tsk_need_resched(prev);
3613 3760
3761 switch_count = &prev->nivcsw;
3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3762 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3615 if (unlikely(signal_pending_state(prev->state, prev))) 3763 if (unlikely(signal_pending_state(prev->state, prev))) {
3616 prev->state = TASK_RUNNING; 3764 prev->state = TASK_RUNNING;
3617 else 3765 } else {
3766 /*
3767 * If a worker is going to sleep, notify and
3768 * ask workqueue whether it wants to wake up a
3769 * task to maintain concurrency. If so, wake
3770 * up the task.
3771 */
3772 if (prev->flags & PF_WQ_WORKER) {
3773 struct task_struct *to_wakeup;
3774
3775 to_wakeup = wq_worker_sleeping(prev, cpu);
3776 if (to_wakeup)
3777 try_to_wake_up_local(to_wakeup);
3778 }
3618 deactivate_task(rq, prev, DEQUEUE_SLEEP); 3779 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3780 }
3619 switch_count = &prev->nvcsw; 3781 switch_count = &prev->nvcsw;
3620 } 3782 }
3621 3783
@@ -3637,8 +3799,10 @@ need_resched_nonpreemptible:
3637 3799
3638 context_switch(rq, prev, next); /* unlocks the rq */ 3800 context_switch(rq, prev, next); /* unlocks the rq */
3639 /* 3801 /*
3640 * the context switch might have flipped the stack from under 3802 * The context switch have flipped the stack from under us
3641 * us, hence refresh the local variables. 3803 * and restored the local variables which were saved when
3804 * this task called schedule() in the past. prev == current
3805 * is still correct, but it can be moved to another cpu/rq.
3642 */ 3806 */
3643 cpu = smp_processor_id(); 3807 cpu = smp_processor_id();
3644 rq = cpu_rq(cpu); 3808 rq = cpu_rq(cpu);
@@ -3647,11 +3811,8 @@ need_resched_nonpreemptible:
3647 3811
3648 post_schedule(rq); 3812 post_schedule(rq);
3649 3813
3650 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3814 if (unlikely(reacquire_kernel_lock(prev)))
3651 prev = rq->curr;
3652 switch_count = &prev->nivcsw;
3653 goto need_resched_nonpreemptible; 3815 goto need_resched_nonpreemptible;
3654 }
3655 3816
3656 preempt_enable_no_resched(); 3817 preempt_enable_no_resched();
3657 if (need_resched()) 3818 if (need_resched())
@@ -4441,12 +4602,8 @@ recheck:
4441 */ 4602 */
4442 if (user && !capable(CAP_SYS_NICE)) { 4603 if (user && !capable(CAP_SYS_NICE)) {
4443 if (rt_policy(policy)) { 4604 if (rt_policy(policy)) {
4444 unsigned long rlim_rtprio; 4605 unsigned long rlim_rtprio =
4445 4606 task_rlimit(p, RLIMIT_RTPRIO);
4446 if (!lock_task_sighand(p, &flags))
4447 return -ESRCH;
4448 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
4449 unlock_task_sighand(p, &flags);
4450 4607
4451 /* can't set/change the rt policy */ 4608 /* can't set/change the rt policy */
4452 if (policy != p->policy && !rlim_rtprio) 4609 if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5973,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5816 */ 5973 */
5817static struct notifier_block __cpuinitdata migration_notifier = { 5974static struct notifier_block __cpuinitdata migration_notifier = {
5818 .notifier_call = migration_call, 5975 .notifier_call = migration_call,
5819 .priority = 10 5976 .priority = CPU_PRI_MIGRATION,
5820}; 5977};
5821 5978
5979static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
5980 unsigned long action, void *hcpu)
5981{
5982 switch (action & ~CPU_TASKS_FROZEN) {
5983 case CPU_ONLINE:
5984 case CPU_DOWN_FAILED:
5985 set_cpu_active((long)hcpu, true);
5986 return NOTIFY_OK;
5987 default:
5988 return NOTIFY_DONE;
5989 }
5990}
5991
5992static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
5993 unsigned long action, void *hcpu)
5994{
5995 switch (action & ~CPU_TASKS_FROZEN) {
5996 case CPU_DOWN_PREPARE:
5997 set_cpu_active((long)hcpu, false);
5998 return NOTIFY_OK;
5999 default:
6000 return NOTIFY_DONE;
6001 }
6002}
6003
5822static int __init migration_init(void) 6004static int __init migration_init(void)
5823{ 6005{
5824 void *cpu = (void *)(long)smp_processor_id(); 6006 void *cpu = (void *)(long)smp_processor_id();
5825 int err; 6007 int err;
5826 6008
5827 /* Start one for the boot CPU: */ 6009 /* Initialize migration for the boot CPU */
5828 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 6010 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5829 BUG_ON(err == NOTIFY_BAD); 6011 BUG_ON(err == NOTIFY_BAD);
5830 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6012 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5831 register_cpu_notifier(&migration_notifier); 6013 register_cpu_notifier(&migration_notifier);
5832 6014
6015 /* Register cpu active notifiers */
6016 cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
6017 cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
6018
5833 return 0; 6019 return 0;
5834} 6020}
5835early_initcall(migration_init); 6021early_initcall(migration_init);
@@ -6064,23 +6250,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6064 free_rootdomain(old_rd); 6250 free_rootdomain(old_rd);
6065} 6251}
6066 6252
6067static int init_rootdomain(struct root_domain *rd, bool bootmem) 6253static int init_rootdomain(struct root_domain *rd)
6068{ 6254{
6069 gfp_t gfp = GFP_KERNEL;
6070
6071 memset(rd, 0, sizeof(*rd)); 6255 memset(rd, 0, sizeof(*rd));
6072 6256
6073 if (bootmem) 6257 if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
6074 gfp = GFP_NOWAIT;
6075
6076 if (!alloc_cpumask_var(&rd->span, gfp))
6077 goto out; 6258 goto out;
6078 if (!alloc_cpumask_var(&rd->online, gfp)) 6259 if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
6079 goto free_span; 6260 goto free_span;
6080 if (!alloc_cpumask_var(&rd->rto_mask, gfp)) 6261 if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
6081 goto free_online; 6262 goto free_online;
6082 6263
6083 if (cpupri_init(&rd->cpupri, bootmem) != 0) 6264 if (cpupri_init(&rd->cpupri) != 0)
6084 goto free_rto_mask; 6265 goto free_rto_mask;
6085 return 0; 6266 return 0;
6086 6267
@@ -6096,7 +6277,7 @@ out:
6096 6277
6097static void init_defrootdomain(void) 6278static void init_defrootdomain(void)
6098{ 6279{
6099 init_rootdomain(&def_root_domain, true); 6280 init_rootdomain(&def_root_domain);
6100 6281
6101 atomic_set(&def_root_domain.refcount, 1); 6282 atomic_set(&def_root_domain.refcount, 1);
6102} 6283}
@@ -6109,7 +6290,7 @@ static struct root_domain *alloc_rootdomain(void)
6109 if (!rd) 6290 if (!rd)
6110 return NULL; 6291 return NULL;
6111 6292
6112 if (init_rootdomain(rd, false) != 0) { 6293 if (init_rootdomain(rd) != 0) {
6113 kfree(rd); 6294 kfree(rd);
6114 return NULL; 6295 return NULL;
6115 } 6296 }
@@ -7288,29 +7469,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7288} 7469}
7289#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7470#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7290 7471
7291#ifndef CONFIG_CPUSETS
7292/* 7472/*
7293 * Add online and remove offline CPUs from the scheduler domains. 7473 * Update cpusets according to cpu_active mask. If cpusets are
7294 * When cpusets are enabled they take over this function. 7474 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7475 * around partition_sched_domains().
7295 */ 7476 */
7296static int update_sched_domains(struct notifier_block *nfb, 7477static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
7297 unsigned long action, void *hcpu) 7478 void *hcpu)
7298{ 7479{
7299 switch (action) { 7480 switch (action & ~CPU_TASKS_FROZEN) {
7300 case CPU_ONLINE: 7481 case CPU_ONLINE:
7301 case CPU_ONLINE_FROZEN:
7302 case CPU_DOWN_PREPARE:
7303 case CPU_DOWN_PREPARE_FROZEN:
7304 case CPU_DOWN_FAILED: 7482 case CPU_DOWN_FAILED:
7305 case CPU_DOWN_FAILED_FROZEN: 7483 cpuset_update_active_cpus();
7306 partition_sched_domains(1, NULL, NULL);
7307 return NOTIFY_OK; 7484 return NOTIFY_OK;
7485 default:
7486 return NOTIFY_DONE;
7487 }
7488}
7308 7489
7490static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
7491 void *hcpu)
7492{
7493 switch (action & ~CPU_TASKS_FROZEN) {
7494 case CPU_DOWN_PREPARE:
7495 cpuset_update_active_cpus();
7496 return NOTIFY_OK;
7309 default: 7497 default:
7310 return NOTIFY_DONE; 7498 return NOTIFY_DONE;
7311 } 7499 }
7312} 7500}
7313#endif
7314 7501
7315static int update_runtime(struct notifier_block *nfb, 7502static int update_runtime(struct notifier_block *nfb,
7316 unsigned long action, void *hcpu) 7503 unsigned long action, void *hcpu)
@@ -7356,10 +7543,8 @@ void __init sched_init_smp(void)
7356 mutex_unlock(&sched_domains_mutex); 7543 mutex_unlock(&sched_domains_mutex);
7357 put_online_cpus(); 7544 put_online_cpus();
7358 7545
7359#ifndef CONFIG_CPUSETS 7546 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
7360 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7547 hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
7361 hotcpu_notifier(update_sched_domains, 0);
7362#endif
7363 7548
7364 /* RT runtime code needs to handle some hotplug events */ 7549 /* RT runtime code needs to handle some hotplug events */
7365 hotcpu_notifier(update_runtime, 0); 7550 hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7789,9 @@ void __init sched_init(void)
7604 7789
7605 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7790 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
7606 rq->cpu_load[j] = 0; 7791 rq->cpu_load[j] = 0;
7792
7793 rq->last_load_update_tick = jiffies;
7794
7607#ifdef CONFIG_SMP 7795#ifdef CONFIG_SMP
7608 rq->sd = NULL; 7796 rq->sd = NULL;
7609 rq->rd = NULL; 7797 rq->rd = NULL;
@@ -7617,6 +7805,10 @@ void __init sched_init(void)
7617 rq->idle_stamp = 0; 7805 rq->idle_stamp = 0;
7618 rq->avg_idle = 2*sysctl_sched_migration_cost; 7806 rq->avg_idle = 2*sysctl_sched_migration_cost;
7619 rq_attach_root(rq, &def_root_domain); 7807 rq_attach_root(rq, &def_root_domain);
7808#ifdef CONFIG_NO_HZ
7809 rq->nohz_balance_kick = 0;
7810 init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
7811#endif
7620#endif 7812#endif
7621 init_rq_hrtick(rq); 7813 init_rq_hrtick(rq);
7622 atomic_set(&rq->nr_iowait, 0); 7814 atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7853,11 @@ void __init sched_init(void)
7661 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 7853 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
7662#ifdef CONFIG_SMP 7854#ifdef CONFIG_SMP
7663#ifdef CONFIG_NO_HZ 7855#ifdef CONFIG_NO_HZ
7664 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 7856 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
7665 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 7857 alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
7858 atomic_set(&nohz.load_balancer, nr_cpu_ids);
7859 atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
7860 atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
7666#endif 7861#endif
7667 /* May be allocated at isolcpus cmdline parse time */ 7862 /* May be allocated at isolcpus cmdline parse time */
7668 if (cpu_isolated_map == NULL) 7863 if (cpu_isolated_map == NULL)
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 906a0f718cb3..52f1a149bfb1 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -10,19 +10,55 @@
10 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 *
14 * - gtod 14 * What:
15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
18 * is monotonic for constant i. The timestamp returned is in nanoseconds.
19 *
20 * ######################### BIG FAT WARNING ##########################
21 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
22 * # go backwards !! #
23 * ####################################################################
24 *
25 * There is no strict promise about the base, although it tends to start
26 * at 0 on boot (but people really shouldn't rely on that).
27 *
28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
30 * local_clock() -- is cpu_clock() on the current cpu.
31 *
32 * How:
33 *
34 * The implementation either uses sched_clock() when
35 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
36 * sched_clock() is assumed to provide these properties (mostly it means
37 * the architecture provides a globally synchronized highres time source).
38 *
39 * Otherwise it tries to create a semi stable clock from a mixture of other
40 * clocks, including:
41 *
42 * - GTOD (clock monotomic)
15 * - sched_clock() 43 * - sched_clock()
16 * - explicit idle events 44 * - explicit idle events
17 * 45 *
18 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 46 * We use GTOD as base and use sched_clock() deltas to improve resolution. The
19 * making it monotonic and keeping it within an expected window. 47 * deltas are filtered to provide monotonicity and keeping it within an
48 * expected window.
20 * 49 *
21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 50 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
22 * that is otherwise invisible (TSC gets stopped). 51 * that is otherwise invisible (TSC gets stopped).
23 * 52 *
24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 53 *
25 * consistent between cpus (never more than 2 jiffies difference). 54 * Notes:
55 *
56 * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
57 * like cpufreq interrupts that can change the base clock (TSC) multiplier
58 * and cause funny jumps in time -- although the filtering provided by
59 * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
60 * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
61 * sched_clock().
26 */ 62 */
27#include <linux/spinlock.h> 63#include <linux/spinlock.h>
28#include <linux/hardirq.h> 64#include <linux/hardirq.h>
@@ -170,6 +206,11 @@ again:
170 return val; 206 return val;
171} 207}
172 208
209/*
210 * Similar to cpu_clock(), but requires local IRQs to be disabled.
211 *
212 * See cpu_clock().
213 */
173u64 sched_clock_cpu(int cpu) 214u64 sched_clock_cpu(int cpu)
174{ 215{
175 struct sched_clock_data *scd; 216 struct sched_clock_data *scd;
@@ -237,9 +278,19 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
237} 278}
238EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 279EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
239 280
240unsigned long long cpu_clock(int cpu) 281/*
282 * As outlined at the top, provides a fast, high resolution, nanosecond
283 * time source that is monotonic per cpu argument and has bounded drift
284 * between cpus.
285 *
286 * ######################### BIG FAT WARNING ##########################
287 * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
288 * # go backwards !! #
289 * ####################################################################
290 */
291u64 cpu_clock(int cpu)
241{ 292{
242 unsigned long long clock; 293 u64 clock;
243 unsigned long flags; 294 unsigned long flags;
244 295
245 local_irq_save(flags); 296 local_irq_save(flags);
@@ -249,6 +300,25 @@ unsigned long long cpu_clock(int cpu)
249 return clock; 300 return clock;
250} 301}
251 302
303/*
304 * Similar to cpu_clock() for the current cpu. Time will only be observed
305 * to be monotonic if care is taken to only compare timestampt taken on the
306 * same CPU.
307 *
308 * See cpu_clock().
309 */
310u64 local_clock(void)
311{
312 u64 clock;
313 unsigned long flags;
314
315 local_irq_save(flags);
316 clock = sched_clock_cpu(smp_processor_id());
317 local_irq_restore(flags);
318
319 return clock;
320}
321
252#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 322#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
253 323
254void sched_clock_init(void) 324void sched_clock_init(void)
@@ -264,12 +334,17 @@ u64 sched_clock_cpu(int cpu)
264 return sched_clock(); 334 return sched_clock();
265} 335}
266 336
267 337u64 cpu_clock(int cpu)
268unsigned long long cpu_clock(int cpu)
269{ 338{
270 return sched_clock_cpu(cpu); 339 return sched_clock_cpu(cpu);
271} 340}
272 341
342u64 local_clock(void)
343{
344 return sched_clock_cpu(0);
345}
346
273#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 347#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
274 348
275EXPORT_SYMBOL_GPL(cpu_clock); 349EXPORT_SYMBOL_GPL(cpu_clock);
350EXPORT_SYMBOL_GPL(local_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index e6871cb3fc83..2722dc1b4138 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -166,14 +166,10 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
166 * 166 *
167 * Returns: -ENOMEM if memory fails. 167 * Returns: -ENOMEM if memory fails.
168 */ 168 */
169int cpupri_init(struct cpupri *cp, bool bootmem) 169int cpupri_init(struct cpupri *cp)
170{ 170{
171 gfp_t gfp = GFP_KERNEL;
172 int i; 171 int i;
173 172
174 if (bootmem)
175 gfp = GFP_NOWAIT;
176
177 memset(cp, 0, sizeof(*cp)); 173 memset(cp, 0, sizeof(*cp));
178 174
179 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 175 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -181,7 +177,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
181 177
182 raw_spin_lock_init(&vec->lock); 178 raw_spin_lock_init(&vec->lock);
183 vec->count = 0; 179 vec->count = 0;
184 if (!zalloc_cpumask_var(&vec->mask, gfp)) 180 if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
185 goto cleanup; 181 goto cleanup;
186 } 182 }
187 183
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 7cb5bb6b95be..9fc7d386fea4 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -27,7 +27,7 @@ struct cpupri {
27int cpupri_find(struct cpupri *cp, 27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, struct cpumask *lowest_mask); 28 struct task_struct *p, struct cpumask *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri); 29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30int cpupri_init(struct cpupri *cp, bool bootmem); 30int cpupri_init(struct cpupri *cp);
31void cpupri_cleanup(struct cpupri *cp); 31void cpupri_cleanup(struct cpupri *cp);
32#else 32#else
33#define cpupri_set(cp, cpu, pri) do { } while (0) 33#define cpupri_set(cp, cpu, pri) do { } while (0)
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 35565395d00d..2e1b0d17dd9b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -332,7 +332,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
332 PN(sysctl_sched_latency); 332 PN(sysctl_sched_latency);
333 PN(sysctl_sched_min_granularity); 333 PN(sysctl_sched_min_granularity);
334 PN(sysctl_sched_wakeup_granularity); 334 PN(sysctl_sched_wakeup_granularity);
335 PN(sysctl_sched_child_runs_first); 335 P(sysctl_sched_child_runs_first);
336 P(sysctl_sched_features); 336 P(sysctl_sched_features);
337#undef PN 337#undef PN
338#undef P 338#undef P
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..806d1b227a21 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2287,13 +2287,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2287 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2288 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2289 2289
2290 if (sched_feat(ARCH_POWER))
2291 power *= arch_scale_freq_power(sd, cpu);
2292 else
2293 power *= default_scale_freq_power(sd, cpu);
2294
2295 power >>= SCHED_LOAD_SHIFT;
2296
2297 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { 2290 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2298 if (sched_feat(ARCH_POWER)) 2291 if (sched_feat(ARCH_POWER))
2299 power *= arch_scale_smt_power(sd, cpu); 2292 power *= arch_scale_smt_power(sd, cpu);
@@ -2303,6 +2296,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2303 power >>= SCHED_LOAD_SHIFT; 2296 power >>= SCHED_LOAD_SHIFT;
2304 } 2297 }
2305 2298
2299 sdg->cpu_power_orig = power;
2300
2301 if (sched_feat(ARCH_POWER))
2302 power *= arch_scale_freq_power(sd, cpu);
2303 else
2304 power *= default_scale_freq_power(sd, cpu);
2305
2306 power >>= SCHED_LOAD_SHIFT;
2307
2306 power *= scale_rt_power(cpu); 2308 power *= scale_rt_power(cpu);
2307 power >>= SCHED_LOAD_SHIFT; 2309 power >>= SCHED_LOAD_SHIFT;
2308 2310
@@ -2335,6 +2337,31 @@ static void update_group_power(struct sched_domain *sd, int cpu)
2335 sdg->cpu_power = power; 2337 sdg->cpu_power = power;
2336} 2338}
2337 2339
2340/*
2341 * Try and fix up capacity for tiny siblings, this is needed when
2342 * things like SD_ASYM_PACKING need f_b_g to select another sibling
2343 * which on its own isn't powerful enough.
2344 *
2345 * See update_sd_pick_busiest() and check_asym_packing().
2346 */
2347static inline int
2348fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
2349{
2350 /*
2351 * Only siblings can have significantly less than SCHED_LOAD_SCALE
2352 */
2353 if (sd->level != SD_LV_SIBLING)
2354 return 0;
2355
2356 /*
2357 * If ~90% of the cpu_power is still there, we're good.
2358 */
2359 if (group->cpu_power * 32 > group->cpu_power_orig * 29)
2360 return 1;
2361
2362 return 0;
2363}
2364
2338/** 2365/**
2339 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 2366 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2340 * @sd: The sched_domain whose statistics are to be updated. 2367 * @sd: The sched_domain whose statistics are to be updated.
@@ -2400,14 +2427,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2400 * domains. In the newly idle case, we will allow all the cpu's 2427 * domains. In the newly idle case, we will allow all the cpu's
2401 * to do the newly idle load balance. 2428 * to do the newly idle load balance.
2402 */ 2429 */
2403 if (idle != CPU_NEWLY_IDLE && local_group && 2430 if (idle != CPU_NEWLY_IDLE && local_group) {
2404 balance_cpu != this_cpu) { 2431 if (balance_cpu != this_cpu) {
2405 *balance = 0; 2432 *balance = 0;
2406 return; 2433 return;
2434 }
2435 update_group_power(sd, this_cpu);
2407 } 2436 }
2408 2437
2409 update_group_power(sd, this_cpu);
2410
2411 /* Adjust by relative CPU power of the group */ 2438 /* Adjust by relative CPU power of the group */
2412 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; 2439 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2413 2440
@@ -2428,6 +2455,51 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2428 2455
2429 sgs->group_capacity = 2456 sgs->group_capacity =
2430 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); 2457 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2458 if (!sgs->group_capacity)
2459 sgs->group_capacity = fix_small_capacity(sd, group);
2460}
2461
2462/**
2463 * update_sd_pick_busiest - return 1 on busiest group
2464 * @sd: sched_domain whose statistics are to be checked
2465 * @sds: sched_domain statistics
2466 * @sg: sched_group candidate to be checked for being the busiest
2467 * @sgs: sched_group statistics
2468 * @this_cpu: the current cpu
2469 *
2470 * Determine if @sg is a busier group than the previously selected
2471 * busiest group.
2472 */
2473static bool update_sd_pick_busiest(struct sched_domain *sd,
2474 struct sd_lb_stats *sds,
2475 struct sched_group *sg,
2476 struct sg_lb_stats *sgs,
2477 int this_cpu)
2478{
2479 if (sgs->avg_load <= sds->max_load)
2480 return false;
2481
2482 if (sgs->sum_nr_running > sgs->group_capacity)
2483 return true;
2484
2485 if (sgs->group_imb)
2486 return true;
2487
2488 /*
2489 * ASYM_PACKING needs to move all the work to the lowest
2490 * numbered CPUs in the group, therefore mark all groups
2491 * higher than ourself as busy.
2492 */
2493 if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
2494 this_cpu < group_first_cpu(sg)) {
2495 if (!sds->busiest)
2496 return true;
2497
2498 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
2499 return true;
2500 }
2501
2502 return false;
2431} 2503}
2432 2504
2433/** 2505/**
@@ -2435,7 +2507,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
2435 * @sd: sched_domain whose statistics are to be updated. 2507 * @sd: sched_domain whose statistics are to be updated.
2436 * @this_cpu: Cpu for which load balance is currently performed. 2508 * @this_cpu: Cpu for which load balance is currently performed.
2437 * @idle: Idle status of this_cpu 2509 * @idle: Idle status of this_cpu
2438 * @sd_idle: Idle status of the sched_domain containing group. 2510 * @sd_idle: Idle status of the sched_domain containing sg.
2439 * @cpus: Set of cpus considered for load balancing. 2511 * @cpus: Set of cpus considered for load balancing.
2440 * @balance: Should we balance. 2512 * @balance: Should we balance.
2441 * @sds: variable to hold the statistics for this sched_domain. 2513 * @sds: variable to hold the statistics for this sched_domain.
@@ -2446,7 +2518,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2446 struct sd_lb_stats *sds) 2518 struct sd_lb_stats *sds)
2447{ 2519{
2448 struct sched_domain *child = sd->child; 2520 struct sched_domain *child = sd->child;
2449 struct sched_group *group = sd->groups; 2521 struct sched_group *sg = sd->groups;
2450 struct sg_lb_stats sgs; 2522 struct sg_lb_stats sgs;
2451 int load_idx, prefer_sibling = 0; 2523 int load_idx, prefer_sibling = 0;
2452 2524
@@ -2459,21 +2531,20 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2459 do { 2531 do {
2460 int local_group; 2532 int local_group;
2461 2533
2462 local_group = cpumask_test_cpu(this_cpu, 2534 local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
2463 sched_group_cpus(group));
2464 memset(&sgs, 0, sizeof(sgs)); 2535 memset(&sgs, 0, sizeof(sgs));
2465 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, 2536 update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
2466 local_group, cpus, balance, &sgs); 2537 local_group, cpus, balance, &sgs);
2467 2538
2468 if (local_group && !(*balance)) 2539 if (local_group && !(*balance))
2469 return; 2540 return;
2470 2541
2471 sds->total_load += sgs.group_load; 2542 sds->total_load += sgs.group_load;
2472 sds->total_pwr += group->cpu_power; 2543 sds->total_pwr += sg->cpu_power;
2473 2544
2474 /* 2545 /*
2475 * In case the child domain prefers tasks go to siblings 2546 * In case the child domain prefers tasks go to siblings
2476 * first, lower the group capacity to one so that we'll try 2547 * first, lower the sg capacity to one so that we'll try
2477 * and move all the excess tasks away. 2548 * and move all the excess tasks away.
2478 */ 2549 */
2479 if (prefer_sibling) 2550 if (prefer_sibling)
@@ -2481,23 +2552,72 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2481 2552
2482 if (local_group) { 2553 if (local_group) {
2483 sds->this_load = sgs.avg_load; 2554 sds->this_load = sgs.avg_load;
2484 sds->this = group; 2555 sds->this = sg;
2485 sds->this_nr_running = sgs.sum_nr_running; 2556 sds->this_nr_running = sgs.sum_nr_running;
2486 sds->this_load_per_task = sgs.sum_weighted_load; 2557 sds->this_load_per_task = sgs.sum_weighted_load;
2487 } else if (sgs.avg_load > sds->max_load && 2558 } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
2488 (sgs.sum_nr_running > sgs.group_capacity ||
2489 sgs.group_imb)) {
2490 sds->max_load = sgs.avg_load; 2559 sds->max_load = sgs.avg_load;
2491 sds->busiest = group; 2560 sds->busiest = sg;
2492 sds->busiest_nr_running = sgs.sum_nr_running; 2561 sds->busiest_nr_running = sgs.sum_nr_running;
2493 sds->busiest_group_capacity = sgs.group_capacity; 2562 sds->busiest_group_capacity = sgs.group_capacity;
2494 sds->busiest_load_per_task = sgs.sum_weighted_load; 2563 sds->busiest_load_per_task = sgs.sum_weighted_load;
2495 sds->group_imb = sgs.group_imb; 2564 sds->group_imb = sgs.group_imb;
2496 } 2565 }
2497 2566
2498 update_sd_power_savings_stats(group, sds, local_group, &sgs); 2567 update_sd_power_savings_stats(sg, sds, local_group, &sgs);
2499 group = group->next; 2568 sg = sg->next;
2500 } while (group != sd->groups); 2569 } while (sg != sd->groups);
2570}
2571
2572int __weak arch_sd_sibling_asym_packing(void)
2573{
2574 return 0*SD_ASYM_PACKING;
2575}
2576
2577/**
2578 * check_asym_packing - Check to see if the group is packed into the
2579 * sched doman.
2580 *
2581 * This is primarily intended to used at the sibling level. Some
2582 * cores like POWER7 prefer to use lower numbered SMT threads. In the
2583 * case of POWER7, it can move to lower SMT modes only when higher
2584 * threads are idle. When in lower SMT modes, the threads will
2585 * perform better since they share less core resources. Hence when we
2586 * have idle threads, we want them to be the higher ones.
2587 *
2588 * This packing function is run on idle threads. It checks to see if
2589 * the busiest CPU in this domain (core in the P7 case) has a higher
2590 * CPU number than the packing function is being run on. Here we are
2591 * assuming lower CPU number will be equivalent to lower a SMT thread
2592 * number.
2593 *
2594 * Returns 1 when packing is required and a task should be moved to
2595 * this CPU. The amount of the imbalance is returned in *imbalance.
2596 *
2597 * @sd: The sched_domain whose packing is to be checked.
2598 * @sds: Statistics of the sched_domain which is to be packed
2599 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2600 * @imbalance: returns amount of imbalanced due to packing.
2601 */
2602static int check_asym_packing(struct sched_domain *sd,
2603 struct sd_lb_stats *sds,
2604 int this_cpu, unsigned long *imbalance)
2605{
2606 int busiest_cpu;
2607
2608 if (!(sd->flags & SD_ASYM_PACKING))
2609 return 0;
2610
2611 if (!sds->busiest)
2612 return 0;
2613
2614 busiest_cpu = group_first_cpu(sds->busiest);
2615 if (this_cpu > busiest_cpu)
2616 return 0;
2617
2618 *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
2619 SCHED_LOAD_SCALE);
2620 return 1;
2501} 2621}
2502 2622
2503/** 2623/**
@@ -2692,6 +2812,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2692 if (!(*balance)) 2812 if (!(*balance))
2693 goto ret; 2813 goto ret;
2694 2814
2815 if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
2816 check_asym_packing(sd, &sds, this_cpu, imbalance))
2817 return sds.busiest;
2818
2695 if (!sds.busiest || sds.busiest_nr_running == 0) 2819 if (!sds.busiest || sds.busiest_nr_running == 0)
2696 goto out_balanced; 2820 goto out_balanced;
2697 2821
@@ -2726,8 +2850,9 @@ ret:
2726 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2850 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2727 */ 2851 */
2728static struct rq * 2852static struct rq *
2729find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, 2853find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
2730 unsigned long imbalance, const struct cpumask *cpus) 2854 enum cpu_idle_type idle, unsigned long imbalance,
2855 const struct cpumask *cpus)
2731{ 2856{
2732 struct rq *busiest = NULL, *rq; 2857 struct rq *busiest = NULL, *rq;
2733 unsigned long max_load = 0; 2858 unsigned long max_load = 0;
@@ -2738,6 +2863,9 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2738 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); 2863 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2739 unsigned long wl; 2864 unsigned long wl;
2740 2865
2866 if (!capacity)
2867 capacity = fix_small_capacity(sd, group);
2868
2741 if (!cpumask_test_cpu(i, cpus)) 2869 if (!cpumask_test_cpu(i, cpus))
2742 continue; 2870 continue;
2743 2871
@@ -2777,9 +2905,19 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2777/* Working cpumask for load_balance and load_balance_newidle. */ 2905/* Working cpumask for load_balance and load_balance_newidle. */
2778static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); 2906static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2779 2907
2780static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle) 2908static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
2909 int busiest_cpu, int this_cpu)
2781{ 2910{
2782 if (idle == CPU_NEWLY_IDLE) { 2911 if (idle == CPU_NEWLY_IDLE) {
2912
2913 /*
2914 * ASYM_PACKING needs to force migrate tasks from busy but
2915 * higher numbered CPUs in order to pack all tasks in the
2916 * lowest numbered CPUs.
2917 */
2918 if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
2919 return 1;
2920
2783 /* 2921 /*
2784 * The only task running in a non-idle cpu can be moved to this 2922 * The only task running in a non-idle cpu can be moved to this
2785 * cpu in an attempt to completely freeup the other CPU 2923 * cpu in an attempt to completely freeup the other CPU
@@ -2854,7 +2992,7 @@ redo:
2854 goto out_balanced; 2992 goto out_balanced;
2855 } 2993 }
2856 2994
2857 busiest = find_busiest_queue(group, idle, imbalance, cpus); 2995 busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
2858 if (!busiest) { 2996 if (!busiest) {
2859 schedstat_inc(sd, lb_nobusyq[idle]); 2997 schedstat_inc(sd, lb_nobusyq[idle]);
2860 goto out_balanced; 2998 goto out_balanced;
@@ -2898,7 +3036,8 @@ redo:
2898 schedstat_inc(sd, lb_failed[idle]); 3036 schedstat_inc(sd, lb_failed[idle]);
2899 sd->nr_balance_failed++; 3037 sd->nr_balance_failed++;
2900 3038
2901 if (need_active_balance(sd, sd_idle, idle)) { 3039 if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
3040 this_cpu)) {
2902 raw_spin_lock_irqsave(&busiest->lock, flags); 3041 raw_spin_lock_irqsave(&busiest->lock, flags);
2903 3042
2904 /* don't kick the active_load_balance_cpu_stop, 3043 /* don't kick the active_load_balance_cpu_stop,
@@ -3093,13 +3232,40 @@ out_unlock:
3093} 3232}
3094 3233
3095#ifdef CONFIG_NO_HZ 3234#ifdef CONFIG_NO_HZ
3235
3236static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
3237
3238static void trigger_sched_softirq(void *data)
3239{
3240 raise_softirq_irqoff(SCHED_SOFTIRQ);
3241}
3242
3243static inline void init_sched_softirq_csd(struct call_single_data *csd)
3244{
3245 csd->func = trigger_sched_softirq;
3246 csd->info = NULL;
3247 csd->flags = 0;
3248 csd->priv = 0;
3249}
3250
3251/*
3252 * idle load balancing details
3253 * - One of the idle CPUs nominates itself as idle load_balancer, while
3254 * entering idle.
3255 * - This idle load balancer CPU will also go into tickless mode when
3256 * it is idle, just like all other idle CPUs
3257 * - When one of the busy CPUs notice that there may be an idle rebalancing
3258 * needed, they will kick the idle load balancer, which then does idle
3259 * load balancing for all the idle CPUs.
3260 */
3096static struct { 3261static struct {
3097 atomic_t load_balancer; 3262 atomic_t load_balancer;
3098 cpumask_var_t cpu_mask; 3263 atomic_t first_pick_cpu;
3099 cpumask_var_t ilb_grp_nohz_mask; 3264 atomic_t second_pick_cpu;
3100} nohz ____cacheline_aligned = { 3265 cpumask_var_t idle_cpus_mask;
3101 .load_balancer = ATOMIC_INIT(-1), 3266 cpumask_var_t grp_idle_mask;
3102}; 3267 unsigned long next_balance; /* in jiffy units */
3268} nohz ____cacheline_aligned;
3103 3269
3104int get_nohz_load_balancer(void) 3270int get_nohz_load_balancer(void)
3105{ 3271{
@@ -3153,17 +3319,17 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3153 */ 3319 */
3154static inline int is_semi_idle_group(struct sched_group *ilb_group) 3320static inline int is_semi_idle_group(struct sched_group *ilb_group)
3155{ 3321{
3156 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, 3322 cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
3157 sched_group_cpus(ilb_group)); 3323 sched_group_cpus(ilb_group));
3158 3324
3159 /* 3325 /*
3160 * A sched_group is semi-idle when it has atleast one busy cpu 3326 * A sched_group is semi-idle when it has atleast one busy cpu
3161 * and atleast one idle cpu. 3327 * and atleast one idle cpu.
3162 */ 3328 */
3163 if (cpumask_empty(nohz.ilb_grp_nohz_mask)) 3329 if (cpumask_empty(nohz.grp_idle_mask))
3164 return 0; 3330 return 0;
3165 3331
3166 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) 3332 if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
3167 return 0; 3333 return 0;
3168 3334
3169 return 1; 3335 return 1;
@@ -3196,7 +3362,7 @@ static int find_new_ilb(int cpu)
3196 * Optimize for the case when we have no idle CPUs or only one 3362 * Optimize for the case when we have no idle CPUs or only one
3197 * idle CPU. Don't walk the sched_domain hierarchy in such cases 3363 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3198 */ 3364 */
3199 if (cpumask_weight(nohz.cpu_mask) < 2) 3365 if (cpumask_weight(nohz.idle_cpus_mask) < 2)
3200 goto out_done; 3366 goto out_done;
3201 3367
3202 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { 3368 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
@@ -3204,7 +3370,7 @@ static int find_new_ilb(int cpu)
3204 3370
3205 do { 3371 do {
3206 if (is_semi_idle_group(ilb_group)) 3372 if (is_semi_idle_group(ilb_group))
3207 return cpumask_first(nohz.ilb_grp_nohz_mask); 3373 return cpumask_first(nohz.grp_idle_mask);
3208 3374
3209 ilb_group = ilb_group->next; 3375 ilb_group = ilb_group->next;
3210 3376
@@ -3212,98 +3378,116 @@ static int find_new_ilb(int cpu)
3212 } 3378 }
3213 3379
3214out_done: 3380out_done:
3215 return cpumask_first(nohz.cpu_mask); 3381 return nr_cpu_ids;
3216} 3382}
3217#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ 3383#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3218static inline int find_new_ilb(int call_cpu) 3384static inline int find_new_ilb(int call_cpu)
3219{ 3385{
3220 return cpumask_first(nohz.cpu_mask); 3386 return nr_cpu_ids;
3221} 3387}
3222#endif 3388#endif
3223 3389
3224/* 3390/*
3391 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
3392 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
3393 * CPU (if there is one).
3394 */
3395static void nohz_balancer_kick(int cpu)
3396{
3397 int ilb_cpu;
3398
3399 nohz.next_balance++;
3400
3401 ilb_cpu = get_nohz_load_balancer();
3402
3403 if (ilb_cpu >= nr_cpu_ids) {
3404 ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
3405 if (ilb_cpu >= nr_cpu_ids)
3406 return;
3407 }
3408
3409 if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
3410 struct call_single_data *cp;
3411
3412 cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
3413 cp = &per_cpu(remote_sched_softirq_cb, cpu);
3414 __smp_call_function_single(ilb_cpu, cp, 0);
3415 }
3416 return;
3417}
3418
3419/*
3225 * This routine will try to nominate the ilb (idle load balancing) 3420 * This routine will try to nominate the ilb (idle load balancing)
3226 * owner among the cpus whose ticks are stopped. ilb owner will do the idle 3421 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3227 * load balancing on behalf of all those cpus. If all the cpus in the system 3422 * load balancing on behalf of all those cpus.
3228 * go into this tickless mode, then there will be no ilb owner (as there is
3229 * no need for one) and all the cpus will sleep till the next wakeup event
3230 * arrives...
3231 *
3232 * For the ilb owner, tick is not stopped. And this tick will be used
3233 * for idle load balancing. ilb owner will still be part of
3234 * nohz.cpu_mask..
3235 * 3423 *
3236 * While stopping the tick, this cpu will become the ilb owner if there 3424 * When the ilb owner becomes busy, we will not have new ilb owner until some
3237 * is no other owner. And will be the owner till that cpu becomes busy 3425 * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
3238 * or if all cpus in the system stop their ticks at which point 3426 * idle load balancing by kicking one of the idle CPUs.
3239 * there is no need for ilb owner.
3240 * 3427 *
3241 * When the ilb owner becomes busy, it nominates another owner, during the 3428 * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
3242 * next busy scheduler_tick() 3429 * ilb owner CPU in future (when there is a need for idle load balancing on
3430 * behalf of all idle CPUs).
3243 */ 3431 */
3244int select_nohz_load_balancer(int stop_tick) 3432void select_nohz_load_balancer(int stop_tick)
3245{ 3433{
3246 int cpu = smp_processor_id(); 3434 int cpu = smp_processor_id();
3247 3435
3248 if (stop_tick) { 3436 if (stop_tick) {
3249 cpu_rq(cpu)->in_nohz_recently = 1;
3250
3251 if (!cpu_active(cpu)) { 3437 if (!cpu_active(cpu)) {
3252 if (atomic_read(&nohz.load_balancer) != cpu) 3438 if (atomic_read(&nohz.load_balancer) != cpu)
3253 return 0; 3439 return;
3254 3440
3255 /* 3441 /*
3256 * If we are going offline and still the leader, 3442 * If we are going offline and still the leader,
3257 * give up! 3443 * give up!
3258 */ 3444 */
3259 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3445 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3446 nr_cpu_ids) != cpu)
3260 BUG(); 3447 BUG();
3261 3448
3262 return 0; 3449 return;
3263 } 3450 }
3264 3451
3265 cpumask_set_cpu(cpu, nohz.cpu_mask); 3452 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
3266 3453
3267 /* time for ilb owner also to sleep */ 3454 if (atomic_read(&nohz.first_pick_cpu) == cpu)
3268 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) { 3455 atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
3269 if (atomic_read(&nohz.load_balancer) == cpu) 3456 if (atomic_read(&nohz.second_pick_cpu) == cpu)
3270 atomic_set(&nohz.load_balancer, -1); 3457 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3271 return 0;
3272 }
3273 3458
3274 if (atomic_read(&nohz.load_balancer) == -1) { 3459 if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
3275 /* make me the ilb owner */
3276 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3277 return 1;
3278 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3279 int new_ilb; 3460 int new_ilb;
3280 3461
3281 if (!(sched_smt_power_savings || 3462 /* make me the ilb owner */
3282 sched_mc_power_savings)) 3463 if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
3283 return 1; 3464 cpu) != nr_cpu_ids)
3465 return;
3466
3284 /* 3467 /*
3285 * Check to see if there is a more power-efficient 3468 * Check to see if there is a more power-efficient
3286 * ilb. 3469 * ilb.
3287 */ 3470 */
3288 new_ilb = find_new_ilb(cpu); 3471 new_ilb = find_new_ilb(cpu);
3289 if (new_ilb < nr_cpu_ids && new_ilb != cpu) { 3472 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3290 atomic_set(&nohz.load_balancer, -1); 3473 atomic_set(&nohz.load_balancer, nr_cpu_ids);
3291 resched_cpu(new_ilb); 3474 resched_cpu(new_ilb);
3292 return 0; 3475 return;
3293 } 3476 }
3294 return 1; 3477 return;
3295 } 3478 }
3296 } else { 3479 } else {
3297 if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) 3480 if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
3298 return 0; 3481 return;
3299 3482
3300 cpumask_clear_cpu(cpu, nohz.cpu_mask); 3483 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
3301 3484
3302 if (atomic_read(&nohz.load_balancer) == cpu) 3485 if (atomic_read(&nohz.load_balancer) == cpu)
3303 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3486 if (atomic_cmpxchg(&nohz.load_balancer, cpu,
3487 nr_cpu_ids) != cpu)
3304 BUG(); 3488 BUG();
3305 } 3489 }
3306 return 0; 3490 return;
3307} 3491}
3308#endif 3492#endif
3309 3493
@@ -3385,11 +3569,102 @@ out:
3385 rq->next_balance = next_balance; 3569 rq->next_balance = next_balance;
3386} 3570}
3387 3571
3572#ifdef CONFIG_NO_HZ
3388/* 3573/*
3389 * run_rebalance_domains is triggered when needed from the scheduler tick. 3574 * In CONFIG_NO_HZ case, the idle balance kickee will do the
3390 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3391 * rebalancing for all the cpus for whom scheduler ticks are stopped. 3575 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3392 */ 3576 */
3577static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
3578{
3579 struct rq *this_rq = cpu_rq(this_cpu);
3580 struct rq *rq;
3581 int balance_cpu;
3582
3583 if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
3584 return;
3585
3586 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
3587 if (balance_cpu == this_cpu)
3588 continue;
3589
3590 /*
3591 * If this cpu gets work to do, stop the load balancing
3592 * work being done for other cpus. Next load
3593 * balancing owner will pick it up.
3594 */
3595 if (need_resched()) {
3596 this_rq->nohz_balance_kick = 0;
3597 break;
3598 }
3599
3600 raw_spin_lock_irq(&this_rq->lock);
3601 update_rq_clock(this_rq);
3602 update_cpu_load(this_rq);
3603 raw_spin_unlock_irq(&this_rq->lock);
3604
3605 rebalance_domains(balance_cpu, CPU_IDLE);
3606
3607 rq = cpu_rq(balance_cpu);
3608 if (time_after(this_rq->next_balance, rq->next_balance))
3609 this_rq->next_balance = rq->next_balance;
3610 }
3611 nohz.next_balance = this_rq->next_balance;
3612 this_rq->nohz_balance_kick = 0;
3613}
3614
3615/*
3616 * Current heuristic for kicking the idle load balancer
3617 * - first_pick_cpu is the one of the busy CPUs. It will kick
3618 * idle load balancer when it has more than one process active. This
3619 * eliminates the need for idle load balancing altogether when we have
3620 * only one running process in the system (common case).
3621 * - If there are more than one busy CPU, idle load balancer may have
3622 * to run for active_load_balance to happen (i.e., two busy CPUs are
3623 * SMT or core siblings and can run better if they move to different
3624 * physical CPUs). So, second_pick_cpu is the second of the busy CPUs
3625 * which will kick idle load balancer as soon as it has any load.
3626 */
3627static inline int nohz_kick_needed(struct rq *rq, int cpu)
3628{
3629 unsigned long now = jiffies;
3630 int ret;
3631 int first_pick_cpu, second_pick_cpu;
3632
3633 if (time_before(now, nohz.next_balance))
3634 return 0;
3635
3636 if (!rq->nr_running)
3637 return 0;
3638
3639 first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
3640 second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
3641
3642 if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
3643 second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
3644 return 0;
3645
3646 ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
3647 if (ret == nr_cpu_ids || ret == cpu) {
3648 atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
3649 if (rq->nr_running > 1)
3650 return 1;
3651 } else {
3652 ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
3653 if (ret == nr_cpu_ids || ret == cpu) {
3654 if (rq->nr_running)
3655 return 1;
3656 }
3657 }
3658 return 0;
3659}
3660#else
3661static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
3662#endif
3663
3664/*
3665 * run_rebalance_domains is triggered when needed from the scheduler tick.
3666 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
3667 */
3393static void run_rebalance_domains(struct softirq_action *h) 3668static void run_rebalance_domains(struct softirq_action *h)
3394{ 3669{
3395 int this_cpu = smp_processor_id(); 3670 int this_cpu = smp_processor_id();
@@ -3399,37 +3674,12 @@ static void run_rebalance_domains(struct softirq_action *h)
3399 3674
3400 rebalance_domains(this_cpu, idle); 3675 rebalance_domains(this_cpu, idle);
3401 3676
3402#ifdef CONFIG_NO_HZ
3403 /* 3677 /*
3404 * If this cpu is the owner for idle load balancing, then do the 3678 * If this cpu has a pending nohz_balance_kick, then do the
3405 * balancing on behalf of the other idle cpus whose ticks are 3679 * balancing on behalf of the other idle cpus whose ticks are
3406 * stopped. 3680 * stopped.
3407 */ 3681 */
3408 if (this_rq->idle_at_tick && 3682 nohz_idle_balance(this_cpu, idle);
3409 atomic_read(&nohz.load_balancer) == this_cpu) {
3410 struct rq *rq;
3411 int balance_cpu;
3412
3413 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3414 if (balance_cpu == this_cpu)
3415 continue;
3416
3417 /*
3418 * If this cpu gets work to do, stop the load balancing
3419 * work being done for other cpus. Next load
3420 * balancing owner will pick it up.
3421 */
3422 if (need_resched())
3423 break;
3424
3425 rebalance_domains(balance_cpu, CPU_IDLE);
3426
3427 rq = cpu_rq(balance_cpu);
3428 if (time_after(this_rq->next_balance, rq->next_balance))
3429 this_rq->next_balance = rq->next_balance;
3430 }
3431 }
3432#endif
3433} 3683}
3434 3684
3435static inline int on_null_domain(int cpu) 3685static inline int on_null_domain(int cpu)
@@ -3439,57 +3689,17 @@ static inline int on_null_domain(int cpu)
3439 3689
3440/* 3690/*
3441 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 3691 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3442 *
3443 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3444 * idle load balancing owner or decide to stop the periodic load balancing,
3445 * if the whole system is idle.
3446 */ 3692 */
3447static inline void trigger_load_balance(struct rq *rq, int cpu) 3693static inline void trigger_load_balance(struct rq *rq, int cpu)
3448{ 3694{
3449#ifdef CONFIG_NO_HZ
3450 /*
3451 * If we were in the nohz mode recently and busy at the current
3452 * scheduler tick, then check if we need to nominate new idle
3453 * load balancer.
3454 */
3455 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3456 rq->in_nohz_recently = 0;
3457
3458 if (atomic_read(&nohz.load_balancer) == cpu) {
3459 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3460 atomic_set(&nohz.load_balancer, -1);
3461 }
3462
3463 if (atomic_read(&nohz.load_balancer) == -1) {
3464 int ilb = find_new_ilb(cpu);
3465
3466 if (ilb < nr_cpu_ids)
3467 resched_cpu(ilb);
3468 }
3469 }
3470
3471 /*
3472 * If this cpu is idle and doing idle load balancing for all the
3473 * cpus with ticks stopped, is it time for that to stop?
3474 */
3475 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3476 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3477 resched_cpu(cpu);
3478 return;
3479 }
3480
3481 /*
3482 * If this cpu is idle and the idle load balancing is done by
3483 * someone else, then no need raise the SCHED_SOFTIRQ
3484 */
3485 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3486 cpumask_test_cpu(cpu, nohz.cpu_mask))
3487 return;
3488#endif
3489 /* Don't need to rebalance while attached to NULL domain */ 3695 /* Don't need to rebalance while attached to NULL domain */
3490 if (time_after_eq(jiffies, rq->next_balance) && 3696 if (time_after_eq(jiffies, rq->next_balance) &&
3491 likely(!on_null_domain(cpu))) 3697 likely(!on_null_domain(cpu)))
3492 raise_softirq(SCHED_SOFTIRQ); 3698 raise_softirq(SCHED_SOFTIRQ);
3699#ifdef CONFIG_NO_HZ
3700 else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
3701 nohz_balancer_kick(cpu);
3702#endif
3493} 3703}
3494 3704
3495static void rq_online_fair(struct rq *rq) 3705static void rq_online_fair(struct rq *rq)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..d10c80ebb67a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1663,9 +1663,6 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1663{ 1663{
1664 unsigned long soft, hard; 1664 unsigned long soft, hard;
1665 1665
1666 if (!p->signal)
1667 return;
1668
1669 /* max may change after cur was read, this will be fixed next tick */ 1666 /* max may change after cur was read, this will be fixed next tick */
1670 soft = task_rlimit(p, RLIMIT_RTTIME); 1667 soft = task_rlimit(p, RLIMIT_RTTIME);
1671 hard = task_rlimit_max(p, RLIMIT_RTTIME); 1668 hard = task_rlimit_max(p, RLIMIT_RTTIME);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 32d2bd4061b0..25c2f962f6fc 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -295,13 +295,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
295static inline void account_group_user_time(struct task_struct *tsk, 295static inline void account_group_user_time(struct task_struct *tsk,
296 cputime_t cputime) 296 cputime_t cputime)
297{ 297{
298 struct thread_group_cputimer *cputimer; 298 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
299
300 /* tsk == current, ensure it is safe to use ->signal */
301 if (unlikely(tsk->exit_state))
302 return;
303
304 cputimer = &tsk->signal->cputimer;
305 299
306 if (!cputimer->running) 300 if (!cputimer->running)
307 return; 301 return;
@@ -325,13 +319,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
325static inline void account_group_system_time(struct task_struct *tsk, 319static inline void account_group_system_time(struct task_struct *tsk,
326 cputime_t cputime) 320 cputime_t cputime)
327{ 321{
328 struct thread_group_cputimer *cputimer; 322 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
329
330 /* tsk == current, ensure it is safe to use ->signal */
331 if (unlikely(tsk->exit_state))
332 return;
333
334 cputimer = &tsk->signal->cputimer;
335 323
336 if (!cputimer->running) 324 if (!cputimer->running)
337 return; 325 return;
@@ -355,16 +343,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
355static inline void account_group_exec_runtime(struct task_struct *tsk, 343static inline void account_group_exec_runtime(struct task_struct *tsk,
356 unsigned long long ns) 344 unsigned long long ns)
357{ 345{
358 struct thread_group_cputimer *cputimer; 346 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
359 struct signal_struct *sig;
360
361 sig = tsk->signal;
362 /* see __exit_signal()->task_rq_unlock_wait() */
363 barrier();
364 if (unlikely(!sig))
365 return;
366
367 cputimer = &sig->cputimer;
368 347
369 if (!cputimer->running) 348 if (!cputimer->running)
370 return; 349 return;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..021d2f878f19 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
325 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
326 326
327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { 328 arch_needs_cpu(cpu)) {
329 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
330 delta_jiffies = 1; 330 delta_jiffies = 1;
331 } else { 331 } else {
@@ -405,13 +405,7 @@ void tick_nohz_stop_sched_tick(int inidle)
405 * the scheduler tick in nohz_restart_sched_tick. 405 * the scheduler tick in nohz_restart_sched_tick.
406 */ 406 */
407 if (!ts->tick_stopped) { 407 if (!ts->tick_stopped) {
408 if (select_nohz_load_balancer(1)) { 408 select_nohz_load_balancer(1);
409 /*
410 * sched tick not stopped!
411 */
412 cpumask_clear_cpu(cpu, nohz_cpu_mask);
413 goto out;
414 }
415 409
416 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); 410 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
417 ts->tick_stopped = 1; 411 ts->tick_stopped = 1;
diff --git a/kernel/timer.c b/kernel/timer.c
index 6aa6f7e69ad5..d61d16da0b64 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -692,12 +692,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
692 cpu = smp_processor_id(); 692 cpu = smp_processor_id();
693 693
694#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 694#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
695 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { 695 if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
696 int preferred_cpu = get_nohz_load_balancer(); 696 cpu = get_nohz_timer_target();
697
698 if (preferred_cpu >= 0)
699 cpu = preferred_cpu;
700 }
701#endif 697#endif
702 new_base = per_cpu(tvec_bases, cpu); 698 new_base = per_cpu(tvec_bases, cpu);
703 699
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 52fda6c04ac3..685a67d55db0 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -55,7 +55,7 @@ u64 notrace trace_clock_local(void)
55 */ 55 */
56u64 notrace trace_clock(void) 56u64 notrace trace_clock(void)
57{ 57{
58 return cpu_clock(raw_smp_processor_id()); 58 return local_clock();
59} 59}
60 60
61 61
diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h
new file mode 100644
index 000000000000..af040babb742
--- /dev/null
+++ b/kernel/workqueue_sched.h
@@ -0,0 +1,16 @@
1/*
2 * kernel/workqueue_sched.h
3 *
4 * Scheduler hooks for concurrency managed workqueue. Only to be
5 * included from sched.c and workqueue.c.
6 */
7static inline void wq_worker_waking_up(struct task_struct *task,
8 unsigned int cpu)
9{
10}
11
12static inline struct task_struct *wq_worker_sleeping(struct task_struct *task,
13 unsigned int cpu)
14{
15 return NULL;
16}