aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile13
-rw-r--r--kernel/bounds.c4
-rw-r--r--kernel/context_tracking.c2
-rw-r--r--kernel/cpu.c17
-rw-r--r--kernel/cpu/idle.c16
-rw-r--r--kernel/events/core.c173
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c126
-rw-r--r--kernel/events/uprobes.c223
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/manage.c2
-rw-r--r--kernel/lockdep.c4
-rw-r--r--kernel/mutex.c32
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/rcu/Makefile6
-rw-r--r--kernel/rcu/rcu.h (renamed from kernel/rcu.h)7
-rw-r--r--kernel/rcu/srcu.c (renamed from kernel/srcu.c)0
-rw-r--r--kernel/rcu/tiny.c (renamed from kernel/rcutiny.c)37
-rw-r--r--kernel/rcu/tiny_plugin.h (renamed from kernel/rcutiny_plugin.h)0
-rw-r--r--kernel/rcu/torture.c (renamed from kernel/rcutorture.c)6
-rw-r--r--kernel/rcu/tree.c (renamed from kernel/rcutree.c)200
-rw-r--r--kernel/rcu/tree.h (renamed from kernel/rcutree.h)2
-rw-r--r--kernel/rcu/tree_plugin.h (renamed from kernel/rcutree_plugin.h)84
-rw-r--r--kernel/rcu/tree_trace.c (renamed from kernel/rcutree_trace.c)2
-rw-r--r--kernel/rcu/update.c (renamed from kernel/rcupdate.c)10
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c299
-rw-r--r--kernel/sched/core.c683
-rw-r--r--kernel/sched/debug.c68
-rw-r--r--kernel/sched/fair.c1397
-rw-r--r--kernel/sched/features.h19
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/rt.c22
-rw-r--r--kernel/sched/sched.h54
-rw-r--r--kernel/sched/stats.h46
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c (renamed from kernel/wait.c)127
-rw-r--r--kernel/smp.c7
-rw-r--r--kernel/softirq.c53
-rw-r--r--kernel/stop_machine.c303
-rw-r--r--kernel/sysctl.c24
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/clockevents.c67
-rw-r--r--kernel/time/clocksource.c52
-rw-r--r--kernel/time/ntp.c3
-rw-r--r--kernel/time/sched_clock.c114
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/time/timer_stats.c8
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/trace.c3
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_event_perf.c2
-rw-r--r--kernel/trace/trace_output.c19
56 files changed, 3134 insertions, 1272 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1ce47553fb02..a4d1aa8da9bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -6,9 +6,9 @@ obj-y = fork.o exec_domain.o panic.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o task_work.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 extable.o params.o posix-timers.o \
10 kthread.o wait.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o semaphore.o \
12 notifier.o ksysfs.o cred.o reboot.o \ 12 notifier.o ksysfs.o cred.o reboot.o \
13 async.o range.o groups.o lglock.o smpboot.o 13 async.o range.o groups.o lglock.o smpboot.o
14 14
@@ -27,6 +27,7 @@ obj-y += power/
27obj-y += printk/ 27obj-y += printk/
28obj-y += cpu/ 28obj-y += cpu/
29obj-y += irq/ 29obj-y += irq/
30obj-y += rcu/
30 31
31obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o 32obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
32obj-$(CONFIG_FREEZER) += freezer.o 33obj-$(CONFIG_FREEZER) += freezer.o
@@ -81,12 +82,6 @@ obj-$(CONFIG_KGDB) += debug/
81obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 82obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
82obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o 83obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
83obj-$(CONFIG_SECCOMP) += seccomp.o 84obj-$(CONFIG_SECCOMP) += seccomp.o
84obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
85obj-$(CONFIG_TREE_RCU) += rcutree.o
86obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o
87obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
88obj-$(CONFIG_TINY_RCU) += rcutiny.o
89obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o
90obj-$(CONFIG_RELAY) += relay.o 85obj-$(CONFIG_RELAY) += relay.o
91obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 86obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
92obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 87obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h> 12#include <linux/page_cgroup.h>
13#include <linux/log2.h>
13 14
14void foo(void) 15void foo(void)
15{ 16{
@@ -17,5 +18,8 @@ void foo(void)
17 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
18 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
19 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); 20 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
21#ifdef CONFIG_SMP
22 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
23#endif
20 /* End of constants */ 24 /* End of constants */
21} 25}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
120 * instead of preempt_schedule() to exit user context if needed before 120 * instead of preempt_schedule() to exit user context if needed before
121 * calling the scheduler. 121 * calling the scheduler.
122 */ 122 */
123void __sched notrace preempt_schedule_context(void) 123asmlinkage void __sched notrace preempt_schedule_context(void)
124{ 124{
125 enum ctx_state prev_ctx; 125 enum ctx_state prev_ctx;
126 126
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
308 } 308 }
309 smpboot_park_threads(cpu); 309 smpboot_park_threads(cpu);
310 310
311 /*
312 * By now we've cleared cpu_active_mask, wait for all preempt-disabled
313 * and RCU users of this state to go away such that all new such users
314 * will observe it.
315 *
316 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
317 * not imply sync_sched(), so explicitly call both.
318 */
319#ifdef CONFIG_PREEMPT
320 synchronize_sched();
321#endif
322 synchronize_rcu();
323
324 /*
325 * So now all preempt/rcu users must observe !cpu_active().
326 */
327
311 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 328 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
312 if (err) { 329 if (err) {
313 /* CPU didn't die: tell everyone. Can't complain. */ 330 /* CPU didn't die: tell everyone. Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
44 rcu_idle_enter(); 44 rcu_idle_enter();
45 trace_cpu_idle_rcuidle(0, smp_processor_id()); 45 trace_cpu_idle_rcuidle(0, smp_processor_id());
46 local_irq_enable(); 46 local_irq_enable();
47 while (!need_resched()) 47 while (!tif_need_resched())
48 cpu_relax(); 48 cpu_relax();
49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 49 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
50 rcu_idle_exit(); 50 rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) { 92 if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
93 cpu_idle_poll(); 93 cpu_idle_poll();
94 } else { 94 } else {
95 current_clr_polling(); 95 if (!current_clr_polling_and_test()) {
96 if (!need_resched()) {
97 stop_critical_timings(); 96 stop_critical_timings();
98 rcu_idle_enter(); 97 rcu_idle_enter();
99 arch_cpu_idle(); 98 arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
103 } else { 102 } else {
104 local_irq_enable(); 103 local_irq_enable();
105 } 104 }
106 current_set_polling(); 105 __current_set_polling();
107 } 106 }
108 arch_cpu_idle_exit(); 107 arch_cpu_idle_exit();
108 /*
109 * We need to test and propagate the TIF_NEED_RESCHED
110 * bit here because we might not have send the
111 * reschedule IPI to idle tasks.
112 */
113 if (tif_need_resched())
114 set_preempt_need_resched();
109 } 115 }
110 tick_nohz_idle_exit(); 116 tick_nohz_idle_exit();
111 schedule_preempt_disabled(); 117 schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
129 */ 135 */
130 boot_init_stack_canary(); 136 boot_init_stack_canary();
131#endif 137#endif
132 current_set_polling(); 138 __current_set_polling();
133 arch_cpu_idle_prepare(); 139 arch_cpu_idle_prepare();
134 cpu_idle_loop(); 140 cpu_idle_loop();
135} 141}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d49a9d29334c..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
177 177
178static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
179 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
180 180
181void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
182{ 182{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
184 184
185 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
186 do_div(tmp, 100); 186 do_div(tmp, 100);
187 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
188} 188}
189 189
190static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
193 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 194 loff_t *ppos)
195{ 195{
196 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
197 197
198 if (ret || !write) 198 if (ret || !write)
199 return ret; 199 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 * we detect that events are taking too long. 228 * we detect that events are taking too long.
229 */ 229 */
230#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
231DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
232 232
233void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
234{ 234{
235 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
236 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
237 238
238 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
239 return; 240 return;
240 241
241 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
251 */ 252 */
252 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
253 254
254 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
255 return; 256 return;
256 257
257 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
262 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
263 264
264 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
265 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
266 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
267 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
268 atomic_read(&perf_sample_allowed_ns),
269 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
270 270
271 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1310 */ 1316 */
1311 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1312 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1313} 1321}
1314 1322
1315static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2146} 2154}
2147 2155
2148/* 2156/*
2149 * Test whether two contexts are equivalent, i.e. whether they 2157 * Test whether two contexts are equivalent, i.e. whether they have both been
2150 * have both been cloned from the same version of the same context 2158 * cloned from the same version of the same context.
2151 * and they both have the same number of enabled events. 2159 *
2152 * If the number of enabled events is the same, then the set 2160 * Equivalence is measured using a generation number in the context that is
2153 * of enabled events should be the same, because these are both 2161 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2154 * inherited contexts, therefore we can't access individual events 2162 * and list_del_event().
2155 * in them directly with an fd; we can only enable/disable all
2156 * events via prctl, or enable/disable all events in a family
2157 * via ioctl, which will have the same effect on both contexts.
2158 */ 2163 */
2159static int context_equiv(struct perf_event_context *ctx1, 2164static int context_equiv(struct perf_event_context *ctx1,
2160 struct perf_event_context *ctx2) 2165 struct perf_event_context *ctx2)
2161{ 2166{
2162 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2167 /* Pinning disables the swap optimization */
2163 && ctx1->parent_gen == ctx2->parent_gen 2168 if (ctx1->pin_count || ctx2->pin_count)
2164 && !ctx1->pin_count && !ctx2->pin_count; 2169 return 0;
2170
2171 /* If ctx1 is the parent of ctx2 */
2172 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2173 return 1;
2174
2175 /* If ctx2 is the parent of ctx1 */
2176 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2177 return 1;
2178
2179 /*
2180 * If ctx1 and ctx2 have the same parent; we flatten the parent
2181 * hierarchy, see perf_event_init_context().
2182 */
2183 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2184 ctx1->parent_gen == ctx2->parent_gen)
2185 return 1;
2186
2187 /* Unmatched */
2188 return 0;
2165} 2189}
2166 2190
2167static void __perf_event_sync_stat(struct perf_event *event, 2191static void __perf_event_sync_stat(struct perf_event *event,
@@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2244{ 2268{
2245 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2269 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2246 struct perf_event_context *next_ctx; 2270 struct perf_event_context *next_ctx;
2247 struct perf_event_context *parent; 2271 struct perf_event_context *parent, *next_parent;
2248 struct perf_cpu_context *cpuctx; 2272 struct perf_cpu_context *cpuctx;
2249 int do_switch = 1; 2273 int do_switch = 1;
2250 2274
@@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2256 return; 2280 return;
2257 2281
2258 rcu_read_lock(); 2282 rcu_read_lock();
2259 parent = rcu_dereference(ctx->parent_ctx);
2260 next_ctx = next->perf_event_ctxp[ctxn]; 2283 next_ctx = next->perf_event_ctxp[ctxn];
2261 if (parent && next_ctx && 2284 if (!next_ctx)
2262 rcu_dereference(next_ctx->parent_ctx) == parent) { 2285 goto unlock;
2286
2287 parent = rcu_dereference(ctx->parent_ctx);
2288 next_parent = rcu_dereference(next_ctx->parent_ctx);
2289
2290 /* If neither context have a parent context; they cannot be clones. */
2291 if (!parent && !next_parent)
2292 goto unlock;
2293
2294 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2263 /* 2295 /*
2264 * Looks like the two contexts are clones, so we might be 2296 * Looks like the two contexts are clones, so we might be
2265 * able to optimize the context switch. We lock both 2297 * able to optimize the context switch. We lock both
@@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2287 raw_spin_unlock(&next_ctx->lock); 2319 raw_spin_unlock(&next_ctx->lock);
2288 raw_spin_unlock(&ctx->lock); 2320 raw_spin_unlock(&ctx->lock);
2289 } 2321 }
2322unlock:
2290 rcu_read_unlock(); 2323 rcu_read_unlock();
2291 2324
2292 if (do_switch) { 2325 if (do_switch) {
@@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4572 if (sample_type & PERF_SAMPLE_DATA_SRC) 4605 if (sample_type & PERF_SAMPLE_DATA_SRC)
4573 perf_output_put(handle, data->data_src.val); 4606 perf_output_put(handle, data->data_src.val);
4574 4607
4608 if (sample_type & PERF_SAMPLE_TRANSACTION)
4609 perf_output_put(handle, data->txn);
4610
4575 if (!event->attr.watermark) { 4611 if (!event->attr.watermark) {
4576 int wakeup_events = event->attr.wakeup_events; 4612 int wakeup_events = event->attr.wakeup_events;
4577 4613
@@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5100 unsigned int size; 5136 unsigned int size;
5101 char tmp[16]; 5137 char tmp[16];
5102 char *buf = NULL; 5138 char *buf = NULL;
5103 const char *name; 5139 char *name;
5104
5105 memset(tmp, 0, sizeof(tmp));
5106 5140
5107 if (file) { 5141 if (file) {
5108 struct inode *inode; 5142 struct inode *inode;
5109 dev_t dev; 5143 dev_t dev;
5144
5145 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5146 if (!buf) {
5147 name = "//enomem";
5148 goto cpy_name;
5149 }
5110 /* 5150 /*
5111 * d_path works from the end of the rb backwards, so we 5151 * d_path() works from the end of the rb backwards, so we
5112 * need to add enough zero bytes after the string to handle 5152 * need to add enough zero bytes after the string to handle
5113 * the 64bit alignment we do later. 5153 * the 64bit alignment we do later.
5114 */ 5154 */
5115 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5155 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5116 if (!buf) {
5117 name = strncpy(tmp, "//enomem", sizeof(tmp));
5118 goto got_name;
5119 }
5120 name = d_path(&file->f_path, buf, PATH_MAX);
5121 if (IS_ERR(name)) { 5156 if (IS_ERR(name)) {
5122 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5157 name = "//toolong";
5123 goto got_name; 5158 goto cpy_name;
5124 } 5159 }
5125 inode = file_inode(vma->vm_file); 5160 inode = file_inode(vma->vm_file);
5126 dev = inode->i_sb->s_dev; 5161 dev = inode->i_sb->s_dev;
@@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5128 gen = inode->i_generation; 5163 gen = inode->i_generation;
5129 maj = MAJOR(dev); 5164 maj = MAJOR(dev);
5130 min = MINOR(dev); 5165 min = MINOR(dev);
5131 5166 goto got_name;
5132 } else { 5167 } else {
5133 if (arch_vma_name(mmap_event->vma)) { 5168 name = (char *)arch_vma_name(vma);
5134 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5169 if (name)
5135 sizeof(tmp) - 1); 5170 goto cpy_name;
5136 tmp[sizeof(tmp) - 1] = '\0';
5137 goto got_name;
5138 }
5139 5171
5140 if (!vma->vm_mm) { 5172 if (vma->vm_start <= vma->vm_mm->start_brk &&
5141 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5142 goto got_name;
5143 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5144 vma->vm_end >= vma->vm_mm->brk) { 5173 vma->vm_end >= vma->vm_mm->brk) {
5145 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5174 name = "[heap]";
5146 goto got_name; 5175 goto cpy_name;
5147 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5176 }
5177 if (vma->vm_start <= vma->vm_mm->start_stack &&
5148 vma->vm_end >= vma->vm_mm->start_stack) { 5178 vma->vm_end >= vma->vm_mm->start_stack) {
5149 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5179 name = "[stack]";
5150 goto got_name; 5180 goto cpy_name;
5151 } 5181 }
5152 5182
5153 name = strncpy(tmp, "//anon", sizeof(tmp)); 5183 name = "//anon";
5154 goto got_name; 5184 goto cpy_name;
5155 } 5185 }
5156 5186
5187cpy_name:
5188 strlcpy(tmp, name, sizeof(tmp));
5189 name = tmp;
5157got_name: 5190got_name:
5158 size = ALIGN(strlen(name)+1, sizeof(u64)); 5191 /*
5192 * Since our buffer works in 8 byte units we need to align our string
5193 * size to a multiple of 8. However, we must guarantee the tail end is
5194 * zero'd out to avoid leaking random bits to userspace.
5195 */
5196 size = strlen(name)+1;
5197 while (!IS_ALIGNED(size, sizeof(u64)))
5198 name[size++] = '\0';
5159 5199
5160 mmap_event->file_name = name; 5200 mmap_event->file_name = name;
5161 mmap_event->file_size = size; 5201 mmap_event->file_size = size;
@@ -6292,6 +6332,7 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
6292 6332
6293 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6333 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6294} 6334}
6335static DEVICE_ATTR_RO(type);
6295 6336
6296static ssize_t 6337static ssize_t
6297perf_event_mux_interval_ms_show(struct device *dev, 6338perf_event_mux_interval_ms_show(struct device *dev,
@@ -6336,17 +6377,19 @@ perf_event_mux_interval_ms_store(struct device *dev,
6336 6377
6337 return count; 6378 return count;
6338} 6379}
6380static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6339 6381
6340static struct device_attribute pmu_dev_attrs[] = { 6382static struct attribute *pmu_dev_attrs[] = {
6341 __ATTR_RO(type), 6383 &dev_attr_type.attr,
6342 __ATTR_RW(perf_event_mux_interval_ms), 6384 &dev_attr_perf_event_mux_interval_ms.attr,
6343 __ATTR_NULL, 6385 NULL,
6344}; 6386};
6387ATTRIBUTE_GROUPS(pmu_dev);
6345 6388
6346static int pmu_bus_running; 6389static int pmu_bus_running;
6347static struct bus_type pmu_bus = { 6390static struct bus_type pmu_bus = {
6348 .name = "event_source", 6391 .name = "event_source",
6349 .dev_attrs = pmu_dev_attrs, 6392 .dev_groups = pmu_dev_groups,
6350}; 6393};
6351 6394
6352static void pmu_dev_release(struct device *dev) 6395static void pmu_dev_release(struct device *dev)
@@ -6767,6 +6810,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6767 if (ret) 6810 if (ret)
6768 return -EFAULT; 6811 return -EFAULT;
6769 6812
6813 /* disabled for now */
6814 if (attr->mmap2)
6815 return -EINVAL;
6816
6770 if (attr->__reserved_1) 6817 if (attr->__reserved_1)
6771 return -EINVAL; 6818 return -EINVAL;
6772 6819
@@ -7122,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
7122 } 7169 }
7123 7170
7124 perf_install_in_context(ctx, event, event->cpu); 7171 perf_install_in_context(ctx, event, event->cpu);
7125 ++ctx->generation;
7126 perf_unpin_context(ctx); 7172 perf_unpin_context(ctx);
7127 mutex_unlock(&ctx->mutex); 7173 mutex_unlock(&ctx->mutex);
7128 7174
@@ -7205,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7205 WARN_ON_ONCE(ctx->parent_ctx); 7251 WARN_ON_ONCE(ctx->parent_ctx);
7206 mutex_lock(&ctx->mutex); 7252 mutex_lock(&ctx->mutex);
7207 perf_install_in_context(ctx, event, cpu); 7253 perf_install_in_context(ctx, event, cpu);
7208 ++ctx->generation;
7209 perf_unpin_context(ctx); 7254 perf_unpin_context(ctx);
7210 mutex_unlock(&ctx->mutex); 7255 mutex_unlock(&ctx->mutex);
7211 7256
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
82} 82}
83 83
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned int \ 85static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 86func_name(struct perf_output_handle *handle, \
87 const void *buf, unsigned int len) \ 87 const void *buf, unsigned long len) \
88{ \ 88{ \
89 unsigned long size, written; \ 89 unsigned long size, written; \
90 \ 90 \
91 do { \ 91 do { \
92 size = min_t(unsigned long, handle->size, len); \ 92 size = min(handle->size, len); \
93 \
94 written = memcpy_func(handle->addr, buf, size); \ 93 written = memcpy_func(handle->addr, buf, size); \
94 written = size - written; \
95 \ 95 \
96 len -= written; \ 96 len -= written; \
97 handle->addr += written; \ 97 handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
110 return len; \ 110 return len; \
111} 111}
112 112
113static inline int memcpy_common(void *dst, const void *src, size_t n) 113static inline unsigned long
114memcpy_common(void *dst, const void *src, unsigned long n)
114{ 115{
115 memcpy(dst, src, n); 116 memcpy(dst, src, n);
116 return n; 117 return 0;
117} 118}
118 119
119DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) 120DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
120 121
121#define MEMCPY_SKIP(dst, src, n) (n) 122static inline unsigned long
123memcpy_skip(void *dst, const void *src, unsigned long n)
124{
125 return 0;
126}
122 127
123DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) 128DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
124 129
125#ifndef arch_perf_out_copy_user 130#ifndef arch_perf_out_copy_user
126#define arch_perf_out_copy_user __copy_from_user_inatomic 131#define arch_perf_out_copy_user arch_perf_out_copy_user
132
133static inline unsigned long
134arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
135{
136 unsigned long ret;
137
138 pagefault_disable();
139 ret = __copy_from_user_inatomic(dst, src, n);
140 pagefault_enable();
141
142 return ret;
143}
127#endif 144#endif
128 145
129DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) 146DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index cd55144270b5..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h>
15 16
16#include "internal.h" 17#include "internal.h"
17 18
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
23
24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
30 return true;
31
32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
39
40 offset = (offset - tail) & mask;
41 head = (head - tail) & mask;
42
43 if ((int)(head - offset) < 0)
44 return false;
45
46 return true;
47}
48
49static void perf_output_wakeup(struct perf_output_handle *handle) 19static void perf_output_wakeup(struct perf_output_handle *handle)
50{ 20{
51 atomic_set(&handle->rb->poll, POLL_IN); 21 atomic_set(&handle->rb->poll, POLL_IN);
@@ -87,15 +57,36 @@ again:
87 goto out; 57 goto out;
88 58
89 /* 59 /*
90 * Publish the known good head. Rely on the full barrier implied 60 * Since the mmap() consumer (userspace) can run on a different CPU:
91 * by atomic_dec_and_test() order the rb->head read and this 61 *
92 * write. 62 * kernel user
63 *
64 * READ ->data_tail READ ->data_head
65 * smp_mb() (A) smp_rmb() (C)
66 * WRITE $data READ $data
67 * smp_wmb() (B) smp_mb() (D)
68 * STORE ->data_head WRITE ->data_tail
69 *
70 * Where A pairs with D, and B pairs with C.
71 *
72 * I don't think A needs to be a full barrier because we won't in fact
73 * write data until we see the store from userspace. So we simply don't
74 * issue the data WRITE until we observe it. Be conservative for now.
75 *
76 * OTOH, D needs to be a full barrier since it separates the data READ
77 * from the tail WRITE.
78 *
79 * For B a WMB is sufficient since it separates two WRITEs, and for C
80 * an RMB is sufficient since it separates two READs.
81 *
82 * See perf_output_begin().
93 */ 83 */
84 smp_wmb();
94 rb->user_page->data_head = head; 85 rb->user_page->data_head = head;
95 86
96 /* 87 /*
97 * Now check if we missed an update, rely on the (compiler) 88 * Now check if we missed an update -- rely on previous implied
98 * barrier in atomic_dec_and_test() to re-read rb->head. 89 * compiler barriers to force a re-read.
99 */ 90 */
100 if (unlikely(head != local_read(&rb->head))) { 91 if (unlikely(head != local_read(&rb->head))) {
101 local_inc(&rb->nest); 92 local_inc(&rb->nest);
@@ -114,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
114{ 105{
115 struct ring_buffer *rb; 106 struct ring_buffer *rb;
116 unsigned long tail, offset, head; 107 unsigned long tail, offset, head;
117 int have_lost; 108 int have_lost, page_shift;
118 struct perf_sample_data sample_data;
119 struct { 109 struct {
120 struct perf_event_header header; 110 struct perf_event_header header;
121 u64 id; 111 u64 id;
@@ -130,55 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
130 event = event->parent; 120 event = event->parent;
131 121
132 rb = rcu_dereference(event->rb); 122 rb = rcu_dereference(event->rb);
133 if (!rb) 123 if (unlikely(!rb))
134 goto out; 124 goto out;
135 125
136 handle->rb = rb; 126 if (unlikely(!rb->nr_pages))
137 handle->event = event;
138
139 if (!rb->nr_pages)
140 goto out; 127 goto out;
141 128
129 handle->rb = rb;
130 handle->event = event;
131
142 have_lost = local_read(&rb->lost); 132 have_lost = local_read(&rb->lost);
143 if (have_lost) { 133 if (unlikely(have_lost)) {
144 lost_event.header.size = sizeof(lost_event); 134 size += sizeof(lost_event);
145 perf_event_header__init_id(&lost_event.header, &sample_data, 135 if (event->attr.sample_id_all)
146 event); 136 size += event->id_header_size;
147 size += lost_event.header.size;
148 } 137 }
149 138
150 perf_output_get_handle(handle); 139 perf_output_get_handle(handle);
151 140
152 do { 141 do {
153 /*
154 * Userspace could choose to issue a mb() before updating the
155 * tail pointer. So that all reads will be completed before the
156 * write is issued.
157 */
158 tail = ACCESS_ONCE(rb->user_page->data_tail); 142 tail = ACCESS_ONCE(rb->user_page->data_tail);
159 smp_rmb();
160 offset = head = local_read(&rb->head); 143 offset = head = local_read(&rb->head);
161 head += size; 144 if (!rb->overwrite &&
162 if (unlikely(!perf_output_space(rb, tail, offset, head))) 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
163 goto fail; 146 goto fail;
147 head += size;
164 } while (local_cmpxchg(&rb->head, offset, head) != offset); 148 } while (local_cmpxchg(&rb->head, offset, head) != offset);
165 149
166 if (head - local_read(&rb->wakeup) > rb->watermark) 150 /*
151 * Separate the userpage->tail read from the data stores below.
152 * Matches the MB userspace SHOULD issue after reading the data
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */
157 smp_mb();
158
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
167 local_add(rb->watermark, &rb->wakeup); 160 local_add(rb->watermark, &rb->wakeup);
168 161
169 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 162 page_shift = PAGE_SHIFT + page_order(rb);
170 handle->page &= rb->nr_pages - 1;
171 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
172 handle->addr = rb->data_pages[handle->page];
173 handle->addr += handle->size;
174 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
175 163
176 if (have_lost) { 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
165 offset &= (1UL << page_shift) - 1;
166 handle->addr = rb->data_pages[handle->page] + offset;
167 handle->size = (1UL << page_shift) - offset;
168
169 if (unlikely(have_lost)) {
170 struct perf_sample_data sample_data;
171
172 lost_event.header.size = sizeof(lost_event);
177 lost_event.header.type = PERF_RECORD_LOST; 173 lost_event.header.type = PERF_RECORD_LOST;
178 lost_event.header.misc = 0; 174 lost_event.header.misc = 0;
179 lost_event.id = event->id; 175 lost_event.id = event->id;
180 lost_event.lost = local_xchg(&rb->lost, 0); 176 lost_event.lost = local_xchg(&rb->lost, 0);
181 177
178 perf_event_header__init_id(&lost_event.header,
179 &sample_data, event);
182 perf_output_put(handle, lost_event); 180 perf_output_put(handle, lost_event);
183 perf_event__output_id_sample(event, handle, &sample_data); 181 perf_event__output_id_sample(event, handle, &sample_data);
184 } 182 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70e..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
35#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h>
38 39
39#include <linux/uprobes.h> 40#include <linux/uprobes.h>
40 41
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
244 * the architecture. If an arch has variable length instruction and the 245 * the architecture. If an arch has variable length instruction and the
245 * breakpoint instruction is not of the smallest length instruction 246 * breakpoint instruction is not of the smallest length instruction
246 * supported by that architecture then we need to modify is_trap_at_addr and 247 * supported by that architecture then we need to modify is_trap_at_addr and
247 * write_opcode accordingly. This would never be a problem for archs that 248 * uprobe_write_opcode accordingly. This would never be a problem for archs
248 * have fixed length instructions. 249 * that have fixed length instructions.
249 */ 250 */
250 251
251/* 252/*
252 * write_opcode - write the opcode at a given virtual address. 253 * uprobe_write_opcode - write the opcode at a given virtual address.
253 * @mm: the probed process address space. 254 * @mm: the probed process address space.
254 * @vaddr: the virtual address to store the opcode. 255 * @vaddr: the virtual address to store the opcode.
255 * @opcode: opcode to be written at @vaddr. 256 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
260 * For mm @mm, write the opcode at @vaddr. 261 * For mm @mm, write the opcode at @vaddr.
261 * Return 0 (success) or a negative errno. 262 * Return 0 (success) or a negative errno.
262 */ 263 */
263static int write_opcode(struct mm_struct *mm, unsigned long vaddr, 264int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
264 uprobe_opcode_t opcode) 265 uprobe_opcode_t opcode)
265{ 266{
266 struct page *old_page, *new_page; 267 struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
314 */ 315 */
315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 316int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
316{ 317{
317 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); 318 return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
318} 319}
319 320
320/** 321/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
329int __weak 330int __weak
330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
331{ 332{
332 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
333} 334}
334 335
335static int match_uprobe(struct uprobe *l, struct uprobe *r) 336static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
503 return ret; 504 return ret;
504} 505}
505 506
506static int 507static int __copy_insn(struct address_space *mapping, struct file *filp,
507__copy_insn(struct address_space *mapping, struct file *filp, char *insn, 508 void *insn, int nbytes, loff_t offset)
508 unsigned long nbytes, loff_t offset)
509{ 509{
510 struct page *page; 510 struct page *page;
511 511
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
527 527
528static int copy_insn(struct uprobe *uprobe, struct file *filp) 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 529{
530 struct address_space *mapping; 530 struct address_space *mapping = uprobe->inode->i_mapping;
531 unsigned long nbytes; 531 loff_t offs = uprobe->offset;
532 int bytes; 532 void *insn = uprobe->arch.insn;
533 533 int size = MAX_UINSN_BYTES;
534 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); 534 int len, err = -EIO;
535 mapping = uprobe->inode->i_mapping;
536 535
537 /* Instruction at end of binary; copy only available bytes */ 536 /* Copy only available bytes, -EIO if nothing was read */
538 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) 537 do {
539 bytes = uprobe->inode->i_size - uprobe->offset; 538 if (offs >= i_size_read(uprobe->inode))
540 else 539 break;
541 bytes = MAX_UINSN_BYTES;
542 540
543 /* Instruction at the page-boundary; copy bytes in second page */ 541 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
544 if (nbytes < bytes) { 542 err = __copy_insn(mapping, filp, insn, len, offs);
545 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
546 bytes - nbytes, uprobe->offset + nbytes);
547 if (err) 543 if (err)
548 return err; 544 break;
549 bytes = nbytes; 545
550 } 546 insn += len;
551 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 547 offs += len;
548 size -= len;
549 } while (size);
550
551 return err;
552} 552}
553 553
554static int prepare_uprobe(struct uprobe *uprobe, struct file *file, 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
576 if (ret) 576 if (ret)
577 goto out; 577 goto out;
578 578
579 /* write_opcode() assumes we don't cross page boundary */ 579 /* uprobe_write_opcode() assumes we don't cross page boundary */
580 BUG_ON((uprobe->offset & ~PAGE_MASK) + 580 BUG_ON((uprobe->offset & ~PAGE_MASK) +
581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); 581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
582 582
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1096} 1096}
1097 1097
1098/* Slot allocation for XOL */ 1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area) 1099static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1100{ 1100{
1101 struct mm_struct *mm = current->mm;
1102 int ret = -EALREADY; 1101 int ret = -EALREADY;
1103 1102
1104 down_write(&mm->mmap_sem); 1103 down_write(&mm->mmap_sem);
1105 if (mm->uprobes_state.xol_area) 1104 if (mm->uprobes_state.xol_area)
1106 goto fail; 1105 goto fail;
1107 1106
1108 ret = -ENOMEM; 1107 if (!area->vaddr) {
1109 /* Try to map as high as possible, this is only a hint. */ 1108 /* Try to map as high as possible, this is only a hint. */
1110 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1109 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1111 if (area->vaddr & ~PAGE_MASK) { 1110 PAGE_SIZE, 0, 0);
1112 ret = area->vaddr; 1111 if (area->vaddr & ~PAGE_MASK) {
1113 goto fail; 1112 ret = area->vaddr;
1113 goto fail;
1114 }
1114 } 1115 }
1115 1116
1116 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1117 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
1120 1121
1121 smp_wmb(); /* pairs with get_xol_area() */ 1122 smp_wmb(); /* pairs with get_xol_area() */
1122 mm->uprobes_state.xol_area = area; 1123 mm->uprobes_state.xol_area = area;
1123 ret = 0;
1124 fail: 1124 fail:
1125 up_write(&mm->mmap_sem); 1125 up_write(&mm->mmap_sem);
1126 1126
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
1130/* 1130static struct xol_area *__create_xol_area(unsigned long vaddr)
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{ 1131{
1138 struct mm_struct *mm = current->mm; 1132 struct mm_struct *mm = current->mm;
1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1133 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1134 struct xol_area *area;
1141 1135
1142 area = mm->uprobes_state.xol_area; 1136 area = kmalloc(sizeof(*area), GFP_KERNEL);
1143 if (area)
1144 goto ret;
1145
1146 area = kzalloc(sizeof(*area), GFP_KERNEL);
1147 if (unlikely(!area)) 1137 if (unlikely(!area))
1148 goto out; 1138 goto out;
1149 1139
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
1155 if (!area->page) 1145 if (!area->page)
1156 goto free_bitmap; 1146 goto free_bitmap;
1157 1147
1158 /* allocate first slot of task's xol_area for the return probes */ 1148 area->vaddr = vaddr;
1149 init_waitqueue_head(&area->wq);
1150 /* Reserve the 1st slot for get_trampoline_vaddr() */
1159 set_bit(0, area->bitmap); 1151 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1); 1152 atomic_set(&area->slot_count, 1);
1162 init_waitqueue_head(&area->wq); 1153 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1163 1154
1164 if (!xol_add_vma(area)) 1155 if (!xol_add_vma(mm, area))
1165 return area; 1156 return area;
1166 1157
1167 __free_page(area->page); 1158 __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
1170 free_area: 1161 free_area:
1171 kfree(area); 1162 kfree(area);
1172 out: 1163 out:
1164 return NULL;
1165}
1166
1167/*
1168 * get_xol_area - Allocate process's xol_area if necessary.
1169 * This area will be used for storing instructions for execution out of line.
1170 *
1171 * Returns the allocated area or NULL.
1172 */
1173static struct xol_area *get_xol_area(void)
1174{
1175 struct mm_struct *mm = current->mm;
1176 struct xol_area *area;
1177
1178 if (!mm->uprobes_state.xol_area)
1179 __create_xol_area(0);
1180
1173 area = mm->uprobes_state.xol_area; 1181 area = mm->uprobes_state.xol_area;
1174 ret: 1182 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1175 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1176 return area; 1183 return area;
1177} 1184}
1178 1185
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1256 return 0; 1263 return 0;
1257 1264
1258 /* Initialize the slot */ 1265 /* Initialize the slot */
1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); 1266 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1260 /* 1268 /*
1261 * We probably need flush_icache_user_range() but it needs vma. 1269 * We probably need flush_icache_user_range() but it needs vma.
1262 * This should work on supported architectures too. 1270 * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
1345} 1353}
1346 1354
1347/* 1355/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352 t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary. 1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint. 1357 * Called when the thread hits a breakpoint.
1358 * 1358 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
1367 return current->utask; 1367 return current->utask;
1368} 1368}
1369 1369
1370static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1371{
1372 struct uprobe_task *n_utask;
1373 struct return_instance **p, *o, *n;
1374
1375 n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1376 if (!n_utask)
1377 return -ENOMEM;
1378 t->utask = n_utask;
1379
1380 p = &n_utask->return_instances;
1381 for (o = o_utask->return_instances; o; o = o->next) {
1382 n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1383 if (!n)
1384 return -ENOMEM;
1385
1386 *n = *o;
1387 atomic_inc(&n->uprobe->ref);
1388 n->next = NULL;
1389
1390 *p = n;
1391 p = &n->next;
1392 n_utask->depth++;
1393 }
1394
1395 return 0;
1396}
1397
1398static void uprobe_warn(struct task_struct *t, const char *msg)
1399{
1400 pr_warn("uprobe: %s:%d failed to %s\n",
1401 current->comm, current->pid, msg);
1402}
1403
1404static void dup_xol_work(struct callback_head *work)
1405{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING)
1409 return;
1410
1411 if (!__create_xol_area(current->utask->vaddr))
1412 uprobe_warn(current, "dup xol area");
1413}
1414
1415/*
1416 * Called in context of a new clone/fork from copy_process.
1417 */
1418void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{
1420 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area;
1424
1425 t->utask = NULL;
1426
1427 if (!utask || !utask->return_instances)
1428 return;
1429
1430 if (mm == t->mm && !(flags & CLONE_VFORK))
1431 return;
1432
1433 if (dup_utask(t, utask))
1434 return uprobe_warn(t, "dup ret instances");
1435
1436 /* The task can fork() after dup_xol_work() fails */
1437 area = mm->uprobes_state.xol_area;
1438 if (!area)
1439 return uprobe_warn(t, "dup xol area");
1440
1441 if (mm == t->mm)
1442 return;
1443
1444 /* TODO: move it into the union in uprobe_task */
1445 work = kmalloc(sizeof(*work), GFP_KERNEL);
1446 if (!work)
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452}
1453
1370/* 1454/*
1371 * Current area->vaddr notion assume the trampoline address is always 1455 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr. 1456 * equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
1857 1941
1858 return register_die_notifier(&uprobe_exception_nb); 1942 return register_die_notifier(&uprobe_exception_nb);
1859} 1943}
1860module_init(init_uprobes); 1944__initcall(init_uprobes);
1861
1862static void __exit exit_uprobes(void)
1863{
1864}
1865module_exit(exit_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..f6d11fc67f72 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
817#ifdef CONFIG_TRANSPARENT_HUGEPAGE 817#ifdef CONFIG_TRANSPARENT_HUGEPAGE
818 mm->pmd_huge_pte = NULL; 818 mm->pmd_huge_pte = NULL;
819#endif 819#endif
820#ifdef CONFIG_NUMA_BALANCING
821 mm->first_nid = NUMA_PTE_SCAN_INIT;
822#endif
823 if (!mm_init(mm, tsk)) 820 if (!mm_init(mm, tsk))
824 goto fail_nomem; 821 goto fail_nomem;
825 822
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1313#endif 1310#endif
1314 1311
1315 /* Perform scheduler related setup. Assign this task to a CPU. */ 1312 /* Perform scheduler related setup. Assign this task to a CPU. */
1316 sched_fork(p); 1313 sched_fork(clone_flags, p);
1317 1314
1318 retval = perf_event_init_task(p); 1315 retval = perf_event_init_task(p);
1319 if (retval) 1316 if (retval)
@@ -1373,7 +1370,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 INIT_LIST_HEAD(&p->pi_state_list); 1370 INIT_LIST_HEAD(&p->pi_state_list);
1374 p->pi_state_cache = NULL; 1371 p->pi_state_cache = NULL;
1375#endif 1372#endif
1376 uprobe_copy_process(p);
1377 /* 1373 /*
1378 * sigaltstack should be cleared when sharing the same VM 1374 * sigaltstack should be cleared when sharing the same VM
1379 */ 1375 */
@@ -1490,6 +1486,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1490 perf_event_fork(p); 1486 perf_event_fork(p);
1491 1487
1492 trace_task_newtask(p, clone_flags); 1488 trace_task_newtask(p, clone_flags);
1489 uprobe_copy_process(p, clone_flags);
1493 1490
1494 return p; 1491 return p;
1495 1492
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 514bcfd855a8..3e59f951d42f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -956,7 +956,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
956 goto out_mput; 956 goto out_mput;
957 } 957 }
958 958
959 sched_setscheduler(t, SCHED_FIFO, &param); 959 sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
960 960
961 /* 961 /*
962 * We keep the reference to the task struct even if 962 * We keep the reference to the task struct even if
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index e16c45b9ee77..4e8e14c34e42 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -4224,7 +4224,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", 4224 printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n",
4225 !rcu_lockdep_current_cpu_online() 4225 !rcu_lockdep_current_cpu_online()
4226 ? "RCU used illegally from offline CPU!\n" 4226 ? "RCU used illegally from offline CPU!\n"
4227 : rcu_is_cpu_idle() 4227 : !rcu_is_watching()
4228 ? "RCU used illegally from idle CPU!\n" 4228 ? "RCU used illegally from idle CPU!\n"
4229 : "", 4229 : "",
4230 rcu_scheduler_active, debug_locks); 4230 rcu_scheduler_active, debug_locks);
@@ -4247,7 +4247,7 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
4247 * So complain bitterly if someone does call rcu_read_lock(), 4247 * So complain bitterly if someone does call rcu_read_lock(),
4248 * rcu_read_lock_bh() and so on from extended quiescent states. 4248 * rcu_read_lock_bh() and so on from extended quiescent states.
4249 */ 4249 */
4250 if (rcu_is_cpu_idle()) 4250 if (!rcu_is_watching())
4251 printk("RCU used illegally from extended quiescent state!\n"); 4251 printk("RCU used illegally from extended quiescent state!\n");
4252 4252
4253 lockdep_print_held_locks(curr); 4253 lockdep_print_held_locks(curr);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 6d647aedffea..d24105b1b794 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock,
410static __always_inline int __sched 410static __always_inline int __sched
411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, 411__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
412 struct lockdep_map *nest_lock, unsigned long ip, 412 struct lockdep_map *nest_lock, unsigned long ip,
413 struct ww_acquire_ctx *ww_ctx) 413 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
414{ 414{
415 struct task_struct *task = current; 415 struct task_struct *task = current;
416 struct mutex_waiter waiter; 416 struct mutex_waiter waiter;
@@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
450 struct task_struct *owner; 450 struct task_struct *owner;
451 struct mspin_node node; 451 struct mspin_node node;
452 452
453 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 453 if (use_ww_ctx && ww_ctx->acquired > 0) {
454 struct ww_mutex *ww; 454 struct ww_mutex *ww;
455 455
456 ww = container_of(lock, struct ww_mutex, base); 456 ww = container_of(lock, struct ww_mutex, base);
@@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
480 if ((atomic_read(&lock->count) == 1) && 480 if ((atomic_read(&lock->count) == 1) &&
481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { 481 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
482 lock_acquired(&lock->dep_map, ip); 482 lock_acquired(&lock->dep_map, ip);
483 if (!__builtin_constant_p(ww_ctx == NULL)) { 483 if (use_ww_ctx) {
484 struct ww_mutex *ww; 484 struct ww_mutex *ww;
485 ww = container_of(lock, struct ww_mutex, base); 485 ww = container_of(lock, struct ww_mutex, base);
486 486
@@ -551,7 +551,7 @@ slowpath:
551 goto err; 551 goto err;
552 } 552 }
553 553
554 if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { 554 if (use_ww_ctx && ww_ctx->acquired > 0) {
555 ret = __mutex_lock_check_stamp(lock, ww_ctx); 555 ret = __mutex_lock_check_stamp(lock, ww_ctx);
556 if (ret) 556 if (ret)
557 goto err; 557 goto err;
@@ -575,7 +575,7 @@ skip_wait:
575 lock_acquired(&lock->dep_map, ip); 575 lock_acquired(&lock->dep_map, ip);
576 mutex_set_owner(lock); 576 mutex_set_owner(lock);
577 577
578 if (!__builtin_constant_p(ww_ctx == NULL)) { 578 if (use_ww_ctx) {
579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); 579 struct ww_mutex *ww = container_of(lock, struct ww_mutex, base);
580 struct mutex_waiter *cur; 580 struct mutex_waiter *cur;
581 581
@@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
615{ 615{
616 might_sleep(); 616 might_sleep();
617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 617 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
618 subclass, NULL, _RET_IP_, NULL); 618 subclass, NULL, _RET_IP_, NULL, 0);
619} 619}
620 620
621EXPORT_SYMBOL_GPL(mutex_lock_nested); 621EXPORT_SYMBOL_GPL(mutex_lock_nested);
@@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
625{ 625{
626 might_sleep(); 626 might_sleep();
627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 627 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE,
628 0, nest, _RET_IP_, NULL); 628 0, nest, _RET_IP_, NULL, 0);
629} 629}
630 630
631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); 631EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
@@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
635{ 635{
636 might_sleep(); 636 might_sleep();
637 return __mutex_lock_common(lock, TASK_KILLABLE, 637 return __mutex_lock_common(lock, TASK_KILLABLE,
638 subclass, NULL, _RET_IP_, NULL); 638 subclass, NULL, _RET_IP_, NULL, 0);
639} 639}
640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); 640EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
641 641
@@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
644{ 644{
645 might_sleep(); 645 might_sleep();
646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 646 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
647 subclass, NULL, _RET_IP_, NULL); 647 subclass, NULL, _RET_IP_, NULL, 0);
648} 648}
649 649
650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); 650EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
682 682
683 might_sleep(); 683 might_sleep();
684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 684 ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE,
685 0, &ctx->dep_map, _RET_IP_, ctx); 685 0, &ctx->dep_map, _RET_IP_, ctx, 1);
686 if (!ret && ctx->acquired > 1) 686 if (!ret && ctx->acquired > 1)
687 return ww_mutex_deadlock_injection(lock, ctx); 687 return ww_mutex_deadlock_injection(lock, ctx);
688 688
@@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
697 697
698 might_sleep(); 698 might_sleep();
699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 699 ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE,
700 0, &ctx->dep_map, _RET_IP_, ctx); 700 0, &ctx->dep_map, _RET_IP_, ctx, 1);
701 701
702 if (!ret && ctx->acquired > 1) 702 if (!ret && ctx->acquired > 1)
703 return ww_mutex_deadlock_injection(lock, ctx); 703 return ww_mutex_deadlock_injection(lock, ctx);
@@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count)
809 struct mutex *lock = container_of(lock_count, struct mutex, count); 809 struct mutex *lock = container_of(lock_count, struct mutex, count);
810 810
811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, 811 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0,
812 NULL, _RET_IP_, NULL); 812 NULL, _RET_IP_, NULL, 0);
813} 813}
814 814
815static noinline int __sched 815static noinline int __sched
816__mutex_lock_killable_slowpath(struct mutex *lock) 816__mutex_lock_killable_slowpath(struct mutex *lock)
817{ 817{
818 return __mutex_lock_common(lock, TASK_KILLABLE, 0, 818 return __mutex_lock_common(lock, TASK_KILLABLE, 0,
819 NULL, _RET_IP_, NULL); 819 NULL, _RET_IP_, NULL, 0);
820} 820}
821 821
822static noinline int __sched 822static noinline int __sched
823__mutex_lock_interruptible_slowpath(struct mutex *lock) 823__mutex_lock_interruptible_slowpath(struct mutex *lock)
824{ 824{
825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, 825 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0,
826 NULL, _RET_IP_, NULL); 826 NULL, _RET_IP_, NULL, 0);
827} 827}
828 828
829static noinline int __sched 829static noinline int __sched
830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) 830__ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx)
831{ 831{
832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, 832 return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0,
833 NULL, _RET_IP_, ctx); 833 NULL, _RET_IP_, ctx, 1);
834} 834}
835 835
836static noinline int __sched 836static noinline int __sched
@@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
838 struct ww_acquire_ctx *ctx) 838 struct ww_acquire_ctx *ctx)
839{ 839{
840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, 840 return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0,
841 NULL, _RET_IP_, ctx); 841 NULL, _RET_IP_, ctx, 1);
842} 842}
843 843
844#endif 844#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index c9c759d5a15c..0121dab83f43 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -846,7 +846,7 @@ static int software_resume(void)
846 goto Finish; 846 goto Finish;
847} 847}
848 848
849late_initcall(software_resume); 849late_initcall_sync(software_resume);
850 850
851 851
852static const char * const hibernation_modes[] = { 852static const char * const hibernation_modes[] = {
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
new file mode 100644
index 000000000000..01e9ec37a3e3
--- /dev/null
+++ b/kernel/rcu/Makefile
@@ -0,0 +1,6 @@
1obj-y += update.o srcu.o
2obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o
3obj-$(CONFIG_TREE_RCU) += tree.o
4obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o
5obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o
6obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu.h b/kernel/rcu/rcu.h
index 77131966c4ad..7859a0a3951e 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -122,4 +122,11 @@ int rcu_jiffies_till_stall_check(void);
122 122
123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 123#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
124 124
125/*
126 * Strings used in tracepoints need to be exported via the
127 * tracing system such that tools like perf and trace-cmd can
128 * translate the string address pointers to actual text.
129 */
130#define TPS(x) tracepoint_string(x)
131
125#endif /* __LINUX_RCU_H */ 132#endif /* __LINUX_RCU_H */
diff --git a/kernel/srcu.c b/kernel/rcu/srcu.c
index 01d5ccb8bfe3..01d5ccb8bfe3 100644
--- a/kernel/srcu.c
+++ b/kernel/rcu/srcu.c
diff --git a/kernel/rcutiny.c b/kernel/rcu/tiny.c
index 9ed6075dc562..0c9a934cfec1 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcu/tiny.c
@@ -35,6 +35,7 @@
35#include <linux/time.h> 35#include <linux/time.h>
36#include <linux/cpu.h> 36#include <linux/cpu.h>
37#include <linux/prefetch.h> 37#include <linux/prefetch.h>
38#include <linux/ftrace_event.h>
38 39
39#ifdef CONFIG_RCU_TRACE 40#ifdef CONFIG_RCU_TRACE
40#include <trace/events/rcu.h> 41#include <trace/events/rcu.h>
@@ -42,7 +43,7 @@
42 43
43#include "rcu.h" 44#include "rcu.h"
44 45
45/* Forward declarations for rcutiny_plugin.h. */ 46/* Forward declarations for tiny_plugin.h. */
46struct rcu_ctrlblk; 47struct rcu_ctrlblk;
47static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); 48static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
48static void rcu_process_callbacks(struct softirq_action *unused); 49static void rcu_process_callbacks(struct softirq_action *unused);
@@ -52,22 +53,23 @@ static void __call_rcu(struct rcu_head *head,
52 53
53static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 54static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
54 55
55#include "rcutiny_plugin.h" 56#include "tiny_plugin.h"
56 57
57/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 58/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
58static void rcu_idle_enter_common(long long newval) 59static void rcu_idle_enter_common(long long newval)
59{ 60{
60 if (newval) { 61 if (newval) {
61 RCU_TRACE(trace_rcu_dyntick("--=", 62 RCU_TRACE(trace_rcu_dyntick(TPS("--="),
62 rcu_dynticks_nesting, newval)); 63 rcu_dynticks_nesting, newval));
63 rcu_dynticks_nesting = newval; 64 rcu_dynticks_nesting = newval;
64 return; 65 return;
65 } 66 }
66 RCU_TRACE(trace_rcu_dyntick("Start", rcu_dynticks_nesting, newval)); 67 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
68 rcu_dynticks_nesting, newval));
67 if (!is_idle_task(current)) { 69 if (!is_idle_task(current)) {
68 struct task_struct *idle = idle_task(smp_processor_id()); 70 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
69 71
70 RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task", 72 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
71 rcu_dynticks_nesting, newval)); 73 rcu_dynticks_nesting, newval));
72 ftrace_dump(DUMP_ALL); 74 ftrace_dump(DUMP_ALL);
73 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 75 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -120,15 +122,15 @@ EXPORT_SYMBOL_GPL(rcu_irq_exit);
120static void rcu_idle_exit_common(long long oldval) 122static void rcu_idle_exit_common(long long oldval)
121{ 123{
122 if (oldval) { 124 if (oldval) {
123 RCU_TRACE(trace_rcu_dyntick("++=", 125 RCU_TRACE(trace_rcu_dyntick(TPS("++="),
124 oldval, rcu_dynticks_nesting)); 126 oldval, rcu_dynticks_nesting));
125 return; 127 return;
126 } 128 }
127 RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting)); 129 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
128 if (!is_idle_task(current)) { 130 if (!is_idle_task(current)) {
129 struct task_struct *idle = idle_task(smp_processor_id()); 131 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
130 132
131 RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task", 133 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
132 oldval, rcu_dynticks_nesting)); 134 oldval, rcu_dynticks_nesting));
133 ftrace_dump(DUMP_ALL); 135 ftrace_dump(DUMP_ALL);
134 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 136 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
@@ -174,18 +176,18 @@ void rcu_irq_enter(void)
174} 176}
175EXPORT_SYMBOL_GPL(rcu_irq_enter); 177EXPORT_SYMBOL_GPL(rcu_irq_enter);
176 178
177#ifdef CONFIG_DEBUG_LOCK_ALLOC 179#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE)
178 180
179/* 181/*
180 * Test whether RCU thinks that the current CPU is idle. 182 * Test whether RCU thinks that the current CPU is idle.
181 */ 183 */
182int rcu_is_cpu_idle(void) 184bool __rcu_is_watching(void)
183{ 185{
184 return !rcu_dynticks_nesting; 186 return rcu_dynticks_nesting;
185} 187}
186EXPORT_SYMBOL(rcu_is_cpu_idle); 188EXPORT_SYMBOL(__rcu_is_watching);
187 189
188#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 190#endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */
189 191
190/* 192/*
191 * Test whether the current CPU was interrupted from idle. Nested 193 * Test whether the current CPU was interrupted from idle. Nested
@@ -273,7 +275,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
273 if (&rcp->rcucblist == rcp->donetail) { 275 if (&rcp->rcucblist == rcp->donetail) {
274 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); 276 RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
275 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, 277 RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
276 ACCESS_ONCE(rcp->rcucblist), 278 !!ACCESS_ONCE(rcp->rcucblist),
277 need_resched(), 279 need_resched(),
278 is_idle_task(current), 280 is_idle_task(current),
279 false)); 281 false));
@@ -304,7 +306,8 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
304 RCU_TRACE(cb_count++); 306 RCU_TRACE(cb_count++);
305 } 307 }
306 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); 308 RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
307 RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), 309 RCU_TRACE(trace_rcu_batch_end(rcp->name,
310 cb_count, 0, need_resched(),
308 is_idle_task(current), 311 is_idle_task(current),
309 false)); 312 false));
310} 313}
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcu/tiny_plugin.h
index 280d06cae352..280d06cae352 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcu/tiny_plugin.h
diff --git a/kernel/rcutorture.c b/kernel/rcu/torture.c
index be63101c6175..3929cd451511 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcu/torture.c
@@ -52,6 +52,12 @@
52MODULE_LICENSE("GPL"); 52MODULE_LICENSE("GPL");
53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); 53MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>");
54 54
55MODULE_ALIAS("rcutorture");
56#ifdef MODULE_PARAM_PREFIX
57#undef MODULE_PARAM_PREFIX
58#endif
59#define MODULE_PARAM_PREFIX "rcutorture."
60
55static int fqs_duration; 61static int fqs_duration;
56module_param(fqs_duration, int, 0444); 62module_param(fqs_duration, int, 0444);
57MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); 63MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable");
diff --git a/kernel/rcutree.c b/kernel/rcu/tree.c
index 32618b3fe4e6..4c06ddfea7cd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcu/tree.c
@@ -41,6 +41,7 @@
41#include <linux/export.h> 41#include <linux/export.h>
42#include <linux/completion.h> 42#include <linux/completion.h>
43#include <linux/moduleparam.h> 43#include <linux/moduleparam.h>
44#include <linux/module.h>
44#include <linux/percpu.h> 45#include <linux/percpu.h>
45#include <linux/notifier.h> 46#include <linux/notifier.h>
46#include <linux/cpu.h> 47#include <linux/cpu.h>
@@ -56,17 +57,16 @@
56#include <linux/ftrace_event.h> 57#include <linux/ftrace_event.h>
57#include <linux/suspend.h> 58#include <linux/suspend.h>
58 59
59#include "rcutree.h" 60#include "tree.h"
60#include <trace/events/rcu.h> 61#include <trace/events/rcu.h>
61 62
62#include "rcu.h" 63#include "rcu.h"
63 64
64/* 65MODULE_ALIAS("rcutree");
65 * Strings used in tracepoints need to be exported via the 66#ifdef MODULE_PARAM_PREFIX
66 * tracing system such that tools like perf and trace-cmd can 67#undef MODULE_PARAM_PREFIX
67 * translate the string address pointers to actual text. 68#endif
68 */ 69#define MODULE_PARAM_PREFIX "rcutree."
69#define TPS(x) tracepoint_string(x)
70 70
71/* Data structures. */ 71/* Data structures. */
72 72
@@ -222,7 +222,7 @@ void rcu_note_context_switch(int cpu)
222} 222}
223EXPORT_SYMBOL_GPL(rcu_note_context_switch); 223EXPORT_SYMBOL_GPL(rcu_note_context_switch);
224 224
225DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 225static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 226 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE,
227 .dynticks = ATOMIC_INIT(1), 227 .dynticks = ATOMIC_INIT(1),
228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 228#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
@@ -371,7 +371,8 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
371{ 371{
372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); 372 trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting);
373 if (!user && !is_idle_task(current)) { 373 if (!user && !is_idle_task(current)) {
374 struct task_struct *idle = idle_task(smp_processor_id()); 374 struct task_struct *idle __maybe_unused =
375 idle_task(smp_processor_id());
375 376
376 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); 377 trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0);
377 ftrace_dump(DUMP_ORIG); 378 ftrace_dump(DUMP_ORIG);
@@ -407,7 +408,7 @@ static void rcu_eqs_enter(bool user)
407 long long oldval; 408 long long oldval;
408 struct rcu_dynticks *rdtp; 409 struct rcu_dynticks *rdtp;
409 410
410 rdtp = &__get_cpu_var(rcu_dynticks); 411 rdtp = this_cpu_ptr(&rcu_dynticks);
411 oldval = rdtp->dynticks_nesting; 412 oldval = rdtp->dynticks_nesting;
412 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); 413 WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0);
413 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 414 if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE)
@@ -435,7 +436,7 @@ void rcu_idle_enter(void)
435 436
436 local_irq_save(flags); 437 local_irq_save(flags);
437 rcu_eqs_enter(false); 438 rcu_eqs_enter(false);
438 rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); 439 rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0);
439 local_irq_restore(flags); 440 local_irq_restore(flags);
440} 441}
441EXPORT_SYMBOL_GPL(rcu_idle_enter); 442EXPORT_SYMBOL_GPL(rcu_idle_enter);
@@ -478,7 +479,7 @@ void rcu_irq_exit(void)
478 struct rcu_dynticks *rdtp; 479 struct rcu_dynticks *rdtp;
479 480
480 local_irq_save(flags); 481 local_irq_save(flags);
481 rdtp = &__get_cpu_var(rcu_dynticks); 482 rdtp = this_cpu_ptr(&rcu_dynticks);
482 oldval = rdtp->dynticks_nesting; 483 oldval = rdtp->dynticks_nesting;
483 rdtp->dynticks_nesting--; 484 rdtp->dynticks_nesting--;
484 WARN_ON_ONCE(rdtp->dynticks_nesting < 0); 485 WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
@@ -508,7 +509,8 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
508 rcu_cleanup_after_idle(smp_processor_id()); 509 rcu_cleanup_after_idle(smp_processor_id());
509 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 510 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting);
510 if (!user && !is_idle_task(current)) { 511 if (!user && !is_idle_task(current)) {
511 struct task_struct *idle = idle_task(smp_processor_id()); 512 struct task_struct *idle __maybe_unused =
513 idle_task(smp_processor_id());
512 514
513 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 515 trace_rcu_dyntick(TPS("Error on exit: not idle task"),
514 oldval, rdtp->dynticks_nesting); 516 oldval, rdtp->dynticks_nesting);
@@ -528,7 +530,7 @@ static void rcu_eqs_exit(bool user)
528 struct rcu_dynticks *rdtp; 530 struct rcu_dynticks *rdtp;
529 long long oldval; 531 long long oldval;
530 532
531 rdtp = &__get_cpu_var(rcu_dynticks); 533 rdtp = this_cpu_ptr(&rcu_dynticks);
532 oldval = rdtp->dynticks_nesting; 534 oldval = rdtp->dynticks_nesting;
533 WARN_ON_ONCE(oldval < 0); 535 WARN_ON_ONCE(oldval < 0);
534 if (oldval & DYNTICK_TASK_NEST_MASK) 536 if (oldval & DYNTICK_TASK_NEST_MASK)
@@ -555,7 +557,7 @@ void rcu_idle_exit(void)
555 557
556 local_irq_save(flags); 558 local_irq_save(flags);
557 rcu_eqs_exit(false); 559 rcu_eqs_exit(false);
558 rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); 560 rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0);
559 local_irq_restore(flags); 561 local_irq_restore(flags);
560} 562}
561EXPORT_SYMBOL_GPL(rcu_idle_exit); 563EXPORT_SYMBOL_GPL(rcu_idle_exit);
@@ -599,7 +601,7 @@ void rcu_irq_enter(void)
599 long long oldval; 601 long long oldval;
600 602
601 local_irq_save(flags); 603 local_irq_save(flags);
602 rdtp = &__get_cpu_var(rcu_dynticks); 604 rdtp = this_cpu_ptr(&rcu_dynticks);
603 oldval = rdtp->dynticks_nesting; 605 oldval = rdtp->dynticks_nesting;
604 rdtp->dynticks_nesting++; 606 rdtp->dynticks_nesting++;
605 WARN_ON_ONCE(rdtp->dynticks_nesting == 0); 607 WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
@@ -620,7 +622,7 @@ void rcu_irq_enter(void)
620 */ 622 */
621void rcu_nmi_enter(void) 623void rcu_nmi_enter(void)
622{ 624{
623 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 625 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
624 626
625 if (rdtp->dynticks_nmi_nesting == 0 && 627 if (rdtp->dynticks_nmi_nesting == 0 &&
626 (atomic_read(&rdtp->dynticks) & 0x1)) 628 (atomic_read(&rdtp->dynticks) & 0x1))
@@ -642,7 +644,7 @@ void rcu_nmi_enter(void)
642 */ 644 */
643void rcu_nmi_exit(void) 645void rcu_nmi_exit(void)
644{ 646{
645 struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); 647 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
646 648
647 if (rdtp->dynticks_nmi_nesting == 0 || 649 if (rdtp->dynticks_nmi_nesting == 0 ||
648 --rdtp->dynticks_nmi_nesting != 0) 650 --rdtp->dynticks_nmi_nesting != 0)
@@ -655,21 +657,34 @@ void rcu_nmi_exit(void)
655} 657}
656 658
657/** 659/**
658 * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle 660 * __rcu_is_watching - are RCU read-side critical sections safe?
661 *
662 * Return true if RCU is watching the running CPU, which means that
663 * this CPU can safely enter RCU read-side critical sections. Unlike
664 * rcu_is_watching(), the caller of __rcu_is_watching() must have at
665 * least disabled preemption.
666 */
667bool __rcu_is_watching(void)
668{
669 return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1;
670}
671
672/**
673 * rcu_is_watching - see if RCU thinks that the current CPU is idle
659 * 674 *
660 * If the current CPU is in its idle loop and is neither in an interrupt 675 * If the current CPU is in its idle loop and is neither in an interrupt
661 * or NMI handler, return true. 676 * or NMI handler, return true.
662 */ 677 */
663int rcu_is_cpu_idle(void) 678bool rcu_is_watching(void)
664{ 679{
665 int ret; 680 int ret;
666 681
667 preempt_disable(); 682 preempt_disable();
668 ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0; 683 ret = __rcu_is_watching();
669 preempt_enable(); 684 preempt_enable();
670 return ret; 685 return ret;
671} 686}
672EXPORT_SYMBOL(rcu_is_cpu_idle); 687EXPORT_SYMBOL_GPL(rcu_is_watching);
673 688
674#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 689#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)
675 690
@@ -703,7 +718,7 @@ bool rcu_lockdep_current_cpu_online(void)
703 if (in_nmi()) 718 if (in_nmi())
704 return 1; 719 return 1;
705 preempt_disable(); 720 preempt_disable();
706 rdp = &__get_cpu_var(rcu_sched_data); 721 rdp = this_cpu_ptr(&rcu_sched_data);
707 rnp = rdp->mynode; 722 rnp = rdp->mynode;
708 ret = (rdp->grpmask & rnp->qsmaskinit) || 723 ret = (rdp->grpmask & rnp->qsmaskinit) ||
709 !rcu_scheduler_fully_active; 724 !rcu_scheduler_fully_active;
@@ -723,7 +738,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online);
723 */ 738 */
724static int rcu_is_cpu_rrupt_from_idle(void) 739static int rcu_is_cpu_rrupt_from_idle(void)
725{ 740{
726 return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1; 741 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1;
727} 742}
728 743
729/* 744/*
@@ -802,8 +817,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
802 817
803static void record_gp_stall_check_time(struct rcu_state *rsp) 818static void record_gp_stall_check_time(struct rcu_state *rsp)
804{ 819{
805 rsp->gp_start = jiffies; 820 unsigned long j = ACCESS_ONCE(jiffies);
806 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 821
822 rsp->gp_start = j;
823 smp_wmb(); /* Record start time before stall time. */
824 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
807} 825}
808 826
809/* 827/*
@@ -898,6 +916,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
898 force_quiescent_state(rsp); /* Kick them all. */ 916 force_quiescent_state(rsp); /* Kick them all. */
899} 917}
900 918
919/*
920 * This function really isn't for public consumption, but RCU is special in
921 * that context switches can allow the state machine to make progress.
922 */
923extern void resched_cpu(int cpu);
924
901static void print_cpu_stall(struct rcu_state *rsp) 925static void print_cpu_stall(struct rcu_state *rsp)
902{ 926{
903 int cpu; 927 int cpu;
@@ -927,22 +951,60 @@ static void print_cpu_stall(struct rcu_state *rsp)
927 3 * rcu_jiffies_till_stall_check() + 3; 951 3 * rcu_jiffies_till_stall_check() + 3;
928 raw_spin_unlock_irqrestore(&rnp->lock, flags); 952 raw_spin_unlock_irqrestore(&rnp->lock, flags);
929 953
930 set_need_resched(); /* kick ourselves to get things going. */ 954 /*
955 * Attempt to revive the RCU machinery by forcing a context switch.
956 *
957 * A context switch would normally allow the RCU state machine to make
958 * progress and it could be we're stuck in kernel space without context
959 * switches for an entirely unreasonable amount of time.
960 */
961 resched_cpu(smp_processor_id());
931} 962}
932 963
933static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 964static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
934{ 965{
966 unsigned long completed;
967 unsigned long gpnum;
968 unsigned long gps;
935 unsigned long j; 969 unsigned long j;
936 unsigned long js; 970 unsigned long js;
937 struct rcu_node *rnp; 971 struct rcu_node *rnp;
938 972
939 if (rcu_cpu_stall_suppress) 973 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
940 return; 974 return;
941 j = ACCESS_ONCE(jiffies); 975 j = ACCESS_ONCE(jiffies);
976
977 /*
978 * Lots of memory barriers to reject false positives.
979 *
980 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
981 * then rsp->gp_start, and finally rsp->completed. These values
982 * are updated in the opposite order with memory barriers (or
983 * equivalent) during grace-period initialization and cleanup.
984 * Now, a false positive can occur if we get an new value of
985 * rsp->gp_start and a old value of rsp->jiffies_stall. But given
986 * the memory barriers, the only way that this can happen is if one
987 * grace period ends and another starts between these two fetches.
988 * Detect this by comparing rsp->completed with the previous fetch
989 * from rsp->gpnum.
990 *
991 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
992 * and rsp->gp_start suffice to forestall false positives.
993 */
994 gpnum = ACCESS_ONCE(rsp->gpnum);
995 smp_rmb(); /* Pick up ->gpnum first... */
942 js = ACCESS_ONCE(rsp->jiffies_stall); 996 js = ACCESS_ONCE(rsp->jiffies_stall);
997 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
998 gps = ACCESS_ONCE(rsp->gp_start);
999 smp_rmb(); /* ...and finally ->gp_start before ->completed. */
1000 completed = ACCESS_ONCE(rsp->completed);
1001 if (ULONG_CMP_GE(completed, gpnum) ||
1002 ULONG_CMP_LT(j, js) ||
1003 ULONG_CMP_GE(gps, js))
1004 return; /* No stall or GP completed since entering function. */
943 rnp = rdp->mynode; 1005 rnp = rdp->mynode;
944 if (rcu_gp_in_progress(rsp) && 1006 if (rcu_gp_in_progress(rsp) &&
945 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 1007 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
946 1008
947 /* We haven't checked in, so go dump stack. */ 1009 /* We haven't checked in, so go dump stack. */
948 print_cpu_stall(rsp); 1010 print_cpu_stall(rsp);
@@ -1297,7 +1359,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
1297} 1359}
1298 1360
1299/* 1361/*
1300 * Initialize a new grace period. 1362 * Initialize a new grace period. Return 0 if no grace period required.
1301 */ 1363 */
1302static int rcu_gp_init(struct rcu_state *rsp) 1364static int rcu_gp_init(struct rcu_state *rsp)
1303{ 1365{
@@ -1306,18 +1368,27 @@ static int rcu_gp_init(struct rcu_state *rsp)
1306 1368
1307 rcu_bind_gp_kthread(); 1369 rcu_bind_gp_kthread();
1308 raw_spin_lock_irq(&rnp->lock); 1370 raw_spin_lock_irq(&rnp->lock);
1371 if (rsp->gp_flags == 0) {
1372 /* Spurious wakeup, tell caller to go back to sleep. */
1373 raw_spin_unlock_irq(&rnp->lock);
1374 return 0;
1375 }
1309 rsp->gp_flags = 0; /* Clear all flags: New grace period. */ 1376 rsp->gp_flags = 0; /* Clear all flags: New grace period. */
1310 1377
1311 if (rcu_gp_in_progress(rsp)) { 1378 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) {
1312 /* Grace period already in progress, don't start another. */ 1379 /*
1380 * Grace period already in progress, don't start another.
1381 * Not supposed to be able to happen.
1382 */
1313 raw_spin_unlock_irq(&rnp->lock); 1383 raw_spin_unlock_irq(&rnp->lock);
1314 return 0; 1384 return 0;
1315 } 1385 }
1316 1386
1317 /* Advance to a new grace period and initialize state. */ 1387 /* Advance to a new grace period and initialize state. */
1388 record_gp_stall_check_time(rsp);
1389 smp_wmb(); /* Record GP times before starting GP. */
1318 rsp->gpnum++; 1390 rsp->gpnum++;
1319 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1391 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1320 record_gp_stall_check_time(rsp);
1321 raw_spin_unlock_irq(&rnp->lock); 1392 raw_spin_unlock_irq(&rnp->lock);
1322 1393
1323 /* Exclude any concurrent CPU-hotplug operations. */ 1394 /* Exclude any concurrent CPU-hotplug operations. */
@@ -1366,7 +1437,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1366/* 1437/*
1367 * Do one round of quiescent-state forcing. 1438 * Do one round of quiescent-state forcing.
1368 */ 1439 */
1369int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1440static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1370{ 1441{
1371 int fqs_state = fqs_state_in; 1442 int fqs_state = fqs_state_in;
1372 bool isidle = false; 1443 bool isidle = false;
@@ -1451,8 +1522,12 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1451 rsp->fqs_state = RCU_GP_IDLE; 1522 rsp->fqs_state = RCU_GP_IDLE;
1452 rdp = this_cpu_ptr(rsp->rda); 1523 rdp = this_cpu_ptr(rsp->rda);
1453 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ 1524 rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */
1454 if (cpu_needs_another_gp(rsp, rdp)) 1525 if (cpu_needs_another_gp(rsp, rdp)) {
1455 rsp->gp_flags = 1; 1526 rsp->gp_flags = RCU_GP_FLAG_INIT;
1527 trace_rcu_grace_period(rsp->name,
1528 ACCESS_ONCE(rsp->gpnum),
1529 TPS("newreq"));
1530 }
1456 raw_spin_unlock_irq(&rnp->lock); 1531 raw_spin_unlock_irq(&rnp->lock);
1457} 1532}
1458 1533
@@ -1462,6 +1537,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1462static int __noreturn rcu_gp_kthread(void *arg) 1537static int __noreturn rcu_gp_kthread(void *arg)
1463{ 1538{
1464 int fqs_state; 1539 int fqs_state;
1540 int gf;
1465 unsigned long j; 1541 unsigned long j;
1466 int ret; 1542 int ret;
1467 struct rcu_state *rsp = arg; 1543 struct rcu_state *rsp = arg;
@@ -1471,14 +1547,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
1471 1547
1472 /* Handle grace-period start. */ 1548 /* Handle grace-period start. */
1473 for (;;) { 1549 for (;;) {
1550 trace_rcu_grace_period(rsp->name,
1551 ACCESS_ONCE(rsp->gpnum),
1552 TPS("reqwait"));
1474 wait_event_interruptible(rsp->gp_wq, 1553 wait_event_interruptible(rsp->gp_wq,
1475 rsp->gp_flags & 1554 ACCESS_ONCE(rsp->gp_flags) &
1476 RCU_GP_FLAG_INIT); 1555 RCU_GP_FLAG_INIT);
1477 if ((rsp->gp_flags & RCU_GP_FLAG_INIT) && 1556 if (rcu_gp_init(rsp))
1478 rcu_gp_init(rsp))
1479 break; 1557 break;
1480 cond_resched(); 1558 cond_resched();
1481 flush_signals(current); 1559 flush_signals(current);
1560 trace_rcu_grace_period(rsp->name,
1561 ACCESS_ONCE(rsp->gpnum),
1562 TPS("reqwaitsig"));
1482 } 1563 }
1483 1564
1484 /* Handle quiescent-state forcing. */ 1565 /* Handle quiescent-state forcing. */
@@ -1488,10 +1569,16 @@ static int __noreturn rcu_gp_kthread(void *arg)
1488 j = HZ; 1569 j = HZ;
1489 jiffies_till_first_fqs = HZ; 1570 jiffies_till_first_fqs = HZ;
1490 } 1571 }
1572 ret = 0;
1491 for (;;) { 1573 for (;;) {
1492 rsp->jiffies_force_qs = jiffies + j; 1574 if (!ret)
1575 rsp->jiffies_force_qs = jiffies + j;
1576 trace_rcu_grace_period(rsp->name,
1577 ACCESS_ONCE(rsp->gpnum),
1578 TPS("fqswait"));
1493 ret = wait_event_interruptible_timeout(rsp->gp_wq, 1579 ret = wait_event_interruptible_timeout(rsp->gp_wq,
1494 (rsp->gp_flags & RCU_GP_FLAG_FQS) || 1580 ((gf = ACCESS_ONCE(rsp->gp_flags)) &
1581 RCU_GP_FLAG_FQS) ||
1495 (!ACCESS_ONCE(rnp->qsmask) && 1582 (!ACCESS_ONCE(rnp->qsmask) &&
1496 !rcu_preempt_blocked_readers_cgp(rnp)), 1583 !rcu_preempt_blocked_readers_cgp(rnp)),
1497 j); 1584 j);
@@ -1500,13 +1587,23 @@ static int __noreturn rcu_gp_kthread(void *arg)
1500 !rcu_preempt_blocked_readers_cgp(rnp)) 1587 !rcu_preempt_blocked_readers_cgp(rnp))
1501 break; 1588 break;
1502 /* If time for quiescent-state forcing, do it. */ 1589 /* If time for quiescent-state forcing, do it. */
1503 if (ret == 0 || (rsp->gp_flags & RCU_GP_FLAG_FQS)) { 1590 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) ||
1591 (gf & RCU_GP_FLAG_FQS)) {
1592 trace_rcu_grace_period(rsp->name,
1593 ACCESS_ONCE(rsp->gpnum),
1594 TPS("fqsstart"));
1504 fqs_state = rcu_gp_fqs(rsp, fqs_state); 1595 fqs_state = rcu_gp_fqs(rsp, fqs_state);
1596 trace_rcu_grace_period(rsp->name,
1597 ACCESS_ONCE(rsp->gpnum),
1598 TPS("fqsend"));
1505 cond_resched(); 1599 cond_resched();
1506 } else { 1600 } else {
1507 /* Deal with stray signal. */ 1601 /* Deal with stray signal. */
1508 cond_resched(); 1602 cond_resched();
1509 flush_signals(current); 1603 flush_signals(current);
1604 trace_rcu_grace_period(rsp->name,
1605 ACCESS_ONCE(rsp->gpnum),
1606 TPS("fqswaitsig"));
1510 } 1607 }
1511 j = jiffies_till_next_fqs; 1608 j = jiffies_till_next_fqs;
1512 if (j > HZ) { 1609 if (j > HZ) {
@@ -1554,6 +1651,8 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp,
1554 return; 1651 return;
1555 } 1652 }
1556 rsp->gp_flags = RCU_GP_FLAG_INIT; 1653 rsp->gp_flags = RCU_GP_FLAG_INIT;
1654 trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum),
1655 TPS("newreq"));
1557 1656
1558 /* 1657 /*
1559 * We can't do wakeups while holding the rnp->lock, as that 1658 * We can't do wakeups while holding the rnp->lock, as that
@@ -2255,7 +2354,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
2255 * If called from an extended quiescent state, invoke the RCU 2354 * If called from an extended quiescent state, invoke the RCU
2256 * core in order to force a re-evaluation of RCU's idleness. 2355 * core in order to force a re-evaluation of RCU's idleness.
2257 */ 2356 */
2258 if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) 2357 if (!rcu_is_watching() && cpu_online(smp_processor_id()))
2259 invoke_rcu_core(); 2358 invoke_rcu_core();
2260 2359
2261 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 2360 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */
@@ -2725,10 +2824,13 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
2725 2824
2726 for_each_rcu_flavor(rsp) { 2825 for_each_rcu_flavor(rsp) {
2727 rdp = per_cpu_ptr(rsp->rda, cpu); 2826 rdp = per_cpu_ptr(rsp->rda, cpu);
2728 if (rdp->qlen != rdp->qlen_lazy) 2827 if (!rdp->nxtlist)
2828 continue;
2829 hc = true;
2830 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) {
2729 al = false; 2831 al = false;
2730 if (rdp->nxtlist) 2832 break;
2731 hc = true; 2833 }
2732 } 2834 }
2733 if (all_lazy) 2835 if (all_lazy)
2734 *all_lazy = al; 2836 *all_lazy = al;
@@ -3216,7 +3318,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
3216 3318
3217/* 3319/*
3218 * Compute the rcu_node tree geometry from kernel parameters. This cannot 3320 * Compute the rcu_node tree geometry from kernel parameters. This cannot
3219 * replace the definitions in rcutree.h because those are needed to size 3321 * replace the definitions in tree.h because those are needed to size
3220 * the ->node array in the rcu_state structure. 3322 * the ->node array in the rcu_state structure.
3221 */ 3323 */
3222static void __init rcu_init_geometry(void) 3324static void __init rcu_init_geometry(void)
@@ -3295,8 +3397,8 @@ void __init rcu_init(void)
3295 3397
3296 rcu_bootup_announce(); 3398 rcu_bootup_announce();
3297 rcu_init_geometry(); 3399 rcu_init_geometry();
3298 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3299 rcu_init_one(&rcu_bh_state, &rcu_bh_data); 3400 rcu_init_one(&rcu_bh_state, &rcu_bh_data);
3401 rcu_init_one(&rcu_sched_state, &rcu_sched_data);
3300 __rcu_init_preempt(); 3402 __rcu_init_preempt();
3301 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 3403 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
3302 3404
@@ -3311,4 +3413,4 @@ void __init rcu_init(void)
3311 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 3413 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
3312} 3414}
3313 3415
3314#include "rcutree_plugin.h" 3416#include "tree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcu/tree.h
index 5f97eab602cd..52be957c9fe2 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcu/tree.h
@@ -104,6 +104,8 @@ struct rcu_dynticks {
104 /* idle-period nonlazy_posted snapshot. */ 104 /* idle-period nonlazy_posted snapshot. */
105 unsigned long last_accelerate; 105 unsigned long last_accelerate;
106 /* Last jiffy CBs were accelerated. */ 106 /* Last jiffy CBs were accelerated. */
107 unsigned long last_advance_all;
108 /* Last jiffy CBs were all advanced. */
107 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ 109 int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
108#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 110#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
109}; 111};
diff --git a/kernel/rcutree_plugin.h b/kernel/rcu/tree_plugin.h
index 130c97b027f2..3822ac0c4b27 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -28,7 +28,7 @@
28#include <linux/gfp.h> 28#include <linux/gfp.h>
29#include <linux/oom.h> 29#include <linux/oom.h>
30#include <linux/smpboot.h> 30#include <linux/smpboot.h>
31#include "time/tick-internal.h" 31#include "../time/tick-internal.h"
32 32
33#define RCU_KTHREAD_PRIO 1 33#define RCU_KTHREAD_PRIO 1
34 34
@@ -96,10 +96,15 @@ static void __init rcu_bootup_announce_oddness(void)
96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ 96#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
97#ifdef CONFIG_RCU_NOCB_CPU_ALL 97#ifdef CONFIG_RCU_NOCB_CPU_ALL
98 pr_info("\tOffload RCU callbacks from all CPUs\n"); 98 pr_info("\tOffload RCU callbacks from all CPUs\n");
99 cpumask_setall(rcu_nocb_mask); 99 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ 100#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ 101#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
102 if (have_rcu_nocb_mask) { 102 if (have_rcu_nocb_mask) {
103 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
104 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
105 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
106 rcu_nocb_mask);
107 }
103 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); 108 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
104 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); 109 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
105 if (rcu_nocb_poll) 110 if (rcu_nocb_poll)
@@ -660,7 +665,7 @@ static void rcu_preempt_check_callbacks(int cpu)
660 665
661static void rcu_preempt_do_callbacks(void) 666static void rcu_preempt_do_callbacks(void)
662{ 667{
663 rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); 668 rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
664} 669}
665 670
666#endif /* #ifdef CONFIG_RCU_BOOST */ 671#endif /* #ifdef CONFIG_RCU_BOOST */
@@ -1128,7 +1133,7 @@ void exit_rcu(void)
1128 1133
1129#ifdef CONFIG_RCU_BOOST 1134#ifdef CONFIG_RCU_BOOST
1130 1135
1131#include "rtmutex_common.h" 1136#include "../rtmutex_common.h"
1132 1137
1133#ifdef CONFIG_RCU_TRACE 1138#ifdef CONFIG_RCU_TRACE
1134 1139
@@ -1332,7 +1337,7 @@ static void invoke_rcu_callbacks_kthread(void)
1332 */ 1337 */
1333static bool rcu_is_callbacks_kthread(void) 1338static bool rcu_is_callbacks_kthread(void)
1334{ 1339{
1335 return __get_cpu_var(rcu_cpu_kthread_task) == current; 1340 return __this_cpu_read(rcu_cpu_kthread_task) == current;
1336} 1341}
1337 1342
1338#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) 1343#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
@@ -1382,8 +1387,8 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1382 1387
1383static void rcu_kthread_do_work(void) 1388static void rcu_kthread_do_work(void)
1384{ 1389{
1385 rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); 1390 rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1386 rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); 1391 rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1387 rcu_preempt_do_callbacks(); 1392 rcu_preempt_do_callbacks();
1388} 1393}
1389 1394
@@ -1402,7 +1407,7 @@ static void rcu_cpu_kthread_park(unsigned int cpu)
1402 1407
1403static int rcu_cpu_kthread_should_run(unsigned int cpu) 1408static int rcu_cpu_kthread_should_run(unsigned int cpu)
1404{ 1409{
1405 return __get_cpu_var(rcu_cpu_has_work); 1410 return __this_cpu_read(rcu_cpu_has_work);
1406} 1411}
1407 1412
1408/* 1413/*
@@ -1412,8 +1417,8 @@ static int rcu_cpu_kthread_should_run(unsigned int cpu)
1412 */ 1417 */
1413static void rcu_cpu_kthread(unsigned int cpu) 1418static void rcu_cpu_kthread(unsigned int cpu)
1414{ 1419{
1415 unsigned int *statusp = &__get_cpu_var(rcu_cpu_kthread_status); 1420 unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1416 char work, *workp = &__get_cpu_var(rcu_cpu_has_work); 1421 char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1417 int spincnt; 1422 int spincnt;
1418 1423
1419 for (spincnt = 0; spincnt < 10; spincnt++) { 1424 for (spincnt = 0; spincnt < 10; spincnt++) {
@@ -1630,17 +1635,23 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
1630extern int tick_nohz_enabled; 1635extern int tick_nohz_enabled;
1631 1636
1632/* 1637/*
1633 * Try to advance callbacks for all flavors of RCU on the current CPU. 1638 * Try to advance callbacks for all flavors of RCU on the current CPU, but
1634 * Afterwards, if there are any callbacks ready for immediate invocation, 1639 * only if it has been awhile since the last time we did so. Afterwards,
1635 * return true. 1640 * if there are any callbacks ready for immediate invocation, return true.
1636 */ 1641 */
1637static bool rcu_try_advance_all_cbs(void) 1642static bool rcu_try_advance_all_cbs(void)
1638{ 1643{
1639 bool cbs_ready = false; 1644 bool cbs_ready = false;
1640 struct rcu_data *rdp; 1645 struct rcu_data *rdp;
1646 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1641 struct rcu_node *rnp; 1647 struct rcu_node *rnp;
1642 struct rcu_state *rsp; 1648 struct rcu_state *rsp;
1643 1649
1650 /* Exit early if we advanced recently. */
1651 if (jiffies == rdtp->last_advance_all)
1652 return 0;
1653 rdtp->last_advance_all = jiffies;
1654
1644 for_each_rcu_flavor(rsp) { 1655 for_each_rcu_flavor(rsp) {
1645 rdp = this_cpu_ptr(rsp->rda); 1656 rdp = this_cpu_ptr(rsp->rda);
1646 rnp = rdp->mynode; 1657 rnp = rdp->mynode;
@@ -1739,6 +1750,8 @@ static void rcu_prepare_for_idle(int cpu)
1739 */ 1750 */
1740 if (rdtp->all_lazy && 1751 if (rdtp->all_lazy &&
1741 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { 1752 rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1753 rdtp->all_lazy = false;
1754 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1742 invoke_rcu_core(); 1755 invoke_rcu_core();
1743 return; 1756 return;
1744 } 1757 }
@@ -1768,17 +1781,11 @@ static void rcu_prepare_for_idle(int cpu)
1768 */ 1781 */
1769static void rcu_cleanup_after_idle(int cpu) 1782static void rcu_cleanup_after_idle(int cpu)
1770{ 1783{
1771 struct rcu_data *rdp;
1772 struct rcu_state *rsp;
1773 1784
1774 if (rcu_is_nocb_cpu(cpu)) 1785 if (rcu_is_nocb_cpu(cpu))
1775 return; 1786 return;
1776 rcu_try_advance_all_cbs(); 1787 if (rcu_try_advance_all_cbs())
1777 for_each_rcu_flavor(rsp) { 1788 invoke_rcu_core();
1778 rdp = per_cpu_ptr(rsp->rda, cpu);
1779 if (cpu_has_callbacks_ready_to_invoke(rdp))
1780 invoke_rcu_core();
1781 }
1782} 1789}
1783 1790
1784/* 1791/*
@@ -2108,15 +2115,22 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2108 2115
2109 /* If we are not being polled and there is a kthread, awaken it ... */ 2116 /* If we are not being polled and there is a kthread, awaken it ... */
2110 t = ACCESS_ONCE(rdp->nocb_kthread); 2117 t = ACCESS_ONCE(rdp->nocb_kthread);
2111 if (rcu_nocb_poll | !t) 2118 if (rcu_nocb_poll || !t) {
2119 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2120 TPS("WakeNotPoll"));
2112 return; 2121 return;
2122 }
2113 len = atomic_long_read(&rdp->nocb_q_count); 2123 len = atomic_long_read(&rdp->nocb_q_count);
2114 if (old_rhpp == &rdp->nocb_head) { 2124 if (old_rhpp == &rdp->nocb_head) {
2115 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ 2125 wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */
2116 rdp->qlen_last_fqs_check = 0; 2126 rdp->qlen_last_fqs_check = 0;
2127 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty"));
2117 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2118 wake_up_process(t); /* ... or if many callbacks queued. */ 2129 wake_up_process(t); /* ... or if many callbacks queued. */
2119 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2130 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2131 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2132 } else {
2133 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2120 } 2134 }
2121 return; 2135 return;
2122} 2136}
@@ -2140,10 +2154,12 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2140 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2154 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2141 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2155 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2142 (unsigned long)rhp->func, 2156 (unsigned long)rhp->func,
2143 rdp->qlen_lazy, rdp->qlen); 2157 -atomic_long_read(&rdp->nocb_q_count_lazy),
2158 -atomic_long_read(&rdp->nocb_q_count));
2144 else 2159 else
2145 trace_rcu_callback(rdp->rsp->name, rhp, 2160 trace_rcu_callback(rdp->rsp->name, rhp,
2146 rdp->qlen_lazy, rdp->qlen); 2161 -atomic_long_read(&rdp->nocb_q_count_lazy),
2162 -atomic_long_read(&rdp->nocb_q_count));
2147 return 1; 2163 return 1;
2148} 2164}
2149 2165
@@ -2221,6 +2237,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2221static int rcu_nocb_kthread(void *arg) 2237static int rcu_nocb_kthread(void *arg)
2222{ 2238{
2223 int c, cl; 2239 int c, cl;
2240 bool firsttime = 1;
2224 struct rcu_head *list; 2241 struct rcu_head *list;
2225 struct rcu_head *next; 2242 struct rcu_head *next;
2226 struct rcu_head **tail; 2243 struct rcu_head **tail;
@@ -2229,14 +2246,27 @@ static int rcu_nocb_kthread(void *arg)
2229 /* Each pass through this loop invokes one batch of callbacks */ 2246 /* Each pass through this loop invokes one batch of callbacks */
2230 for (;;) { 2247 for (;;) {
2231 /* If not polling, wait for next batch of callbacks. */ 2248 /* If not polling, wait for next batch of callbacks. */
2232 if (!rcu_nocb_poll) 2249 if (!rcu_nocb_poll) {
2250 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2251 TPS("Sleep"));
2233 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); 2252 wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head);
2253 } else if (firsttime) {
2254 firsttime = 0;
2255 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2256 TPS("Poll"));
2257 }
2234 list = ACCESS_ONCE(rdp->nocb_head); 2258 list = ACCESS_ONCE(rdp->nocb_head);
2235 if (!list) { 2259 if (!list) {
2260 if (!rcu_nocb_poll)
2261 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2262 TPS("WokeEmpty"));
2236 schedule_timeout_interruptible(1); 2263 schedule_timeout_interruptible(1);
2237 flush_signals(current); 2264 flush_signals(current);
2238 continue; 2265 continue;
2239 } 2266 }
2267 firsttime = 1;
2268 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2269 TPS("WokeNonEmpty"));
2240 2270
2241 /* 2271 /*
2242 * Extract queued callbacks, update counts, and wait 2272 * Extract queued callbacks, update counts, and wait
@@ -2257,7 +2287,11 @@ static int rcu_nocb_kthread(void *arg)
2257 next = list->next; 2287 next = list->next;
2258 /* Wait for enqueuing to complete, if needed. */ 2288 /* Wait for enqueuing to complete, if needed. */
2259 while (next == NULL && &list->next != tail) { 2289 while (next == NULL && &list->next != tail) {
2290 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2291 TPS("WaitQueue"));
2260 schedule_timeout_interruptible(1); 2292 schedule_timeout_interruptible(1);
2293 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2294 TPS("WokeQueue"));
2261 next = list->next; 2295 next = list->next;
2262 } 2296 }
2263 debug_rcu_head_unqueue(list); 2297 debug_rcu_head_unqueue(list);
diff --git a/kernel/rcutree_trace.c b/kernel/rcu/tree_trace.c
index cf6c17412932..3596797b7e46 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -44,7 +44,7 @@
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#define RCU_TREE_NONCORE 46#define RCU_TREE_NONCORE
47#include "rcutree.h" 47#include "tree.h"
48 48
49static int r_open(struct inode *inode, struct file *file, 49static int r_open(struct inode *inode, struct file *file,
50 const struct seq_operations *op) 50 const struct seq_operations *op)
diff --git a/kernel/rcupdate.c b/kernel/rcu/update.c
index b02a339836b4..6cb3dff89e2b 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcu/update.c
@@ -53,6 +53,12 @@
53 53
54#include "rcu.h" 54#include "rcu.h"
55 55
56MODULE_ALIAS("rcupdate");
57#ifdef MODULE_PARAM_PREFIX
58#undef MODULE_PARAM_PREFIX
59#endif
60#define MODULE_PARAM_PREFIX "rcupdate."
61
56module_param(rcu_expedited, int, 0); 62module_param(rcu_expedited, int, 0);
57 63
58#ifdef CONFIG_PREEMPT_RCU 64#ifdef CONFIG_PREEMPT_RCU
@@ -148,7 +154,7 @@ int rcu_read_lock_bh_held(void)
148{ 154{
149 if (!debug_lockdep_rcu_enabled()) 155 if (!debug_lockdep_rcu_enabled())
150 return 1; 156 return 1;
151 if (rcu_is_cpu_idle()) 157 if (!rcu_is_watching())
152 return 0; 158 return 0;
153 if (!rcu_lockdep_current_cpu_online()) 159 if (!rcu_lockdep_current_cpu_online())
154 return 0; 160 return 0;
@@ -298,7 +304,7 @@ EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
298#endif 304#endif
299 305
300int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ 306int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */
301int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; 307static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT;
302 308
303module_param(rcu_cpu_stall_suppress, int, 0644); 309module_param(rcu_cpu_stall_suppress, int, 0644);
304module_param(rcu_cpu_stall_timeout, int, 0644); 310module_param(rcu_cpu_stall_timeout, int, 0644);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 54adcf35f495..7b621409cf15 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -12,6 +12,7 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
12endif 12endif
13 13
14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o 14obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
15obj-y += wait.o completion.o
15obj-$(CONFIG_SMP) += cpupri.o 16obj-$(CONFIG_SMP) += cpupri.o
16obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o 17obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
17obj-$(CONFIG_SCHEDSTATS) += stats.o 18obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
new file mode 100644
index 000000000000..a63f4dc27909
--- /dev/null
+++ b/kernel/sched/completion.c
@@ -0,0 +1,299 @@
1/*
2 * Generic wait-for-completion handler;
3 *
4 * It differs from semaphores in that their default case is the opposite,
5 * wait_for_completion default blocks whereas semaphore default non-block. The
6 * interface also makes it easy to 'complete' multiple waiting threads,
7 * something which isn't entirely natural for semaphores.
8 *
9 * But more importantly, the primitive documents the usage. Semaphores would
10 * typically be used for exclusion which gives rise to priority inversion.
11 * Waiting for completion is a typically sync point, but not an exclusion point.
12 */
13
14#include <linux/sched.h>
15#include <linux/completion.h>
16
17/**
18 * complete: - signals a single thread waiting on this completion
19 * @x: holds the state of this particular completion
20 *
21 * This will wake up a single thread waiting on this completion. Threads will be
22 * awakened in the same order in which they were queued.
23 *
24 * See also complete_all(), wait_for_completion() and related routines.
25 *
26 * It may be assumed that this function implies a write memory barrier before
27 * changing the task state if and only if any tasks are woken up.
28 */
29void complete(struct completion *x)
30{
31 unsigned long flags;
32
33 spin_lock_irqsave(&x->wait.lock, flags);
34 x->done++;
35 __wake_up_locked(&x->wait, TASK_NORMAL, 1);
36 spin_unlock_irqrestore(&x->wait.lock, flags);
37}
38EXPORT_SYMBOL(complete);
39
40/**
41 * complete_all: - signals all threads waiting on this completion
42 * @x: holds the state of this particular completion
43 *
44 * This will wake up all threads waiting on this particular completion event.
45 *
46 * It may be assumed that this function implies a write memory barrier before
47 * changing the task state if and only if any tasks are woken up.
48 */
49void complete_all(struct completion *x)
50{
51 unsigned long flags;
52
53 spin_lock_irqsave(&x->wait.lock, flags);
54 x->done += UINT_MAX/2;
55 __wake_up_locked(&x->wait, TASK_NORMAL, 0);
56 spin_unlock_irqrestore(&x->wait.lock, flags);
57}
58EXPORT_SYMBOL(complete_all);
59
60static inline long __sched
61do_wait_for_common(struct completion *x,
62 long (*action)(long), long timeout, int state)
63{
64 if (!x->done) {
65 DECLARE_WAITQUEUE(wait, current);
66
67 __add_wait_queue_tail_exclusive(&x->wait, &wait);
68 do {
69 if (signal_pending_state(state, current)) {
70 timeout = -ERESTARTSYS;
71 break;
72 }
73 __set_current_state(state);
74 spin_unlock_irq(&x->wait.lock);
75 timeout = action(timeout);
76 spin_lock_irq(&x->wait.lock);
77 } while (!x->done && timeout);
78 __remove_wait_queue(&x->wait, &wait);
79 if (!x->done)
80 return timeout;
81 }
82 x->done--;
83 return timeout ?: 1;
84}
85
86static inline long __sched
87__wait_for_common(struct completion *x,
88 long (*action)(long), long timeout, int state)
89{
90 might_sleep();
91
92 spin_lock_irq(&x->wait.lock);
93 timeout = do_wait_for_common(x, action, timeout, state);
94 spin_unlock_irq(&x->wait.lock);
95 return timeout;
96}
97
98static long __sched
99wait_for_common(struct completion *x, long timeout, int state)
100{
101 return __wait_for_common(x, schedule_timeout, timeout, state);
102}
103
104static long __sched
105wait_for_common_io(struct completion *x, long timeout, int state)
106{
107 return __wait_for_common(x, io_schedule_timeout, timeout, state);
108}
109
110/**
111 * wait_for_completion: - waits for completion of a task
112 * @x: holds the state of this particular completion
113 *
114 * This waits to be signaled for completion of a specific task. It is NOT
115 * interruptible and there is no timeout.
116 *
117 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
118 * and interrupt capability. Also see complete().
119 */
120void __sched wait_for_completion(struct completion *x)
121{
122 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
123}
124EXPORT_SYMBOL(wait_for_completion);
125
126/**
127 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
128 * @x: holds the state of this particular completion
129 * @timeout: timeout value in jiffies
130 *
131 * This waits for either a completion of a specific task to be signaled or for a
132 * specified timeout to expire. The timeout is in jiffies. It is not
133 * interruptible.
134 *
135 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
136 * till timeout) if completed.
137 */
138unsigned long __sched
139wait_for_completion_timeout(struct completion *x, unsigned long timeout)
140{
141 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
142}
143EXPORT_SYMBOL(wait_for_completion_timeout);
144
145/**
146 * wait_for_completion_io: - waits for completion of a task
147 * @x: holds the state of this particular completion
148 *
149 * This waits to be signaled for completion of a specific task. It is NOT
150 * interruptible and there is no timeout. The caller is accounted as waiting
151 * for IO.
152 */
153void __sched wait_for_completion_io(struct completion *x)
154{
155 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
156}
157EXPORT_SYMBOL(wait_for_completion_io);
158
159/**
160 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
161 * @x: holds the state of this particular completion
162 * @timeout: timeout value in jiffies
163 *
164 * This waits for either a completion of a specific task to be signaled or for a
165 * specified timeout to expire. The timeout is in jiffies. It is not
166 * interruptible. The caller is accounted as waiting for IO.
167 *
168 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
169 * till timeout) if completed.
170 */
171unsigned long __sched
172wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
173{
174 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
175}
176EXPORT_SYMBOL(wait_for_completion_io_timeout);
177
178/**
179 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
180 * @x: holds the state of this particular completion
181 *
182 * This waits for completion of a specific task to be signaled. It is
183 * interruptible.
184 *
185 * Return: -ERESTARTSYS if interrupted, 0 if completed.
186 */
187int __sched wait_for_completion_interruptible(struct completion *x)
188{
189 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
190 if (t == -ERESTARTSYS)
191 return t;
192 return 0;
193}
194EXPORT_SYMBOL(wait_for_completion_interruptible);
195
196/**
197 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
198 * @x: holds the state of this particular completion
199 * @timeout: timeout value in jiffies
200 *
201 * This waits for either a completion of a specific task to be signaled or for a
202 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
203 *
204 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
205 * or number of jiffies left till timeout) if completed.
206 */
207long __sched
208wait_for_completion_interruptible_timeout(struct completion *x,
209 unsigned long timeout)
210{
211 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
212}
213EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
214
215/**
216 * wait_for_completion_killable: - waits for completion of a task (killable)
217 * @x: holds the state of this particular completion
218 *
219 * This waits to be signaled for completion of a specific task. It can be
220 * interrupted by a kill signal.
221 *
222 * Return: -ERESTARTSYS if interrupted, 0 if completed.
223 */
224int __sched wait_for_completion_killable(struct completion *x)
225{
226 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
227 if (t == -ERESTARTSYS)
228 return t;
229 return 0;
230}
231EXPORT_SYMBOL(wait_for_completion_killable);
232
233/**
234 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
235 * @x: holds the state of this particular completion
236 * @timeout: timeout value in jiffies
237 *
238 * This waits for either a completion of a specific task to be
239 * signaled or for a specified timeout to expire. It can be
240 * interrupted by a kill signal. The timeout is in jiffies.
241 *
242 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
243 * or number of jiffies left till timeout) if completed.
244 */
245long __sched
246wait_for_completion_killable_timeout(struct completion *x,
247 unsigned long timeout)
248{
249 return wait_for_common(x, timeout, TASK_KILLABLE);
250}
251EXPORT_SYMBOL(wait_for_completion_killable_timeout);
252
253/**
254 * try_wait_for_completion - try to decrement a completion without blocking
255 * @x: completion structure
256 *
257 * Return: 0 if a decrement cannot be done without blocking
258 * 1 if a decrement succeeded.
259 *
260 * If a completion is being used as a counting completion,
261 * attempt to decrement the counter without blocking. This
262 * enables us to avoid waiting if the resource the completion
263 * is protecting is not available.
264 */
265bool try_wait_for_completion(struct completion *x)
266{
267 unsigned long flags;
268 int ret = 1;
269
270 spin_lock_irqsave(&x->wait.lock, flags);
271 if (!x->done)
272 ret = 0;
273 else
274 x->done--;
275 spin_unlock_irqrestore(&x->wait.lock, flags);
276 return ret;
277}
278EXPORT_SYMBOL(try_wait_for_completion);
279
280/**
281 * completion_done - Test to see if a completion has any waiters
282 * @x: completion structure
283 *
284 * Return: 0 if there are waiters (wait_for_completion() in progress)
285 * 1 if there are no waiters.
286 *
287 */
288bool completion_done(struct completion *x)
289{
290 unsigned long flags;
291 int ret = 1;
292
293 spin_lock_irqsave(&x->wait.lock, flags);
294 if (!x->done)
295 ret = 0;
296 spin_unlock_irqrestore(&x->wait.lock, flags);
297 return ret;
298}
299EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..1deccd78be98 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
513 * might also involve a cross-CPU call to trigger the scheduler on 513 * might also involve a cross-CPU call to trigger the scheduler on
514 * the target CPU. 514 * the target CPU.
515 */ 515 */
516#ifdef CONFIG_SMP
517void resched_task(struct task_struct *p) 516void resched_task(struct task_struct *p)
518{ 517{
519 int cpu; 518 int cpu;
520 519
521 assert_raw_spin_locked(&task_rq(p)->lock); 520 lockdep_assert_held(&task_rq(p)->lock);
522 521
523 if (test_tsk_need_resched(p)) 522 if (test_tsk_need_resched(p))
524 return; 523 return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
526 set_tsk_need_resched(p); 525 set_tsk_need_resched(p);
527 526
528 cpu = task_cpu(p); 527 cpu = task_cpu(p);
529 if (cpu == smp_processor_id()) 528 if (cpu == smp_processor_id()) {
529 set_preempt_need_resched();
530 return; 530 return;
531 }
531 532
532 /* NEED_RESCHED must be visible before we test polling */ 533 /* NEED_RESCHED must be visible before we test polling */
533 smp_mb(); 534 smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
546 raw_spin_unlock_irqrestore(&rq->lock, flags); 547 raw_spin_unlock_irqrestore(&rq->lock, flags);
547} 548}
548 549
550#ifdef CONFIG_SMP
549#ifdef CONFIG_NO_HZ_COMMON 551#ifdef CONFIG_NO_HZ_COMMON
550/* 552/*
551 * In the semi idle case, use the nearest busy cpu for migrating timers 553 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
693 } 695 }
694} 696}
695 697
696#else /* !CONFIG_SMP */
697void resched_task(struct task_struct *p)
698{
699 assert_raw_spin_locked(&task_rq(p)->lock);
700 set_tsk_need_resched(p);
701}
702#endif /* CONFIG_SMP */ 698#endif /* CONFIG_SMP */
703 699
704#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \ 700#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
767static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) 763static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
768{ 764{
769 update_rq_clock(rq); 765 update_rq_clock(rq);
770 sched_info_queued(p); 766 sched_info_queued(rq, p);
771 p->sched_class->enqueue_task(rq, p, flags); 767 p->sched_class->enqueue_task(rq, p, flags);
772} 768}
773 769
774static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) 770static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
775{ 771{
776 update_rq_clock(rq); 772 update_rq_clock(rq);
777 sched_info_dequeued(p); 773 sched_info_dequeued(rq, p);
778 p->sched_class->dequeue_task(rq, p, flags); 774 p->sched_class->dequeue_task(rq, p, flags);
779} 775}
780 776
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
987 * ttwu() will sort out the placement. 983 * ttwu() will sort out the placement.
988 */ 984 */
989 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && 985 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
990 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); 986 !(task_preempt_count(p) & PREEMPT_ACTIVE));
991 987
992#ifdef CONFIG_LOCKDEP 988#ifdef CONFIG_LOCKDEP
993 /* 989 /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1017 __set_task_cpu(p, new_cpu); 1013 __set_task_cpu(p, new_cpu);
1018} 1014}
1019 1015
1016static void __migrate_swap_task(struct task_struct *p, int cpu)
1017{
1018 if (p->on_rq) {
1019 struct rq *src_rq, *dst_rq;
1020
1021 src_rq = task_rq(p);
1022 dst_rq = cpu_rq(cpu);
1023
1024 deactivate_task(src_rq, p, 0);
1025 set_task_cpu(p, cpu);
1026 activate_task(dst_rq, p, 0);
1027 check_preempt_curr(dst_rq, p, 0);
1028 } else {
1029 /*
1030 * Task isn't running anymore; make it appear like we migrated
1031 * it before it went to sleep. This means on wakeup we make the
1032 * previous cpu our targer instead of where it really is.
1033 */
1034 p->wake_cpu = cpu;
1035 }
1036}
1037
1038struct migration_swap_arg {
1039 struct task_struct *src_task, *dst_task;
1040 int src_cpu, dst_cpu;
1041};
1042
1043static int migrate_swap_stop(void *data)
1044{
1045 struct migration_swap_arg *arg = data;
1046 struct rq *src_rq, *dst_rq;
1047 int ret = -EAGAIN;
1048
1049 src_rq = cpu_rq(arg->src_cpu);
1050 dst_rq = cpu_rq(arg->dst_cpu);
1051
1052 double_raw_lock(&arg->src_task->pi_lock,
1053 &arg->dst_task->pi_lock);
1054 double_rq_lock(src_rq, dst_rq);
1055 if (task_cpu(arg->dst_task) != arg->dst_cpu)
1056 goto unlock;
1057
1058 if (task_cpu(arg->src_task) != arg->src_cpu)
1059 goto unlock;
1060
1061 if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
1062 goto unlock;
1063
1064 if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
1065 goto unlock;
1066
1067 __migrate_swap_task(arg->src_task, arg->dst_cpu);
1068 __migrate_swap_task(arg->dst_task, arg->src_cpu);
1069
1070 ret = 0;
1071
1072unlock:
1073 double_rq_unlock(src_rq, dst_rq);
1074 raw_spin_unlock(&arg->dst_task->pi_lock);
1075 raw_spin_unlock(&arg->src_task->pi_lock);
1076
1077 return ret;
1078}
1079
1080/*
1081 * Cross migrate two tasks
1082 */
1083int migrate_swap(struct task_struct *cur, struct task_struct *p)
1084{
1085 struct migration_swap_arg arg;
1086 int ret = -EINVAL;
1087
1088 arg = (struct migration_swap_arg){
1089 .src_task = cur,
1090 .src_cpu = task_cpu(cur),
1091 .dst_task = p,
1092 .dst_cpu = task_cpu(p),
1093 };
1094
1095 if (arg.src_cpu == arg.dst_cpu)
1096 goto out;
1097
1098 /*
1099 * These three tests are all lockless; this is OK since all of them
1100 * will be re-checked with proper locks held further down the line.
1101 */
1102 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1103 goto out;
1104
1105 if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
1106 goto out;
1107
1108 if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
1109 goto out;
1110
1111 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1112
1113out:
1114 return ret;
1115}
1116
1020struct migration_arg { 1117struct migration_arg {
1021 struct task_struct *task; 1118 struct task_struct *task;
1022 int dest_cpu; 1119 int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
1236 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. 1333 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1237 */ 1334 */
1238static inline 1335static inline
1239int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 1336int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
1240{ 1337{
1241 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 1338 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
1242 1339
1243 /* 1340 /*
1244 * In order not to call set_task_cpu() on a blocking task we need 1341 * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1330 1427
1331 if (rq->idle_stamp) { 1428 if (rq->idle_stamp) {
1332 u64 delta = rq_clock(rq) - rq->idle_stamp; 1429 u64 delta = rq_clock(rq) - rq->idle_stamp;
1333 u64 max = 2*sysctl_sched_migration_cost; 1430 u64 max = 2*rq->max_idle_balance_cost;
1431
1432 update_avg(&rq->avg_idle, delta);
1334 1433
1335 if (delta > max) 1434 if (rq->avg_idle > max)
1336 rq->avg_idle = max; 1435 rq->avg_idle = max;
1337 else 1436
1338 update_avg(&rq->avg_idle, delta);
1339 rq->idle_stamp = 0; 1437 rq->idle_stamp = 0;
1340 } 1438 }
1341#endif 1439#endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
1396 1494
1397void scheduler_ipi(void) 1495void scheduler_ipi(void)
1398{ 1496{
1497 /*
1498 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1499 * TIF_NEED_RESCHED remotely (for the first time) will also send
1500 * this IPI.
1501 */
1502 if (tif_need_resched())
1503 set_preempt_need_resched();
1504
1399 if (llist_empty(&this_rq()->wake_list) 1505 if (llist_empty(&this_rq()->wake_list)
1400 && !tick_nohz_full_cpu(smp_processor_id()) 1506 && !tick_nohz_full_cpu(smp_processor_id())
1401 && !got_nohz_idle_kick()) 1507 && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1513 if (p->sched_class->task_waking) 1619 if (p->sched_class->task_waking)
1514 p->sched_class->task_waking(p); 1620 p->sched_class->task_waking(p);
1515 1621
1516 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 1622 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
1517 if (task_cpu(p) != cpu) { 1623 if (task_cpu(p) != cpu) {
1518 wake_flags |= WF_MIGRATED; 1624 wake_flags |= WF_MIGRATED;
1519 set_task_cpu(p, cpu); 1625 set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1595 * 1701 *
1596 * __sched_fork() is basic setup used by init_idle() too: 1702 * __sched_fork() is basic setup used by init_idle() too:
1597 */ 1703 */
1598static void __sched_fork(struct task_struct *p) 1704static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1599{ 1705{
1600 p->on_rq = 0; 1706 p->on_rq = 0;
1601 1707
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
1619 1725
1620#ifdef CONFIG_NUMA_BALANCING 1726#ifdef CONFIG_NUMA_BALANCING
1621 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1727 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1622 p->mm->numa_next_scan = jiffies; 1728 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
1623 p->mm->numa_next_reset = jiffies;
1624 p->mm->numa_scan_seq = 0; 1729 p->mm->numa_scan_seq = 0;
1625 } 1730 }
1626 1731
1732 if (clone_flags & CLONE_VM)
1733 p->numa_preferred_nid = current->numa_preferred_nid;
1734 else
1735 p->numa_preferred_nid = -1;
1736
1627 p->node_stamp = 0ULL; 1737 p->node_stamp = 0ULL;
1628 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; 1738 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1629 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1630 p->numa_scan_period = sysctl_numa_balancing_scan_delay; 1739 p->numa_scan_period = sysctl_numa_balancing_scan_delay;
1631 p->numa_work.next = &p->numa_work; 1740 p->numa_work.next = &p->numa_work;
1741 p->numa_faults = NULL;
1742 p->numa_faults_buffer = NULL;
1743
1744 INIT_LIST_HEAD(&p->numa_entry);
1745 p->numa_group = NULL;
1632#endif /* CONFIG_NUMA_BALANCING */ 1746#endif /* CONFIG_NUMA_BALANCING */
1633} 1747}
1634 1748
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
1654/* 1768/*
1655 * fork()/clone()-time setup: 1769 * fork()/clone()-time setup:
1656 */ 1770 */
1657void sched_fork(struct task_struct *p) 1771void sched_fork(unsigned long clone_flags, struct task_struct *p)
1658{ 1772{
1659 unsigned long flags; 1773 unsigned long flags;
1660 int cpu = get_cpu(); 1774 int cpu = get_cpu();
1661 1775
1662 __sched_fork(p); 1776 __sched_fork(clone_flags, p);
1663 /* 1777 /*
1664 * We mark the process as running here. This guarantees that 1778 * We mark the process as running here. This guarantees that
1665 * nobody will actually run it, and a signal or other external 1779 * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
1717#if defined(CONFIG_SMP) 1831#if defined(CONFIG_SMP)
1718 p->on_cpu = 0; 1832 p->on_cpu = 0;
1719#endif 1833#endif
1720#ifdef CONFIG_PREEMPT_COUNT 1834 init_task_preempt_count(p);
1721 /* Want to start with kernel preemption disabled. */
1722 task_thread_info(p)->preempt_count = 1;
1723#endif
1724#ifdef CONFIG_SMP 1835#ifdef CONFIG_SMP
1725 plist_node_init(&p->pushable_tasks, MAX_PRIO); 1836 plist_node_init(&p->pushable_tasks, MAX_PRIO);
1726#endif 1837#endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
1747 * - cpus_allowed can change in the fork path 1858 * - cpus_allowed can change in the fork path
1748 * - any previously selected cpu might disappear through hotplug 1859 * - any previously selected cpu might disappear through hotplug
1749 */ 1860 */
1750 set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); 1861 set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
1751#endif 1862#endif
1752 1863
1753 /* Initialize new task's runnable average */ 1864 /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1838 struct task_struct *next) 1949 struct task_struct *next)
1839{ 1950{
1840 trace_sched_switch(prev, next); 1951 trace_sched_switch(prev, next);
1841 sched_info_switch(prev, next); 1952 sched_info_switch(rq, prev, next);
1842 perf_event_task_sched_out(prev, next); 1953 perf_event_task_sched_out(prev, next);
1843 fire_sched_out_preempt_notifiers(prev, next); 1954 fire_sched_out_preempt_notifiers(prev, next);
1844 prepare_lock_switch(rq, next); 1955 prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1890 if (mm) 2001 if (mm)
1891 mmdrop(mm); 2002 mmdrop(mm);
1892 if (unlikely(prev_state == TASK_DEAD)) { 2003 if (unlikely(prev_state == TASK_DEAD)) {
2004 task_numa_free(prev);
2005
1893 /* 2006 /*
1894 * Remove function-return probe instances associated with this 2007 * Remove function-return probe instances associated with this
1895 * task and put them back on the free list. 2008 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
2073 int dest_cpu; 2186 int dest_cpu;
2074 2187
2075 raw_spin_lock_irqsave(&p->pi_lock, flags); 2188 raw_spin_lock_irqsave(&p->pi_lock, flags);
2076 dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); 2189 dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
2077 if (dest_cpu == smp_processor_id()) 2190 if (dest_cpu == smp_processor_id())
2078 goto unlock; 2191 goto unlock;
2079 2192
@@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
2215#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 2328#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
2216 defined(CONFIG_PREEMPT_TRACER)) 2329 defined(CONFIG_PREEMPT_TRACER))
2217 2330
2218void __kprobes add_preempt_count(int val) 2331void __kprobes preempt_count_add(int val)
2219{ 2332{
2220#ifdef CONFIG_DEBUG_PREEMPT 2333#ifdef CONFIG_DEBUG_PREEMPT
2221 /* 2334 /*
@@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
2224 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 2337 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
2225 return; 2338 return;
2226#endif 2339#endif
2227 preempt_count() += val; 2340 __preempt_count_add(val);
2228#ifdef CONFIG_DEBUG_PREEMPT 2341#ifdef CONFIG_DEBUG_PREEMPT
2229 /* 2342 /*
2230 * Spinlock count overflowing soon? 2343 * Spinlock count overflowing soon?
@@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
2235 if (preempt_count() == val) 2348 if (preempt_count() == val)
2236 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2349 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2237} 2350}
2238EXPORT_SYMBOL(add_preempt_count); 2351EXPORT_SYMBOL(preempt_count_add);
2239 2352
2240void __kprobes sub_preempt_count(int val) 2353void __kprobes preempt_count_sub(int val)
2241{ 2354{
2242#ifdef CONFIG_DEBUG_PREEMPT 2355#ifdef CONFIG_DEBUG_PREEMPT
2243 /* 2356 /*
@@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
2255 2368
2256 if (preempt_count() == val) 2369 if (preempt_count() == val)
2257 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); 2370 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
2258 preempt_count() -= val; 2371 __preempt_count_sub(val);
2259} 2372}
2260EXPORT_SYMBOL(sub_preempt_count); 2373EXPORT_SYMBOL(preempt_count_sub);
2261 2374
2262#endif 2375#endif
2263 2376
@@ -2430,6 +2543,7 @@ need_resched:
2430 put_prev_task(rq, prev); 2543 put_prev_task(rq, prev);
2431 next = pick_next_task(rq); 2544 next = pick_next_task(rq);
2432 clear_tsk_need_resched(prev); 2545 clear_tsk_need_resched(prev);
2546 clear_preempt_need_resched();
2433 rq->skip_clock_update = 0; 2547 rq->skip_clock_update = 0;
2434 2548
2435 if (likely(prev != next)) { 2549 if (likely(prev != next)) {
@@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
2520 return; 2634 return;
2521 2635
2522 do { 2636 do {
2523 add_preempt_count_notrace(PREEMPT_ACTIVE); 2637 __preempt_count_add(PREEMPT_ACTIVE);
2524 __schedule(); 2638 __schedule();
2525 sub_preempt_count_notrace(PREEMPT_ACTIVE); 2639 __preempt_count_sub(PREEMPT_ACTIVE);
2526 2640
2527 /* 2641 /*
2528 * Check again in case we missed a preemption opportunity 2642 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
2541 */ 2655 */
2542asmlinkage void __sched preempt_schedule_irq(void) 2656asmlinkage void __sched preempt_schedule_irq(void)
2543{ 2657{
2544 struct thread_info *ti = current_thread_info();
2545 enum ctx_state prev_state; 2658 enum ctx_state prev_state;
2546 2659
2547 /* Catch callers which need to be fixed */ 2660 /* Catch callers which need to be fixed */
2548 BUG_ON(ti->preempt_count || !irqs_disabled()); 2661 BUG_ON(preempt_count() || !irqs_disabled());
2549 2662
2550 prev_state = exception_enter(); 2663 prev_state = exception_enter();
2551 2664
2552 do { 2665 do {
2553 add_preempt_count(PREEMPT_ACTIVE); 2666 __preempt_count_add(PREEMPT_ACTIVE);
2554 local_irq_enable(); 2667 local_irq_enable();
2555 __schedule(); 2668 __schedule();
2556 local_irq_disable(); 2669 local_irq_disable();
2557 sub_preempt_count(PREEMPT_ACTIVE); 2670 __preempt_count_sub(PREEMPT_ACTIVE);
2558 2671
2559 /* 2672 /*
2560 * Check again in case we missed a preemption opportunity 2673 * Check again in case we missed a preemption opportunity
@@ -2575,393 +2688,6 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
2575} 2688}
2576EXPORT_SYMBOL(default_wake_function); 2689EXPORT_SYMBOL(default_wake_function);
2577 2690
2578/*
2579 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
2580 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
2581 * number) then we wake all the non-exclusive tasks and one exclusive task.
2582 *
2583 * There are circumstances in which we can try to wake a task which has already
2584 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
2585 * zero in this (rare) case, and we handle it by continuing to scan the queue.
2586 */
2587static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
2588 int nr_exclusive, int wake_flags, void *key)
2589{
2590 wait_queue_t *curr, *next;
2591
2592 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
2593 unsigned flags = curr->flags;
2594
2595 if (curr->func(curr, mode, wake_flags, key) &&
2596 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
2597 break;
2598 }
2599}
2600
2601/**
2602 * __wake_up - wake up threads blocked on a waitqueue.
2603 * @q: the waitqueue
2604 * @mode: which threads
2605 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2606 * @key: is directly passed to the wakeup function
2607 *
2608 * It may be assumed that this function implies a write memory barrier before
2609 * changing the task state if and only if any tasks are woken up.
2610 */
2611void __wake_up(wait_queue_head_t *q, unsigned int mode,
2612 int nr_exclusive, void *key)
2613{
2614 unsigned long flags;
2615
2616 spin_lock_irqsave(&q->lock, flags);
2617 __wake_up_common(q, mode, nr_exclusive, 0, key);
2618 spin_unlock_irqrestore(&q->lock, flags);
2619}
2620EXPORT_SYMBOL(__wake_up);
2621
2622/*
2623 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
2624 */
2625void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
2626{
2627 __wake_up_common(q, mode, nr, 0, NULL);
2628}
2629EXPORT_SYMBOL_GPL(__wake_up_locked);
2630
2631void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
2632{
2633 __wake_up_common(q, mode, 1, 0, key);
2634}
2635EXPORT_SYMBOL_GPL(__wake_up_locked_key);
2636
2637/**
2638 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
2639 * @q: the waitqueue
2640 * @mode: which threads
2641 * @nr_exclusive: how many wake-one or wake-many threads to wake up
2642 * @key: opaque value to be passed to wakeup targets
2643 *
2644 * The sync wakeup differs that the waker knows that it will schedule
2645 * away soon, so while the target thread will be woken up, it will not
2646 * be migrated to another CPU - ie. the two threads are 'synchronized'
2647 * with each other. This can prevent needless bouncing between CPUs.
2648 *
2649 * On UP it can prevent extra preemption.
2650 *
2651 * It may be assumed that this function implies a write memory barrier before
2652 * changing the task state if and only if any tasks are woken up.
2653 */
2654void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
2655 int nr_exclusive, void *key)
2656{
2657 unsigned long flags;
2658 int wake_flags = WF_SYNC;
2659
2660 if (unlikely(!q))
2661 return;
2662
2663 if (unlikely(nr_exclusive != 1))
2664 wake_flags = 0;
2665
2666 spin_lock_irqsave(&q->lock, flags);
2667 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
2668 spin_unlock_irqrestore(&q->lock, flags);
2669}
2670EXPORT_SYMBOL_GPL(__wake_up_sync_key);
2671
2672/*
2673 * __wake_up_sync - see __wake_up_sync_key()
2674 */
2675void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
2676{
2677 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
2678}
2679EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
2680
2681/**
2682 * complete: - signals a single thread waiting on this completion
2683 * @x: holds the state of this particular completion
2684 *
2685 * This will wake up a single thread waiting on this completion. Threads will be
2686 * awakened in the same order in which they were queued.
2687 *
2688 * See also complete_all(), wait_for_completion() and related routines.
2689 *
2690 * It may be assumed that this function implies a write memory barrier before
2691 * changing the task state if and only if any tasks are woken up.
2692 */
2693void complete(struct completion *x)
2694{
2695 unsigned long flags;
2696
2697 spin_lock_irqsave(&x->wait.lock, flags);
2698 x->done++;
2699 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
2700 spin_unlock_irqrestore(&x->wait.lock, flags);
2701}
2702EXPORT_SYMBOL(complete);
2703
2704/**
2705 * complete_all: - signals all threads waiting on this completion
2706 * @x: holds the state of this particular completion
2707 *
2708 * This will wake up all threads waiting on this particular completion event.
2709 *
2710 * It may be assumed that this function implies a write memory barrier before
2711 * changing the task state if and only if any tasks are woken up.
2712 */
2713void complete_all(struct completion *x)
2714{
2715 unsigned long flags;
2716
2717 spin_lock_irqsave(&x->wait.lock, flags);
2718 x->done += UINT_MAX/2;
2719 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
2720 spin_unlock_irqrestore(&x->wait.lock, flags);
2721}
2722EXPORT_SYMBOL(complete_all);
2723
2724static inline long __sched
2725do_wait_for_common(struct completion *x,
2726 long (*action)(long), long timeout, int state)
2727{
2728 if (!x->done) {
2729 DECLARE_WAITQUEUE(wait, current);
2730
2731 __add_wait_queue_tail_exclusive(&x->wait, &wait);
2732 do {
2733 if (signal_pending_state(state, current)) {
2734 timeout = -ERESTARTSYS;
2735 break;
2736 }
2737 __set_current_state(state);
2738 spin_unlock_irq(&x->wait.lock);
2739 timeout = action(timeout);
2740 spin_lock_irq(&x->wait.lock);
2741 } while (!x->done && timeout);
2742 __remove_wait_queue(&x->wait, &wait);
2743 if (!x->done)
2744 return timeout;
2745 }
2746 x->done--;
2747 return timeout ?: 1;
2748}
2749
2750static inline long __sched
2751__wait_for_common(struct completion *x,
2752 long (*action)(long), long timeout, int state)
2753{
2754 might_sleep();
2755
2756 spin_lock_irq(&x->wait.lock);
2757 timeout = do_wait_for_common(x, action, timeout, state);
2758 spin_unlock_irq(&x->wait.lock);
2759 return timeout;
2760}
2761
2762static long __sched
2763wait_for_common(struct completion *x, long timeout, int state)
2764{
2765 return __wait_for_common(x, schedule_timeout, timeout, state);
2766}
2767
2768static long __sched
2769wait_for_common_io(struct completion *x, long timeout, int state)
2770{
2771 return __wait_for_common(x, io_schedule_timeout, timeout, state);
2772}
2773
2774/**
2775 * wait_for_completion: - waits for completion of a task
2776 * @x: holds the state of this particular completion
2777 *
2778 * This waits to be signaled for completion of a specific task. It is NOT
2779 * interruptible and there is no timeout.
2780 *
2781 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
2782 * and interrupt capability. Also see complete().
2783 */
2784void __sched wait_for_completion(struct completion *x)
2785{
2786 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2787}
2788EXPORT_SYMBOL(wait_for_completion);
2789
2790/**
2791 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
2792 * @x: holds the state of this particular completion
2793 * @timeout: timeout value in jiffies
2794 *
2795 * This waits for either a completion of a specific task to be signaled or for a
2796 * specified timeout to expire. The timeout is in jiffies. It is not
2797 * interruptible.
2798 *
2799 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2800 * till timeout) if completed.
2801 */
2802unsigned long __sched
2803wait_for_completion_timeout(struct completion *x, unsigned long timeout)
2804{
2805 return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE);
2806}
2807EXPORT_SYMBOL(wait_for_completion_timeout);
2808
2809/**
2810 * wait_for_completion_io: - waits for completion of a task
2811 * @x: holds the state of this particular completion
2812 *
2813 * This waits to be signaled for completion of a specific task. It is NOT
2814 * interruptible and there is no timeout. The caller is accounted as waiting
2815 * for IO.
2816 */
2817void __sched wait_for_completion_io(struct completion *x)
2818{
2819 wait_for_common_io(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
2820}
2821EXPORT_SYMBOL(wait_for_completion_io);
2822
2823/**
2824 * wait_for_completion_io_timeout: - waits for completion of a task (w/timeout)
2825 * @x: holds the state of this particular completion
2826 * @timeout: timeout value in jiffies
2827 *
2828 * This waits for either a completion of a specific task to be signaled or for a
2829 * specified timeout to expire. The timeout is in jiffies. It is not
2830 * interruptible. The caller is accounted as waiting for IO.
2831 *
2832 * Return: 0 if timed out, and positive (at least 1, or number of jiffies left
2833 * till timeout) if completed.
2834 */
2835unsigned long __sched
2836wait_for_completion_io_timeout(struct completion *x, unsigned long timeout)
2837{
2838 return wait_for_common_io(x, timeout, TASK_UNINTERRUPTIBLE);
2839}
2840EXPORT_SYMBOL(wait_for_completion_io_timeout);
2841
2842/**
2843 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
2844 * @x: holds the state of this particular completion
2845 *
2846 * This waits for completion of a specific task to be signaled. It is
2847 * interruptible.
2848 *
2849 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2850 */
2851int __sched wait_for_completion_interruptible(struct completion *x)
2852{
2853 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
2854 if (t == -ERESTARTSYS)
2855 return t;
2856 return 0;
2857}
2858EXPORT_SYMBOL(wait_for_completion_interruptible);
2859
2860/**
2861 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
2862 * @x: holds the state of this particular completion
2863 * @timeout: timeout value in jiffies
2864 *
2865 * This waits for either a completion of a specific task to be signaled or for a
2866 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
2867 *
2868 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2869 * or number of jiffies left till timeout) if completed.
2870 */
2871long __sched
2872wait_for_completion_interruptible_timeout(struct completion *x,
2873 unsigned long timeout)
2874{
2875 return wait_for_common(x, timeout, TASK_INTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
2878
2879/**
2880 * wait_for_completion_killable: - waits for completion of a task (killable)
2881 * @x: holds the state of this particular completion
2882 *
2883 * This waits to be signaled for completion of a specific task. It can be
2884 * interrupted by a kill signal.
2885 *
2886 * Return: -ERESTARTSYS if interrupted, 0 if completed.
2887 */
2888int __sched wait_for_completion_killable(struct completion *x)
2889{
2890 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
2891 if (t == -ERESTARTSYS)
2892 return t;
2893 return 0;
2894}
2895EXPORT_SYMBOL(wait_for_completion_killable);
2896
2897/**
2898 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
2899 * @x: holds the state of this particular completion
2900 * @timeout: timeout value in jiffies
2901 *
2902 * This waits for either a completion of a specific task to be
2903 * signaled or for a specified timeout to expire. It can be
2904 * interrupted by a kill signal. The timeout is in jiffies.
2905 *
2906 * Return: -ERESTARTSYS if interrupted, 0 if timed out, positive (at least 1,
2907 * or number of jiffies left till timeout) if completed.
2908 */
2909long __sched
2910wait_for_completion_killable_timeout(struct completion *x,
2911 unsigned long timeout)
2912{
2913 return wait_for_common(x, timeout, TASK_KILLABLE);
2914}
2915EXPORT_SYMBOL(wait_for_completion_killable_timeout);
2916
2917/**
2918 * try_wait_for_completion - try to decrement a completion without blocking
2919 * @x: completion structure
2920 *
2921 * Return: 0 if a decrement cannot be done without blocking
2922 * 1 if a decrement succeeded.
2923 *
2924 * If a completion is being used as a counting completion,
2925 * attempt to decrement the counter without blocking. This
2926 * enables us to avoid waiting if the resource the completion
2927 * is protecting is not available.
2928 */
2929bool try_wait_for_completion(struct completion *x)
2930{
2931 unsigned long flags;
2932 int ret = 1;
2933
2934 spin_lock_irqsave(&x->wait.lock, flags);
2935 if (!x->done)
2936 ret = 0;
2937 else
2938 x->done--;
2939 spin_unlock_irqrestore(&x->wait.lock, flags);
2940 return ret;
2941}
2942EXPORT_SYMBOL(try_wait_for_completion);
2943
2944/**
2945 * completion_done - Test to see if a completion has any waiters
2946 * @x: completion structure
2947 *
2948 * Return: 0 if there are waiters (wait_for_completion() in progress)
2949 * 1 if there are no waiters.
2950 *
2951 */
2952bool completion_done(struct completion *x)
2953{
2954 unsigned long flags;
2955 int ret = 1;
2956
2957 spin_lock_irqsave(&x->wait.lock, flags);
2958 if (!x->done)
2959 ret = 0;
2960 spin_unlock_irqrestore(&x->wait.lock, flags);
2961 return ret;
2962}
2963EXPORT_SYMBOL(completion_done);
2964
2965static long __sched 2691static long __sched
2966sleep_on_common(wait_queue_head_t *q, int state, long timeout) 2692sleep_on_common(wait_queue_head_t *q, int state, long timeout)
2967{ 2693{
@@ -3598,13 +3324,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3598 struct task_struct *p; 3324 struct task_struct *p;
3599 int retval; 3325 int retval;
3600 3326
3601 get_online_cpus();
3602 rcu_read_lock(); 3327 rcu_read_lock();
3603 3328
3604 p = find_process_by_pid(pid); 3329 p = find_process_by_pid(pid);
3605 if (!p) { 3330 if (!p) {
3606 rcu_read_unlock(); 3331 rcu_read_unlock();
3607 put_online_cpus();
3608 return -ESRCH; 3332 return -ESRCH;
3609 } 3333 }
3610 3334
@@ -3661,7 +3385,6 @@ out_free_cpus_allowed:
3661 free_cpumask_var(cpus_allowed); 3385 free_cpumask_var(cpus_allowed);
3662out_put_task: 3386out_put_task:
3663 put_task_struct(p); 3387 put_task_struct(p);
3664 put_online_cpus();
3665 return retval; 3388 return retval;
3666} 3389}
3667 3390
@@ -3706,7 +3429,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3706 unsigned long flags; 3429 unsigned long flags;
3707 int retval; 3430 int retval;
3708 3431
3709 get_online_cpus();
3710 rcu_read_lock(); 3432 rcu_read_lock();
3711 3433
3712 retval = -ESRCH; 3434 retval = -ESRCH;
@@ -3719,12 +3441,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
3719 goto out_unlock; 3441 goto out_unlock;
3720 3442
3721 raw_spin_lock_irqsave(&p->pi_lock, flags); 3443 raw_spin_lock_irqsave(&p->pi_lock, flags);
3722 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 3444 cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
3723 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 3445 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3724 3446
3725out_unlock: 3447out_unlock:
3726 rcu_read_unlock(); 3448 rcu_read_unlock();
3727 put_online_cpus();
3728 3449
3729 return retval; 3450 return retval;
3730} 3451}
@@ -3794,16 +3515,11 @@ SYSCALL_DEFINE0(sched_yield)
3794 return 0; 3515 return 0;
3795} 3516}
3796 3517
3797static inline int should_resched(void)
3798{
3799 return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
3800}
3801
3802static void __cond_resched(void) 3518static void __cond_resched(void)
3803{ 3519{
3804 add_preempt_count(PREEMPT_ACTIVE); 3520 __preempt_count_add(PREEMPT_ACTIVE);
3805 __schedule(); 3521 __schedule();
3806 sub_preempt_count(PREEMPT_ACTIVE); 3522 __preempt_count_sub(PREEMPT_ACTIVE);
3807} 3523}
3808 3524
3809int __sched _cond_resched(void) 3525int __sched _cond_resched(void)
@@ -4186,7 +3902,7 @@ void init_idle(struct task_struct *idle, int cpu)
4186 3902
4187 raw_spin_lock_irqsave(&rq->lock, flags); 3903 raw_spin_lock_irqsave(&rq->lock, flags);
4188 3904
4189 __sched_fork(idle); 3905 __sched_fork(0, idle);
4190 idle->state = TASK_RUNNING; 3906 idle->state = TASK_RUNNING;
4191 idle->se.exec_start = sched_clock(); 3907 idle->se.exec_start = sched_clock();
4192 3908
@@ -4212,7 +3928,7 @@ void init_idle(struct task_struct *idle, int cpu)
4212 raw_spin_unlock_irqrestore(&rq->lock, flags); 3928 raw_spin_unlock_irqrestore(&rq->lock, flags);
4213 3929
4214 /* Set the preempt count _outside_ the spinlocks! */ 3930 /* Set the preempt count _outside_ the spinlocks! */
4215 task_thread_info(idle)->preempt_count = 0; 3931 init_idle_preempt_count(idle, cpu);
4216 3932
4217 /* 3933 /*
4218 * The idle tasks have their own, simple scheduling class: 3934 * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4062,53 @@ fail:
4346 return ret; 4062 return ret;
4347} 4063}
4348 4064
4065#ifdef CONFIG_NUMA_BALANCING
4066/* Migrate current task p to target_cpu */
4067int migrate_task_to(struct task_struct *p, int target_cpu)
4068{
4069 struct migration_arg arg = { p, target_cpu };
4070 int curr_cpu = task_cpu(p);
4071
4072 if (curr_cpu == target_cpu)
4073 return 0;
4074
4075 if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
4076 return -EINVAL;
4077
4078 /* TODO: This is not properly updating schedstats */
4079
4080 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
4081}
4082
4083/*
4084 * Requeue a task on a given node and accurately track the number of NUMA
4085 * tasks on the runqueues
4086 */
4087void sched_setnuma(struct task_struct *p, int nid)
4088{
4089 struct rq *rq;
4090 unsigned long flags;
4091 bool on_rq, running;
4092
4093 rq = task_rq_lock(p, &flags);
4094 on_rq = p->on_rq;
4095 running = task_current(rq, p);
4096
4097 if (on_rq)
4098 dequeue_task(rq, p, 0);
4099 if (running)
4100 p->sched_class->put_prev_task(rq, p);
4101
4102 p->numa_preferred_nid = nid;
4103
4104 if (running)
4105 p->sched_class->set_curr_task(rq);
4106 if (on_rq)
4107 enqueue_task(rq, p, 0);
4108 task_rq_unlock(rq, p, &flags);
4109}
4110#endif
4111
4349/* 4112/*
4350 * migration_cpu_stop - this will be executed by a highprio stopper thread 4113 * migration_cpu_stop - this will be executed by a highprio stopper thread
4351 * and performs thread migration by bumping thread off CPU then 4114 * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +4882,9 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
5119DEFINE_PER_CPU(struct sched_domain *, sd_llc); 4882DEFINE_PER_CPU(struct sched_domain *, sd_llc);
5120DEFINE_PER_CPU(int, sd_llc_size); 4883DEFINE_PER_CPU(int, sd_llc_size);
5121DEFINE_PER_CPU(int, sd_llc_id); 4884DEFINE_PER_CPU(int, sd_llc_id);
4885DEFINE_PER_CPU(struct sched_domain *, sd_numa);
4886DEFINE_PER_CPU(struct sched_domain *, sd_busy);
4887DEFINE_PER_CPU(struct sched_domain *, sd_asym);
5122 4888
5123static void update_top_cache_domain(int cpu) 4889static void update_top_cache_domain(int cpu)
5124{ 4890{
@@ -5130,11 +4896,18 @@ static void update_top_cache_domain(int cpu)
5130 if (sd) { 4896 if (sd) {
5131 id = cpumask_first(sched_domain_span(sd)); 4897 id = cpumask_first(sched_domain_span(sd));
5132 size = cpumask_weight(sched_domain_span(sd)); 4898 size = cpumask_weight(sched_domain_span(sd));
4899 rcu_assign_pointer(per_cpu(sd_busy, cpu), sd->parent);
5133 } 4900 }
5134 4901
5135 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); 4902 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
5136 per_cpu(sd_llc_size, cpu) = size; 4903 per_cpu(sd_llc_size, cpu) = size;
5137 per_cpu(sd_llc_id, cpu) = id; 4904 per_cpu(sd_llc_id, cpu) = id;
4905
4906 sd = lowest_flag_domain(cpu, SD_NUMA);
4907 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
4908
4909 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
4910 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
5138} 4911}
5139 4912
5140/* 4913/*
@@ -5654,6 +5427,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
5654 | 0*SD_SHARE_PKG_RESOURCES 5427 | 0*SD_SHARE_PKG_RESOURCES
5655 | 1*SD_SERIALIZE 5428 | 1*SD_SERIALIZE
5656 | 0*SD_PREFER_SIBLING 5429 | 0*SD_PREFER_SIBLING
5430 | 1*SD_NUMA
5657 | sd_local_flags(level) 5431 | sd_local_flags(level)
5658 , 5432 ,
5659 .last_balance = jiffies, 5433 .last_balance = jiffies,
@@ -6335,14 +6109,17 @@ void __init sched_init_smp(void)
6335 6109
6336 sched_init_numa(); 6110 sched_init_numa();
6337 6111
6338 get_online_cpus(); 6112 /*
6113 * There's no userspace yet to cause hotplug operations; hence all the
6114 * cpu masks are stable and all blatant races in the below code cannot
6115 * happen.
6116 */
6339 mutex_lock(&sched_domains_mutex); 6117 mutex_lock(&sched_domains_mutex);
6340 init_sched_domains(cpu_active_mask); 6118 init_sched_domains(cpu_active_mask);
6341 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 6119 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
6342 if (cpumask_empty(non_isolated_cpus)) 6120 if (cpumask_empty(non_isolated_cpus))
6343 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 6121 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
6344 mutex_unlock(&sched_domains_mutex); 6122 mutex_unlock(&sched_domains_mutex);
6345 put_online_cpus();
6346 6123
6347 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE); 6124 hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
6348 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); 6125 hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6282,7 @@ void __init sched_init(void)
6505 rq->online = 0; 6282 rq->online = 0;
6506 rq->idle_stamp = 0; 6283 rq->idle_stamp = 0;
6507 rq->avg_idle = 2*sysctl_sched_migration_cost; 6284 rq->avg_idle = 2*sysctl_sched_migration_cost;
6285 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
6508 6286
6509 INIT_LIST_HEAD(&rq->cfs_tasks); 6287 INIT_LIST_HEAD(&rq->cfs_tasks);
6510 6288
@@ -7277,7 +7055,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7277 7055
7278 runtime_enabled = quota != RUNTIME_INF; 7056 runtime_enabled = quota != RUNTIME_INF;
7279 runtime_was_enabled = cfs_b->quota != RUNTIME_INF; 7057 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
7280 account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled); 7058 /*
7059 * If we need to toggle cfs_bandwidth_used, off->on must occur
7060 * before making related changes, and on->off must occur afterwards
7061 */
7062 if (runtime_enabled && !runtime_was_enabled)
7063 cfs_bandwidth_usage_inc();
7281 raw_spin_lock_irq(&cfs_b->lock); 7064 raw_spin_lock_irq(&cfs_b->lock);
7282 cfs_b->period = ns_to_ktime(period); 7065 cfs_b->period = ns_to_ktime(period);
7283 cfs_b->quota = quota; 7066 cfs_b->quota = quota;
@@ -7303,6 +7086,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
7303 unthrottle_cfs_rq(cfs_rq); 7086 unthrottle_cfs_rq(cfs_rq);
7304 raw_spin_unlock_irq(&rq->lock); 7087 raw_spin_unlock_irq(&rq->lock);
7305 } 7088 }
7089 if (runtime_was_enabled && !runtime_enabled)
7090 cfs_bandwidth_usage_dec();
7306out_unlock: 7091out_unlock:
7307 mutex_unlock(&cfs_constraints_mutex); 7092 mutex_unlock(&cfs_constraints_mutex);
7308 7093
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
15#include <linux/seq_file.h> 15#include <linux/seq_file.h>
16#include <linux/kallsyms.h> 16#include <linux/kallsyms.h>
17#include <linux/utsname.h> 17#include <linux/utsname.h>
18#include <linux/mempolicy.h>
18 19
19#include "sched.h" 20#include "sched.h"
20 21
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
137 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 138 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
138 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 139 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
139#endif 140#endif
141#ifdef CONFIG_NUMA_BALANCING
142 SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
143#endif
140#ifdef CONFIG_CGROUP_SCHED 144#ifdef CONFIG_CGROUP_SCHED
141 SEQ_printf(m, " %s", task_group_path(task_group(p))); 145 SEQ_printf(m, " %s", task_group_path(task_group(p)));
142#endif 146#endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 read_lock_irqsave(&tasklist_lock, flags); 163 read_lock_irqsave(&tasklist_lock, flags);
160 164
161 do_each_thread(g, p) { 165 do_each_thread(g, p) {
162 if (!p->on_rq || task_cpu(p) != rq_cpu) 166 if (task_cpu(p) != rq_cpu)
163 continue; 167 continue;
164 168
165 print_task(m, rq, p); 169 print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
225 atomic_read(&cfs_rq->tg->runnable_avg)); 229 atomic_read(&cfs_rq->tg->runnable_avg));
226#endif 230#endif
227#endif 231#endif
232#ifdef CONFIG_CFS_BANDWIDTH
233 SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
234 cfs_rq->tg->cfs_bandwidth.timer_active);
235 SEQ_printf(m, " .%-30s: %d\n", "throttled",
236 cfs_rq->throttled);
237 SEQ_printf(m, " .%-30s: %d\n", "throttle_count",
238 cfs_rq->throttle_count);
239#endif
228 240
229#ifdef CONFIG_FAIR_GROUP_SCHED 241#ifdef CONFIG_FAIR_GROUP_SCHED
230 print_cfs_group_stats(m, cpu, cfs_rq->tg); 242 print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
345 cpu_clk = local_clock(); 357 cpu_clk = local_clock();
346 local_irq_restore(flags); 358 local_irq_restore(flags);
347 359
348 SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", 360 SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
349 init_utsname()->release, 361 init_utsname()->release,
350 (int)strcspn(init_utsname()->version, " "), 362 (int)strcspn(init_utsname()->version, " "),
351 init_utsname()->version); 363 init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
488 500
489__initcall(init_sched_debug_procfs); 501__initcall(init_sched_debug_procfs);
490 502
503#define __P(F) \
504 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
505#define P(F) \
506 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
507#define __PN(F) \
508 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
509#define PN(F) \
510 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
511
512
513static void sched_show_numa(struct task_struct *p, struct seq_file *m)
514{
515#ifdef CONFIG_NUMA_BALANCING
516 struct mempolicy *pol;
517 int node, i;
518
519 if (p->mm)
520 P(mm->numa_scan_seq);
521
522 task_lock(p);
523 pol = p->mempolicy;
524 if (pol && !(pol->flags & MPOL_F_MORON))
525 pol = NULL;
526 mpol_get(pol);
527 task_unlock(p);
528
529 SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
530
531 for_each_online_node(node) {
532 for (i = 0; i < 2; i++) {
533 unsigned long nr_faults = -1;
534 int cpu_current, home_node;
535
536 if (p->numa_faults)
537 nr_faults = p->numa_faults[2*node + i];
538
539 cpu_current = !i ? (task_node(p) == node) :
540 (pol && node_isset(node, pol->v.nodes));
541
542 home_node = (p->numa_preferred_nid == node);
543
544 SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
545 i, node, cpu_current, home_node, nr_faults);
546 }
547 }
548
549 mpol_put(pol);
550#endif
551}
552
491void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 553void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
492{ 554{
493 unsigned long nr_switches; 555 unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
591 SEQ_printf(m, "%-45s:%21Ld\n", 653 SEQ_printf(m, "%-45s:%21Ld\n",
592 "clock-delta", (long long)(t1-t0)); 654 "clock-delta", (long long)(t1-t0));
593 } 655 }
656
657 sched_show_numa(p, m);
594} 658}
595 659
596void proc_sched_set_task(struct task_struct *p) 660void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..df77c605c7a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
681} 681}
682 682
683#ifdef CONFIG_SMP 683#ifdef CONFIG_SMP
684static unsigned long task_h_load(struct task_struct *p);
685
684static inline void __update_task_entity_contrib(struct sched_entity *se); 686static inline void __update_task_entity_contrib(struct sched_entity *se);
685 687
686/* Give new task start runnable values to heavy its load in infant time */ 688/* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
818 820
819#ifdef CONFIG_NUMA_BALANCING 821#ifdef CONFIG_NUMA_BALANCING
820/* 822/*
821 * numa task sample period in ms 823 * Approximate time to scan a full NUMA task in ms. The task scan period is
824 * calculated based on the tasks virtual memory size and
825 * numa_balancing_scan_size.
822 */ 826 */
823unsigned int sysctl_numa_balancing_scan_period_min = 100; 827unsigned int sysctl_numa_balancing_scan_period_min = 1000;
824unsigned int sysctl_numa_balancing_scan_period_max = 100*50; 828unsigned int sysctl_numa_balancing_scan_period_max = 60000;
825unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
826 829
827/* Portion of address space to scan in MB */ 830/* Portion of address space to scan in MB */
828unsigned int sysctl_numa_balancing_scan_size = 256; 831unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
830/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ 833/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
831unsigned int sysctl_numa_balancing_scan_delay = 1000; 834unsigned int sysctl_numa_balancing_scan_delay = 1000;
832 835
833static void task_numa_placement(struct task_struct *p) 836/*
837 * After skipping a page migration on a shared page, skip N more numa page
838 * migrations unconditionally. This reduces the number of NUMA migrations
839 * in shared memory workloads, and has the effect of pulling tasks towards
840 * where their memory lives, over pulling the memory towards the task.
841 */
842unsigned int sysctl_numa_balancing_migrate_deferred = 16;
843
844static unsigned int task_nr_scan_windows(struct task_struct *p)
845{
846 unsigned long rss = 0;
847 unsigned long nr_scan_pages;
848
849 /*
850 * Calculations based on RSS as non-present and empty pages are skipped
851 * by the PTE scanner and NUMA hinting faults should be trapped based
852 * on resident pages
853 */
854 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
855 rss = get_mm_rss(p->mm);
856 if (!rss)
857 rss = nr_scan_pages;
858
859 rss = round_up(rss, nr_scan_pages);
860 return rss / nr_scan_pages;
861}
862
863/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
864#define MAX_SCAN_WINDOW 2560
865
866static unsigned int task_scan_min(struct task_struct *p)
867{
868 unsigned int scan, floor;
869 unsigned int windows = 1;
870
871 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
872 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
873 floor = 1000 / windows;
874
875 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
876 return max_t(unsigned int, floor, scan);
877}
878
879static unsigned int task_scan_max(struct task_struct *p)
880{
881 unsigned int smin = task_scan_min(p);
882 unsigned int smax;
883
884 /* Watch for min being lower than max due to floor calculations */
885 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
886 return max(smin, smax);
887}
888
889/*
890 * Once a preferred node is selected the scheduler balancer will prefer moving
891 * a task to that node for sysctl_numa_balancing_settle_count number of PTE
892 * scans. This will give the process the chance to accumulate more faults on
893 * the preferred node but still allow the scheduler to move the task again if
894 * the nodes CPUs are overloaded.
895 */
896unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
897
898static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
899{
900 rq->nr_numa_running += (p->numa_preferred_nid != -1);
901 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
902}
903
904static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
905{
906 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
907 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
908}
909
910struct numa_group {
911 atomic_t refcount;
912
913 spinlock_t lock; /* nr_tasks, tasks */
914 int nr_tasks;
915 pid_t gid;
916 struct list_head task_list;
917
918 struct rcu_head rcu;
919 unsigned long total_faults;
920 unsigned long faults[0];
921};
922
923pid_t task_numa_group_id(struct task_struct *p)
924{
925 return p->numa_group ? p->numa_group->gid : 0;
926}
927
928static inline int task_faults_idx(int nid, int priv)
929{
930 return 2 * nid + priv;
931}
932
933static inline unsigned long task_faults(struct task_struct *p, int nid)
934{
935 if (!p->numa_faults)
936 return 0;
937
938 return p->numa_faults[task_faults_idx(nid, 0)] +
939 p->numa_faults[task_faults_idx(nid, 1)];
940}
941
942static inline unsigned long group_faults(struct task_struct *p, int nid)
943{
944 if (!p->numa_group)
945 return 0;
946
947 return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
948}
949
950/*
951 * These return the fraction of accesses done by a particular task, or
952 * task group, on a particular numa node. The group weight is given a
953 * larger multiplier, in order to group tasks together that are almost
954 * evenly spread out between numa nodes.
955 */
956static inline unsigned long task_weight(struct task_struct *p, int nid)
957{
958 unsigned long total_faults;
959
960 if (!p->numa_faults)
961 return 0;
962
963 total_faults = p->total_numa_faults;
964
965 if (!total_faults)
966 return 0;
967
968 return 1000 * task_faults(p, nid) / total_faults;
969}
970
971static inline unsigned long group_weight(struct task_struct *p, int nid)
834{ 972{
835 int seq; 973 if (!p->numa_group || !p->numa_group->total_faults)
974 return 0;
836 975
837 if (!p->mm) /* for example, ksmd faulting in a user's mm */ 976 return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
977}
978
979static unsigned long weighted_cpuload(const int cpu);
980static unsigned long source_load(int cpu, int type);
981static unsigned long target_load(int cpu, int type);
982static unsigned long power_of(int cpu);
983static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
984
985/* Cached statistics for all CPUs within a node */
986struct numa_stats {
987 unsigned long nr_running;
988 unsigned long load;
989
990 /* Total compute capacity of CPUs on a node */
991 unsigned long power;
992
993 /* Approximate capacity in terms of runnable tasks on a node */
994 unsigned long capacity;
995 int has_capacity;
996};
997
998/*
999 * XXX borrowed from update_sg_lb_stats
1000 */
1001static void update_numa_stats(struct numa_stats *ns, int nid)
1002{
1003 int cpu;
1004
1005 memset(ns, 0, sizeof(*ns));
1006 for_each_cpu(cpu, cpumask_of_node(nid)) {
1007 struct rq *rq = cpu_rq(cpu);
1008
1009 ns->nr_running += rq->nr_running;
1010 ns->load += weighted_cpuload(cpu);
1011 ns->power += power_of(cpu);
1012 }
1013
1014 ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
1015 ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
1016 ns->has_capacity = (ns->nr_running < ns->capacity);
1017}
1018
1019struct task_numa_env {
1020 struct task_struct *p;
1021
1022 int src_cpu, src_nid;
1023 int dst_cpu, dst_nid;
1024
1025 struct numa_stats src_stats, dst_stats;
1026
1027 int imbalance_pct, idx;
1028
1029 struct task_struct *best_task;
1030 long best_imp;
1031 int best_cpu;
1032};
1033
1034static void task_numa_assign(struct task_numa_env *env,
1035 struct task_struct *p, long imp)
1036{
1037 if (env->best_task)
1038 put_task_struct(env->best_task);
1039 if (p)
1040 get_task_struct(p);
1041
1042 env->best_task = p;
1043 env->best_imp = imp;
1044 env->best_cpu = env->dst_cpu;
1045}
1046
1047/*
1048 * This checks if the overall compute and NUMA accesses of the system would
1049 * be improved if the source tasks was migrated to the target dst_cpu taking
1050 * into account that it might be best if task running on the dst_cpu should
1051 * be exchanged with the source task
1052 */
1053static void task_numa_compare(struct task_numa_env *env,
1054 long taskimp, long groupimp)
1055{
1056 struct rq *src_rq = cpu_rq(env->src_cpu);
1057 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1058 struct task_struct *cur;
1059 long dst_load, src_load;
1060 long load;
1061 long imp = (groupimp > 0) ? groupimp : taskimp;
1062
1063 rcu_read_lock();
1064 cur = ACCESS_ONCE(dst_rq->curr);
1065 if (cur->pid == 0) /* idle */
1066 cur = NULL;
1067
1068 /*
1069 * "imp" is the fault differential for the source task between the
1070 * source and destination node. Calculate the total differential for
1071 * the source task and potential destination task. The more negative
1072 * the value is, the more rmeote accesses that would be expected to
1073 * be incurred if the tasks were swapped.
1074 */
1075 if (cur) {
1076 /* Skip this swap candidate if cannot move to the source cpu */
1077 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1078 goto unlock;
1079
1080 /*
1081 * If dst and source tasks are in the same NUMA group, or not
1082 * in any group then look only at task weights.
1083 */
1084 if (cur->numa_group == env->p->numa_group) {
1085 imp = taskimp + task_weight(cur, env->src_nid) -
1086 task_weight(cur, env->dst_nid);
1087 /*
1088 * Add some hysteresis to prevent swapping the
1089 * tasks within a group over tiny differences.
1090 */
1091 if (cur->numa_group)
1092 imp -= imp/16;
1093 } else {
1094 /*
1095 * Compare the group weights. If a task is all by
1096 * itself (not part of a group), use the task weight
1097 * instead.
1098 */
1099 if (env->p->numa_group)
1100 imp = groupimp;
1101 else
1102 imp = taskimp;
1103
1104 if (cur->numa_group)
1105 imp += group_weight(cur, env->src_nid) -
1106 group_weight(cur, env->dst_nid);
1107 else
1108 imp += task_weight(cur, env->src_nid) -
1109 task_weight(cur, env->dst_nid);
1110 }
1111 }
1112
1113 if (imp < env->best_imp)
1114 goto unlock;
1115
1116 if (!cur) {
1117 /* Is there capacity at our destination? */
1118 if (env->src_stats.has_capacity &&
1119 !env->dst_stats.has_capacity)
1120 goto unlock;
1121
1122 goto balance;
1123 }
1124
1125 /* Balance doesn't matter much if we're running a task per cpu */
1126 if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
1127 goto assign;
1128
1129 /*
1130 * In the overloaded case, try and keep the load balanced.
1131 */
1132balance:
1133 dst_load = env->dst_stats.load;
1134 src_load = env->src_stats.load;
1135
1136 /* XXX missing power terms */
1137 load = task_h_load(env->p);
1138 dst_load += load;
1139 src_load -= load;
1140
1141 if (cur) {
1142 load = task_h_load(cur);
1143 dst_load -= load;
1144 src_load += load;
1145 }
1146
1147 /* make src_load the smaller */
1148 if (dst_load < src_load)
1149 swap(dst_load, src_load);
1150
1151 if (src_load * env->imbalance_pct < dst_load * 100)
1152 goto unlock;
1153
1154assign:
1155 task_numa_assign(env, cur, imp);
1156unlock:
1157 rcu_read_unlock();
1158}
1159
1160static void task_numa_find_cpu(struct task_numa_env *env,
1161 long taskimp, long groupimp)
1162{
1163 int cpu;
1164
1165 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1166 /* Skip this CPU if the source task cannot migrate */
1167 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1168 continue;
1169
1170 env->dst_cpu = cpu;
1171 task_numa_compare(env, taskimp, groupimp);
1172 }
1173}
1174
1175static int task_numa_migrate(struct task_struct *p)
1176{
1177 struct task_numa_env env = {
1178 .p = p,
1179
1180 .src_cpu = task_cpu(p),
1181 .src_nid = task_node(p),
1182
1183 .imbalance_pct = 112,
1184
1185 .best_task = NULL,
1186 .best_imp = 0,
1187 .best_cpu = -1
1188 };
1189 struct sched_domain *sd;
1190 unsigned long taskweight, groupweight;
1191 int nid, ret;
1192 long taskimp, groupimp;
1193
1194 /*
1195 * Pick the lowest SD_NUMA domain, as that would have the smallest
1196 * imbalance and would be the first to start moving tasks about.
1197 *
1198 * And we want to avoid any moving of tasks about, as that would create
1199 * random movement of tasks -- counter the numa conditions we're trying
1200 * to satisfy here.
1201 */
1202 rcu_read_lock();
1203 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1204 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1205 rcu_read_unlock();
1206
1207 taskweight = task_weight(p, env.src_nid);
1208 groupweight = group_weight(p, env.src_nid);
1209 update_numa_stats(&env.src_stats, env.src_nid);
1210 env.dst_nid = p->numa_preferred_nid;
1211 taskimp = task_weight(p, env.dst_nid) - taskweight;
1212 groupimp = group_weight(p, env.dst_nid) - groupweight;
1213 update_numa_stats(&env.dst_stats, env.dst_nid);
1214
1215 /* If the preferred nid has capacity, try to use it. */
1216 if (env.dst_stats.has_capacity)
1217 task_numa_find_cpu(&env, taskimp, groupimp);
1218
1219 /* No space available on the preferred nid. Look elsewhere. */
1220 if (env.best_cpu == -1) {
1221 for_each_online_node(nid) {
1222 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1223 continue;
1224
1225 /* Only consider nodes where both task and groups benefit */
1226 taskimp = task_weight(p, nid) - taskweight;
1227 groupimp = group_weight(p, nid) - groupweight;
1228 if (taskimp < 0 && groupimp < 0)
1229 continue;
1230
1231 env.dst_nid = nid;
1232 update_numa_stats(&env.dst_stats, env.dst_nid);
1233 task_numa_find_cpu(&env, taskimp, groupimp);
1234 }
1235 }
1236
1237 /* No better CPU than the current one was found. */
1238 if (env.best_cpu == -1)
1239 return -EAGAIN;
1240
1241 sched_setnuma(p, env.dst_nid);
1242
1243 /*
1244 * Reset the scan period if the task is being rescheduled on an
1245 * alternative node to recheck if the tasks is now properly placed.
1246 */
1247 p->numa_scan_period = task_scan_min(p);
1248
1249 if (env.best_task == NULL) {
1250 int ret = migrate_task_to(p, env.best_cpu);
1251 return ret;
1252 }
1253
1254 ret = migrate_swap(p, env.best_task);
1255 put_task_struct(env.best_task);
1256 return ret;
1257}
1258
1259/* Attempt to migrate a task to a CPU on the preferred node. */
1260static void numa_migrate_preferred(struct task_struct *p)
1261{
1262 /* This task has no NUMA fault statistics yet */
1263 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1264 return;
1265
1266 /* Periodically retry migrating the task to the preferred node */
1267 p->numa_migrate_retry = jiffies + HZ;
1268
1269 /* Success if task is already running on preferred CPU */
1270 if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
838 return; 1271 return;
1272
1273 /* Otherwise, try migrate to a CPU on the preferred node */
1274 task_numa_migrate(p);
1275}
1276
1277/*
1278 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1279 * increments. The more local the fault statistics are, the higher the scan
1280 * period will be for the next scan window. If local/remote ratio is below
1281 * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
1282 * scan period will decrease
1283 */
1284#define NUMA_PERIOD_SLOTS 10
1285#define NUMA_PERIOD_THRESHOLD 3
1286
1287/*
1288 * Increase the scan period (slow down scanning) if the majority of
1289 * our memory is already on our local node, or if the majority of
1290 * the page accesses are shared with other processes.
1291 * Otherwise, decrease the scan period.
1292 */
1293static void update_task_scan_period(struct task_struct *p,
1294 unsigned long shared, unsigned long private)
1295{
1296 unsigned int period_slot;
1297 int ratio;
1298 int diff;
1299
1300 unsigned long remote = p->numa_faults_locality[0];
1301 unsigned long local = p->numa_faults_locality[1];
1302
1303 /*
1304 * If there were no record hinting faults then either the task is
1305 * completely idle or all activity is areas that are not of interest
1306 * to automatic numa balancing. Scan slower
1307 */
1308 if (local + shared == 0) {
1309 p->numa_scan_period = min(p->numa_scan_period_max,
1310 p->numa_scan_period << 1);
1311
1312 p->mm->numa_next_scan = jiffies +
1313 msecs_to_jiffies(p->numa_scan_period);
1314
1315 return;
1316 }
1317
1318 /*
1319 * Prepare to scale scan period relative to the current period.
1320 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1321 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1322 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1323 */
1324 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1325 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1326 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1327 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1328 if (!slot)
1329 slot = 1;
1330 diff = slot * period_slot;
1331 } else {
1332 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1333
1334 /*
1335 * Scale scan rate increases based on sharing. There is an
1336 * inverse relationship between the degree of sharing and
1337 * the adjustment made to the scanning period. Broadly
1338 * speaking the intent is that there is little point
1339 * scanning faster if shared accesses dominate as it may
1340 * simply bounce migrations uselessly
1341 */
1342 period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
1343 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
1344 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1345 }
1346
1347 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1348 task_scan_min(p), task_scan_max(p));
1349 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1350}
1351
1352static void task_numa_placement(struct task_struct *p)
1353{
1354 int seq, nid, max_nid = -1, max_group_nid = -1;
1355 unsigned long max_faults = 0, max_group_faults = 0;
1356 unsigned long fault_types[2] = { 0, 0 };
1357 spinlock_t *group_lock = NULL;
1358
839 seq = ACCESS_ONCE(p->mm->numa_scan_seq); 1359 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
840 if (p->numa_scan_seq == seq) 1360 if (p->numa_scan_seq == seq)
841 return; 1361 return;
842 p->numa_scan_seq = seq; 1362 p->numa_scan_seq = seq;
1363 p->numa_scan_period_max = task_scan_max(p);
1364
1365 /* If the task is part of a group prevent parallel updates to group stats */
1366 if (p->numa_group) {
1367 group_lock = &p->numa_group->lock;
1368 spin_lock(group_lock);
1369 }
1370
1371 /* Find the node with the highest number of faults */
1372 for_each_online_node(nid) {
1373 unsigned long faults = 0, group_faults = 0;
1374 int priv, i;
1375
1376 for (priv = 0; priv < 2; priv++) {
1377 long diff;
1378
1379 i = task_faults_idx(nid, priv);
1380 diff = -p->numa_faults[i];
1381
1382 /* Decay existing window, copy faults since last scan */
1383 p->numa_faults[i] >>= 1;
1384 p->numa_faults[i] += p->numa_faults_buffer[i];
1385 fault_types[priv] += p->numa_faults_buffer[i];
1386 p->numa_faults_buffer[i] = 0;
1387
1388 faults += p->numa_faults[i];
1389 diff += p->numa_faults[i];
1390 p->total_numa_faults += diff;
1391 if (p->numa_group) {
1392 /* safe because we can only change our own group */
1393 p->numa_group->faults[i] += diff;
1394 p->numa_group->total_faults += diff;
1395 group_faults += p->numa_group->faults[i];
1396 }
1397 }
1398
1399 if (faults > max_faults) {
1400 max_faults = faults;
1401 max_nid = nid;
1402 }
1403
1404 if (group_faults > max_group_faults) {
1405 max_group_faults = group_faults;
1406 max_group_nid = nid;
1407 }
1408 }
1409
1410 update_task_scan_period(p, fault_types[0], fault_types[1]);
1411
1412 if (p->numa_group) {
1413 /*
1414 * If the preferred task and group nids are different,
1415 * iterate over the nodes again to find the best place.
1416 */
1417 if (max_nid != max_group_nid) {
1418 unsigned long weight, max_weight = 0;
1419
1420 for_each_online_node(nid) {
1421 weight = task_weight(p, nid) + group_weight(p, nid);
1422 if (weight > max_weight) {
1423 max_weight = weight;
1424 max_nid = nid;
1425 }
1426 }
1427 }
1428
1429 spin_unlock(group_lock);
1430 }
1431
1432 /* Preferred node as the node with the most faults */
1433 if (max_faults && max_nid != p->numa_preferred_nid) {
1434 /* Update the preferred nid and migrate task if possible */
1435 sched_setnuma(p, max_nid);
1436 numa_migrate_preferred(p);
1437 }
1438}
1439
1440static inline int get_numa_group(struct numa_group *grp)
1441{
1442 return atomic_inc_not_zero(&grp->refcount);
1443}
1444
1445static inline void put_numa_group(struct numa_group *grp)
1446{
1447 if (atomic_dec_and_test(&grp->refcount))
1448 kfree_rcu(grp, rcu);
1449}
1450
1451static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1452 int *priv)
1453{
1454 struct numa_group *grp, *my_grp;
1455 struct task_struct *tsk;
1456 bool join = false;
1457 int cpu = cpupid_to_cpu(cpupid);
1458 int i;
1459
1460 if (unlikely(!p->numa_group)) {
1461 unsigned int size = sizeof(struct numa_group) +
1462 2*nr_node_ids*sizeof(unsigned long);
1463
1464 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1465 if (!grp)
1466 return;
1467
1468 atomic_set(&grp->refcount, 1);
1469 spin_lock_init(&grp->lock);
1470 INIT_LIST_HEAD(&grp->task_list);
1471 grp->gid = p->pid;
1472
1473 for (i = 0; i < 2*nr_node_ids; i++)
1474 grp->faults[i] = p->numa_faults[i];
1475
1476 grp->total_faults = p->total_numa_faults;
1477
1478 list_add(&p->numa_entry, &grp->task_list);
1479 grp->nr_tasks++;
1480 rcu_assign_pointer(p->numa_group, grp);
1481 }
1482
1483 rcu_read_lock();
1484 tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
1485
1486 if (!cpupid_match_pid(tsk, cpupid))
1487 goto no_join;
1488
1489 grp = rcu_dereference(tsk->numa_group);
1490 if (!grp)
1491 goto no_join;
1492
1493 my_grp = p->numa_group;
1494 if (grp == my_grp)
1495 goto no_join;
1496
1497 /*
1498 * Only join the other group if its bigger; if we're the bigger group,
1499 * the other task will join us.
1500 */
1501 if (my_grp->nr_tasks > grp->nr_tasks)
1502 goto no_join;
1503
1504 /*
1505 * Tie-break on the grp address.
1506 */
1507 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
1508 goto no_join;
1509
1510 /* Always join threads in the same process. */
1511 if (tsk->mm == current->mm)
1512 join = true;
1513
1514 /* Simple filter to avoid false positives due to PID collisions */
1515 if (flags & TNF_SHARED)
1516 join = true;
1517
1518 /* Update priv based on whether false sharing was detected */
1519 *priv = !join;
1520
1521 if (join && !get_numa_group(grp))
1522 goto no_join;
843 1523
844 /* FIXME: Scheduling placement policy hints go here */ 1524 rcu_read_unlock();
1525
1526 if (!join)
1527 return;
1528
1529 double_lock(&my_grp->lock, &grp->lock);
1530
1531 for (i = 0; i < 2*nr_node_ids; i++) {
1532 my_grp->faults[i] -= p->numa_faults[i];
1533 grp->faults[i] += p->numa_faults[i];
1534 }
1535 my_grp->total_faults -= p->total_numa_faults;
1536 grp->total_faults += p->total_numa_faults;
1537
1538 list_move(&p->numa_entry, &grp->task_list);
1539 my_grp->nr_tasks--;
1540 grp->nr_tasks++;
1541
1542 spin_unlock(&my_grp->lock);
1543 spin_unlock(&grp->lock);
1544
1545 rcu_assign_pointer(p->numa_group, grp);
1546
1547 put_numa_group(my_grp);
1548 return;
1549
1550no_join:
1551 rcu_read_unlock();
1552 return;
1553}
1554
1555void task_numa_free(struct task_struct *p)
1556{
1557 struct numa_group *grp = p->numa_group;
1558 int i;
1559 void *numa_faults = p->numa_faults;
1560
1561 if (grp) {
1562 spin_lock(&grp->lock);
1563 for (i = 0; i < 2*nr_node_ids; i++)
1564 grp->faults[i] -= p->numa_faults[i];
1565 grp->total_faults -= p->total_numa_faults;
1566
1567 list_del(&p->numa_entry);
1568 grp->nr_tasks--;
1569 spin_unlock(&grp->lock);
1570 rcu_assign_pointer(p->numa_group, NULL);
1571 put_numa_group(grp);
1572 }
1573
1574 p->numa_faults = NULL;
1575 p->numa_faults_buffer = NULL;
1576 kfree(numa_faults);
845} 1577}
846 1578
847/* 1579/*
848 * Got a PROT_NONE fault for a page on @node. 1580 * Got a PROT_NONE fault for a page on @node.
849 */ 1581 */
850void task_numa_fault(int node, int pages, bool migrated) 1582void task_numa_fault(int last_cpupid, int node, int pages, int flags)
851{ 1583{
852 struct task_struct *p = current; 1584 struct task_struct *p = current;
1585 bool migrated = flags & TNF_MIGRATED;
1586 int priv;
853 1587
854 if (!numabalancing_enabled) 1588 if (!numabalancing_enabled)
855 return; 1589 return;
856 1590
857 /* FIXME: Allocate task-specific structure for placement policy here */ 1591 /* for example, ksmd faulting in a user's mm */
1592 if (!p->mm)
1593 return;
1594
1595 /* Do not worry about placement if exiting */
1596 if (p->state == TASK_DEAD)
1597 return;
1598
1599 /* Allocate buffer to track faults on a per-node basis */
1600 if (unlikely(!p->numa_faults)) {
1601 int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
1602
1603 /* numa_faults and numa_faults_buffer share the allocation */
1604 p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
1605 if (!p->numa_faults)
1606 return;
1607
1608 BUG_ON(p->numa_faults_buffer);
1609 p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
1610 p->total_numa_faults = 0;
1611 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1612 }
858 1613
859 /* 1614 /*
860 * If pages are properly placed (did not migrate) then scan slower. 1615 * First accesses are treated as private, otherwise consider accesses
861 * This is reset periodically in case of phase changes 1616 * to be private if the accessing pid has not changed
862 */ 1617 */
863 if (!migrated) 1618 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
864 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 1619 priv = 1;
865 p->numa_scan_period + jiffies_to_msecs(10)); 1620 } else {
1621 priv = cpupid_match_pid(p, last_cpupid);
1622 if (!priv && !(flags & TNF_NO_GROUP))
1623 task_numa_group(p, last_cpupid, flags, &priv);
1624 }
866 1625
867 task_numa_placement(p); 1626 task_numa_placement(p);
1627
1628 /*
1629 * Retry task to preferred node migration periodically, in case it
1630 * case it previously failed, or the scheduler moved us.
1631 */
1632 if (time_after(jiffies, p->numa_migrate_retry))
1633 numa_migrate_preferred(p);
1634
1635 if (migrated)
1636 p->numa_pages_migrated += pages;
1637
1638 p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
1639 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
868} 1640}
869 1641
870static void reset_ptenuma_scan(struct task_struct *p) 1642static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
884 struct mm_struct *mm = p->mm; 1656 struct mm_struct *mm = p->mm;
885 struct vm_area_struct *vma; 1657 struct vm_area_struct *vma;
886 unsigned long start, end; 1658 unsigned long start, end;
1659 unsigned long nr_pte_updates = 0;
887 long pages; 1660 long pages;
888 1661
889 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); 1662 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
900 if (p->flags & PF_EXITING) 1673 if (p->flags & PF_EXITING)
901 return; 1674 return;
902 1675
903 /* 1676 if (!mm->numa_next_scan) {
904 * We do not care about task placement until a task runs on a node 1677 mm->numa_next_scan = now +
905 * other than the first one used by the address space. This is 1678 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
906 * largely because migrations are driven by what CPU the task
907 * is running on. If it's never scheduled on another node, it'll
908 * not migrate so why bother trapping the fault.
909 */
910 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
911 mm->first_nid = numa_node_id();
912 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
913 /* Are we running on a new node yet? */
914 if (numa_node_id() == mm->first_nid &&
915 !sched_feat_numa(NUMA_FORCE))
916 return;
917
918 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
919 }
920
921 /*
922 * Reset the scan period if enough time has gone by. Objective is that
923 * scanning will be reduced if pages are properly placed. As tasks
924 * can enter different phases this needs to be re-examined. Lacking
925 * proper tracking of reference behaviour, this blunt hammer is used.
926 */
927 migrate = mm->numa_next_reset;
928 if (time_after(now, migrate)) {
929 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
930 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
931 xchg(&mm->numa_next_reset, next_scan);
932 } 1679 }
933 1680
934 /* 1681 /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
938 if (time_before(now, migrate)) 1685 if (time_before(now, migrate))
939 return; 1686 return;
940 1687
941 if (p->numa_scan_period == 0) 1688 if (p->numa_scan_period == 0) {
942 p->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1689 p->numa_scan_period_max = task_scan_max(p);
1690 p->numa_scan_period = task_scan_min(p);
1691 }
943 1692
944 next_scan = now + msecs_to_jiffies(p->numa_scan_period); 1693 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
945 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) 1694 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
946 return; 1695 return;
947 1696
948 /* 1697 /*
949 * Do not set pte_numa if the current running node is rate-limited. 1698 * Delay this task enough that another task of this mm will likely win
950 * This loses statistics on the fault but if we are unwilling to 1699 * the next time around.
951 * migrate to this node, it is less likely we can do useful work
952 */ 1700 */
953 if (migrate_ratelimited(numa_node_id())) 1701 p->node_stamp += 2 * TICK_NSEC;
954 return;
955 1702
956 start = mm->numa_scan_offset; 1703 start = mm->numa_scan_offset;
957 pages = sysctl_numa_balancing_scan_size; 1704 pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
967 vma = mm->mmap; 1714 vma = mm->mmap;
968 } 1715 }
969 for (; vma; vma = vma->vm_next) { 1716 for (; vma; vma = vma->vm_next) {
970 if (!vma_migratable(vma)) 1717 if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
971 continue; 1718 continue;
972 1719
973 /* Skip small VMAs. They are not likely to be of relevance */ 1720 /*
974 if (vma->vm_end - vma->vm_start < HPAGE_SIZE) 1721 * Shared library pages mapped by multiple processes are not
1722 * migrated as it is expected they are cache replicated. Avoid
1723 * hinting faults in read-only file-backed mappings or the vdso
1724 * as migrating the pages will be of marginal benefit.
1725 */
1726 if (!vma->vm_mm ||
1727 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
975 continue; 1728 continue;
976 1729
977 do { 1730 do {
978 start = max(start, vma->vm_start); 1731 start = max(start, vma->vm_start);
979 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); 1732 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
980 end = min(end, vma->vm_end); 1733 end = min(end, vma->vm_end);
981 pages -= change_prot_numa(vma, start, end); 1734 nr_pte_updates += change_prot_numa(vma, start, end);
1735
1736 /*
1737 * Scan sysctl_numa_balancing_scan_size but ensure that
1738 * at least one PTE is updated so that unused virtual
1739 * address space is quickly skipped.
1740 */
1741 if (nr_pte_updates)
1742 pages -= (end - start) >> PAGE_SHIFT;
982 1743
983 start = end; 1744 start = end;
984 if (pages <= 0) 1745 if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
988 1749
989out: 1750out:
990 /* 1751 /*
991 * It is possible to reach the end of the VMA list but the last few VMAs are 1752 * It is possible to reach the end of the VMA list but the last few
992 * not guaranteed to the vma_migratable. If they are not, we would find the 1753 * VMAs are not guaranteed to the vma_migratable. If they are not, we
993 * !migratable VMA on the next scan but not reset the scanner to the start 1754 * would find the !migratable VMA on the next scan but not reset the
994 * so check it now. 1755 * scanner to the start so check it now.
995 */ 1756 */
996 if (vma) 1757 if (vma)
997 mm->numa_scan_offset = start; 1758 mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1025 1786
1026 if (now - curr->node_stamp > period) { 1787 if (now - curr->node_stamp > period) {
1027 if (!curr->node_stamp) 1788 if (!curr->node_stamp)
1028 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; 1789 curr->numa_scan_period = task_scan_min(curr);
1029 curr->node_stamp = now; 1790 curr->node_stamp += period;
1030 1791
1031 if (!time_before(jiffies, curr->mm->numa_next_scan)) { 1792 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1032 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ 1793 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
1038static void task_tick_numa(struct rq *rq, struct task_struct *curr) 1799static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1039{ 1800{
1040} 1801}
1802
1803static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1804{
1805}
1806
1807static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1808{
1809}
1041#endif /* CONFIG_NUMA_BALANCING */ 1810#endif /* CONFIG_NUMA_BALANCING */
1042 1811
1043static void 1812static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1047 if (!parent_entity(se)) 1816 if (!parent_entity(se))
1048 update_load_add(&rq_of(cfs_rq)->load, se->load.weight); 1817 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1049#ifdef CONFIG_SMP 1818#ifdef CONFIG_SMP
1050 if (entity_is_task(se)) 1819 if (entity_is_task(se)) {
1051 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks); 1820 struct rq *rq = rq_of(cfs_rq);
1821
1822 account_numa_enqueue(rq, task_of(se));
1823 list_add(&se->group_node, &rq->cfs_tasks);
1824 }
1052#endif 1825#endif
1053 cfs_rq->nr_running++; 1826 cfs_rq->nr_running++;
1054} 1827}
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1059 update_load_sub(&cfs_rq->load, se->load.weight); 1832 update_load_sub(&cfs_rq->load, se->load.weight);
1060 if (!parent_entity(se)) 1833 if (!parent_entity(se))
1061 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight); 1834 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1062 if (entity_is_task(se)) 1835 if (entity_is_task(se)) {
1836 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
1063 list_del_init(&se->group_node); 1837 list_del_init(&se->group_node);
1838 }
1064 cfs_rq->nr_running--; 1839 cfs_rq->nr_running--;
1065} 1840}
1066 1841
@@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
2070 return static_key_false(&__cfs_bandwidth_used); 2845 return static_key_false(&__cfs_bandwidth_used);
2071} 2846}
2072 2847
2073void account_cfs_bandwidth_used(int enabled, int was_enabled) 2848void cfs_bandwidth_usage_inc(void)
2849{
2850 static_key_slow_inc(&__cfs_bandwidth_used);
2851}
2852
2853void cfs_bandwidth_usage_dec(void)
2074{ 2854{
2075 /* only need to count groups transitioning between enabled/!enabled */ 2855 static_key_slow_dec(&__cfs_bandwidth_used);
2076 if (enabled && !was_enabled)
2077 static_key_slow_inc(&__cfs_bandwidth_used);
2078 else if (!enabled && was_enabled)
2079 static_key_slow_dec(&__cfs_bandwidth_used);
2080} 2856}
2081#else /* HAVE_JUMP_LABEL */ 2857#else /* HAVE_JUMP_LABEL */
2082static bool cfs_bandwidth_used(void) 2858static bool cfs_bandwidth_used(void)
@@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
2084 return true; 2860 return true;
2085} 2861}
2086 2862
2087void account_cfs_bandwidth_used(int enabled, int was_enabled) {} 2863void cfs_bandwidth_usage_inc(void) {}
2864void cfs_bandwidth_usage_dec(void) {}
2088#endif /* HAVE_JUMP_LABEL */ 2865#endif /* HAVE_JUMP_LABEL */
2089 2866
2090/* 2867/*
@@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
2335 cfs_rq->throttled_clock = rq_clock(rq); 3112 cfs_rq->throttled_clock = rq_clock(rq);
2336 raw_spin_lock(&cfs_b->lock); 3113 raw_spin_lock(&cfs_b->lock);
2337 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); 3114 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3115 if (!cfs_b->timer_active)
3116 __start_cfs_bandwidth(cfs_b);
2338 raw_spin_unlock(&cfs_b->lock); 3117 raw_spin_unlock(&cfs_b->lock);
2339} 3118}
2340 3119
@@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2448 if (idle) 3227 if (idle)
2449 goto out_unlock; 3228 goto out_unlock;
2450 3229
3230 /*
3231 * if we have relooped after returning idle once, we need to update our
3232 * status as actually running, so that other cpus doing
3233 * __start_cfs_bandwidth will stop trying to cancel us.
3234 */
3235 cfs_b->timer_active = 1;
3236
2451 __refill_cfs_bandwidth_runtime(cfs_b); 3237 __refill_cfs_bandwidth_runtime(cfs_b);
2452 3238
2453 if (!throttled) { 3239 if (!throttled) {
@@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2508/* how long we wait to gather additional slack before distributing */ 3294/* how long we wait to gather additional slack before distributing */
2509static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; 3295static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2510 3296
2511/* are we near the end of the current quota period? */ 3297/*
3298 * Are we near the end of the current quota period?
3299 *
3300 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3301 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3302 * migrate_hrtimers, base is never cleared, so we are fine.
3303 */
2512static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) 3304static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2513{ 3305{
2514 struct hrtimer *refresh_timer = &cfs_b->period_timer; 3306 struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2584 u64 expires; 3376 u64 expires;
2585 3377
2586 /* confirm we're still not at a refresh boundary */ 3378 /* confirm we're still not at a refresh boundary */
2587 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) 3379 raw_spin_lock(&cfs_b->lock);
3380 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3381 raw_spin_unlock(&cfs_b->lock);
2588 return; 3382 return;
3383 }
2589 3384
2590 raw_spin_lock(&cfs_b->lock);
2591 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) { 3385 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2592 runtime = cfs_b->runtime; 3386 runtime = cfs_b->runtime;
2593 cfs_b->runtime = 0; 3387 cfs_b->runtime = 0;
@@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708 * (timer_active==0 becomes visible before the hrtimer call-back 3502 * (timer_active==0 becomes visible before the hrtimer call-back
2709 * terminates). In either case we ensure that it's re-programmed 3503 * terminates). In either case we ensure that it's re-programmed
2710 */ 3504 */
2711 while (unlikely(hrtimer_active(&cfs_b->period_timer))) { 3505 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3506 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3507 /* bounce the lock to allow do_sched_cfs_period_timer to run */
2712 raw_spin_unlock(&cfs_b->lock); 3508 raw_spin_unlock(&cfs_b->lock);
2713 /* ensure cfs_b->lock is available while we wait */ 3509 cpu_relax();
2714 hrtimer_cancel(&cfs_b->period_timer);
2715
2716 raw_spin_lock(&cfs_b->lock); 3510 raw_spin_lock(&cfs_b->lock);
2717 /* if someone else restarted the timer then we're done */ 3511 /* if someone else restarted the timer then we're done */
2718 if (cfs_b->timer_active) 3512 if (cfs_b->timer_active)
@@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3113{ 3907{
3114 struct sched_entity *se = tg->se[cpu]; 3908 struct sched_entity *se = tg->se[cpu];
3115 3909
3116 if (!tg->parent) /* the trivial, non-cgroup case */ 3910 if (!tg->parent || !wl) /* the trivial, non-cgroup case */
3117 return wl; 3911 return wl;
3118 3912
3119 for_each_sched_entity(se) { 3913 for_each_sched_entity(se) {
@@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3166} 3960}
3167#else 3961#else
3168 3962
3169static inline unsigned long effective_load(struct task_group *tg, int cpu, 3963static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
3170 unsigned long wl, unsigned long wg)
3171{ 3964{
3172 return wl; 3965 return wl;
3173} 3966}
@@ -3420,11 +4213,10 @@ done:
3420 * preempt must be disabled. 4213 * preempt must be disabled.
3421 */ 4214 */
3422static int 4215static int
3423select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 4216select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
3424{ 4217{
3425 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4218 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
3426 int cpu = smp_processor_id(); 4219 int cpu = smp_processor_id();
3427 int prev_cpu = task_cpu(p);
3428 int new_cpu = cpu; 4220 int new_cpu = cpu;
3429 int want_affine = 0; 4221 int want_affine = 0;
3430 int sync = wake_flags & WF_SYNC; 4222 int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
3904 4696
3905static unsigned long __read_mostly max_load_balance_interval = HZ/10; 4697static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3906 4698
4699enum fbq_type { regular, remote, all };
4700
3907#define LBF_ALL_PINNED 0x01 4701#define LBF_ALL_PINNED 0x01
3908#define LBF_NEED_BREAK 0x02 4702#define LBF_NEED_BREAK 0x02
3909#define LBF_SOME_PINNED 0x04 4703#define LBF_DST_PINNED 0x04
4704#define LBF_SOME_PINNED 0x08
3910 4705
3911struct lb_env { 4706struct lb_env {
3912 struct sched_domain *sd; 4707 struct sched_domain *sd;
@@ -3929,6 +4724,8 @@ struct lb_env {
3929 unsigned int loop; 4724 unsigned int loop;
3930 unsigned int loop_break; 4725 unsigned int loop_break;
3931 unsigned int loop_max; 4726 unsigned int loop_max;
4727
4728 enum fbq_type fbq_type;
3932}; 4729};
3933 4730
3934/* 4731/*
@@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3975 return delta < (s64)sysctl_sched_migration_cost; 4772 return delta < (s64)sysctl_sched_migration_cost;
3976} 4773}
3977 4774
4775#ifdef CONFIG_NUMA_BALANCING
4776/* Returns true if the destination node has incurred more faults */
4777static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
4778{
4779 int src_nid, dst_nid;
4780
4781 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
4782 !(env->sd->flags & SD_NUMA)) {
4783 return false;
4784 }
4785
4786 src_nid = cpu_to_node(env->src_cpu);
4787 dst_nid = cpu_to_node(env->dst_cpu);
4788
4789 if (src_nid == dst_nid)
4790 return false;
4791
4792 /* Always encourage migration to the preferred node. */
4793 if (dst_nid == p->numa_preferred_nid)
4794 return true;
4795
4796 /* If both task and group weight improve, this move is a winner. */
4797 if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
4798 group_weight(p, dst_nid) > group_weight(p, src_nid))
4799 return true;
4800
4801 return false;
4802}
4803
4804
4805static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
4806{
4807 int src_nid, dst_nid;
4808
4809 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
4810 return false;
4811
4812 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
4813 return false;
4814
4815 src_nid = cpu_to_node(env->src_cpu);
4816 dst_nid = cpu_to_node(env->dst_cpu);
4817
4818 if (src_nid == dst_nid)
4819 return false;
4820
4821 /* Migrating away from the preferred node is always bad. */
4822 if (src_nid == p->numa_preferred_nid)
4823 return true;
4824
4825 /* If either task or group weight get worse, don't do it. */
4826 if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
4827 group_weight(p, dst_nid) < group_weight(p, src_nid))
4828 return true;
4829
4830 return false;
4831}
4832
4833#else
4834static inline bool migrate_improves_locality(struct task_struct *p,
4835 struct lb_env *env)
4836{
4837 return false;
4838}
4839
4840static inline bool migrate_degrades_locality(struct task_struct *p,
4841 struct lb_env *env)
4842{
4843 return false;
4844}
4845#endif
4846
3978/* 4847/*
3979 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 4848 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3980 */ 4849 */
@@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
3997 4866
3998 schedstat_inc(p, se.statistics.nr_failed_migrations_affine); 4867 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
3999 4868
4869 env->flags |= LBF_SOME_PINNED;
4870
4000 /* 4871 /*
4001 * Remember if this task can be migrated to any other cpu in 4872 * Remember if this task can be migrated to any other cpu in
4002 * our sched_group. We may want to revisit it if we couldn't 4873 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4005 * Also avoid computing new_dst_cpu if we have already computed 4876 * Also avoid computing new_dst_cpu if we have already computed
4006 * one in current iteration. 4877 * one in current iteration.
4007 */ 4878 */
4008 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) 4879 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
4009 return 0; 4880 return 0;
4010 4881
4011 /* Prevent to re-select dst_cpu via env's cpus */ 4882 /* Prevent to re-select dst_cpu via env's cpus */
4012 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 4883 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
4013 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { 4884 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
4014 env->flags |= LBF_SOME_PINNED; 4885 env->flags |= LBF_DST_PINNED;
4015 env->new_dst_cpu = cpu; 4886 env->new_dst_cpu = cpu;
4016 break; 4887 break;
4017 } 4888 }
@@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
4030 4901
4031 /* 4902 /*
4032 * Aggressive migration if: 4903 * Aggressive migration if:
4033 * 1) task is cache cold, or 4904 * 1) destination numa is preferred
4034 * 2) too many balance attempts have failed. 4905 * 2) task is cache cold, or
4906 * 3) too many balance attempts have failed.
4035 */ 4907 */
4036
4037 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd); 4908 tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
4909 if (!tsk_cache_hot)
4910 tsk_cache_hot = migrate_degrades_locality(p, env);
4911
4912 if (migrate_improves_locality(p, env)) {
4913#ifdef CONFIG_SCHEDSTATS
4914 if (tsk_cache_hot) {
4915 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
4916 schedstat_inc(p, se.statistics.nr_forced_migrations);
4917 }
4918#endif
4919 return 1;
4920 }
4921
4038 if (!tsk_cache_hot || 4922 if (!tsk_cache_hot ||
4039 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 4923 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4040 4924
@@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
4077 return 0; 4961 return 0;
4078} 4962}
4079 4963
4080static unsigned long task_h_load(struct task_struct *p);
4081
4082static const unsigned int sched_nr_migrate_break = 32; 4964static const unsigned int sched_nr_migrate_break = 32;
4083 4965
4084/* 4966/*
@@ -4291,6 +5173,10 @@ struct sg_lb_stats {
4291 unsigned int group_weight; 5173 unsigned int group_weight;
4292 int group_imb; /* Is there an imbalance in the group ? */ 5174 int group_imb; /* Is there an imbalance in the group ? */
4293 int group_has_capacity; /* Is there extra capacity in the group? */ 5175 int group_has_capacity; /* Is there extra capacity in the group? */
5176#ifdef CONFIG_NUMA_BALANCING
5177 unsigned int nr_numa_running;
5178 unsigned int nr_preferred_running;
5179#endif
4294}; 5180};
4295 5181
4296/* 5182/*
@@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
4330/** 5216/**
4331 * get_sd_load_idx - Obtain the load index for a given sched domain. 5217 * get_sd_load_idx - Obtain the load index for a given sched domain.
4332 * @sd: The sched_domain whose load_idx is to be obtained. 5218 * @sd: The sched_domain whose load_idx is to be obtained.
4333 * @idle: The Idle status of the CPU for whose sd load_icx is obtained. 5219 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
4334 * 5220 *
4335 * Return: The load index. 5221 * Return: The load index.
4336 */ 5222 */
@@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4447{ 5333{
4448 struct sched_domain *child = sd->child; 5334 struct sched_domain *child = sd->child;
4449 struct sched_group *group, *sdg = sd->groups; 5335 struct sched_group *group, *sdg = sd->groups;
4450 unsigned long power; 5336 unsigned long power, power_orig;
4451 unsigned long interval; 5337 unsigned long interval;
4452 5338
4453 interval = msecs_to_jiffies(sd->balance_interval); 5339 interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
4459 return; 5345 return;
4460 } 5346 }
4461 5347
4462 power = 0; 5348 power_orig = power = 0;
4463 5349
4464 if (child->flags & SD_OVERLAP) { 5350 if (child->flags & SD_OVERLAP) {
4465 /* 5351 /*
@@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
4467 * span the current group. 5353 * span the current group.
4468 */ 5354 */
4469 5355
4470 for_each_cpu(cpu, sched_group_cpus(sdg)) 5356 for_each_cpu(cpu, sched_group_cpus(sdg)) {
4471 power += power_of(cpu); 5357 struct sched_group *sg = cpu_rq(cpu)->sd->groups;
5358
5359 power_orig += sg->sgp->power_orig;
5360 power += sg->sgp->power;
5361 }
4472 } else { 5362 } else {
4473 /* 5363 /*
4474 * !SD_OVERLAP domains can assume that child groups 5364 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
4477 5367
4478 group = child->groups; 5368 group = child->groups;
4479 do { 5369 do {
5370 power_orig += group->sgp->power_orig;
4480 power += group->sgp->power; 5371 power += group->sgp->power;
4481 group = group->next; 5372 group = group->next;
4482 } while (group != child->groups); 5373 } while (group != child->groups);
4483 } 5374 }
4484 5375
4485 sdg->sgp->power_orig = sdg->sgp->power = power; 5376 sdg->sgp->power_orig = power_orig;
5377 sdg->sgp->power = power;
4486} 5378}
4487 5379
4488/* 5380/*
@@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4526 * cpu 3 and leave one of the cpus in the second group unused. 5418 * cpu 3 and leave one of the cpus in the second group unused.
4527 * 5419 *
4528 * The current solution to this issue is detecting the skew in the first group 5420 * The current solution to this issue is detecting the skew in the first group
4529 * by noticing it has a cpu that is overloaded while the remaining cpus are 5421 * by noticing the lower domain failed to reach balance and had difficulty
4530 * idle -- or rather, there's a distinct imbalance in the cpus; see 5422 * moving tasks due to affinity constraints.
4531 * sg_imbalanced().
4532 * 5423 *
4533 * When this is so detected; this group becomes a candidate for busiest; see 5424 * When this is so detected; this group becomes a candidate for busiest; see
4534 * update_sd_pick_busiest(). And calculcate_imbalance() and 5425 * update_sd_pick_busiest(). And calculate_imbalance() and
4535 * find_busiest_group() avoid some of the usual balance conditional to allow it 5426 * find_busiest_group() avoid some of the usual balance conditions to allow it
4536 * to create an effective group imbalance. 5427 * to create an effective group imbalance.
4537 * 5428 *
4538 * This is a somewhat tricky proposition since the next run might not find the 5429 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4540 * subtle and fragile situation. 5431 * subtle and fragile situation.
4541 */ 5432 */
4542 5433
4543struct sg_imb_stats { 5434static inline int sg_imbalanced(struct sched_group *group)
4544 unsigned long max_nr_running, min_nr_running;
4545 unsigned long max_cpu_load, min_cpu_load;
4546};
4547
4548static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
4549{ 5435{
4550 sgi->max_cpu_load = sgi->max_nr_running = 0UL; 5436 return group->sgp->imbalance;
4551 sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
4552} 5437}
4553 5438
4554static inline void 5439/*
4555update_sg_imb_stats(struct sg_imb_stats *sgi, 5440 * Compute the group capacity.
4556 unsigned long load, unsigned long nr_running) 5441 *
5442 * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
5443 * first dividing out the smt factor and computing the actual number of cores
5444 * and limit power unit capacity with that.
5445 */
5446static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
4557{ 5447{
4558 if (load > sgi->max_cpu_load) 5448 unsigned int capacity, smt, cpus;
4559 sgi->max_cpu_load = load; 5449 unsigned int power, power_orig;
4560 if (sgi->min_cpu_load > load)
4561 sgi->min_cpu_load = load;
4562 5450
4563 if (nr_running > sgi->max_nr_running) 5451 power = group->sgp->power;
4564 sgi->max_nr_running = nr_running; 5452 power_orig = group->sgp->power_orig;
4565 if (sgi->min_nr_running > nr_running) 5453 cpus = group->group_weight;
4566 sgi->min_nr_running = nr_running;
4567}
4568 5454
4569static inline int 5455 /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
4570sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) 5456 smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
4571{ 5457 capacity = cpus / smt; /* cores */
4572 /*
4573 * Consider the group unbalanced when the imbalance is larger
4574 * than the average weight of a task.
4575 *
4576 * APZ: with cgroup the avg task weight can vary wildly and
4577 * might not be a suitable number - should we keep a
4578 * normalized nr_running number somewhere that negates
4579 * the hierarchy?
4580 */
4581 if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
4582 (sgi->max_nr_running - sgi->min_nr_running) > 1)
4583 return 1;
4584 5458
4585 return 0; 5459 capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
5460 if (!capacity)
5461 capacity = fix_small_capacity(env->sd, group);
5462
5463 return capacity;
4586} 5464}
4587 5465
4588/** 5466/**
@@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4597 struct sched_group *group, int load_idx, 5475 struct sched_group *group, int load_idx,
4598 int local_group, struct sg_lb_stats *sgs) 5476 int local_group, struct sg_lb_stats *sgs)
4599{ 5477{
4600 struct sg_imb_stats sgi;
4601 unsigned long nr_running; 5478 unsigned long nr_running;
4602 unsigned long load; 5479 unsigned long load;
4603 int i; 5480 int i;
4604 5481
4605 init_sg_imb_stats(&sgi); 5482 memset(sgs, 0, sizeof(*sgs));
4606 5483
4607 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5484 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
4608 struct rq *rq = cpu_rq(i); 5485 struct rq *rq = cpu_rq(i);
@@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4610 nr_running = rq->nr_running; 5487 nr_running = rq->nr_running;
4611 5488
4612 /* Bias balancing toward cpus of our domain */ 5489 /* Bias balancing toward cpus of our domain */
4613 if (local_group) { 5490 if (local_group)
4614 load = target_load(i, load_idx); 5491 load = target_load(i, load_idx);
4615 } else { 5492 else
4616 load = source_load(i, load_idx); 5493 load = source_load(i, load_idx);
4617 update_sg_imb_stats(&sgi, load, nr_running);
4618 }
4619 5494
4620 sgs->group_load += load; 5495 sgs->group_load += load;
4621 sgs->sum_nr_running += nr_running; 5496 sgs->sum_nr_running += nr_running;
5497#ifdef CONFIG_NUMA_BALANCING
5498 sgs->nr_numa_running += rq->nr_numa_running;
5499 sgs->nr_preferred_running += rq->nr_preferred_running;
5500#endif
4622 sgs->sum_weighted_load += weighted_cpuload(i); 5501 sgs->sum_weighted_load += weighted_cpuload(i);
4623 if (idle_cpu(i)) 5502 if (idle_cpu(i))
4624 sgs->idle_cpus++; 5503 sgs->idle_cpus++;
4625 } 5504 }
4626 5505
4627 if (local_group && (env->idle != CPU_NEWLY_IDLE ||
4628 time_after_eq(jiffies, group->sgp->next_update)))
4629 update_group_power(env->sd, env->dst_cpu);
4630
4631 /* Adjust by relative CPU power of the group */ 5506 /* Adjust by relative CPU power of the group */
4632 sgs->group_power = group->sgp->power; 5507 sgs->group_power = group->sgp->power;
4633 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; 5508 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
4635 if (sgs->sum_nr_running) 5510 if (sgs->sum_nr_running)
4636 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 5511 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
4637 5512
4638 sgs->group_imb = sg_imbalanced(sgs, &sgi);
4639
4640 sgs->group_capacity =
4641 DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
4642
4643 if (!sgs->group_capacity)
4644 sgs->group_capacity = fix_small_capacity(env->sd, group);
4645
4646 sgs->group_weight = group->group_weight; 5513 sgs->group_weight = group->group_weight;
4647 5514
5515 sgs->group_imb = sg_imbalanced(group);
5516 sgs->group_capacity = sg_capacity(env, group);
5517
4648 if (sgs->group_capacity > sgs->sum_nr_running) 5518 if (sgs->group_capacity > sgs->sum_nr_running)
4649 sgs->group_has_capacity = 1; 5519 sgs->group_has_capacity = 1;
4650} 5520}
@@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
4693 return false; 5563 return false;
4694} 5564}
4695 5565
5566#ifdef CONFIG_NUMA_BALANCING
5567static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5568{
5569 if (sgs->sum_nr_running > sgs->nr_numa_running)
5570 return regular;
5571 if (sgs->sum_nr_running > sgs->nr_preferred_running)
5572 return remote;
5573 return all;
5574}
5575
5576static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5577{
5578 if (rq->nr_running > rq->nr_numa_running)
5579 return regular;
5580 if (rq->nr_running > rq->nr_preferred_running)
5581 return remote;
5582 return all;
5583}
5584#else
5585static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
5586{
5587 return all;
5588}
5589
5590static inline enum fbq_type fbq_classify_rq(struct rq *rq)
5591{
5592 return regular;
5593}
5594#endif /* CONFIG_NUMA_BALANCING */
5595
4696/** 5596/**
4697 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 5597 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
4698 * @env: The load balancing environment. 5598 * @env: The load balancing environment.
4699 * @balance: Should we balance.
4700 * @sds: variable to hold the statistics for this sched_domain. 5599 * @sds: variable to hold the statistics for this sched_domain.
4701 */ 5600 */
4702static inline void update_sd_lb_stats(struct lb_env *env, 5601static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
4703 struct sd_lb_stats *sds)
4704{ 5602{
4705 struct sched_domain *child = env->sd->child; 5603 struct sched_domain *child = env->sd->child;
4706 struct sched_group *sg = env->sd->groups; 5604 struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4720 if (local_group) { 5618 if (local_group) {
4721 sds->local = sg; 5619 sds->local = sg;
4722 sgs = &sds->local_stat; 5620 sgs = &sds->local_stat;
5621
5622 if (env->idle != CPU_NEWLY_IDLE ||
5623 time_after_eq(jiffies, sg->sgp->next_update))
5624 update_group_power(env->sd, env->dst_cpu);
4723 } 5625 }
4724 5626
4725 memset(sgs, 0, sizeof(*sgs));
4726 update_sg_lb_stats(env, sg, load_idx, local_group, sgs); 5627 update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
4727 5628
5629 if (local_group)
5630 goto next_group;
5631
4728 /* 5632 /*
4729 * In case the child domain prefers tasks go to siblings 5633 * In case the child domain prefers tasks go to siblings
4730 * first, lower the sg capacity to one so that we'll try 5634 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
4735 * heaviest group when it is already under-utilized (possible 5639 * heaviest group when it is already under-utilized (possible
4736 * with a large weight task outweighs the tasks on the system). 5640 * with a large weight task outweighs the tasks on the system).
4737 */ 5641 */
4738 if (prefer_sibling && !local_group && 5642 if (prefer_sibling && sds->local &&
4739 sds->local && sds->local_stat.group_has_capacity) 5643 sds->local_stat.group_has_capacity)
4740 sgs->group_capacity = min(sgs->group_capacity, 1U); 5644 sgs->group_capacity = min(sgs->group_capacity, 1U);
4741 5645
4742 /* Now, start updating sd_lb_stats */ 5646 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
4743 sds->total_load += sgs->group_load;
4744 sds->total_pwr += sgs->group_power;
4745
4746 if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
4747 sds->busiest = sg; 5647 sds->busiest = sg;
4748 sds->busiest_stat = *sgs; 5648 sds->busiest_stat = *sgs;
4749 } 5649 }
4750 5650
5651next_group:
5652 /* Now, start updating sd_lb_stats */
5653 sds->total_load += sgs->group_load;
5654 sds->total_pwr += sgs->group_power;
5655
4751 sg = sg->next; 5656 sg = sg->next;
4752 } while (sg != env->sd->groups); 5657 } while (sg != env->sd->groups);
5658
5659 if (env->sd->flags & SD_NUMA)
5660 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
4753} 5661}
4754 5662
4755/** 5663/**
@@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
5053 int i; 5961 int i;
5054 5962
5055 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { 5963 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
5056 unsigned long power = power_of(i); 5964 unsigned long power, capacity, wl;
5057 unsigned long capacity = DIV_ROUND_CLOSEST(power, 5965 enum fbq_type rt;
5058 SCHED_POWER_SCALE); 5966
5059 unsigned long wl; 5967 rq = cpu_rq(i);
5968 rt = fbq_classify_rq(rq);
5969
5970 /*
5971 * We classify groups/runqueues into three groups:
5972 * - regular: there are !numa tasks
5973 * - remote: there are numa tasks that run on the 'wrong' node
5974 * - all: there is no distinction
5975 *
5976 * In order to avoid migrating ideally placed numa tasks,
5977 * ignore those when there's better options.
5978 *
5979 * If we ignore the actual busiest queue to migrate another
5980 * task, the next balance pass can still reduce the busiest
5981 * queue by moving tasks around inside the node.
5982 *
5983 * If we cannot move enough load due to this classification
5984 * the next pass will adjust the group classification and
5985 * allow migration of more tasks.
5986 *
5987 * Both cases only affect the total convergence complexity.
5988 */
5989 if (rt > env->fbq_type)
5990 continue;
5060 5991
5992 power = power_of(i);
5993 capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
5061 if (!capacity) 5994 if (!capacity)
5062 capacity = fix_small_capacity(env->sd, group); 5995 capacity = fix_small_capacity(env->sd, group);
5063 5996
5064 rq = cpu_rq(i);
5065 wl = weighted_cpuload(i); 5997 wl = weighted_cpuload(i);
5066 5998
5067 /* 5999 /*
@@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5164 int *continue_balancing) 6096 int *continue_balancing)
5165{ 6097{
5166 int ld_moved, cur_ld_moved, active_balance = 0; 6098 int ld_moved, cur_ld_moved, active_balance = 0;
6099 struct sched_domain *sd_parent = sd->parent;
5167 struct sched_group *group; 6100 struct sched_group *group;
5168 struct rq *busiest; 6101 struct rq *busiest;
5169 unsigned long flags; 6102 unsigned long flags;
@@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
5177 .idle = idle, 6110 .idle = idle,
5178 .loop_break = sched_nr_migrate_break, 6111 .loop_break = sched_nr_migrate_break,
5179 .cpus = cpus, 6112 .cpus = cpus,
6113 .fbq_type = all,
5180 }; 6114 };
5181 6115
5182 /* 6116 /*
@@ -5268,17 +6202,17 @@ more_balance:
5268 * moreover subsequent load balance cycles should correct the 6202 * moreover subsequent load balance cycles should correct the
5269 * excess load moved. 6203 * excess load moved.
5270 */ 6204 */
5271 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6205 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
6206
6207 /* Prevent to re-select dst_cpu via env's cpus */
6208 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5272 6209
5273 env.dst_rq = cpu_rq(env.new_dst_cpu); 6210 env.dst_rq = cpu_rq(env.new_dst_cpu);
5274 env.dst_cpu = env.new_dst_cpu; 6211 env.dst_cpu = env.new_dst_cpu;
5275 env.flags &= ~LBF_SOME_PINNED; 6212 env.flags &= ~LBF_DST_PINNED;
5276 env.loop = 0; 6213 env.loop = 0;
5277 env.loop_break = sched_nr_migrate_break; 6214 env.loop_break = sched_nr_migrate_break;
5278 6215
5279 /* Prevent to re-select dst_cpu via env's cpus */
5280 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5281
5282 /* 6216 /*
5283 * Go back to "more_balance" rather than "redo" since we 6217 * Go back to "more_balance" rather than "redo" since we
5284 * need to continue with same src_cpu. 6218 * need to continue with same src_cpu.
@@ -5286,6 +6220,18 @@ more_balance:
5286 goto more_balance; 6220 goto more_balance;
5287 } 6221 }
5288 6222
6223 /*
6224 * We failed to reach balance because of affinity.
6225 */
6226 if (sd_parent) {
6227 int *group_imbalance = &sd_parent->groups->sgp->imbalance;
6228
6229 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
6230 *group_imbalance = 1;
6231 } else if (*group_imbalance)
6232 *group_imbalance = 0;
6233 }
6234
5289 /* All tasks on this runqueue were pinned by CPU affinity */ 6235 /* All tasks on this runqueue were pinned by CPU affinity */
5290 if (unlikely(env.flags & LBF_ALL_PINNED)) { 6236 if (unlikely(env.flags & LBF_ALL_PINNED)) {
5291 cpumask_clear_cpu(cpu_of(busiest), cpus); 6237 cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5393 struct sched_domain *sd; 6339 struct sched_domain *sd;
5394 int pulled_task = 0; 6340 int pulled_task = 0;
5395 unsigned long next_balance = jiffies + HZ; 6341 unsigned long next_balance = jiffies + HZ;
6342 u64 curr_cost = 0;
5396 6343
5397 this_rq->idle_stamp = rq_clock(this_rq); 6344 this_rq->idle_stamp = rq_clock(this_rq);
5398 6345
@@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5409 for_each_domain(this_cpu, sd) { 6356 for_each_domain(this_cpu, sd) {
5410 unsigned long interval; 6357 unsigned long interval;
5411 int continue_balancing = 1; 6358 int continue_balancing = 1;
6359 u64 t0, domain_cost;
5412 6360
5413 if (!(sd->flags & SD_LOAD_BALANCE)) 6361 if (!(sd->flags & SD_LOAD_BALANCE))
5414 continue; 6362 continue;
5415 6363
6364 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
6365 break;
6366
5416 if (sd->flags & SD_BALANCE_NEWIDLE) { 6367 if (sd->flags & SD_BALANCE_NEWIDLE) {
6368 t0 = sched_clock_cpu(this_cpu);
6369
5417 /* If we've pulled tasks over stop searching: */ 6370 /* If we've pulled tasks over stop searching: */
5418 pulled_task = load_balance(this_cpu, this_rq, 6371 pulled_task = load_balance(this_cpu, this_rq,
5419 sd, CPU_NEWLY_IDLE, 6372 sd, CPU_NEWLY_IDLE,
5420 &continue_balancing); 6373 &continue_balancing);
6374
6375 domain_cost = sched_clock_cpu(this_cpu) - t0;
6376 if (domain_cost > sd->max_newidle_lb_cost)
6377 sd->max_newidle_lb_cost = domain_cost;
6378
6379 curr_cost += domain_cost;
5421 } 6380 }
5422 6381
5423 interval = msecs_to_jiffies(sd->balance_interval); 6382 interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
5439 */ 6398 */
5440 this_rq->next_balance = next_balance; 6399 this_rq->next_balance = next_balance;
5441 } 6400 }
6401
6402 if (curr_cost > this_rq->max_idle_balance_cost)
6403 this_rq->max_idle_balance_cost = curr_cost;
5442} 6404}
5443 6405
5444/* 6406/*
@@ -5572,16 +6534,16 @@ static inline void nohz_balance_exit_idle(int cpu)
5572static inline void set_cpu_sd_state_busy(void) 6534static inline void set_cpu_sd_state_busy(void)
5573{ 6535{
5574 struct sched_domain *sd; 6536 struct sched_domain *sd;
6537 int cpu = smp_processor_id();
5575 6538
5576 rcu_read_lock(); 6539 rcu_read_lock();
5577 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6540 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5578 6541
5579 if (!sd || !sd->nohz_idle) 6542 if (!sd || !sd->nohz_idle)
5580 goto unlock; 6543 goto unlock;
5581 sd->nohz_idle = 0; 6544 sd->nohz_idle = 0;
5582 6545
5583 for (; sd; sd = sd->parent) 6546 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5584 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
5585unlock: 6547unlock:
5586 rcu_read_unlock(); 6548 rcu_read_unlock();
5587} 6549}
@@ -5589,16 +6551,16 @@ unlock:
5589void set_cpu_sd_state_idle(void) 6551void set_cpu_sd_state_idle(void)
5590{ 6552{
5591 struct sched_domain *sd; 6553 struct sched_domain *sd;
6554 int cpu = smp_processor_id();
5592 6555
5593 rcu_read_lock(); 6556 rcu_read_lock();
5594 sd = rcu_dereference_check_sched_domain(this_rq()->sd); 6557 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5595 6558
5596 if (!sd || sd->nohz_idle) 6559 if (!sd || sd->nohz_idle)
5597 goto unlock; 6560 goto unlock;
5598 sd->nohz_idle = 1; 6561 sd->nohz_idle = 1;
5599 6562
5600 for (; sd; sd = sd->parent) 6563 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5601 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
5602unlock: 6564unlock:
5603 rcu_read_unlock(); 6565 rcu_read_unlock();
5604} 6566}
@@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5662 /* Earliest time when we have to do rebalance again */ 6624 /* Earliest time when we have to do rebalance again */
5663 unsigned long next_balance = jiffies + 60*HZ; 6625 unsigned long next_balance = jiffies + 60*HZ;
5664 int update_next_balance = 0; 6626 int update_next_balance = 0;
5665 int need_serialize; 6627 int need_serialize, need_decay = 0;
6628 u64 max_cost = 0;
5666 6629
5667 update_blocked_averages(cpu); 6630 update_blocked_averages(cpu);
5668 6631
5669 rcu_read_lock(); 6632 rcu_read_lock();
5670 for_each_domain(cpu, sd) { 6633 for_each_domain(cpu, sd) {
6634 /*
6635 * Decay the newidle max times here because this is a regular
6636 * visit to all the domains. Decay ~1% per second.
6637 */
6638 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
6639 sd->max_newidle_lb_cost =
6640 (sd->max_newidle_lb_cost * 253) / 256;
6641 sd->next_decay_max_lb_cost = jiffies + HZ;
6642 need_decay = 1;
6643 }
6644 max_cost += sd->max_newidle_lb_cost;
6645
5671 if (!(sd->flags & SD_LOAD_BALANCE)) 6646 if (!(sd->flags & SD_LOAD_BALANCE))
5672 continue; 6647 continue;
5673 6648
6649 /*
6650 * Stop the load balance at this level. There is another
6651 * CPU in our sched group which is doing load balancing more
6652 * actively.
6653 */
6654 if (!continue_balancing) {
6655 if (need_decay)
6656 continue;
6657 break;
6658 }
6659
5674 interval = sd->balance_interval; 6660 interval = sd->balance_interval;
5675 if (idle != CPU_IDLE) 6661 if (idle != CPU_IDLE)
5676 interval *= sd->busy_factor; 6662 interval *= sd->busy_factor;
@@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5689 if (time_after_eq(jiffies, sd->last_balance + interval)) { 6675 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5690 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { 6676 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
5691 /* 6677 /*
5692 * The LBF_SOME_PINNED logic could have changed 6678 * The LBF_DST_PINNED logic could have changed
5693 * env->dst_cpu, so we can't know our idle 6679 * env->dst_cpu, so we can't know our idle
5694 * state even if we migrated tasks. Update it. 6680 * state even if we migrated tasks. Update it.
5695 */ 6681 */
@@ -5704,14 +6690,14 @@ out:
5704 next_balance = sd->last_balance + interval; 6690 next_balance = sd->last_balance + interval;
5705 update_next_balance = 1; 6691 update_next_balance = 1;
5706 } 6692 }
5707 6693 }
6694 if (need_decay) {
5708 /* 6695 /*
5709 * Stop the load balance at this level. There is another 6696 * Ensure the rq-wide value also decays but keep it at a
5710 * CPU in our sched group which is doing load balancing more 6697 * reasonable floor to avoid funnies with rq->avg_idle.
5711 * actively.
5712 */ 6698 */
5713 if (!continue_balancing) 6699 rq->max_idle_balance_cost =
5714 break; 6700 max((u64)sysctl_sched_migration_cost, max_cost);
5715 } 6701 }
5716 rcu_read_unlock(); 6702 rcu_read_unlock();
5717 6703
@@ -5781,6 +6767,8 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5781{ 6767{
5782 unsigned long now = jiffies; 6768 unsigned long now = jiffies;
5783 struct sched_domain *sd; 6769 struct sched_domain *sd;
6770 struct sched_group_power *sgp;
6771 int nr_busy;
5784 6772
5785 if (unlikely(idle_cpu(cpu))) 6773 if (unlikely(idle_cpu(cpu)))
5786 return 0; 6774 return 0;
@@ -5806,22 +6794,22 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
5806 goto need_kick; 6794 goto need_kick;
5807 6795
5808 rcu_read_lock(); 6796 rcu_read_lock();
5809 for_each_domain(cpu, sd) { 6797 sd = rcu_dereference(per_cpu(sd_busy, cpu));
5810 struct sched_group *sg = sd->groups;
5811 struct sched_group_power *sgp = sg->sgp;
5812 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
5813 6798
5814 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1) 6799 if (sd) {
5815 goto need_kick_unlock; 6800 sgp = sd->groups->sgp;
6801 nr_busy = atomic_read(&sgp->nr_busy_cpus);
5816 6802
5817 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight 6803 if (nr_busy > 1)
5818 && (cpumask_first_and(nohz.idle_cpus_mask,
5819 sched_domain_span(sd)) < cpu))
5820 goto need_kick_unlock; 6804 goto need_kick_unlock;
5821
5822 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5823 break;
5824 } 6805 }
6806
6807 sd = rcu_dereference(per_cpu(sd_asym, cpu));
6808
6809 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
6810 sched_domain_span(sd)) < cpu))
6811 goto need_kick_unlock;
6812
5825 rcu_read_unlock(); 6813 rcu_read_unlock();
5826 return 0; 6814 return 0;
5827 6815
@@ -6214,7 +7202,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6214 se->cfs_rq = parent->my_q; 7202 se->cfs_rq = parent->my_q;
6215 7203
6216 se->my_q = cfs_rq; 7204 se->my_q = cfs_rq;
6217 update_load_set(&se->load, 0); 7205 /* guarantee group entities always have weight */
7206 update_load_set(&se->load, NICE_0_LOAD);
6218 se->parent = parent; 7207 se->parent = parent;
6219} 7208}
6220 7209
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
63/* 63/*
64 * Apply the automatic NUMA scheduling policy. Enabled automatically 64 * Apply the automatic NUMA scheduling policy. Enabled automatically
65 * at runtime if running on a NUMA machine. Can be controlled via 65 * at runtime if running on a NUMA machine. Can be controlled via
66 * numa_balancing=. Allow PTE scanning to be forced on UMA machines 66 * numa_balancing=
67 * for debugging the core machinery.
68 */ 67 */
69#ifdef CONFIG_NUMA_BALANCING 68#ifdef CONFIG_NUMA_BALANCING
70SCHED_FEAT(NUMA, false) 69SCHED_FEAT(NUMA, false)
71SCHED_FEAT(NUMA_FORCE, false) 70
71/*
72 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
73 * higher number of hinting faults are recorded during active load
74 * balancing.
75 */
76SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
77
78/*
79 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
80 * lower number of hinting faults have been recorded. As this has
81 * the potential to prevent a task ever migrating to a new node
82 * due to CPU overload it is disabled by default.
83 */
84SCHED_FEAT(NUMA_RESIST_LOWER, false)
72#endif 85#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
9 9
10#ifdef CONFIG_SMP 10#ifdef CONFIG_SMP
11static int 11static int
12select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 12select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
13{ 13{
14 return task_cpu(p); /* IDLE tasks as never migrated */ 14 return task_cpu(p); /* IDLE tasks as never migrated */
15} 15}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
246 * if we should look at the mask. It would be a shame 246 * if we should look at the mask. It would be a shame
247 * if we looked at the mask, but the mask was not 247 * if we looked at the mask, but the mask was not
248 * updated yet. 248 * updated yet.
249 *
250 * Matched by the barrier in pull_rt_task().
249 */ 251 */
250 wmb(); 252 smp_wmb();
251 atomic_inc(&rq->rd->rto_count); 253 atomic_inc(&rq->rd->rto_count);
252} 254}
253 255
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
1169static int find_lowest_rq(struct task_struct *task); 1171static int find_lowest_rq(struct task_struct *task);
1170 1172
1171static int 1173static int
1172select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 1174select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1173{ 1175{
1174 struct task_struct *curr; 1176 struct task_struct *curr;
1175 struct rq *rq; 1177 struct rq *rq;
1176 int cpu;
1177
1178 cpu = task_cpu(p);
1179 1178
1180 if (p->nr_cpus_allowed == 1) 1179 if (p->nr_cpus_allowed == 1)
1181 goto out; 1180 goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1213 */ 1212 */
1214 if (curr && unlikely(rt_task(curr)) && 1213 if (curr && unlikely(rt_task(curr)) &&
1215 (curr->nr_cpus_allowed < 2 || 1214 (curr->nr_cpus_allowed < 2 ||
1216 curr->prio <= p->prio) && 1215 curr->prio <= p->prio)) {
1217 (p->nr_cpus_allowed > 1)) {
1218 int target = find_lowest_rq(p); 1216 int target = find_lowest_rq(p);
1219 1217
1220 if (target != -1) 1218 if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
1630 if (likely(!rt_overloaded(this_rq))) 1628 if (likely(!rt_overloaded(this_rq)))
1631 return 0; 1629 return 0;
1632 1630
1631 /*
1632 * Match the barrier from rt_set_overloaded; this guarantees that if we
1633 * see overloaded we must also see the rto_mask bit.
1634 */
1635 smp_rmb();
1636
1633 for_each_cpu(cpu, this_rq->rd->rto_mask) { 1637 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1634 if (this_cpu == cpu) 1638 if (this_cpu == cpu)
1635 continue; 1639 continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1931 p->rt.time_slice = sched_rr_timeslice; 1935 p->rt.time_slice = sched_rr_timeslice;
1932 1936
1933 /* 1937 /*
1934 * Requeue to the end of queue if we (and all of our ancestors) are the 1938 * Requeue to the end of queue if we (and all of our ancestors) are not
1935 * only element on the queue 1939 * the only element on the queue
1936 */ 1940 */
1937 for_each_sched_rt_entity(rt_se) { 1941 for_each_sched_rt_entity(rt_se) {
1938 if (rt_se->run_list.prev != rt_se->run_list.next) { 1942 if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..88c85b21d633 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/stop_machine.h> 7#include <linux/stop_machine.h>
8#include <linux/tick.h> 8#include <linux/tick.h>
9#include <linux/slab.h>
9 10
10#include "cpupri.h" 11#include "cpupri.h"
11#include "cpuacct.h" 12#include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
408 * remote CPUs use both these fields when doing load calculation. 409 * remote CPUs use both these fields when doing load calculation.
409 */ 410 */
410 unsigned int nr_running; 411 unsigned int nr_running;
412#ifdef CONFIG_NUMA_BALANCING
413 unsigned int nr_numa_running;
414 unsigned int nr_preferred_running;
415#endif
411 #define CPU_LOAD_IDX_MAX 5 416 #define CPU_LOAD_IDX_MAX 5
412 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 417 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
413 unsigned long last_load_update_tick; 418 unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
476 u64 age_stamp; 481 u64 age_stamp;
477 u64 idle_stamp; 482 u64 idle_stamp;
478 u64 avg_idle; 483 u64 avg_idle;
484
485 /* This is used to determine avg_idle's max value */
486 u64 max_idle_balance_cost;
479#endif 487#endif
480 488
481#ifdef CONFIG_IRQ_TIME_ACCOUNTING 489#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
552 return rq->clock_task; 560 return rq->clock_task;
553} 561}
554 562
563#ifdef CONFIG_NUMA_BALANCING
564extern void sched_setnuma(struct task_struct *p, int node);
565extern int migrate_task_to(struct task_struct *p, int cpu);
566extern int migrate_swap(struct task_struct *, struct task_struct *);
567#endif /* CONFIG_NUMA_BALANCING */
568
555#ifdef CONFIG_SMP 569#ifdef CONFIG_SMP
556 570
557#define rcu_dereference_check_sched_domain(p) \ 571#define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,24 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
593 return hsd; 607 return hsd;
594} 608}
595 609
610static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
611{
612 struct sched_domain *sd;
613
614 for_each_domain(cpu, sd) {
615 if (sd->flags & flag)
616 break;
617 }
618
619 return sd;
620}
621
596DECLARE_PER_CPU(struct sched_domain *, sd_llc); 622DECLARE_PER_CPU(struct sched_domain *, sd_llc);
597DECLARE_PER_CPU(int, sd_llc_size); 623DECLARE_PER_CPU(int, sd_llc_size);
598DECLARE_PER_CPU(int, sd_llc_id); 624DECLARE_PER_CPU(int, sd_llc_id);
625DECLARE_PER_CPU(struct sched_domain *, sd_numa);
626DECLARE_PER_CPU(struct sched_domain *, sd_busy);
627DECLARE_PER_CPU(struct sched_domain *, sd_asym);
599 628
600struct sched_group_power { 629struct sched_group_power {
601 atomic_t ref; 630 atomic_t ref;
@@ -605,6 +634,7 @@ struct sched_group_power {
605 */ 634 */
606 unsigned int power, power_orig; 635 unsigned int power, power_orig;
607 unsigned long next_update; 636 unsigned long next_update;
637 int imbalance; /* XXX unrelated to power but shared group state */
608 /* 638 /*
609 * Number of busy cpus in this group. 639 * Number of busy cpus in this group.
610 */ 640 */
@@ -719,6 +749,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
719 */ 749 */
720 smp_wmb(); 750 smp_wmb();
721 task_thread_info(p)->cpu = cpu; 751 task_thread_info(p)->cpu = cpu;
752 p->wake_cpu = cpu;
722#endif 753#endif
723} 754}
724 755
@@ -974,7 +1005,7 @@ struct sched_class {
974 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1005 void (*put_prev_task) (struct rq *rq, struct task_struct *p);
975 1006
976#ifdef CONFIG_SMP 1007#ifdef CONFIG_SMP
977 int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); 1008 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
978 void (*migrate_task_rq)(struct task_struct *p, int next_cpu); 1009 void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
979 1010
980 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); 1011 void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1251,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1220 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1251 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1221} 1252}
1222 1253
1254static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
1255{
1256 if (l1 > l2)
1257 swap(l1, l2);
1258
1259 spin_lock(l1);
1260 spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1261}
1262
1263static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
1264{
1265 if (l1 > l2)
1266 swap(l1, l2);
1267
1268 raw_spin_lock(l1);
1269 raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
1270}
1271
1223/* 1272/*
1224 * double_rq_lock - safely lock two runqueues 1273 * double_rq_lock - safely lock two runqueues
1225 * 1274 *
@@ -1305,7 +1354,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
1305extern void init_cfs_rq(struct cfs_rq *cfs_rq); 1354extern void init_cfs_rq(struct cfs_rq *cfs_rq);
1306extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); 1355extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
1307 1356
1308extern void account_cfs_bandwidth_used(int enabled, int was_enabled); 1357extern void cfs_bandwidth_usage_inc(void);
1358extern void cfs_bandwidth_usage_dec(void);
1309 1359
1310#ifdef CONFIG_NO_HZ_COMMON 1360#ifdef CONFIG_NO_HZ_COMMON
1311enum rq_nohz_flag_bits { 1361enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
59 * from dequeue_task() to account for possible rq->clock skew across cpus. The 59 * from dequeue_task() to account for possible rq->clock skew across cpus. The
60 * delta taken on each cpu would annul the skew. 60 * delta taken on each cpu would annul the skew.
61 */ 61 */
62static inline void sched_info_dequeued(struct task_struct *t) 62static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
63{ 63{
64 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 64 unsigned long long now = rq_clock(rq), delta = 0;
65 65
66 if (unlikely(sched_info_on())) 66 if (unlikely(sched_info_on()))
67 if (t->sched_info.last_queued) 67 if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
69 sched_info_reset_dequeued(t); 69 sched_info_reset_dequeued(t);
70 t->sched_info.run_delay += delta; 70 t->sched_info.run_delay += delta;
71 71
72 rq_sched_info_dequeued(task_rq(t), delta); 72 rq_sched_info_dequeued(rq, delta);
73} 73}
74 74
75/* 75/*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
77 * long it was waiting to run. We also note when it began so that we 77 * long it was waiting to run. We also note when it began so that we
78 * can keep stats on how long its timeslice is. 78 * can keep stats on how long its timeslice is.
79 */ 79 */
80static void sched_info_arrive(struct task_struct *t) 80static void sched_info_arrive(struct rq *rq, struct task_struct *t)
81{ 81{
82 unsigned long long now = rq_clock(task_rq(t)), delta = 0; 82 unsigned long long now = rq_clock(rq), delta = 0;
83 83
84 if (t->sched_info.last_queued) 84 if (t->sched_info.last_queued)
85 delta = now - t->sched_info.last_queued; 85 delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
88 t->sched_info.last_arrival = now; 88 t->sched_info.last_arrival = now;
89 t->sched_info.pcount++; 89 t->sched_info.pcount++;
90 90
91 rq_sched_info_arrive(task_rq(t), delta); 91 rq_sched_info_arrive(rq, delta);
92} 92}
93 93
94/* 94/*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
96 * the timestamp if it is already not set. It's assumed that 96 * the timestamp if it is already not set. It's assumed that
97 * sched_info_dequeued() will clear that stamp when appropriate. 97 * sched_info_dequeued() will clear that stamp when appropriate.
98 */ 98 */
99static inline void sched_info_queued(struct task_struct *t) 99static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
100{ 100{
101 if (unlikely(sched_info_on())) 101 if (unlikely(sched_info_on()))
102 if (!t->sched_info.last_queued) 102 if (!t->sched_info.last_queued)
103 t->sched_info.last_queued = rq_clock(task_rq(t)); 103 t->sched_info.last_queued = rq_clock(rq);
104} 104}
105 105
106/* 106/*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
111 * sched_info_queued() to mark that it has now again started waiting on 111 * sched_info_queued() to mark that it has now again started waiting on
112 * the runqueue. 112 * the runqueue.
113 */ 113 */
114static inline void sched_info_depart(struct task_struct *t) 114static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
115{ 115{
116 unsigned long long delta = rq_clock(task_rq(t)) - 116 unsigned long long delta = rq_clock(rq) -
117 t->sched_info.last_arrival; 117 t->sched_info.last_arrival;
118 118
119 rq_sched_info_depart(task_rq(t), delta); 119 rq_sched_info_depart(rq, delta);
120 120
121 if (t->state == TASK_RUNNING) 121 if (t->state == TASK_RUNNING)
122 sched_info_queued(t); 122 sched_info_queued(rq, t);
123} 123}
124 124
125/* 125/*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
128 * the idle task.) We are only called when prev != next. 128 * the idle task.) We are only called when prev != next.
129 */ 129 */
130static inline void 130static inline void
131__sched_info_switch(struct task_struct *prev, struct task_struct *next) 131__sched_info_switch(struct rq *rq,
132 struct task_struct *prev, struct task_struct *next)
132{ 133{
133 struct rq *rq = task_rq(prev);
134
135 /* 134 /*
136 * prev now departs the cpu. It's not interesting to record 135 * prev now departs the cpu. It's not interesting to record
137 * stats about how efficient we were at scheduling the idle 136 * stats about how efficient we were at scheduling the idle
138 * process, however. 137 * process, however.
139 */ 138 */
140 if (prev != rq->idle) 139 if (prev != rq->idle)
141 sched_info_depart(prev); 140 sched_info_depart(rq, prev);
142 141
143 if (next != rq->idle) 142 if (next != rq->idle)
144 sched_info_arrive(next); 143 sched_info_arrive(rq, next);
145} 144}
146static inline void 145static inline void
147sched_info_switch(struct task_struct *prev, struct task_struct *next) 146sched_info_switch(struct rq *rq,
147 struct task_struct *prev, struct task_struct *next)
148{ 148{
149 if (unlikely(sched_info_on())) 149 if (unlikely(sched_info_on()))
150 __sched_info_switch(prev, next); 150 __sched_info_switch(rq, prev, next);
151} 151}
152#else 152#else
153#define sched_info_queued(t) do { } while (0) 153#define sched_info_queued(rq, t) do { } while (0)
154#define sched_info_reset_dequeued(t) do { } while (0) 154#define sched_info_reset_dequeued(t) do { } while (0)
155#define sched_info_dequeued(t) do { } while (0) 155#define sched_info_dequeued(rq, t) do { } while (0)
156#define sched_info_switch(t, next) do { } while (0) 156#define sched_info_depart(rq, t) do { } while (0)
157#define sched_info_arrive(rq, next) do { } while (0)
158#define sched_info_switch(rq, t, next) do { } while (0)
157#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 159#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
158 160
159/* 161/*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
11 11
12#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
13static int 13static int
14select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) 14select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
15{ 15{
16 return task_cpu(p); /* stop tasks as never migrate */ 16 return task_cpu(p); /* stop tasks as never migrate */
17} 17}
diff --git a/kernel/wait.c b/kernel/sched/wait.c
index d550920e040c..7d50f794e248 100644
--- a/kernel/wait.c
+++ b/kernel/sched/wait.c
@@ -53,6 +53,109 @@ EXPORT_SYMBOL(remove_wait_queue);
53 53
54 54
55/* 55/*
56 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
57 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
58 * number) then we wake all the non-exclusive tasks and one exclusive task.
59 *
60 * There are circumstances in which we can try to wake a task which has already
61 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
62 * zero in this (rare) case, and we handle it by continuing to scan the queue.
63 */
64static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
65 int nr_exclusive, int wake_flags, void *key)
66{
67 wait_queue_t *curr, *next;
68
69 list_for_each_entry_safe(curr, next, &q->task_list, task_list) {
70 unsigned flags = curr->flags;
71
72 if (curr->func(curr, mode, wake_flags, key) &&
73 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
74 break;
75 }
76}
77
78/**
79 * __wake_up - wake up threads blocked on a waitqueue.
80 * @q: the waitqueue
81 * @mode: which threads
82 * @nr_exclusive: how many wake-one or wake-many threads to wake up
83 * @key: is directly passed to the wakeup function
84 *
85 * It may be assumed that this function implies a write memory barrier before
86 * changing the task state if and only if any tasks are woken up.
87 */
88void __wake_up(wait_queue_head_t *q, unsigned int mode,
89 int nr_exclusive, void *key)
90{
91 unsigned long flags;
92
93 spin_lock_irqsave(&q->lock, flags);
94 __wake_up_common(q, mode, nr_exclusive, 0, key);
95 spin_unlock_irqrestore(&q->lock, flags);
96}
97EXPORT_SYMBOL(__wake_up);
98
99/*
100 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
101 */
102void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
103{
104 __wake_up_common(q, mode, nr, 0, NULL);
105}
106EXPORT_SYMBOL_GPL(__wake_up_locked);
107
108void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
109{
110 __wake_up_common(q, mode, 1, 0, key);
111}
112EXPORT_SYMBOL_GPL(__wake_up_locked_key);
113
114/**
115 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
116 * @q: the waitqueue
117 * @mode: which threads
118 * @nr_exclusive: how many wake-one or wake-many threads to wake up
119 * @key: opaque value to be passed to wakeup targets
120 *
121 * The sync wakeup differs that the waker knows that it will schedule
122 * away soon, so while the target thread will be woken up, it will not
123 * be migrated to another CPU - ie. the two threads are 'synchronized'
124 * with each other. This can prevent needless bouncing between CPUs.
125 *
126 * On UP it can prevent extra preemption.
127 *
128 * It may be assumed that this function implies a write memory barrier before
129 * changing the task state if and only if any tasks are woken up.
130 */
131void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
132 int nr_exclusive, void *key)
133{
134 unsigned long flags;
135 int wake_flags = 1; /* XXX WF_SYNC */
136
137 if (unlikely(!q))
138 return;
139
140 if (unlikely(nr_exclusive != 1))
141 wake_flags = 0;
142
143 spin_lock_irqsave(&q->lock, flags);
144 __wake_up_common(q, mode, nr_exclusive, wake_flags, key);
145 spin_unlock_irqrestore(&q->lock, flags);
146}
147EXPORT_SYMBOL_GPL(__wake_up_sync_key);
148
149/*
150 * __wake_up_sync - see __wake_up_sync_key()
151 */
152void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
153{
154 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
155}
156EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
157
158/*
56 * Note: we use "set_current_state()" _after_ the wait-queue add, 159 * Note: we use "set_current_state()" _after_ the wait-queue add,
57 * because we need a memory barrier there on SMP, so that any 160 * because we need a memory barrier there on SMP, so that any
58 * wake-function that tests for the wait-queue being active 161 * wake-function that tests for the wait-queue being active
@@ -92,6 +195,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
92} 195}
93EXPORT_SYMBOL(prepare_to_wait_exclusive); 196EXPORT_SYMBOL(prepare_to_wait_exclusive);
94 197
198long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
199{
200 unsigned long flags;
201
202 if (signal_pending_state(state, current))
203 return -ERESTARTSYS;
204
205 wait->private = current;
206 wait->func = autoremove_wake_function;
207
208 spin_lock_irqsave(&q->lock, flags);
209 if (list_empty(&wait->task_list)) {
210 if (wait->flags & WQ_FLAG_EXCLUSIVE)
211 __add_wait_queue_tail(q, wait);
212 else
213 __add_wait_queue(q, wait);
214 }
215 set_current_state(state);
216 spin_unlock_irqrestore(&q->lock, flags);
217
218 return 0;
219}
220EXPORT_SYMBOL(prepare_to_wait_event);
221
95/** 222/**
96 * finish_wait - clean up after waiting in a queue 223 * finish_wait - clean up after waiting in a queue
97 * @q: waitqueue waited on 224 * @q: waitqueue waited on
diff --git a/kernel/smp.c b/kernel/smp.c
index 0564571dcdf7..f5768b0c816a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -524,6 +524,11 @@ void __init setup_nr_cpu_ids(void)
524 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; 524 nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
525} 525}
526 526
527void __weak smp_announce(void)
528{
529 printk(KERN_INFO "Brought up %d CPUs\n", num_online_cpus());
530}
531
527/* Called by boot processor to activate the rest. */ 532/* Called by boot processor to activate the rest. */
528void __init smp_init(void) 533void __init smp_init(void)
529{ 534{
@@ -540,7 +545,7 @@ void __init smp_init(void)
540 } 545 }
541 546
542 /* Any cleanup work */ 547 /* Any cleanup work */
543 printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); 548 smp_announce();
544 smp_cpus_done(setup_max_cpus); 549 smp_cpus_done(setup_max_cpus);
545} 550}
546 551
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..b24988353458 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -29,7 +29,6 @@
29#define CREATE_TRACE_POINTS 29#define CREATE_TRACE_POINTS
30#include <trace/events/irq.h> 30#include <trace/events/irq.h>
31 31
32#include <asm/irq.h>
33/* 32/*
34 - No shared variables, all the data are CPU local. 33 - No shared variables, all the data are CPU local.
35 - If a softirq needs serialization, let it serialize itself 34 - If a softirq needs serialization, let it serialize itself
@@ -100,13 +99,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
100 99
101 raw_local_irq_save(flags); 100 raw_local_irq_save(flags);
102 /* 101 /*
103 * The preempt tracer hooks into add_preempt_count and will break 102 * The preempt tracer hooks into preempt_count_add and will break
104 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET 103 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
105 * is set and before current->softirq_enabled is cleared. 104 * is set and before current->softirq_enabled is cleared.
106 * We must manually increment preempt_count here and manually 105 * We must manually increment preempt_count here and manually
107 * call the trace_preempt_off later. 106 * call the trace_preempt_off later.
108 */ 107 */
109 preempt_count() += cnt; 108 __preempt_count_add(cnt);
110 /* 109 /*
111 * Were softirqs turned off above: 110 * Were softirqs turned off above:
112 */ 111 */
@@ -120,7 +119,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
120#else /* !CONFIG_TRACE_IRQFLAGS */ 119#else /* !CONFIG_TRACE_IRQFLAGS */
121static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) 120static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
122{ 121{
123 add_preempt_count(cnt); 122 preempt_count_add(cnt);
124 barrier(); 123 barrier();
125} 124}
126#endif /* CONFIG_TRACE_IRQFLAGS */ 125#endif /* CONFIG_TRACE_IRQFLAGS */
@@ -134,12 +133,11 @@ EXPORT_SYMBOL(local_bh_disable);
134 133
135static void __local_bh_enable(unsigned int cnt) 134static void __local_bh_enable(unsigned int cnt)
136{ 135{
137 WARN_ON_ONCE(in_irq());
138 WARN_ON_ONCE(!irqs_disabled()); 136 WARN_ON_ONCE(!irqs_disabled());
139 137
140 if (softirq_count() == cnt) 138 if (softirq_count() == cnt)
141 trace_softirqs_on(_RET_IP_); 139 trace_softirqs_on(_RET_IP_);
142 sub_preempt_count(cnt); 140 preempt_count_sub(cnt);
143} 141}
144 142
145/* 143/*
@@ -149,6 +147,7 @@ static void __local_bh_enable(unsigned int cnt)
149 */ 147 */
150void _local_bh_enable(void) 148void _local_bh_enable(void)
151{ 149{
150 WARN_ON_ONCE(in_irq());
152 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); 151 __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
153} 152}
154 153
@@ -169,12 +168,17 @@ static inline void _local_bh_enable_ip(unsigned long ip)
169 * Keep preemption disabled until we are done with 168 * Keep preemption disabled until we are done with
170 * softirq processing: 169 * softirq processing:
171 */ 170 */
172 sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); 171 preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
173 172
174 if (unlikely(!in_interrupt() && local_softirq_pending())) 173 if (unlikely(!in_interrupt() && local_softirq_pending())) {
174 /*
175 * Run softirq if any pending. And do it in its own stack
176 * as we may be calling this deep in a task call stack already.
177 */
175 do_softirq(); 178 do_softirq();
179 }
176 180
177 dec_preempt_count(); 181 preempt_count_dec();
178#ifdef CONFIG_TRACE_IRQFLAGS 182#ifdef CONFIG_TRACE_IRQFLAGS
179 local_irq_enable(); 183 local_irq_enable();
180#endif 184#endif
@@ -256,7 +260,7 @@ restart:
256 " exited with %08x?\n", vec_nr, 260 " exited with %08x?\n", vec_nr,
257 softirq_to_name[vec_nr], h->action, 261 softirq_to_name[vec_nr], h->action,
258 prev_count, preempt_count()); 262 prev_count, preempt_count());
259 preempt_count() = prev_count; 263 preempt_count_set(prev_count);
260 } 264 }
261 265
262 rcu_bh_qs(cpu); 266 rcu_bh_qs(cpu);
@@ -280,10 +284,11 @@ restart:
280 284
281 account_irq_exit_time(current); 285 account_irq_exit_time(current);
282 __local_bh_enable(SOFTIRQ_OFFSET); 286 __local_bh_enable(SOFTIRQ_OFFSET);
287 WARN_ON_ONCE(in_interrupt());
283 tsk_restore_flags(current, old_flags, PF_MEMALLOC); 288 tsk_restore_flags(current, old_flags, PF_MEMALLOC);
284} 289}
285 290
286#ifndef __ARCH_HAS_DO_SOFTIRQ 291
287 292
288asmlinkage void do_softirq(void) 293asmlinkage void do_softirq(void)
289{ 294{
@@ -298,13 +303,11 @@ asmlinkage void do_softirq(void)
298 pending = local_softirq_pending(); 303 pending = local_softirq_pending();
299 304
300 if (pending) 305 if (pending)
301 __do_softirq(); 306 do_softirq_own_stack();
302 307
303 local_irq_restore(flags); 308 local_irq_restore(flags);
304} 309}
305 310
306#endif
307
308/* 311/*
309 * Enter an interrupt context. 312 * Enter an interrupt context.
310 */ 313 */
@@ -329,15 +332,21 @@ void irq_enter(void)
329static inline void invoke_softirq(void) 332static inline void invoke_softirq(void)
330{ 333{
331 if (!force_irqthreads) { 334 if (!force_irqthreads) {
335#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK
332 /* 336 /*
333 * We can safely execute softirq on the current stack if 337 * We can safely execute softirq on the current stack if
334 * it is the irq stack, because it should be near empty 338 * it is the irq stack, because it should be near empty
335 * at this stage. But we have no way to know if the arch 339 * at this stage.
336 * calls irq_exit() on the irq stack. So call softirq
337 * in its own stack to prevent from any overrun on top
338 * of a potentially deep task stack.
339 */ 340 */
340 do_softirq(); 341 __do_softirq();
342#else
343 /*
344 * Otherwise, irq_exit() is called on the task stack that can
345 * be potentially deep already. So call softirq in its own stack
346 * to prevent from any overrun.
347 */
348 do_softirq_own_stack();
349#endif
341 } else { 350 } else {
342 wakeup_softirqd(); 351 wakeup_softirqd();
343 } 352 }
@@ -369,7 +378,7 @@ void irq_exit(void)
369 378
370 account_irq_exit_time(current); 379 account_irq_exit_time(current);
371 trace_hardirq_exit(); 380 trace_hardirq_exit();
372 sub_preempt_count(HARDIRQ_OFFSET); 381 preempt_count_sub(HARDIRQ_OFFSET);
373 if (!in_interrupt() && local_softirq_pending()) 382 if (!in_interrupt() && local_softirq_pending())
374 invoke_softirq(); 383 invoke_softirq();
375 384
@@ -771,6 +780,10 @@ static void run_ksoftirqd(unsigned int cpu)
771{ 780{
772 local_irq_disable(); 781 local_irq_disable();
773 if (local_softirq_pending()) { 782 if (local_softirq_pending()) {
783 /*
784 * We can safely run softirq on inline stack, as we are not deep
785 * in the task stack here.
786 */
774 __do_softirq(); 787 __do_softirq();
775 rcu_note_context_switch(cpu); 788 rcu_note_context_switch(cpu);
776 local_irq_enable(); 789 local_irq_enable();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..84571e09c907 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -20,6 +20,7 @@
20#include <linux/kallsyms.h> 20#include <linux/kallsyms.h>
21#include <linux/smpboot.h> 21#include <linux/smpboot.h>
22#include <linux/atomic.h> 22#include <linux/atomic.h>
23#include <linux/lglock.h>
23 24
24/* 25/*
25 * Structure to determine completion condition and record errors. May 26 * Structure to determine completion condition and record errors. May
@@ -43,6 +44,14 @@ static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
43static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task); 44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
44static bool stop_machine_initialized = false; 45static bool stop_machine_initialized = false;
45 46
47/*
48 * Avoids a race between stop_two_cpus and global stop_cpus, where
49 * the stoppers could get queued up in reverse order, leading to
50 * system deadlock. Using an lglock means stop_two_cpus remains
51 * relatively cheap.
52 */
53DEFINE_STATIC_LGLOCK(stop_cpus_lock);
54
46static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) 55static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
47{ 56{
48 memset(done, 0, sizeof(*done)); 57 memset(done, 0, sizeof(*done));
@@ -115,6 +124,184 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
115 return done.executed ? done.ret : -ENOENT; 124 return done.executed ? done.ret : -ENOENT;
116} 125}
117 126
127/* This controls the threads on each CPU. */
128enum multi_stop_state {
129 /* Dummy starting state for thread. */
130 MULTI_STOP_NONE,
131 /* Awaiting everyone to be scheduled. */
132 MULTI_STOP_PREPARE,
133 /* Disable interrupts. */
134 MULTI_STOP_DISABLE_IRQ,
135 /* Run the function */
136 MULTI_STOP_RUN,
137 /* Exit */
138 MULTI_STOP_EXIT,
139};
140
141struct multi_stop_data {
142 int (*fn)(void *);
143 void *data;
144 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145 unsigned int num_threads;
146 const struct cpumask *active_cpus;
147
148 enum multi_stop_state state;
149 atomic_t thread_ack;
150};
151
152static void set_state(struct multi_stop_data *msdata,
153 enum multi_stop_state newstate)
154{
155 /* Reset ack counter. */
156 atomic_set(&msdata->thread_ack, msdata->num_threads);
157 smp_wmb();
158 msdata->state = newstate;
159}
160
161/* Last one to ack a state moves to the next state. */
162static void ack_state(struct multi_stop_data *msdata)
163{
164 if (atomic_dec_and_test(&msdata->thread_ack))
165 set_state(msdata, msdata->state + 1);
166}
167
168/* This is the cpu_stop function which stops the CPU. */
169static int multi_cpu_stop(void *data)
170{
171 struct multi_stop_data *msdata = data;
172 enum multi_stop_state curstate = MULTI_STOP_NONE;
173 int cpu = smp_processor_id(), err = 0;
174 unsigned long flags;
175 bool is_active;
176
177 /*
178 * When called from stop_machine_from_inactive_cpu(), irq might
179 * already be disabled. Save the state and restore it on exit.
180 */
181 local_save_flags(flags);
182
183 if (!msdata->active_cpus)
184 is_active = cpu == cpumask_first(cpu_online_mask);
185 else
186 is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
187
188 /* Simple state machine */
189 do {
190 /* Chill out and ensure we re-read multi_stop_state. */
191 cpu_relax();
192 if (msdata->state != curstate) {
193 curstate = msdata->state;
194 switch (curstate) {
195 case MULTI_STOP_DISABLE_IRQ:
196 local_irq_disable();
197 hard_irq_disable();
198 break;
199 case MULTI_STOP_RUN:
200 if (is_active)
201 err = msdata->fn(msdata->data);
202 break;
203 default:
204 break;
205 }
206 ack_state(msdata);
207 }
208 } while (curstate != MULTI_STOP_EXIT);
209
210 local_irq_restore(flags);
211 return err;
212}
213
214struct irq_cpu_stop_queue_work_info {
215 int cpu1;
216 int cpu2;
217 struct cpu_stop_work *work1;
218 struct cpu_stop_work *work2;
219};
220
221/*
222 * This function is always run with irqs and preemption disabled.
223 * This guarantees that both work1 and work2 get queued, before
224 * our local migrate thread gets the chance to preempt us.
225 */
226static void irq_cpu_stop_queue_work(void *arg)
227{
228 struct irq_cpu_stop_queue_work_info *info = arg;
229 cpu_stop_queue_work(info->cpu1, info->work1);
230 cpu_stop_queue_work(info->cpu2, info->work2);
231}
232
233/**
234 * stop_two_cpus - stops two cpus
235 * @cpu1: the cpu to stop
236 * @cpu2: the other cpu to stop
237 * @fn: function to execute
238 * @arg: argument to @fn
239 *
240 * Stops both the current and specified CPU and runs @fn on one of them.
241 *
242 * returns when both are completed.
243 */
244int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
245{
246 struct cpu_stop_done done;
247 struct cpu_stop_work work1, work2;
248 struct irq_cpu_stop_queue_work_info call_args;
249 struct multi_stop_data msdata;
250
251 preempt_disable();
252 msdata = (struct multi_stop_data){
253 .fn = fn,
254 .data = arg,
255 .num_threads = 2,
256 .active_cpus = cpumask_of(cpu1),
257 };
258
259 work1 = work2 = (struct cpu_stop_work){
260 .fn = multi_cpu_stop,
261 .arg = &msdata,
262 .done = &done
263 };
264
265 call_args = (struct irq_cpu_stop_queue_work_info){
266 .cpu1 = cpu1,
267 .cpu2 = cpu2,
268 .work1 = &work1,
269 .work2 = &work2,
270 };
271
272 cpu_stop_init_done(&done, 2);
273 set_state(&msdata, MULTI_STOP_PREPARE);
274
275 /*
276 * If we observe both CPUs active we know _cpu_down() cannot yet have
277 * queued its stop_machine works and therefore ours will get executed
278 * first. Or its not either one of our CPUs that's getting unplugged,
279 * in which case we don't care.
280 *
281 * This relies on the stopper workqueues to be FIFO.
282 */
283 if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
284 preempt_enable();
285 return -ENOENT;
286 }
287
288 lg_local_lock(&stop_cpus_lock);
289 /*
290 * Queuing needs to be done by the lowest numbered CPU, to ensure
291 * that works are always queued in the same order on every CPU.
292 * This prevents deadlocks.
293 */
294 smp_call_function_single(min(cpu1, cpu2),
295 &irq_cpu_stop_queue_work,
296 &call_args, 0);
297 lg_local_unlock(&stop_cpus_lock);
298 preempt_enable();
299
300 wait_for_completion(&done.completion);
301
302 return done.executed ? done.ret : -ENOENT;
303}
304
118/** 305/**
119 * stop_one_cpu_nowait - stop a cpu but don't wait for completion 306 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
120 * @cpu: cpu to stop 307 * @cpu: cpu to stop
@@ -159,10 +346,10 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
159 * preempted by a stopper which might wait for other stoppers 346 * preempted by a stopper which might wait for other stoppers
160 * to enter @fn which can lead to deadlock. 347 * to enter @fn which can lead to deadlock.
161 */ 348 */
162 preempt_disable(); 349 lg_global_lock(&stop_cpus_lock);
163 for_each_cpu(cpu, cpumask) 350 for_each_cpu(cpu, cpumask)
164 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); 351 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
165 preempt_enable(); 352 lg_global_unlock(&stop_cpus_lock);
166} 353}
167 354
168static int __stop_cpus(const struct cpumask *cpumask, 355static int __stop_cpus(const struct cpumask *cpumask,
@@ -359,98 +546,14 @@ early_initcall(cpu_stop_init);
359 546
360#ifdef CONFIG_STOP_MACHINE 547#ifdef CONFIG_STOP_MACHINE
361 548
362/* This controls the threads on each CPU. */
363enum stopmachine_state {
364 /* Dummy starting state for thread. */
365 STOPMACHINE_NONE,
366 /* Awaiting everyone to be scheduled. */
367 STOPMACHINE_PREPARE,
368 /* Disable interrupts. */
369 STOPMACHINE_DISABLE_IRQ,
370 /* Run the function */
371 STOPMACHINE_RUN,
372 /* Exit */
373 STOPMACHINE_EXIT,
374};
375
376struct stop_machine_data {
377 int (*fn)(void *);
378 void *data;
379 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
380 unsigned int num_threads;
381 const struct cpumask *active_cpus;
382
383 enum stopmachine_state state;
384 atomic_t thread_ack;
385};
386
387static void set_state(struct stop_machine_data *smdata,
388 enum stopmachine_state newstate)
389{
390 /* Reset ack counter. */
391 atomic_set(&smdata->thread_ack, smdata->num_threads);
392 smp_wmb();
393 smdata->state = newstate;
394}
395
396/* Last one to ack a state moves to the next state. */
397static void ack_state(struct stop_machine_data *smdata)
398{
399 if (atomic_dec_and_test(&smdata->thread_ack))
400 set_state(smdata, smdata->state + 1);
401}
402
403/* This is the cpu_stop function which stops the CPU. */
404static int stop_machine_cpu_stop(void *data)
405{
406 struct stop_machine_data *smdata = data;
407 enum stopmachine_state curstate = STOPMACHINE_NONE;
408 int cpu = smp_processor_id(), err = 0;
409 unsigned long flags;
410 bool is_active;
411
412 /*
413 * When called from stop_machine_from_inactive_cpu(), irq might
414 * already be disabled. Save the state and restore it on exit.
415 */
416 local_save_flags(flags);
417
418 if (!smdata->active_cpus)
419 is_active = cpu == cpumask_first(cpu_online_mask);
420 else
421 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
422
423 /* Simple state machine */
424 do {
425 /* Chill out and ensure we re-read stopmachine_state. */
426 cpu_relax();
427 if (smdata->state != curstate) {
428 curstate = smdata->state;
429 switch (curstate) {
430 case STOPMACHINE_DISABLE_IRQ:
431 local_irq_disable();
432 hard_irq_disable();
433 break;
434 case STOPMACHINE_RUN:
435 if (is_active)
436 err = smdata->fn(smdata->data);
437 break;
438 default:
439 break;
440 }
441 ack_state(smdata);
442 }
443 } while (curstate != STOPMACHINE_EXIT);
444
445 local_irq_restore(flags);
446 return err;
447}
448
449int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 549int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
450{ 550{
451 struct stop_machine_data smdata = { .fn = fn, .data = data, 551 struct multi_stop_data msdata = {
452 .num_threads = num_online_cpus(), 552 .fn = fn,
453 .active_cpus = cpus }; 553 .data = data,
554 .num_threads = num_online_cpus(),
555 .active_cpus = cpus,
556 };
454 557
455 if (!stop_machine_initialized) { 558 if (!stop_machine_initialized) {
456 /* 559 /*
@@ -461,7 +564,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
461 unsigned long flags; 564 unsigned long flags;
462 int ret; 565 int ret;
463 566
464 WARN_ON_ONCE(smdata.num_threads != 1); 567 WARN_ON_ONCE(msdata.num_threads != 1);
465 568
466 local_irq_save(flags); 569 local_irq_save(flags);
467 hard_irq_disable(); 570 hard_irq_disable();
@@ -472,8 +575,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
472 } 575 }
473 576
474 /* Set the initial state and stop all online cpus. */ 577 /* Set the initial state and stop all online cpus. */
475 set_state(&smdata, STOPMACHINE_PREPARE); 578 set_state(&msdata, MULTI_STOP_PREPARE);
476 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); 579 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
477} 580}
478 581
479int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 582int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +616,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
513int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 616int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
514 const struct cpumask *cpus) 617 const struct cpumask *cpus)
515{ 618{
516 struct stop_machine_data smdata = { .fn = fn, .data = data, 619 struct multi_stop_data msdata = { .fn = fn, .data = data,
517 .active_cpus = cpus }; 620 .active_cpus = cpus };
518 struct cpu_stop_done done; 621 struct cpu_stop_done done;
519 int ret; 622 int ret;
520 623
521 /* Local CPU must be inactive and CPU hotplug in progress. */ 624 /* Local CPU must be inactive and CPU hotplug in progress. */
522 BUG_ON(cpu_active(raw_smp_processor_id())); 625 BUG_ON(cpu_active(raw_smp_processor_id()));
523 smdata.num_threads = num_active_cpus() + 1; /* +1 for local */ 626 msdata.num_threads = num_active_cpus() + 1; /* +1 for local */
524 627
525 /* No proper task established and can't sleep - busy wait for lock. */ 628 /* No proper task established and can't sleep - busy wait for lock. */
526 while (!mutex_trylock(&stop_cpus_mutex)) 629 while (!mutex_trylock(&stop_cpus_mutex))
527 cpu_relax(); 630 cpu_relax();
528 631
529 /* Schedule work on other CPUs and execute directly for local CPU */ 632 /* Schedule work on other CPUs and execute directly for local CPU */
530 set_state(&smdata, STOPMACHINE_PREPARE); 633 set_state(&msdata, MULTI_STOP_PREPARE);
531 cpu_stop_init_done(&done, num_active_cpus()); 634 cpu_stop_init_done(&done, num_active_cpus());
532 queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata, 635 queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
533 &done); 636 &done);
534 ret = stop_machine_cpu_stop(&smdata); 637 ret = multi_cpu_stop(&msdata);
535 638
536 /* Busy wait for completion. */ 639 /* Busy wait for completion. */
537 while (!completion_done(&done.completion)) 640 while (!completion_done(&done.completion))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..36547dddcdb8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -190,7 +190,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write,
190 190
191#ifdef CONFIG_MAGIC_SYSRQ 191#ifdef CONFIG_MAGIC_SYSRQ
192/* Note: sysrq code uses it's own private copy */ 192/* Note: sysrq code uses it's own private copy */
193static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; 193static int __sysrq_enabled = CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE;
194 194
195static int sysrq_sysctl_handler(ctl_table *table, int write, 195static int sysrq_sysctl_handler(ctl_table *table, int write,
196 void __user *buffer, size_t *lenp, 196 void __user *buffer, size_t *lenp,
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = proc_dointvec, 371 .proc_handler = proc_dointvec,
372 }, 372 },
373 { 373 {
374 .procname = "numa_balancing_scan_period_reset",
375 .data = &sysctl_numa_balancing_scan_period_reset,
376 .maxlen = sizeof(unsigned int),
377 .mode = 0644,
378 .proc_handler = proc_dointvec,
379 },
380 {
381 .procname = "numa_balancing_scan_period_max_ms", 374 .procname = "numa_balancing_scan_period_max_ms",
382 .data = &sysctl_numa_balancing_scan_period_max, 375 .data = &sysctl_numa_balancing_scan_period_max,
383 .maxlen = sizeof(unsigned int), 376 .maxlen = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
391 .mode = 0644, 384 .mode = 0644,
392 .proc_handler = proc_dointvec, 385 .proc_handler = proc_dointvec,
393 }, 386 },
387 {
388 .procname = "numa_balancing_settle_count",
389 .data = &sysctl_numa_balancing_settle_count,
390 .maxlen = sizeof(unsigned int),
391 .mode = 0644,
392 .proc_handler = proc_dointvec,
393 },
394 {
395 .procname = "numa_balancing_migrate_deferred",
396 .data = &sysctl_numa_balancing_migrate_deferred,
397 .maxlen = sizeof(unsigned int),
398 .mode = 0644,
399 .proc_handler = proc_dointvec,
400 },
394#endif /* CONFIG_NUMA_BALANCING */ 401#endif /* CONFIG_NUMA_BALANCING */
395#endif /* CONFIG_SCHED_DEBUG */ 402#endif /* CONFIG_SCHED_DEBUG */
396 { 403 {
@@ -1049,6 +1056,7 @@ static struct ctl_table kern_table[] = {
1049 .maxlen = sizeof(sysctl_perf_event_sample_rate), 1056 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1050 .mode = 0644, 1057 .mode = 0644,
1051 .proc_handler = perf_proc_update_handler, 1058 .proc_handler = perf_proc_update_handler,
1059 .extra1 = &one,
1052 }, 1060 },
1053 { 1061 {
1054 .procname = "perf_cpu_time_max_percent", 1062 .procname = "perf_cpu_time_max_percent",
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 2b62fe86f9ec..3ce6e8c5f3fc 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -100,7 +100,7 @@ config NO_HZ_FULL
100 # RCU_USER_QS dependency 100 # RCU_USER_QS dependency
101 depends on HAVE_CONTEXT_TRACKING 101 depends on HAVE_CONTEXT_TRACKING
102 # VIRT_CPU_ACCOUNTING_GEN dependency 102 # VIRT_CPU_ACCOUNTING_GEN dependency
103 depends on 64BIT 103 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
104 select NO_HZ_COMMON 104 select NO_HZ_COMMON
105 select RCU_USER_QS 105 select RCU_USER_QS
106 select RCU_NOCB_CPU 106 select RCU_NOCB_CPU
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index eec50fcef9e4..88c9c65a430d 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -490,7 +490,7 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; 490 clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
491 491
492 if (!alarmtimer_get_rtcdev()) 492 if (!alarmtimer_get_rtcdev())
493 return -ENOTSUPP; 493 return -EINVAL;
494 494
495 return hrtimer_get_res(baseid, tp); 495 return hrtimer_get_res(baseid, tp);
496} 496}
@@ -507,7 +507,7 @@ static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; 507 struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
508 508
509 if (!alarmtimer_get_rtcdev()) 509 if (!alarmtimer_get_rtcdev())
510 return -ENOTSUPP; 510 return -EINVAL;
511 511
512 *tp = ktime_to_timespec(base->gettime()); 512 *tp = ktime_to_timespec(base->gettime());
513 return 0; 513 return 0;
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 38959c866789..086ad6043bcb 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -33,29 +33,64 @@ struct ce_unbind {
33 int res; 33 int res;
34}; 34};
35 35
36/** 36static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
37 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 37 bool ismax)
38 * @latch: value to convert
39 * @evt: pointer to clock event device descriptor
40 *
41 * Math helper, returns latch value converted to nanoseconds (bound checked)
42 */
43u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
44{ 38{
45 u64 clc = (u64) latch << evt->shift; 39 u64 clc = (u64) latch << evt->shift;
40 u64 rnd;
46 41
47 if (unlikely(!evt->mult)) { 42 if (unlikely(!evt->mult)) {
48 evt->mult = 1; 43 evt->mult = 1;
49 WARN_ON(1); 44 WARN_ON(1);
50 } 45 }
46 rnd = (u64) evt->mult - 1;
47
48 /*
49 * Upper bound sanity check. If the backwards conversion is
50 * not equal latch, we know that the above shift overflowed.
51 */
52 if ((clc >> evt->shift) != (u64)latch)
53 clc = ~0ULL;
54
55 /*
56 * Scaled math oddities:
57 *
58 * For mult <= (1 << shift) we can safely add mult - 1 to
59 * prevent integer rounding loss. So the backwards conversion
60 * from nsec to device ticks will be correct.
61 *
62 * For mult > (1 << shift), i.e. device frequency is > 1GHz we
63 * need to be careful. Adding mult - 1 will result in a value
64 * which when converted back to device ticks can be larger
65 * than latch by up to (mult - 1) >> shift. For the min_delta
66 * calculation we still want to apply this in order to stay
67 * above the minimum device ticks limit. For the upper limit
68 * we would end up with a latch value larger than the upper
69 * limit of the device, so we omit the add to stay below the
70 * device upper boundary.
71 *
72 * Also omit the add if it would overflow the u64 boundary.
73 */
74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift)))
76 clc += rnd;
51 77
52 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
53 if (clc < 1000)
54 clc = 1000;
55 if (clc > KTIME_MAX)
56 clc = KTIME_MAX;
57 79
58 return clc; 80 /* Deltas less than 1usec are pointless noise */
81 return clc > 1000 ? clc : 1000;
82}
83
84/**
85 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
86 * @latch: value to convert
87 * @evt: pointer to clock event device descriptor
88 *
89 * Math helper, returns latch value converted to nanoseconds (bound checked)
90 */
91u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
92{
93 return cev_delta2ns(latch, evt, false);
59} 94}
60EXPORT_SYMBOL_GPL(clockevent_delta2ns); 95EXPORT_SYMBOL_GPL(clockevent_delta2ns);
61 96
@@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq)
380 sec = 600; 415 sec = 600;
381 416
382 clockevents_calc_mult_shift(dev, freq, sec); 417 clockevents_calc_mult_shift(dev, freq, sec);
383 dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); 418 dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false);
384 dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); 419 dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true);
385} 420}
386 421
387/** 422/**
@@ -584,7 +619,7 @@ static ssize_t sysfs_unbind_tick_dev(struct device *dev,
584 const char *buf, size_t count) 619 const char *buf, size_t count)
585{ 620{
586 char name[CS_NAME_LEN]; 621 char name[CS_NAME_LEN];
587 size_t ret = sysfs_get_uname(buf, name, count); 622 ssize_t ret = sysfs_get_uname(buf, name, count);
588 struct clock_event_device *ce; 623 struct clock_event_device *ce;
589 624
590 if (ret < 0) 625 if (ret < 0)
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 50a8736757f3..ba3e502c955a 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -479,6 +479,7 @@ static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
479static inline void clocksource_resume_watchdog(void) { } 479static inline void clocksource_resume_watchdog(void) { }
480static inline int __clocksource_watchdog_kthread(void) { return 0; } 480static inline int __clocksource_watchdog_kthread(void) { return 0; }
481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; } 481static bool clocksource_is_watchdog(struct clocksource *cs) { return false; }
482void clocksource_mark_unstable(struct clocksource *cs) { }
482 483
483#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 484#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
484 485
@@ -537,40 +538,55 @@ static u32 clocksource_max_adjustment(struct clocksource *cs)
537} 538}
538 539
539/** 540/**
540 * clocksource_max_deferment - Returns max time the clocksource can be deferred 541 * clocks_calc_max_nsecs - Returns maximum nanoseconds that can be converted
541 * @cs: Pointer to clocksource 542 * @mult: cycle to nanosecond multiplier
542 * 543 * @shift: cycle to nanosecond divisor (power of two)
544 * @maxadj: maximum adjustment value to mult (~11%)
545 * @mask: bitmask for two's complement subtraction of non 64 bit counters
543 */ 546 */
544static u64 clocksource_max_deferment(struct clocksource *cs) 547u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask)
545{ 548{
546 u64 max_nsecs, max_cycles; 549 u64 max_nsecs, max_cycles;
547 550
548 /* 551 /*
549 * Calculate the maximum number of cycles that we can pass to the 552 * Calculate the maximum number of cycles that we can pass to the
550 * cyc2ns function without overflowing a 64-bit signed result. The 553 * cyc2ns function without overflowing a 64-bit signed result. The
551 * maximum number of cycles is equal to ULLONG_MAX/(cs->mult+cs->maxadj) 554 * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj)
552 * which is equivalent to the below. 555 * which is equivalent to the below.
553 * max_cycles < (2^63)/(cs->mult + cs->maxadj) 556 * max_cycles < (2^63)/(mult + maxadj)
554 * max_cycles < 2^(log2((2^63)/(cs->mult + cs->maxadj))) 557 * max_cycles < 2^(log2((2^63)/(mult + maxadj)))
555 * max_cycles < 2^(log2(2^63) - log2(cs->mult + cs->maxadj)) 558 * max_cycles < 2^(log2(2^63) - log2(mult + maxadj))
556 * max_cycles < 2^(63 - log2(cs->mult + cs->maxadj)) 559 * max_cycles < 2^(63 - log2(mult + maxadj))
557 * max_cycles < 1 << (63 - log2(cs->mult + cs->maxadj)) 560 * max_cycles < 1 << (63 - log2(mult + maxadj))
558 * Please note that we add 1 to the result of the log2 to account for 561 * Please note that we add 1 to the result of the log2 to account for
559 * any rounding errors, ensure the above inequality is satisfied and 562 * any rounding errors, ensure the above inequality is satisfied and
560 * no overflow will occur. 563 * no overflow will occur.
561 */ 564 */
562 max_cycles = 1ULL << (63 - (ilog2(cs->mult + cs->maxadj) + 1)); 565 max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1));
563 566
564 /* 567 /*
565 * The actual maximum number of cycles we can defer the clocksource is 568 * The actual maximum number of cycles we can defer the clocksource is
566 * determined by the minimum of max_cycles and cs->mask. 569 * determined by the minimum of max_cycles and mask.
567 * Note: Here we subtract the maxadj to make sure we don't sleep for 570 * Note: Here we subtract the maxadj to make sure we don't sleep for
568 * too long if there's a large negative adjustment. 571 * too long if there's a large negative adjustment.
569 */ 572 */
570 max_cycles = min_t(u64, max_cycles, (u64) cs->mask); 573 max_cycles = min(max_cycles, mask);
571 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult - cs->maxadj, 574 max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift);
572 cs->shift); 575
576 return max_nsecs;
577}
578
579/**
580 * clocksource_max_deferment - Returns max time the clocksource can be deferred
581 * @cs: Pointer to clocksource
582 *
583 */
584static u64 clocksource_max_deferment(struct clocksource *cs)
585{
586 u64 max_nsecs;
573 587
588 max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj,
589 cs->mask);
574 /* 590 /*
575 * To ensure that the clocksource does not wrap whilst we are idle, 591 * To ensure that the clocksource does not wrap whilst we are idle,
576 * limit the time the clocksource can be deferred by 12.5%. Please 592 * limit the time the clocksource can be deferred by 12.5%. Please
@@ -893,7 +909,7 @@ sysfs_show_current_clocksources(struct device *dev,
893 return count; 909 return count;
894} 910}
895 911
896size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt) 912ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt)
897{ 913{
898 size_t ret = cnt; 914 size_t ret = cnt;
899 915
@@ -924,7 +940,7 @@ static ssize_t sysfs_override_clocksource(struct device *dev,
924 struct device_attribute *attr, 940 struct device_attribute *attr,
925 const char *buf, size_t count) 941 const char *buf, size_t count)
926{ 942{
927 size_t ret; 943 ssize_t ret;
928 944
929 mutex_lock(&clocksource_mutex); 945 mutex_lock(&clocksource_mutex);
930 946
@@ -952,7 +968,7 @@ static ssize_t sysfs_unbind_clocksource(struct device *dev,
952{ 968{
953 struct clocksource *cs; 969 struct clocksource *cs;
954 char name[CS_NAME_LEN]; 970 char name[CS_NAME_LEN];
955 size_t ret; 971 ssize_t ret;
956 972
957 ret = sysfs_get_uname(buf, name, count); 973 ret = sysfs_get_uname(buf, name, count);
958 if (ret < 0) 974 if (ret < 0)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index bb2215174f05..af8d1d4f3d55 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -475,6 +475,7 @@ static void sync_cmos_clock(struct work_struct *work)
475 * called as close as possible to 500 ms before the new second starts. 475 * called as close as possible to 500 ms before the new second starts.
476 * This code is run on a timer. If the clock is set, that timer 476 * This code is run on a timer. If the clock is set, that timer
477 * may not expire at the correct time. Thus, we adjust... 477 * may not expire at the correct time. Thus, we adjust...
478 * We want the clock to be within a couple of ticks from the target.
478 */ 479 */
479 if (!ntp_synced()) { 480 if (!ntp_synced()) {
480 /* 481 /*
@@ -485,7 +486,7 @@ static void sync_cmos_clock(struct work_struct *work)
485 } 486 }
486 487
487 getnstimeofday(&now); 488 getnstimeofday(&now);
488 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) { 489 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec * 5) {
489 struct timespec adjust = now; 490 struct timespec adjust = now;
490 491
491 fail = -ENODEV; 492 fail = -ENODEV;
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 0b479a6a22bb..68b799375981 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -8,25 +8,28 @@
8#include <linux/clocksource.h> 8#include <linux/clocksource.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/jiffies.h> 10#include <linux/jiffies.h>
11#include <linux/ktime.h>
11#include <linux/kernel.h> 12#include <linux/kernel.h>
12#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/syscore_ops.h> 15#include <linux/syscore_ops.h>
15#include <linux/timer.h> 16#include <linux/hrtimer.h>
16#include <linux/sched_clock.h> 17#include <linux/sched_clock.h>
18#include <linux/seqlock.h>
19#include <linux/bitops.h>
17 20
18struct clock_data { 21struct clock_data {
22 ktime_t wrap_kt;
19 u64 epoch_ns; 23 u64 epoch_ns;
20 u32 epoch_cyc; 24 u64 epoch_cyc;
21 u32 epoch_cyc_copy; 25 seqcount_t seq;
22 unsigned long rate; 26 unsigned long rate;
23 u32 mult; 27 u32 mult;
24 u32 shift; 28 u32 shift;
25 bool suspended; 29 bool suspended;
26}; 30};
27 31
28static void sched_clock_poll(unsigned long wrap_ticks); 32static struct hrtimer sched_clock_timer;
29static DEFINE_TIMER(sched_clock_timer, sched_clock_poll, 0, 0);
30static int irqtime = -1; 33static int irqtime = -1;
31 34
32core_param(irqtime, irqtime, int, 0400); 35core_param(irqtime, irqtime, int, 0400);
@@ -35,42 +38,46 @@ static struct clock_data cd = {
35 .mult = NSEC_PER_SEC / HZ, 38 .mult = NSEC_PER_SEC / HZ,
36}; 39};
37 40
38static u32 __read_mostly sched_clock_mask = 0xffffffff; 41static u64 __read_mostly sched_clock_mask;
39 42
40static u32 notrace jiffy_sched_clock_read(void) 43static u64 notrace jiffy_sched_clock_read(void)
41{ 44{
42 return (u32)(jiffies - INITIAL_JIFFIES); 45 /*
46 * We don't need to use get_jiffies_64 on 32-bit arches here
47 * because we register with BITS_PER_LONG
48 */
49 return (u64)(jiffies - INITIAL_JIFFIES);
43} 50}
44 51
45static u32 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; 52static u32 __read_mostly (*read_sched_clock_32)(void);
53
54static u64 notrace read_sched_clock_32_wrapper(void)
55{
56 return read_sched_clock_32();
57}
58
59static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read;
46 60
47static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) 61static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift)
48{ 62{
49 return (cyc * mult) >> shift; 63 return (cyc * mult) >> shift;
50} 64}
51 65
52static unsigned long long notrace sched_clock_32(void) 66unsigned long long notrace sched_clock(void)
53{ 67{
54 u64 epoch_ns; 68 u64 epoch_ns;
55 u32 epoch_cyc; 69 u64 epoch_cyc;
56 u32 cyc; 70 u64 cyc;
71 unsigned long seq;
57 72
58 if (cd.suspended) 73 if (cd.suspended)
59 return cd.epoch_ns; 74 return cd.epoch_ns;
60 75
61 /*
62 * Load the epoch_cyc and epoch_ns atomically. We do this by
63 * ensuring that we always write epoch_cyc, epoch_ns and
64 * epoch_cyc_copy in strict order, and read them in strict order.
65 * If epoch_cyc and epoch_cyc_copy are not equal, then we're in
66 * the middle of an update, and we should repeat the load.
67 */
68 do { 76 do {
77 seq = read_seqcount_begin(&cd.seq);
69 epoch_cyc = cd.epoch_cyc; 78 epoch_cyc = cd.epoch_cyc;
70 smp_rmb();
71 epoch_ns = cd.epoch_ns; 79 epoch_ns = cd.epoch_ns;
72 smp_rmb(); 80 } while (read_seqcount_retry(&cd.seq, seq));
73 } while (epoch_cyc != cd.epoch_cyc_copy);
74 81
75 cyc = read_sched_clock(); 82 cyc = read_sched_clock();
76 cyc = (cyc - epoch_cyc) & sched_clock_mask; 83 cyc = (cyc - epoch_cyc) & sched_clock_mask;
@@ -83,49 +90,46 @@ static unsigned long long notrace sched_clock_32(void)
83static void notrace update_sched_clock(void) 90static void notrace update_sched_clock(void)
84{ 91{
85 unsigned long flags; 92 unsigned long flags;
86 u32 cyc; 93 u64 cyc;
87 u64 ns; 94 u64 ns;
88 95
89 cyc = read_sched_clock(); 96 cyc = read_sched_clock();
90 ns = cd.epoch_ns + 97 ns = cd.epoch_ns +
91 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, 98 cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask,
92 cd.mult, cd.shift); 99 cd.mult, cd.shift);
93 /* 100
94 * Write epoch_cyc and epoch_ns in a way that the update is
95 * detectable in cyc_to_fixed_sched_clock().
96 */
97 raw_local_irq_save(flags); 101 raw_local_irq_save(flags);
98 cd.epoch_cyc_copy = cyc; 102 write_seqcount_begin(&cd.seq);
99 smp_wmb();
100 cd.epoch_ns = ns; 103 cd.epoch_ns = ns;
101 smp_wmb();
102 cd.epoch_cyc = cyc; 104 cd.epoch_cyc = cyc;
105 write_seqcount_end(&cd.seq);
103 raw_local_irq_restore(flags); 106 raw_local_irq_restore(flags);
104} 107}
105 108
106static void sched_clock_poll(unsigned long wrap_ticks) 109static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt)
107{ 110{
108 mod_timer(&sched_clock_timer, round_jiffies(jiffies + wrap_ticks));
109 update_sched_clock(); 111 update_sched_clock();
112 hrtimer_forward_now(hrt, cd.wrap_kt);
113 return HRTIMER_RESTART;
110} 114}
111 115
112void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate) 116void __init sched_clock_register(u64 (*read)(void), int bits,
117 unsigned long rate)
113{ 118{
114 unsigned long r, w; 119 unsigned long r;
115 u64 res, wrap; 120 u64 res, wrap;
116 char r_unit; 121 char r_unit;
117 122
118 if (cd.rate > rate) 123 if (cd.rate > rate)
119 return; 124 return;
120 125
121 BUG_ON(bits > 32);
122 WARN_ON(!irqs_disabled()); 126 WARN_ON(!irqs_disabled());
123 read_sched_clock = read; 127 read_sched_clock = read;
124 sched_clock_mask = (1ULL << bits) - 1; 128 sched_clock_mask = CLOCKSOURCE_MASK(bits);
125 cd.rate = rate; 129 cd.rate = rate;
126 130
127 /* calculate the mult/shift to convert counter ticks to ns. */ 131 /* calculate the mult/shift to convert counter ticks to ns. */
128 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 0); 132 clocks_calc_mult_shift(&cd.mult, &cd.shift, rate, NSEC_PER_SEC, 3600);
129 133
130 r = rate; 134 r = rate;
131 if (r >= 4000000) { 135 if (r >= 4000000) {
@@ -138,20 +142,14 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
138 r_unit = ' '; 142 r_unit = ' ';
139 143
140 /* calculate how many ns until we wrap */ 144 /* calculate how many ns until we wrap */
141 wrap = cyc_to_ns((1ULL << bits) - 1, cd.mult, cd.shift); 145 wrap = clocks_calc_max_nsecs(cd.mult, cd.shift, 0, sched_clock_mask);
142 do_div(wrap, NSEC_PER_MSEC); 146 cd.wrap_kt = ns_to_ktime(wrap - (wrap >> 3));
143 w = wrap;
144 147
145 /* calculate the ns resolution of this counter */ 148 /* calculate the ns resolution of this counter */
146 res = cyc_to_ns(1ULL, cd.mult, cd.shift); 149 res = cyc_to_ns(1ULL, cd.mult, cd.shift);
147 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lums\n", 150 pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n",
148 bits, r, r_unit, res, w); 151 bits, r, r_unit, res, wrap);
149 152
150 /*
151 * Start the timer to keep sched_clock() properly updated and
152 * sets the initial epoch.
153 */
154 sched_clock_timer.data = msecs_to_jiffies(w - (w / 10));
155 update_sched_clock(); 153 update_sched_clock();
156 154
157 /* 155 /*
@@ -166,11 +164,10 @@ void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
166 pr_debug("Registered %pF as sched_clock source\n", read); 164 pr_debug("Registered %pF as sched_clock source\n", read);
167} 165}
168 166
169unsigned long long __read_mostly (*sched_clock_func)(void) = sched_clock_32; 167void __init setup_sched_clock(u32 (*read)(void), int bits, unsigned long rate)
170
171unsigned long long notrace sched_clock(void)
172{ 168{
173 return sched_clock_func(); 169 read_sched_clock_32 = read;
170 sched_clock_register(read_sched_clock_32_wrapper, bits, rate);
174} 171}
175 172
176void __init sched_clock_postinit(void) 173void __init sched_clock_postinit(void)
@@ -180,14 +177,22 @@ void __init sched_clock_postinit(void)
180 * make it the final one one. 177 * make it the final one one.
181 */ 178 */
182 if (read_sched_clock == jiffy_sched_clock_read) 179 if (read_sched_clock == jiffy_sched_clock_read)
183 setup_sched_clock(jiffy_sched_clock_read, 32, HZ); 180 sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ);
184 181
185 sched_clock_poll(sched_clock_timer.data); 182 update_sched_clock();
183
184 /*
185 * Start the timer to keep sched_clock() properly updated and
186 * sets the initial epoch.
187 */
188 hrtimer_init(&sched_clock_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
189 sched_clock_timer.function = sched_clock_poll;
190 hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL);
186} 191}
187 192
188static int sched_clock_suspend(void) 193static int sched_clock_suspend(void)
189{ 194{
190 sched_clock_poll(sched_clock_timer.data); 195 sched_clock_poll(&sched_clock_timer);
191 cd.suspended = true; 196 cd.suspended = true;
192 return 0; 197 return 0;
193} 198}
@@ -195,7 +200,6 @@ static int sched_clock_suspend(void)
195static void sched_clock_resume(void) 200static void sched_clock_resume(void)
196{ 201{
197 cd.epoch_cyc = read_sched_clock(); 202 cd.epoch_cyc = read_sched_clock();
198 cd.epoch_cyc_copy = cd.epoch_cyc;
199 cd.suspended = false; 203 cd.suspended = false;
200} 204}
201 205
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 218bcb565fed..9532690daaa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -70,6 +70,7 @@ static bool tick_check_broadcast_device(struct clock_event_device *curdev,
70 struct clock_event_device *newdev) 70 struct clock_event_device *newdev)
71{ 71{
72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) || 72 if ((newdev->features & CLOCK_EVT_FEAT_DUMMY) ||
73 (newdev->features & CLOCK_EVT_FEAT_PERCPU) ||
73 (newdev->features & CLOCK_EVT_FEAT_C3STOP)) 74 (newdev->features & CLOCK_EVT_FEAT_C3STOP))
74 return false; 75 return false;
75 76
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bc906cad709b..18e71f7fbc2a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -31,7 +31,7 @@ extern void tick_install_replacement(struct clock_event_device *dev);
31 31
32extern void clockevents_shutdown(struct clock_event_device *dev); 32extern void clockevents_shutdown(struct clock_event_device *dev);
33 33
34extern size_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); 34extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
35 35
36/* 36/*
37 * NO_HZ / high resolution timer shared code 37 * NO_HZ / high resolution timer shared code
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 947ba25a95a0..3abf53418b67 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1613,9 +1613,10 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
1613 * ktime_get_update_offsets - hrtimer helper 1613 * ktime_get_update_offsets - hrtimer helper
1614 * @offs_real: pointer to storage for monotonic -> realtime offset 1614 * @offs_real: pointer to storage for monotonic -> realtime offset
1615 * @offs_boot: pointer to storage for monotonic -> boottime offset 1615 * @offs_boot: pointer to storage for monotonic -> boottime offset
1616 * @offs_tai: pointer to storage for monotonic -> clock tai offset
1616 * 1617 *
1617 * Returns current monotonic time and updates the offsets 1618 * Returns current monotonic time and updates the offsets
1618 * Called from hrtimer_interupt() or retrigger_next_event() 1619 * Called from hrtimer_interrupt() or retrigger_next_event()
1619 */ 1620 */
1620ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot, 1621ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
1621 ktime_t *offs_tai) 1622 ktime_t *offs_tai)
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 0b537f27b559..1fb08f21302e 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -298,15 +298,15 @@ static int tstats_show(struct seq_file *m, void *v)
298 period = ktime_to_timespec(time); 298 period = ktime_to_timespec(time);
299 ms = period.tv_nsec / 1000000; 299 ms = period.tv_nsec / 1000000;
300 300
301 seq_puts(m, "Timer Stats Version: v0.2\n"); 301 seq_puts(m, "Timer Stats Version: v0.3\n");
302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); 302 seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms);
303 if (atomic_read(&overflow_count)) 303 if (atomic_read(&overflow_count))
304 seq_printf(m, "Overflow: %d entries\n", 304 seq_printf(m, "Overflow: %d entries\n", atomic_read(&overflow_count));
305 atomic_read(&overflow_count)); 305 seq_printf(m, "Collection: %s\n", timer_stats_active ? "active" : "inactive");
306 306
307 for (i = 0; i < nr_entries; i++) { 307 for (i = 0; i < nr_entries; i++) {
308 entry = entries + i; 308 entry = entries + i;
309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { 309 if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
310 seq_printf(m, "%4luD, %5d %-16s ", 310 seq_printf(m, "%4luD, %5d %-16s ",
311 entry->count, entry->pid, entry->comm); 311 entry->count, entry->pid, entry->comm);
312 } else { 312 } else {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), 1092static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1093 unsigned long data) 1093 unsigned long data)
1094{ 1094{
1095 int preempt_count = preempt_count(); 1095 int count = preempt_count();
1096 1096
1097#ifdef CONFIG_LOCKDEP 1097#ifdef CONFIG_LOCKDEP
1098 /* 1098 /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1119 1119
1120 lock_map_release(&lockdep_map); 1120 lock_map_release(&lockdep_map);
1121 1121
1122 if (preempt_count != preempt_count()) { 1122 if (count != preempt_count()) {
1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", 1123 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1124 fn, preempt_count, preempt_count()); 1124 fn, count, preempt_count());
1125 /* 1125 /*
1126 * Restore the preempt count. That gives us a decent 1126 * Restore the preempt count. That gives us a decent
1127 * chance to survive and extract information. If the 1127 * chance to survive and extract information. If the
1128 * callback kept a lock held, bad luck, but not worse 1128 * callback kept a lock held, bad luck, but not worse
1129 * than the BUG() we had. 1129 * than the BUG() we had.
1130 */ 1130 */
1131 preempt_count() = preempt_count; 1131 preempt_count_set(count);
1132 } 1132 }
1133} 1133}
1134 1134
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7974ba20557d..d9fea7dfd5d3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1509,7 +1509,8 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1509#endif 1509#endif
1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 1510 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 1511 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
1512 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 1512 (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
1513 (test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
1513} 1514}
1514EXPORT_SYMBOL_GPL(tracing_generic_entry_update); 1515EXPORT_SYMBOL_GPL(tracing_generic_entry_update);
1515 1516
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 10c86fb7a2b4..73d08aa25b55 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -124,6 +124,7 @@ enum trace_flag_type {
124 TRACE_FLAG_NEED_RESCHED = 0x04, 124 TRACE_FLAG_NEED_RESCHED = 0x04,
125 TRACE_FLAG_HARDIRQ = 0x08, 125 TRACE_FLAG_HARDIRQ = 0x08,
126 TRACE_FLAG_SOFTIRQ = 0x10, 126 TRACE_FLAG_SOFTIRQ = 0x10,
127 TRACE_FLAG_PREEMPT_RESCHED = 0x20,
127}; 128};
128 129
129#define TRACE_BUF_SIZE 1024 130#define TRACE_BUF_SIZE 1024
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 80c36bcf66e8..78e27e3b52ac 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -26,7 +26,7 @@ static int perf_trace_event_perm(struct ftrace_event_call *tp_event,
26{ 26{
27 /* The ftrace function trace is allowed only for root. */ 27 /* The ftrace function trace is allowed only for root. */
28 if (ftrace_event_is_function(tp_event) && 28 if (ftrace_event_is_function(tp_event) &&
29 perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) 29 perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
30 return -EPERM; 30 return -EPERM;
31 31
32 /* No tracing, just counting, so no obvious leak */ 32 /* No tracing, just counting, so no obvious leak */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 34e7cbac0c9c..ed32284fbe32 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -618,8 +618,23 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : 618 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : 619 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
620 '.'; 620 '.';
621 need_resched = 621
622 (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; 622 switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
623 TRACE_FLAG_PREEMPT_RESCHED)) {
624 case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
625 need_resched = 'N';
626 break;
627 case TRACE_FLAG_NEED_RESCHED:
628 need_resched = 'n';
629 break;
630 case TRACE_FLAG_PREEMPT_RESCHED:
631 need_resched = 'p';
632 break;
633 default:
634 need_resched = '.';
635 break;
636 }
637
623 hardsoft_irq = 638 hardsoft_irq =
624 (hardirq && softirq) ? 'H' : 639 (hardirq && softirq) ? 'H' :
625 hardirq ? 'h' : 640 hardirq ? 'h' :