aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit_tree.c13
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/cgroup.c8
-rw-r--r--kernel/cpu.c41
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/exit.c40
-rw-r--r--kernel/fork.c30
-rw-r--r--kernel/futex.c117
-rw-r--r--kernel/hrtimer.c170
-rw-r--r--kernel/hw_breakpoint.c212
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c86
-rw-r--r--kernel/irq/handle.c22
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c52
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c8
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/spurious.c16
-rw-r--r--kernel/itimer.c7
-rw-r--r--kernel/kexec.c65
-rw-r--r--kernel/kfifo.c410
-rw-r--r--kernel/kgdb.c65
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c21
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c49
-rw-r--r--kernel/module.c208
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/perf_event.c246
-rw-r--r--kernel/pid.c12
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-cpu-timers.c5
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/printk.c120
-rw-r--r--kernel/rcutorture.c8
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c30
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c766
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_cpupri.c10
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c17
-rw-r--r--kernel/sched_fair.c210
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c66
-rw-r--r--kernel/signal.c66
-rw-r--r--kernel/smp.c37
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/softlockup.c69
-rw-r--r--kernel/spinlock.c306
-rw-r--r--kernel/sys.c24
-rw-r--r--kernel/sysctl.c50
-rw-r--r--kernel/sysctl_binary.c38
-rw-r--r--kernel/time/clockevents.c46
-rw-r--r--kernel/time/clocksource.c117
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/tick-oneshot.c4
-rw-r--r--kernel/time/tick-sched.c141
-rw-r--r--kernel/time/timecompare.c2
-rw-r--r--kernel/time/timekeeping.c104
-rw-r--r--kernel/time/timer_list.c25
-rw-r--r--kernel/time/timer_stats.c18
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/Kconfig116
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/trace/ring_buffer.c73
-rw-r--r--kernel/trace/trace.c312
-rw-r--r--kernel/trace/trace.h27
-rw-r--r--kernel/trace/trace_clock.c8
-rw-r--r--kernel/trace/trace_event_profile.c6
-rw-r--r--kernel/trace/trace_events.c41
-rw-r--r--kernel/trace/trace_events_filter.c29
-rw-r--r--kernel/trace/trace_export.c11
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c51
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c86
-rw-r--r--kernel/trace/trace_ksym.c193
-rw-r--r--kernel/trace/trace_output.c75
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c4
-rw-r--r--kernel/trace/trace_stack.c40
-rw-r--r--kernel/trace/trace_syscalls.c18
-rw-r--r--kernel/trace/trace_sysprof.c1
-rw-r--r--kernel/user-return-notifier.c6
-rw-r--r--kernel/workqueue.c131
100 files changed, 3642 insertions, 2373 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6b..a6605ca921b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 536 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 537 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 538 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 539 ac.ac_uid = orig_cred->uid;
540 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 541#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 542 ac.ac_ahz = AHZ;
542#endif 543#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f328..4b05bd9479d 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 277 owner->root = NULL;
278 } 278 }
279 279
280 for (i = j = 0; i < size; i++, j++) { 280 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 281 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 282 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 283 list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 290 if (!s) /* result of earlier fallback */
291 continue; 291 continue;
292 get_tree(s); 292 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 293 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 294 }
295 295
296 list_replace_rcu(&chunk->hash, &new->hash); 296 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 373 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 374 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 375 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 376 put_inotify_watch(&old->watch);
377 return 0; 377 return 0;
378 } 378 }
379 } 379 }
380 spin_unlock(&hash_lock); 380 spin_unlock(&hash_lock);
381 381
382 chunk = alloc_chunk(old->count + 1); 382 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 383 if (!chunk) {
384 put_inotify_watch(&old->watch);
384 return -ENOMEM; 385 return -ENOMEM;
386 }
385 387
386 mutex_lock(&inode->inotify_mutex); 388 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 389 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 427 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 428 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 429 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 430 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
431 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 432 return 0;
430} 433}
431 434
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f019..fc0f928167e 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
250#endif 250#endif
251}; 251};
252 252
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 253static inline int open_arg(int flags, int mask)
255{ 254{
256 int n = ACC_MODE(flags); 255 int n = ACC_MODE(flags);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c530138183..98a51f26c13 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
12 12
13void foo(void) 13void foo(void)
14{ 14{
15 /* The enum constants to put into include/linux/bounds.h */ 15 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */ 18 /* End of constants */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5..aa3bee56644 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2468,7 +2468,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468 /* make sure l doesn't vanish out from under us */ 2468 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2469 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2470 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2471 return l;
2473 } 2472 }
2474 } 2473 }
@@ -2937,14 +2936,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2937 2936
2938 for_each_subsys(root, ss) { 2937 for_each_subsys(root, ss) {
2939 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2938 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2939
2940 if (IS_ERR(css)) { 2940 if (IS_ERR(css)) {
2941 err = PTR_ERR(css); 2941 err = PTR_ERR(css);
2942 goto err_destroy; 2942 goto err_destroy;
2943 } 2943 }
2944 init_cgroup_css(css, ss, cgrp); 2944 init_cgroup_css(css, ss, cgrp);
2945 if (ss->use_id) 2945 if (ss->use_id) {
2946 if (alloc_css_id(ss, parent, cgrp)) 2946 err = alloc_css_id(ss, parent, cgrp);
2947 if (err)
2947 goto err_destroy; 2948 goto err_destroy;
2949 }
2948 /* At error, ->destroy() callback has to free assigned ID. */ 2950 /* At error, ->destroy() callback has to free assigned ID. */
2949 } 2951 }
2950 2952
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6ba0f1ecb21..677f25376a3 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
151 151
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 153 for_each_process(p) {
154 if (task_cpu(p) == cpu && 154 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 155 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 156 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 158 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 159 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 160 p->state, p->flags);
161 } 161 }
162 write_unlock_irq(&tasklist_lock); 162 write_unlock_irq(&tasklist_lock);
163} 163}
@@ -209,9 +209,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 cpu_hotplug_begin(); 211 cpu_hotplug_begin();
212 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 214 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 215 if (err == NOTIFY_BAD) {
216 set_cpu_active(cpu, true);
217
215 nr_calls--; 218 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 219 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 220 hcpu, nr_calls, NULL);
@@ -223,11 +226,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 226
224 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 228 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 229 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 230
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 232 if (err) {
233 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 236 hcpu) == NOTIFY_BAD)
@@ -278,23 +281,8 @@ int __ref cpu_down(unsigned int cpu)
278 goto out; 281 goto out;
279 } 282 }
280 283
281 set_cpu_active(cpu, false);
282
283 /*
284 * Make sure the all cpus did the reschedule and are not
285 * using stale version of the cpu_active_mask.
286 * This is not strictly necessary becuase stop_machine()
287 * that we run down the line already provides the required
288 * synchronization. But it's really a side effect and we do not
289 * want to depend on the innards of the stop_machine here.
290 */
291 synchronize_sched();
292
293 err = _cpu_down(cpu, 0); 284 err = _cpu_down(cpu, 0);
294 285
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 286out:
299 cpu_maps_update_done(); 287 cpu_maps_update_done();
300 stop_machine_destroy(); 288 stop_machine_destroy();
@@ -383,19 +371,20 @@ int disable_nonboot_cpus(void)
383 return error; 371 return error;
384 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
385 first_cpu = cpumask_first(cpu_online_mask); 373 first_cpu = cpumask_first(cpu_online_mask);
386 /* We take down all of the non-boot CPUs in one shot to avoid races 374 /*
375 * We take down all of the non-boot CPUs in one shot to avoid races
387 * with the userspace trying to use the CPU hotplug at the same time 376 * with the userspace trying to use the CPU hotplug at the same time
388 */ 377 */
389 cpumask_clear(frozen_cpus); 378 cpumask_clear(frozen_cpus);
379
390 printk("Disabling non-boot CPUs ...\n"); 380 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 381 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 382 if (cpu == first_cpu)
393 continue; 383 continue;
394 error = _cpu_down(cpu, 1); 384 error = _cpu_down(cpu, 1);
395 if (!error) { 385 if (!error)
396 cpumask_set_cpu(cpu, frozen_cpus); 386 cpumask_set_cpu(cpu, frozen_cpus);
397 printk("CPU%d is down\n", cpu); 387 else {
398 } else {
399 printk(KERN_ERR "Error taking CPU%d down: %d\n", 388 printk(KERN_ERR "Error taking CPU%d down: %d\n",
400 cpu, error); 389 cpu, error);
401 break; 390 break;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472..ba401fab459 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
737{ 737{
738} 738}
739 739
740static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
741 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
742{ 742{
743 *domains = NULL; 743 *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
872 if (retval < 0) 872 if (retval < 0)
873 return retval; 873 return retval;
874 874
875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
876 return -EINVAL; 876 return -EINVAL;
877 } 877 }
878 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2010 } 2010 }
2011 2011
2012 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2013 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2015 continue;
2016 2016
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2019 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2022 cpu_online_mask); 2022 cpu_active_mask);
2023 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2024 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2025 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2057 switch (phase) { 2057 switch (phase) {
2058 case CPU_ONLINE: 2058 case CPU_ONLINE:
2059 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2060 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2061 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2062 break; 2064 break;
2063 2065
2064 default: 2066 default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2067 2069
2068 cgroup_lock(); 2070 cgroup_lock();
2069 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2070 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2071 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2072 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2073 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2114 2116
2115void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2116{ 2118{
2117 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2118 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2119 2121
2120 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b..1ed8ca18790 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 226 if (!new->tgcred) {
227 kfree(new); 227 kmem_cache_free(cred_jar, new);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_set(&new->tgcred->usage, 1); 230 atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e..546774a31a6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
68 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling);
71 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
72 } 73 }
73 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
74 list_del_init(&p->sibling);
75} 75}
76 76
77/* 77/*
@@ -736,12 +736,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
736/* 736/*
737* Any that need to be release_task'd are put on the @dead list. 737* Any that need to be release_task'd are put on the @dead list.
738 */ 738 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p, 739static void reparent_leader(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead) 740 struct list_head *dead)
741{ 741{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children); 742 list_move_tail(&p->sibling, &p->real_parent->children);
746 743
747 if (task_detached(p)) 744 if (task_detached(p))
@@ -780,12 +777,18 @@ static void forget_original_parent(struct task_struct *father)
780 reaper = find_new_reaper(father); 777 reaper = find_new_reaper(father);
781 778
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 779 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 780 struct task_struct *t = p;
784 if (p->parent == father) { 781 do {
785 BUG_ON(task_ptrace(p)); 782 t->real_parent = reaper;
786 p->parent = p->real_parent; 783 if (t->parent == father) {
787 } 784 BUG_ON(task_ptrace(t));
788 reparent_thread(father, p, &dead_children); 785 t->parent = t->real_parent;
786 }
787 if (t->pdeath_signal)
788 group_send_sig_info(t->pdeath_signal,
789 SEND_SIG_NOINFO, t);
790 } while_each_thread(p, t);
791 reparent_leader(father, p, &dead_children);
789 } 792 }
790 write_unlock_irq(&tasklist_lock); 793 write_unlock_irq(&tasklist_lock);
791 794
@@ -933,7 +936,7 @@ NORET_TYPE void do_exit(long code)
933 * an exiting task cleaning up the robust pi futexes. 936 * an exiting task cleaning up the robust pi futexes.
934 */ 937 */
935 smp_mb(); 938 smp_mb();
936 spin_unlock_wait(&tsk->pi_lock); 939 raw_spin_unlock_wait(&tsk->pi_lock);
937 940
938 if (unlikely(in_atomic())) 941 if (unlikely(in_atomic()))
939 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 942 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -971,7 +974,7 @@ NORET_TYPE void do_exit(long code)
971 exit_thread(); 974 exit_thread();
972 cgroup_exit(tsk, 1); 975 cgroup_exit(tsk, 1);
973 976
974 if (group_dead && tsk->signal->leader) 977 if (group_dead)
975 disassociate_ctty(1); 978 disassociate_ctty(1);
976 979
977 module_put(task_thread_info(tsk)->exec_domain->module); 980 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -1551,14 +1554,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1551 struct task_struct *p; 1554 struct task_struct *p;
1552 1555
1553 list_for_each_entry(p, &tsk->children, sibling) { 1556 list_for_each_entry(p, &tsk->children, sibling) {
1554 /* 1557 int ret = wait_consider_task(wo, 0, p);
1555 * Do not consider detached threads. 1558 if (ret)
1556 */ 1559 return ret;
1557 if (!task_detached(p)) {
1558 int ret = wait_consider_task(wo, 0, p);
1559 if (ret)
1560 return ret;
1561 }
1562 } 1560 }
1563 1561
1564 return 0; 1562 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1415dc4598a..f88bd984df3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -939,9 +939,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
939 939
940static void rt_mutex_init_task(struct task_struct *p) 940static void rt_mutex_init_task(struct task_struct *p)
941{ 941{
942 spin_lock_init(&p->pi_lock); 942 raw_spin_lock_init(&p->pi_lock);
943#ifdef CONFIG_RT_MUTEXES 943#ifdef CONFIG_RT_MUTEXES
944 plist_head_init(&p->pi_waiters, &p->pi_lock); 944 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
945 p->pi_blocked_on = NULL; 945 p->pi_blocked_on = NULL;
946#endif 946#endif
947} 947}
@@ -1127,6 +1127,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127#ifdef CONFIG_DEBUG_MUTEXES 1127#ifdef CONFIG_DEBUG_MUTEXES
1128 p->blocked_on = NULL; /* not blocked yet */ 1128 p->blocked_on = NULL; /* not blocked yet */
1129#endif 1129#endif
1130#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1131 p->memcg_batch.do_batch = 0;
1132 p->memcg_batch.memcg = NULL;
1133#endif
1130 1134
1131 p->bts = NULL; 1135 p->bts = NULL;
1132 1136
@@ -1206,9 +1210,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1206 p->sas_ss_sp = p->sas_ss_size = 0; 1210 p->sas_ss_sp = p->sas_ss_size = 0;
1207 1211
1208 /* 1212 /*
1209 * Syscall tracing should be turned off in the child regardless 1213 * Syscall tracing and stepping should be turned off in the
1210 * of CLONE_PTRACE. 1214 * child regardless of CLONE_PTRACE.
1211 */ 1215 */
1216 user_disable_single_step(p);
1212 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1217 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1213#ifdef TIF_SYSCALL_EMU 1218#ifdef TIF_SYSCALL_EMU
1214 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1219 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1236,21 +1241,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1236 /* Need tasklist lock for parent etc handling! */ 1241 /* Need tasklist lock for parent etc handling! */
1237 write_lock_irq(&tasklist_lock); 1242 write_lock_irq(&tasklist_lock);
1238 1243
1239 /*
1240 * The task hasn't been attached yet, so its cpus_allowed mask will
1241 * not be changed, nor will its assigned CPU.
1242 *
1243 * The cpus_allowed mask of the parent may have changed after it was
1244 * copied first time - so re-copy it here, then check the child's CPU
1245 * to ensure it is on a valid CPU (and if not, just force it back to
1246 * parent's CPU). This avoids alot of nasty races.
1247 */
1248 p->cpus_allowed = current->cpus_allowed;
1249 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1251 !cpu_online(task_cpu(p))))
1252 set_task_cpu(p, smp_processor_id());
1253
1254 /* CLONE_PARENT re-uses the old parent */ 1244 /* CLONE_PARENT re-uses the old parent */
1255 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1245 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1256 p->real_parent = current->real_parent; 1246 p->real_parent = current->real_parent;
@@ -1286,7 +1276,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1286 } 1276 }
1287 1277
1288 if (likely(p->pid)) { 1278 if (likely(p->pid)) {
1289 list_add_tail(&p->sibling, &p->real_parent->children);
1290 tracehook_finish_clone(p, clone_flags, trace); 1279 tracehook_finish_clone(p, clone_flags, trace);
1291 1280
1292 if (thread_group_leader(p)) { 1281 if (thread_group_leader(p)) {
@@ -1298,6 +1287,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1298 p->signal->tty = tty_kref_get(current->signal->tty); 1287 p->signal->tty = tty_kref_get(current->signal->tty);
1299 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1288 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1300 attach_pid(p, PIDTYPE_SID, task_session(current)); 1289 attach_pid(p, PIDTYPE_SID, task_session(current));
1290 list_add_tail(&p->sibling, &p->real_parent->children);
1301 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1291 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1302 __get_cpu_var(process_counts)++; 1292 __get_cpu_var(process_counts)++;
1303 } 1293 }
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc4..e7a35f1039e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
203 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
205 * @key: address where result is stored. 205 * @key: address where result is stored.
206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
208 * 206 *
209 * Returns a negative error code or 0 207 * Returns a negative error code or 0
210 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
216 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
217 */ 215 */
218static int 216static int
219get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
220{ 218{
221 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
222 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
240 */ 238 */
241 if (!fshared) { 239 if (!fshared) {
242 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
243 return -EFAULT; 241 return -EFAULT;
244 key->private.mm = mm; 242 key->private.mm = mm;
245 key->private.address = address; 243 key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
248 } 246 }
249 247
250again: 248again:
251 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
252 if (err < 0) 250 if (err < 0)
253 return err; 251 return err;
254 252
@@ -304,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 302 */
305static int fault_in_user_writeable(u32 __user *uaddr) 303static int fault_in_user_writeable(u32 __user *uaddr)
306{ 304{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 305 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 306 int ret;
307
308 down_read(&mm->mmap_sem);
309 ret = get_user_pages(current, mm, (unsigned long)uaddr,
310 1, 1, 0, NULL, NULL);
311 up_read(&mm->mmap_sem);
312
309 return ret < 0 ? ret : 0; 313 return ret < 0 ? ret : 0;
310} 314}
311 315
@@ -397,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
397 * and has cleaned up the pi_state already 401 * and has cleaned up the pi_state already
398 */ 402 */
399 if (pi_state->owner) { 403 if (pi_state->owner) {
400 spin_lock_irq(&pi_state->owner->pi_lock); 404 raw_spin_lock_irq(&pi_state->owner->pi_lock);
401 list_del_init(&pi_state->list); 405 list_del_init(&pi_state->list);
402 spin_unlock_irq(&pi_state->owner->pi_lock); 406 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
403 407
404 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 408 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
405 } 409 }
@@ -464,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
464 * pi_state_list anymore, but we have to be careful 468 * pi_state_list anymore, but we have to be careful
465 * versus waiters unqueueing themselves: 469 * versus waiters unqueueing themselves:
466 */ 470 */
467 spin_lock_irq(&curr->pi_lock); 471 raw_spin_lock_irq(&curr->pi_lock);
468 while (!list_empty(head)) { 472 while (!list_empty(head)) {
469 473
470 next = head->next; 474 next = head->next;
471 pi_state = list_entry(next, struct futex_pi_state, list); 475 pi_state = list_entry(next, struct futex_pi_state, list);
472 key = pi_state->key; 476 key = pi_state->key;
473 hb = hash_futex(&key); 477 hb = hash_futex(&key);
474 spin_unlock_irq(&curr->pi_lock); 478 raw_spin_unlock_irq(&curr->pi_lock);
475 479
476 spin_lock(&hb->lock); 480 spin_lock(&hb->lock);
477 481
478 spin_lock_irq(&curr->pi_lock); 482 raw_spin_lock_irq(&curr->pi_lock);
479 /* 483 /*
480 * We dropped the pi-lock, so re-check whether this 484 * We dropped the pi-lock, so re-check whether this
481 * task still owns the PI-state: 485 * task still owns the PI-state:
@@ -489,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
489 WARN_ON(list_empty(&pi_state->list)); 493 WARN_ON(list_empty(&pi_state->list));
490 list_del_init(&pi_state->list); 494 list_del_init(&pi_state->list);
491 pi_state->owner = NULL; 495 pi_state->owner = NULL;
492 spin_unlock_irq(&curr->pi_lock); 496 raw_spin_unlock_irq(&curr->pi_lock);
493 497
494 rt_mutex_unlock(&pi_state->pi_mutex); 498 rt_mutex_unlock(&pi_state->pi_mutex);
495 499
496 spin_unlock(&hb->lock); 500 spin_unlock(&hb->lock);
497 501
498 spin_lock_irq(&curr->pi_lock); 502 raw_spin_lock_irq(&curr->pi_lock);
499 } 503 }
500 spin_unlock_irq(&curr->pi_lock); 504 raw_spin_unlock_irq(&curr->pi_lock);
501} 505}
502 506
503static int 507static int
@@ -526,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
526 return -EINVAL; 530 return -EINVAL;
527 531
528 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
529 WARN_ON(pid && pi_state->owner && 533
530 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
531 552
532 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
533 *ps = pi_state; 554 *ps = pi_state;
@@ -552,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
552 * change of the task flags, we do this protected by 573 * change of the task flags, we do this protected by
553 * p->pi_lock: 574 * p->pi_lock:
554 */ 575 */
555 spin_lock_irq(&p->pi_lock); 576 raw_spin_lock_irq(&p->pi_lock);
556 if (unlikely(p->flags & PF_EXITING)) { 577 if (unlikely(p->flags & PF_EXITING)) {
557 /* 578 /*
558 * The task is on the way out. When PF_EXITPIDONE is 579 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
561 */ 582 */
562 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 583 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
563 584
564 spin_unlock_irq(&p->pi_lock); 585 raw_spin_unlock_irq(&p->pi_lock);
565 put_task_struct(p); 586 put_task_struct(p);
566 return ret; 587 return ret;
567 } 588 }
@@ -580,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
580 WARN_ON(!list_empty(&pi_state->list)); 601 WARN_ON(!list_empty(&pi_state->list));
581 list_add(&pi_state->list, &p->pi_state_list); 602 list_add(&pi_state->list, &p->pi_state_list);
582 pi_state->owner = p; 603 pi_state->owner = p;
583 spin_unlock_irq(&p->pi_lock); 604 raw_spin_unlock_irq(&p->pi_lock);
584 605
585 put_task_struct(p); 606 put_task_struct(p);
586 607
@@ -754,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
754 if (!pi_state) 775 if (!pi_state)
755 return -EINVAL; 776 return -EINVAL;
756 777
757 spin_lock(&pi_state->pi_mutex.wait_lock); 778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
758 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
759 787
760 /* 788 /*
@@ -783,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
783 else if (curval != uval) 811 else if (curval != uval)
784 ret = -EINVAL; 812 ret = -EINVAL;
785 if (ret) { 813 if (ret) {
786 spin_unlock(&pi_state->pi_mutex.wait_lock); 814 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
787 return ret; 815 return ret;
788 } 816 }
789 } 817 }
790 818
791 spin_lock_irq(&pi_state->owner->pi_lock); 819 raw_spin_lock_irq(&pi_state->owner->pi_lock);
792 WARN_ON(list_empty(&pi_state->list)); 820 WARN_ON(list_empty(&pi_state->list));
793 list_del_init(&pi_state->list); 821 list_del_init(&pi_state->list);
794 spin_unlock_irq(&pi_state->owner->pi_lock); 822 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
795 823
796 spin_lock_irq(&new_owner->pi_lock); 824 raw_spin_lock_irq(&new_owner->pi_lock);
797 WARN_ON(!list_empty(&pi_state->list)); 825 WARN_ON(!list_empty(&pi_state->list));
798 list_add(&pi_state->list, &new_owner->pi_state_list); 826 list_add(&pi_state->list, &new_owner->pi_state_list);
799 pi_state->owner = new_owner; 827 pi_state->owner = new_owner;
800 spin_unlock_irq(&new_owner->pi_lock); 828 raw_spin_unlock_irq(&new_owner->pi_lock);
801 829
802 spin_unlock(&pi_state->pi_mutex.wait_lock); 830 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
803 rt_mutex_unlock(&pi_state->pi_mutex); 831 rt_mutex_unlock(&pi_state->pi_mutex);
804 832
805 return 0; 833 return 0;
@@ -861,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
861 if (!bitset) 889 if (!bitset)
862 return -EINVAL; 890 return -EINVAL;
863 891
864 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
865 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
866 goto out; 894 goto out;
867 895
@@ -907,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
907 int ret, op_ret; 935 int ret, op_ret;
908 936
909retry: 937retry:
910 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
911 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
912 goto out; 940 goto out;
913 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
914 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
915 goto out_put_key1; 943 goto out_put_key1;
916 944
@@ -1004,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1004 plist_add(&q->list, &hb2->chain); 1032 plist_add(&q->list, &hb2->chain);
1005 q->lock_ptr = &hb2->lock; 1033 q->lock_ptr = &hb2->lock;
1006#ifdef CONFIG_DEBUG_PI_LIST 1034#ifdef CONFIG_DEBUG_PI_LIST
1007 q->list.plist.lock = &hb2->lock; 1035 q->list.plist.spinlock = &hb2->lock;
1008#endif 1036#endif
1009 } 1037 }
1010 get_futex_key_refs(key2); 1038 get_futex_key_refs(key2);
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1040 1068
1041 q->lock_ptr = &hb->lock; 1069 q->lock_ptr = &hb->lock;
1042#ifdef CONFIG_DEBUG_PI_LIST 1070#ifdef CONFIG_DEBUG_PI_LIST
1043 q->list.plist.lock = &hb->lock; 1071 q->list.plist.spinlock = &hb->lock;
1044#endif 1072#endif
1045 1073
1046 wake_up_state(q->task, TASK_NORMAL); 1074 wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
1169 pi_state = NULL; 1197 pi_state = NULL;
1170 } 1198 }
1171 1199
1172 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1173 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1174 goto out; 1202 goto out;
1175 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1176 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1177 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1178 goto out_put_key1; 1205 goto out_put_key1;
1179 1206
@@ -1388,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1388 1415
1389 plist_node_init(&q->list, prio); 1416 plist_node_init(&q->list, prio);
1390#ifdef CONFIG_DEBUG_PI_LIST 1417#ifdef CONFIG_DEBUG_PI_LIST
1391 q->list.plist.lock = &hb->lock; 1418 q->list.plist.spinlock = &hb->lock;
1392#endif 1419#endif
1393 plist_add(&q->list, &hb->chain); 1420 plist_add(&q->list, &hb->chain);
1394 q->task = current; 1421 q->task = current;
@@ -1523,18 +1550,18 @@ retry:
1523 * itself. 1550 * itself.
1524 */ 1551 */
1525 if (pi_state->owner != NULL) { 1552 if (pi_state->owner != NULL) {
1526 spin_lock_irq(&pi_state->owner->pi_lock); 1553 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1527 WARN_ON(list_empty(&pi_state->list)); 1554 WARN_ON(list_empty(&pi_state->list));
1528 list_del_init(&pi_state->list); 1555 list_del_init(&pi_state->list);
1529 spin_unlock_irq(&pi_state->owner->pi_lock); 1556 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1530 } 1557 }
1531 1558
1532 pi_state->owner = newowner; 1559 pi_state->owner = newowner;
1533 1560
1534 spin_lock_irq(&newowner->pi_lock); 1561 raw_spin_lock_irq(&newowner->pi_lock);
1535 WARN_ON(!list_empty(&pi_state->list)); 1562 WARN_ON(!list_empty(&pi_state->list));
1536 list_add(&pi_state->list, &newowner->pi_state_list); 1563 list_add(&pi_state->list, &newowner->pi_state_list);
1537 spin_unlock_irq(&newowner->pi_lock); 1564 raw_spin_unlock_irq(&newowner->pi_lock);
1538 return 0; 1565 return 0;
1539 1566
1540 /* 1567 /*
@@ -1732,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1732 */ 1759 */
1733retry: 1760retry:
1734 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1735 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1736 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1737 return ret; 1764 return ret;
1738 1765
@@ -1898,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1898 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1899retry: 1926retry:
1900 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1901 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1902 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1903 goto out; 1930 goto out;
1904 1931
@@ -1968,7 +1995,7 @@ retry_private:
1968 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1969 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1970 1997
1971 goto out; 1998 goto out_put_key;
1972 1999
1973out_unlock_put_key: 2000out_unlock_put_key:
1974 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2017,7 +2044,7 @@ retry:
2017 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2018 return -EPERM; 2045 return -EPERM;
2019 2046
2020 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2021 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2022 goto out; 2049 goto out;
2023 2050
@@ -2209,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2209 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2210 2237
2211 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2212 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2213 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2214 goto out; 2241 goto out;
2215 2242
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 3e1c36e7998..0086628b6e9 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
127 for (;;) { 127 for (;;) {
128 base = timer->base; 128 base = timer->base;
129 if (likely(base != NULL)) { 129 if (likely(base != NULL)) {
130 spin_lock_irqsave(&base->cpu_base->lock, *flags); 130 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
131 if (likely(base == timer->base)) 131 if (likely(base == timer->base))
132 return base; 132 return base;
133 /* The timer has migrated to another CPU: */ 133 /* The timer has migrated to another CPU: */
134 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 134 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
135 } 135 }
136 cpu_relax(); 136 cpu_relax();
137 } 137 }
@@ -208,13 +208,13 @@ again:
208 208
209 /* See the comment in lock_timer_base() */ 209 /* See the comment in lock_timer_base() */
210 timer->base = NULL; 210 timer->base = NULL;
211 spin_unlock(&base->cpu_base->lock); 211 raw_spin_unlock(&base->cpu_base->lock);
212 spin_lock(&new_base->cpu_base->lock); 212 raw_spin_lock(&new_base->cpu_base->lock);
213 213
214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
215 cpu = this_cpu; 215 cpu = this_cpu;
216 spin_unlock(&new_base->cpu_base->lock); 216 raw_spin_unlock(&new_base->cpu_base->lock);
217 spin_lock(&base->cpu_base->lock); 217 raw_spin_lock(&base->cpu_base->lock);
218 timer->base = base; 218 timer->base = base;
219 goto again; 219 goto again;
220 } 220 }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
230{ 230{
231 struct hrtimer_clock_base *base = timer->base; 231 struct hrtimer_clock_base *base = timer->base;
232 232
233 spin_lock_irqsave(&base->cpu_base->lock, *flags); 233 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
234 234
235 return base; 235 return base;
236} 236}
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg)
619 base = &__get_cpu_var(hrtimer_bases); 628 base = &__get_cpu_var(hrtimer_bases);
620 629
621 /* Adjust CLOCK_REALTIME offset */ 630 /* Adjust CLOCK_REALTIME offset */
622 spin_lock(&base->lock); 631 raw_spin_lock(&base->lock);
623 base->clock_base[CLOCK_REALTIME].offset = 632 base->clock_base[CLOCK_REALTIME].offset =
624 timespec_to_ktime(realtime_offset); 633 timespec_to_ktime(realtime_offset);
625 634
626 hrtimer_force_reprogram(base, 0); 635 hrtimer_force_reprogram(base, 0);
627 spin_unlock(&base->lock); 636 raw_spin_unlock(&base->lock);
628} 637}
629 638
630/* 639/*
@@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
685{ 694{
686 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 695 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
687 if (wakeup) { 696 if (wakeup) {
688 spin_unlock(&base->cpu_base->lock); 697 raw_spin_unlock(&base->cpu_base->lock);
689 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 698 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
690 spin_lock(&base->cpu_base->lock); 699 raw_spin_lock(&base->cpu_base->lock);
691 } else 700 } else
692 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 701 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
693 702
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
765static inline 790static inline
766void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 791void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
767{ 792{
768 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 793 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
769} 794}
770 795
771/** 796/**
@@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
1098 unsigned long flags; 1123 unsigned long flags;
1099 int i; 1124 int i;
1100 1125
1101 spin_lock_irqsave(&cpu_base->lock, flags); 1126 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1102 1127
1103 if (!hrtimer_hres_active()) { 1128 if (!hrtimer_hres_active()) {
1104 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
1115 } 1140 }
1116 } 1141 }
1117 1142
1118 spin_unlock_irqrestore(&cpu_base->lock, flags); 1143 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1119 1144
1120 if (mindelta.tv64 < 0) 1145 if (mindelta.tv64 < 0)
1121 mindelta.tv64 = 0; 1146 mindelta.tv64 = 0;
@@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1197 * they get migrated to another cpu, therefore its safe to unlock 1222 * they get migrated to another cpu, therefore its safe to unlock
1198 * the timer base. 1223 * the timer base.
1199 */ 1224 */
1200 spin_unlock(&cpu_base->lock); 1225 raw_spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now); 1226 trace_hrtimer_expire_entry(timer, now);
1202 restart = fn(timer); 1227 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer); 1228 trace_hrtimer_expire_exit(timer);
1204 spin_lock(&cpu_base->lock); 1229 raw_spin_lock(&cpu_base->lock);
1205 1230
1206 /* 1231 /*
1207 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1232 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1217,29 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
1242}
1243/* 1245/*
1244 * High resolution timer interrupt 1246 * High resolution timer interrupt
1245 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1248,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1248{ 1250{
1249 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1250 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1251 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1252 int nr_retries = 0; 1254 int i, retries = 0;
1253 int i;
1254 1255
1255 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1256 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1257 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1258 1259
1259 retry: 1260 entry_time = now = ktime_get();
1260 /* 5 retries is enough to notice a hang */ 1261retry:
1261 if (!(++nr_retries % 5))
1262 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1263
1264 now = ktime_get();
1265
1266 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1267 1263
1268 spin_lock(&cpu_base->lock); 1264 raw_spin_lock(&cpu_base->lock);
1269 /* 1265 /*
1270 * We set expires_next to KTIME_MAX here with cpu_base->lock 1266 * We set expires_next to KTIME_MAX here with cpu_base->lock
1271 * held to prevent that a timer is enqueued in our queue via 1267 * held to prevent that a timer is enqueued in our queue via
@@ -1321,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1321 * against it. 1317 * against it.
1322 */ 1318 */
1323 cpu_base->expires_next = expires_next; 1319 cpu_base->expires_next = expires_next;
1324 spin_unlock(&cpu_base->lock); 1320 raw_spin_unlock(&cpu_base->lock);
1325 1321
1326 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1327 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1328 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1329 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1330 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1331} 1365}
1332 1366
1333/* 1367/*
@@ -1423,7 +1457,7 @@ void hrtimer_run_queues(void)
1423 gettime = 0; 1457 gettime = 0;
1424 } 1458 }
1425 1459
1426 spin_lock(&cpu_base->lock); 1460 raw_spin_lock(&cpu_base->lock);
1427 1461
1428 while ((node = base->first)) { 1462 while ((node = base->first)) {
1429 struct hrtimer *timer; 1463 struct hrtimer *timer;
@@ -1435,7 +1469,7 @@ void hrtimer_run_queues(void)
1435 1469
1436 __run_hrtimer(timer, &base->softirq_time); 1470 __run_hrtimer(timer, &base->softirq_time);
1437 } 1471 }
1438 spin_unlock(&cpu_base->lock); 1472 raw_spin_unlock(&cpu_base->lock);
1439 } 1473 }
1440} 1474}
1441 1475
@@ -1591,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1592 int i; 1626 int i;
1593 1627
1594 spin_lock_init(&cpu_base->lock); 1628 raw_spin_lock_init(&cpu_base->lock);
1595 1629
1596 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1630 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1597 cpu_base->clock_base[i].cpu_base = cpu_base; 1631 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1649,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
1649 * The caller is globally serialized and nobody else 1683 * The caller is globally serialized and nobody else
1650 * takes two locks at once, deadlock is not possible. 1684 * takes two locks at once, deadlock is not possible.
1651 */ 1685 */
1652 spin_lock(&new_base->lock); 1686 raw_spin_lock(&new_base->lock);
1653 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1687 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1654 1688
1655 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1689 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1656 migrate_hrtimer_list(&old_base->clock_base[i], 1690 migrate_hrtimer_list(&old_base->clock_base[i],
1657 &new_base->clock_base[i]); 1691 &new_base->clock_base[i]);
1658 } 1692 }
1659 1693
1660 spin_unlock(&old_base->lock); 1694 raw_spin_unlock(&old_base->lock);
1661 spin_unlock(&new_base->lock); 1695 raw_spin_unlock(&new_base->lock);
1662 1696
1663 /* Check, if we got expired work to do */ 1697 /* Check, if we got expired work to do */
1664 __hrtimer_peek_ahead_timers(); 1698 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee162841..967e66143e1 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/cpu.h>
43#include <linux/smp.h> 44#include <linux/smp.h>
44 45
45#include <linux/hw_breakpoint.h> 46#include <linux/hw_breakpoint.h>
@@ -52,7 +53,7 @@
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53 54
54/* Number of pinned task breakpoints in a cpu */ 55/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); 56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
56 57
57/* Number of non-pinned cpu/task breakpoints in a cpu */ 58/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +74,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
73static unsigned int max_task_bp_pinned(int cpu) 74static unsigned int max_task_bp_pinned(int cpu)
74{ 75{
75 int i; 76 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); 77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
77 78
78 for (i = HBP_NUM -1; i >= 0; i--) { 79 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0) 80 if (tsk_pinned[i] > 0)
@@ -83,15 +84,51 @@ static unsigned int max_task_bp_pinned(int cpu)
83 return 0; 84 return 0;
84} 85}
85 86
87static int task_bp_pinned(struct task_struct *tsk)
88{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list;
91 struct perf_event *bp;
92 unsigned long flags;
93 int count = 0;
94
95 if (WARN_ONCE(!ctx, "No perf context for this task"))
96 return 0;
97
98 list = &ctx->event_list;
99
100 raw_spin_lock_irqsave(&ctx->lock, flags);
101
102 /*
103 * The current breakpoint counter is not included in the list
104 * at the open() callback time
105 */
106 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++;
109 }
110
111 raw_spin_unlock_irqrestore(&ctx->lock, flags);
112
113 return count;
114}
115
86/* 116/*
87 * Report the number of pinned/un-pinned breakpoints we have in 117 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1). 118 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */ 119 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) 120static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
91{ 122{
123 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task;
125
92 if (cpu >= 0) { 126 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu); 128 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu);
130 else
131 slots->pinned += task_bp_pinned(tsk);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu); 132 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96 133
97 return; 134 return;
@@ -101,7 +138,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
101 unsigned int nr; 138 unsigned int nr;
102 139
103 nr = per_cpu(nr_cpu_bp_pinned, cpu); 140 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu); 141 if (!tsk)
142 nr += max_task_bp_pinned(cpu);
143 else
144 nr += task_bp_pinned(tsk);
105 145
106 if (nr > slots->pinned) 146 if (nr > slots->pinned)
107 slots->pinned = nr; 147 slots->pinned = nr;
@@ -118,35 +158,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
118 */ 158 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{ 160{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned; 161 unsigned int *tsk_pinned;
125 struct list_head *list; 162 int count = 0;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143 163
144 spin_unlock_irqrestore(&ctx->lock, flags); 164 count = task_bp_pinned(tsk);
145 165
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) 166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) { 167 if (enable) {
151 tsk_pinned[count]++; 168 tsk_pinned[count]++;
152 if (count > 0) 169 if (count > 0)
@@ -193,7 +210,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
193 * - If attached to a single cpu, check: 210 * - If attached to a single cpu, check:
194 * 211 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 212 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM 213 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
197 * 214 *
198 * -> If there are already non-pinned counters in this cpu, it means 215 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them. 216 * there is already a free slot for them.
@@ -204,7 +221,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
204 * - If attached to every cpus, check: 221 * - If attached to every cpus, check:
205 * 222 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 223 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM 224 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
208 * 225 *
209 * -> This is roughly the same, except we check the number of per cpu 226 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks 227 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +233,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
216 * - If attached to a single cpu, check: 233 * - If attached to a single cpu, check:
217 * 234 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 235 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM 236 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
220 * 237 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 238 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed). 239 * one register at least (or they will never be fed).
@@ -224,42 +241,74 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
224 * - If attached to every cpus, check: 241 * - If attached to every cpus, check:
225 * 242 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM 244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
228 */ 245 */
229int reserve_bp_slot(struct perf_event *bp) 246static int __reserve_bp_slot(struct perf_event *bp)
230{ 247{
231 struct bp_busy_slots slots = {0}; 248 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235 249
236 fetch_bp_busy_slots(&slots, bp->cpu); 250 fetch_bp_busy_slots(&slots, bp);
237 251
238 /* Flexible counters need to keep at least one slot */ 252 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
240 ret = -ENOSPC; 254 return -ENOSPC;
241 goto end;
242 }
243 255
244 toggle_bp_slot(bp, true); 256 toggle_bp_slot(bp, true);
245 257
246end: 258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
247 mutex_unlock(&nr_bp_mutex); 269 mutex_unlock(&nr_bp_mutex);
248 270
249 return ret; 271 return ret;
250} 272}
251 273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
252void release_bp_slot(struct perf_event *bp) 279void release_bp_slot(struct perf_event *bp)
253{ 280{
254 mutex_lock(&nr_bp_mutex); 281 mutex_lock(&nr_bp_mutex);
255 282
256 toggle_bp_slot(bp, false); 283 __release_bp_slot(bp);
257 284
258 mutex_unlock(&nr_bp_mutex); 285 mutex_unlock(&nr_bp_mutex);
259} 286}
260 287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
261 307
262int __register_perf_hw_breakpoint(struct perf_event *bp) 308 return 0;
309}
310
311int register_perf_hw_breakpoint(struct perf_event *bp)
263{ 312{
264 int ret; 313 int ret;
265 314
@@ -276,17 +325,14 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
276 * This is a quick hack that will be removed soon, once we remove 325 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace 326 * the tmp breakpoints from ptrace
278 */ 327 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event) 328 if (!bp->attr.disabled || !bp->overflow_handler)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281 330
282 return ret; 331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
283} 332 if (ret)
284 333 release_bp_slot(bp);
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288 334
289 return __register_perf_hw_breakpoint(bp); 335 return ret;
290} 336}
291 337
292/** 338/**
@@ -297,7 +343,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
297 */ 343 */
298struct perf_event * 344struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr, 345register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered, 346 perf_overflow_handler_t triggered,
301 struct task_struct *tsk) 347 struct task_struct *tsk)
302{ 348{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 349 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +357,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
311 * @triggered: callback to trigger when we hit the breakpoint 357 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs 358 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */ 359 */
314struct perf_event * 360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{ 361{
319 /* 362 u64 old_addr = bp->attr.bp_addr;
320 * FIXME: do it without unregistering 363 u64 old_len = bp->attr.bp_len;
321 * - We don't want to lose our slot 364 int old_type = bp->attr.bp_type;
322 * - If the new bp is incorrect, don't lose the older one 365 int err = 0;
323 */
324 unregister_hw_breakpoint(bp);
325 366
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 367 perf_event_disable(bp);
368
369 bp->attr.bp_addr = attr->bp_addr;
370 bp->attr.bp_type = attr->bp_type;
371 bp->attr.bp_len = attr->bp_len;
372
373 if (attr->disabled)
374 goto end;
375
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
377 if (!err)
378 perf_event_enable(bp);
379
380 if (err) {
381 bp->attr.bp_addr = old_addr;
382 bp->attr.bp_type = old_type;
383 bp->attr.bp_len = old_len;
384 if (!bp->attr.disabled)
385 perf_event_enable(bp);
386
387 return err;
388 }
389
390end:
391 bp->attr.disabled = attr->disabled;
392
393 return 0;
327} 394}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); 395EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329 396
@@ -348,7 +415,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
348 */ 415 */
349struct perf_event ** 416struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered) 418 perf_overflow_handler_t triggered)
352{ 419{
353 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event **cpu_events, **pevent, *bp;
354 long err; 421 long err;
@@ -358,7 +425,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
358 if (!cpu_events) 425 if (!cpu_events)
359 return ERR_PTR(-ENOMEM); 426 return ERR_PTR(-ENOMEM);
360 427
361 for_each_possible_cpu(cpu) { 428 get_online_cpus();
429 for_each_online_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu); 430 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364 432
@@ -369,18 +437,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
369 goto fail; 437 goto fail;
370 } 438 }
371 } 439 }
440 put_online_cpus();
372 441
373 return cpu_events; 442 return cpu_events;
374 443
375fail: 444fail:
376 for_each_possible_cpu(cpu) { 445 for_each_online_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu); 446 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent)) 447 if (IS_ERR(*pevent))
379 break; 448 break;
380 unregister_hw_breakpoint(*pevent); 449 unregister_hw_breakpoint(*pevent);
381 } 450 }
451 put_online_cpus();
452
382 free_percpu(cpu_events); 453 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err); 454 return ERR_PTR(err);
385} 455}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416..2295a31ef11 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ba566c261ad..ecc3fa28f66 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
34 } 34 }
35 35
36 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 37 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
52#endif 52#endif
53#endif 53#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 54 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 55}
56 56
57/** 57/**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 68 return;
69 } 69 }
70 70
71 spin_lock_irqsave(&desc->lock, flags); 71 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 72 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 73 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 75 irq);
76 return; 76 return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc); 84 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 85 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 86}
87 87
88 88
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 104 if (!chip)
105 chip = &no_irq_chip; 105 chip = &no_irq_chip;
106 106
107 spin_lock_irqsave(&desc->lock, flags); 107 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 108 irq_chip_set_defaults(chip);
109 desc->chip = chip; 109 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 110 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 111
112 return 0; 112 return 0;
113} 113}
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 133 if (type == IRQ_TYPE_NONE)
134 return 0; 134 return 0;
135 135
136 spin_lock_irqsave(&desc->lock, flags); 136 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 137 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 138 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 139 return ret;
140} 140}
141EXPORT_SYMBOL(set_irq_type); 141EXPORT_SYMBOL(set_irq_type);
@@ -158,9 +158,9 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 158 return -EINVAL;
159 } 159 }
160 160
161 spin_lock_irqsave(&desc->lock, flags); 161 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 162 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 163 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 164 return 0;
165} 165}
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
185 185
186 spin_lock_irqsave(&desc->lock, flags); 186 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 187 desc->msi_desc = entry;
188 if (entry) 188 if (entry)
189 entry->irq = irq; 189 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 191 return 0;
192} 192}
193 193
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 214 return -EINVAL;
215 } 215 }
216 216
217 spin_lock_irqsave(&desc->lock, flags); 217 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 218 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 219 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 220
221 return 0; 221 return 0;
222} 222}
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 241 if (!desc)
242 return; 242 return;
243 243
244 spin_lock_irqsave(&desc->lock, flags); 244 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 246 desc->status |= IRQ_NESTED_THREAD;
247 else 247 else
248 desc->status &= ~IRQ_NESTED_THREAD; 248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 249 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 252
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
343 343
344 might_sleep(); 344 might_sleep();
345 345
346 spin_lock_irq(&desc->lock); 346 raw_spin_lock_irq(&desc->lock);
347 347
348 kstat_incr_irqs_this_cpu(irq, desc); 348 kstat_incr_irqs_this_cpu(irq, desc);
349 349
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 352 goto out_unlock;
353 353
354 desc->status |= IRQ_INPROGRESS; 354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 355 raw_spin_unlock_irq(&desc->lock);
356 356
357 action_ret = action->thread_fn(action->irq, action->dev_id); 357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 359 note_interrupt(irq, desc, action_ret);
360 360
361 spin_lock_irq(&desc->lock); 361 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 362 desc->status &= ~IRQ_INPROGRESS;
363 363
364out_unlock: 364out_unlock:
365 spin_unlock_irq(&desc->lock); 365 raw_spin_unlock_irq(&desc->lock);
366} 366}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 367EXPORT_SYMBOL_GPL(handle_nested_irq);
368 368
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 384 struct irqaction *action;
385 irqreturn_t action_ret; 385 irqreturn_t action_ret;
386 386
387 spin_lock(&desc->lock); 387 raw_spin_lock(&desc->lock);
388 388
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 389 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 390 goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 396 goto out_unlock;
397 397
398 desc->status |= IRQ_INPROGRESS; 398 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 399 raw_spin_unlock(&desc->lock);
400 400
401 action_ret = handle_IRQ_event(irq, action); 401 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 402 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 403 note_interrupt(irq, desc, action_ret);
404 404
405 spin_lock(&desc->lock); 405 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 406 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 407out_unlock:
408 spin_unlock(&desc->lock); 408 raw_spin_unlock(&desc->lock);
409} 409}
410 410
411/** 411/**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 424 struct irqaction *action;
425 irqreturn_t action_ret; 425 irqreturn_t action_ret;
426 426
427 spin_lock(&desc->lock); 427 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 428 mask_ack_irq(desc, irq);
429 429
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 430 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 441 goto out_unlock;
442 442
443 desc->status |= IRQ_INPROGRESS; 443 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 444 raw_spin_unlock(&desc->lock);
445 445
446 action_ret = handle_IRQ_event(irq, action); 446 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 447 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 448 note_interrupt(irq, desc, action_ret);
449 449
450 spin_lock(&desc->lock); 450 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
452 452
453 if (unlikely(desc->status & IRQ_ONESHOT)) 453 if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
457out_unlock: 457out_unlock:
458 spin_unlock(&desc->lock); 458 raw_spin_unlock(&desc->lock);
459} 459}
460EXPORT_SYMBOL_GPL(handle_level_irq); 460EXPORT_SYMBOL_GPL(handle_level_irq);
461 461
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 475 struct irqaction *action;
476 irqreturn_t action_ret; 476 irqreturn_t action_ret;
477 477
478 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
479 479
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 480 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 481 goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
497 497
498 desc->status |= IRQ_INPROGRESS; 498 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 499 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 500 raw_spin_unlock(&desc->lock);
501 501
502 action_ret = handle_IRQ_event(irq, action); 502 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 503 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 504 note_interrupt(irq, desc, action_ret);
505 505
506 spin_lock(&desc->lock); 506 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 507 desc->status &= ~IRQ_INPROGRESS;
508out: 508out:
509 desc->chip->eoi(irq); 509 desc->chip->eoi(irq);
510 510
511 spin_unlock(&desc->lock); 511 raw_spin_unlock(&desc->lock);
512} 512}
513 513
514/** 514/**
@@ -530,7 +530,7 @@ out:
530void 530void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 531handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 532{
533 spin_lock(&desc->lock); 533 raw_spin_lock(&desc->lock);
534 534
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 536
@@ -576,17 +576,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
576 } 576 }
577 577
578 desc->status &= ~IRQ_PENDING; 578 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 579 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 580 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 581 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 582 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
584 584
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 586
587 desc->status &= ~IRQ_INPROGRESS; 587 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 588out_unlock:
589 spin_unlock(&desc->lock); 589 raw_spin_unlock(&desc->lock);
590} 590}
591 591
592/** 592/**
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 643 }
644 644
645 chip_bus_lock(irq, desc); 645 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 646 raw_spin_lock_irqsave(&desc->lock, flags);
647 647
648 /* Uninstall? */ 648 /* Uninstall? */
649 if (handle == handle_bad_irq) { 649 if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 661 desc->depth = 0;
662 desc->chip->startup(irq); 662 desc->chip->startup(irq);
663 } 663 }
664 spin_unlock_irqrestore(&desc->lock, flags); 664 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 665 chip_bus_sync_unlock(irq, desc);
666} 666}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 692 return;
693 } 693 }
694 694
695 spin_lock_irqsave(&desc->lock, flags); 695 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 696 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 697 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 698}
699 699
700void __init set_irq_probe(unsigned int irq) 700void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 707 return;
708 } 708 }
709 709
710 spin_lock_irqsave(&desc->lock, flags); 710 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 711 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 712 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 713}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c..814940e7f48 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = {
80 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
82 .depth = 1, 82 .depth = 1,
83 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84}; 84};
85 85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
108{ 108{
109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
110 110
111 spin_lock_init(&desc->lock); 111 raw_spin_lock_init(&desc->lock);
112 desc->irq = irq; 112 desc->irq = irq;
113#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
114 desc->node = node; 114 desc->node = node;
@@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
130/* 130/*
131 * Protect the sparse_irqs: 131 * Protect the sparse_irqs:
132 */ 132 */
133DEFINE_SPINLOCK(sparse_irq_lock); 133DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 134
135struct irq_desc **irq_desc_ptrs __read_mostly; 135struct irq_desc **irq_desc_ptrs __read_mostly;
136 136
@@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
141 .chip = &no_irq_chip, 141 .chip = &no_irq_chip,
142 .handle_irq = handle_bad_irq, 142 .handle_irq = handle_bad_irq,
143 .depth = 1, 143 .depth = 1,
144 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 144 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
145 } 145 }
146}; 146};
147 147
@@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
212 if (desc) 212 if (desc)
213 return desc; 213 return desc;
214 214
215 spin_lock_irqsave(&sparse_irq_lock, flags); 215 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 216
217 /* We have to check it to avoid races with another CPU */ 217 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 218 desc = irq_desc_ptrs[irq];
@@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
234 irq_desc_ptrs[irq] = desc; 234 irq_desc_ptrs[irq] = desc;
235 235
236out_unlock: 236out_unlock:
237 spin_unlock_irqrestore(&sparse_irq_lock, flags); 237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
238 238
239 return desc; 239 return desc;
240} 240}
@@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
247 .chip = &no_irq_chip, 247 .chip = &no_irq_chip,
248 .handle_irq = handle_bad_irq, 248 .handle_irq = handle_bad_irq,
249 .depth = 1, 249 .depth = 1,
250 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 250 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
251 } 251 }
252}; 252};
253 253
@@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
473 return 1; 473 return 1;
474 } 474 }
475 475
476 spin_lock(&desc->lock); 476 raw_spin_lock(&desc->lock);
477 if (desc->chip->ack) 477 if (desc->chip->ack)
478 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 /* 479 /*
@@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
517 for (;;) { 517 for (;;) {
518 irqreturn_t action_ret; 518 irqreturn_t action_ret;
519 519
520 spin_unlock(&desc->lock); 520 raw_spin_unlock(&desc->lock);
521 521
522 action_ret = handle_IRQ_event(irq, action); 522 action_ret = handle_IRQ_event(irq, action);
523 if (!noirqdebug) 523 if (!noirqdebug)
524 note_interrupt(irq, desc, action_ret); 524 note_interrupt(irq, desc, action_ret);
525 525
526 spin_lock(&desc->lock); 526 raw_spin_lock(&desc->lock);
527 if (likely(!(desc->status & IRQ_PENDING))) 527 if (likely(!(desc->status & IRQ_PENDING)))
528 break; 528 break;
529 desc->status &= ~IRQ_PENDING; 529 desc->status &= ~IRQ_PENDING;
@@ -536,7 +536,7 @@ out:
536 * disabled while the handler was running. 536 * disabled while the handler was running.
537 */ 537 */
538 desc->chip->end(irq); 538 desc->chip->end(irq);
539 spin_unlock(&desc->lock); 539 raw_spin_unlock(&desc->lock);
540 540
541 return 1; 541 return 1;
542} 542}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a7..b2821f070a3 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24/* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index bde4c667d24..eb6078ca60c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 485{
486 chip_bus_lock(irq, desc); 486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 487 raw_spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 490 desc->chip->unmask(irq);
491 } 491 }
492 spin_unlock_irq(&desc->lock); 492 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 493 chip_bus_sync_unlock(irq, desc);
494} 494}
495 495
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 514 return;
515 } 515 }
516 516
517 spin_lock_irq(&desc->lock); 517 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 519 raw_spin_unlock_irq(&desc->lock);
520 520
521 set_cpus_allowed_ptr(current, mask); 521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 522 free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
545 545
546 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
547 547
548 spin_lock_irq(&desc->lock); 548 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 549 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 550 /*
551 * CHECKME: We might need a dedicated 551 * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 555 * retriggers the interrupt itself --- tglx
556 */ 556 */
557 desc->status |= IRQ_PENDING; 557 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 558 raw_spin_unlock_irq(&desc->lock);
559 } else { 559 } else {
560 spin_unlock_irq(&desc->lock); 560 raw_spin_unlock_irq(&desc->lock);
561 561
562 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563 563
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 679 /*
680 * The following block of code has to be executed atomically 680 * The following block of code has to be executed atomically
681 */ 681 */
682 spin_lock_irqsave(&desc->lock, flags); 682 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 683 old_ptr = &desc->action;
684 old = *old_ptr; 684 old = *old_ptr;
685 if (old) { 685 if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 775 __enable_irq(desc, irq, false);
776 } 776 }
777 777
778 spin_unlock_irqrestore(&desc->lock, flags); 778 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 779
780 /* 780 /*
781 * Strictly no need to wake it up, but hung_task complains 781 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
802 ret = -EBUSY; 802 ret = -EBUSY;
803 803
804out_thread: 804out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 805 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 806 if (new->thread) {
807 struct task_struct *t = new->thread; 807 struct task_struct *t = new->thread;
808 808
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 844 if (!desc)
845 return NULL; 845 return NULL;
846 846
847 spin_lock_irqsave(&desc->lock, flags); 847 raw_spin_lock_irqsave(&desc->lock, flags);
848 848
849 /* 849 /*
850 * There can be multiple actions per IRQ descriptor, find the right 850 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 856
857 if (!action) { 857 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 858 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 859 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 860
861 return NULL; 861 return NULL;
862 } 862 }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 884 desc->chip->disable(irq);
885 } 885 }
886 886
887 spin_unlock_irqrestore(&desc->lock, flags); 887 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 888
889 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
890 890
@@ -1067,7 +1067,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1067 kfree(action); 1067 kfree(action);
1068 1068
1069#ifdef CONFIG_DEBUG_SHIRQ 1069#ifdef CONFIG_DEBUG_SHIRQ
1070 if (irqflags & IRQF_SHARED) { 1070 if (!retval && (irqflags & IRQF_SHARED)) {
1071 /* 1071 /*
1072 * It's a shared IRQ -- the driver ought to be prepared for it 1072 * It's a shared IRQ -- the driver ought to be prepared for it
1073 * to happen immediately, so let's make sure.... 1073 * to happen immediately, so let's make sure....
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f262..24196228083 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2..26bac9d8f86 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 67
68 irq = old_desc->irq; 68 irq = old_desc->irq;
69 69
70 spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 irq_desc_ptrs[irq] = desc;
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 100 return desc;
101 101
102out_unlock: 102out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 103 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 104
105 return desc; 105 return desc;
106} 106}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e7986..0d4005d85b0 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145fea9..6f50eccc79c 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -179,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
179 unsigned long flags; 179 unsigned long flags;
180 int ret = 1; 180 int ret = 1;
181 181
182 spin_lock_irqsave(&desc->lock, flags); 182 raw_spin_lock_irqsave(&desc->lock, flags);
183 for (action = desc->action ; action; action = action->next) { 183 for (action = desc->action ; action; action = action->next) {
184 if ((action != new_action) && action->name && 184 if ((action != new_action) && action->name &&
185 !strcmp(new_action->name, action->name)) { 185 !strcmp(new_action->name, action->name)) {
@@ -187,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
187 break; 187 break;
188 } 188 }
189 } 189 }
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return ret; 191 return ret;
192} 192}
193 193
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf2..89fb90ae534 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
220 /* 220 /*
221 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
222 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
223 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
224 * working systems 224 * working systems
225 */ 225 */
226 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/itimer.c b/kernel/itimer.c
index b03451ede52..d802883153d 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -146,6 +146,7 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
146{ 146{
147 cputime_t cval, nval, cinterval, ninterval; 147 cputime_t cval, nval, cinterval, ninterval;
148 s64 ns_ninterval, ns_nval; 148 s64 ns_ninterval, ns_nval;
149 u32 error, incr_error;
149 struct cpu_itimer *it = &tsk->signal->it[clock_id]; 150 struct cpu_itimer *it = &tsk->signal->it[clock_id];
150 151
151 nval = timeval_to_cputime(&value->it_value); 152 nval = timeval_to_cputime(&value->it_value);
@@ -153,8 +154,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
153 ninterval = timeval_to_cputime(&value->it_interval); 154 ninterval = timeval_to_cputime(&value->it_interval);
154 ns_ninterval = timeval_to_ns(&value->it_interval); 155 ns_ninterval = timeval_to_ns(&value->it_interval);
155 156
156 it->incr_error = cputime_sub_ns(ninterval, ns_ninterval); 157 error = cputime_sub_ns(nval, ns_nval);
157 it->error = cputime_sub_ns(nval, ns_nval); 158 incr_error = cputime_sub_ns(ninterval, ns_ninterval);
158 159
159 spin_lock_irq(&tsk->sighand->siglock); 160 spin_lock_irq(&tsk->sighand->siglock);
160 161
@@ -168,6 +169,8 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
168 } 169 }
169 it->expires = nval; 170 it->expires = nval;
170 it->incr = ninterval; 171 it->incr = ninterval;
172 it->error = error;
173 it->incr_error = incr_error;
171 trace_itimer_state(clock_id == CPUCLOCK_VIRT ? 174 trace_itimer_state(clock_id == CPUCLOCK_VIRT ?
172 ITIMER_VIRTUAL : ITIMER_PROF, value, nval); 175 ITIMER_VIRTUAL : ITIMER_PROF, value, nval);
173 176
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f9..ef077fb7315 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h> 24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h> 27#include <linux/suspend.h>
@@ -31,6 +31,8 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
34 36
35#include <asm/page.h> 37#include <asm/page.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1073 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1074 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1075 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1076 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1077 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1078 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,64 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1087 }
1083} 1088}
1084 1089
1090size_t crash_get_memory_size(void)
1091{
1092 size_t size;
1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex);
1096 return size;
1097}
1098
1099static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1100{
1101 unsigned long addr;
1102
1103 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1104 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1105 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1106 free_page((unsigned long)__va(addr));
1107 totalram_pages++;
1108 }
1109}
1110
1111int crash_shrink_memory(unsigned long new_size)
1112{
1113 int ret = 0;
1114 unsigned long start, end;
1115
1116 mutex_lock(&kexec_mutex);
1117
1118 if (kexec_crash_image) {
1119 ret = -ENOENT;
1120 goto unlock;
1121 }
1122 start = crashk_res.start;
1123 end = crashk_res.end;
1124
1125 if (new_size >= end - start + 1) {
1126 ret = -EINVAL;
1127 if (new_size == end - start + 1)
1128 ret = 0;
1129 goto unlock;
1130 }
1131
1132 start = roundup(start, PAGE_SIZE);
1133 end = roundup(start + new_size, PAGE_SIZE);
1134
1135 free_reserved_phys_range(end, crashk_res.end);
1136
1137 if (start == end) {
1138 crashk_res.end = end;
1139 release_resource(&crashk_res);
1140 } else
1141 crashk_res.end = end - 1;
1142
1143unlock:
1144 mutex_unlock(&kexec_mutex);
1145 return ret;
1146}
1147
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1148static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1149 size_t data_len)
1087{ 1150{
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bb..35edbe22e9a 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, NULL, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo); 100 _kfifo_init(fifo, NULL, 0);
103} 101}
104EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
105 103
106/** 104/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 105 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 106 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 107 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 108 */
119unsigned int __kfifo_put(struct kfifo *fifo, 109void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 110{
111 if (len < kfifo_len(fifo)) {
112 __kfifo_add_out(fifo, len);
113 return;
114 }
115 kfifo_reset_out(fifo);
116}
117EXPORT_SYMBOL(kfifo_skip);
118
119static inline void __kfifo_in_data(struct kfifo *fifo,
120 const void *from, unsigned int len, unsigned int off)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
124 len = min(len, fifo->size - fifo->in + fifo->out); 124 /*
125 * Ensure that we sample the fifo->out index -before- we
126 * start putting bytes into the kfifo.
127 */
128
129 smp_mb();
130
131 off = __kfifo_off(fifo, fifo->in + off);
132
133 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + off, from, l);
136
137 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, from + l, len - l);
139}
140
141static inline void __kfifo_out_data(struct kfifo *fifo,
142 void *to, unsigned int len, unsigned int off)
143{
144 unsigned int l;
145
146 /*
147 * Ensure that we sample the fifo->in index -before- we
148 * start removing bytes from the kfifo.
149 */
150
151 smp_rmb();
152
153 off = __kfifo_off(fifo, fifo->out + off);
154
155 /* first get the data from fifo->out until the end of the buffer */
156 l = min(len, fifo->size - off);
157 memcpy(to, fifo->buffer + off, l);
158
159 /* then get the rest (if any) from the beginning of the buffer */
160 memcpy(to + l, fifo->buffer, len - l);
161}
162
163static inline int __kfifo_from_user_data(struct kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
166{
167 unsigned int l;
168 int ret;
125 169
126 /* 170 /*
127 * Ensure that we sample the fifo->out index -before- we 171 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 174
131 smp_mb(); 175 smp_mb();
132 176
177 off = __kfifo_off(fifo, fifo->in + off);
178
133 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 180 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
136 187
137 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
192}
193
194static inline int __kfifo_to_user_data(struct kfifo *fifo,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
196{
197 unsigned int l;
198 int ret;
139 199
140 /* 200 /*
141 * Ensure that we add the bytes to the kfifo -before- 201 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 202 * start removing bytes from the kfifo.
143 */ 203 */
144 204
145 smp_wmb(); 205 smp_rmb();
206
207 off = __kfifo_off(fifo, fifo->out + off);
208
209 /* first get the data from fifo->out until the end of the buffer */
210 l = min(len, fifo->size - off);
211 ret = copy_to_user(to, fifo->buffer + off, l);
212 *lenout = l;
213 if (unlikely(ret)) {
214 *lenout -= ret;
215 return -EFAULT;
216 }
217
218 /* then get the rest (if any) from the beginning of the buffer */
219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
227}
228
229unsigned int __kfifo_in_n(struct kfifo *fifo,
230 const void *from, unsigned int len, unsigned int recsize)
231{
232 if (kfifo_avail(fifo) < len + recsize)
233 return len + 1;
234
235 __kfifo_in_data(fifo, from, len, recsize);
236 return 0;
237}
238EXPORT_SYMBOL(__kfifo_in_n);
146 239
147 fifo->in += len; 240/**
241 * kfifo_in - puts some data into the FIFO
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{
256 len = min(kfifo_avail(fifo), len);
148 257
258 __kfifo_in_data(fifo, from, len, 0);
259 __kfifo_add_in(fifo, len);
149 return len; 260 return len;
150} 261}
151EXPORT_SYMBOL(__kfifo_put); 262EXPORT_SYMBOL(kfifo_in);
263
264unsigned int __kfifo_in_generic(struct kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize)
266{
267 return __kfifo_in_rec(fifo, from, len, recsize);
268}
269EXPORT_SYMBOL(__kfifo_in_generic);
270
271unsigned int __kfifo_out_n(struct kfifo *fifo,
272 void *to, unsigned int len, unsigned int recsize)
273{
274 if (kfifo_len(fifo) < len + recsize)
275 return len;
276
277 __kfifo_out_data(fifo, to, len, recsize);
278 __kfifo_add_out(fifo, len + recsize);
279 return 0;
280}
281EXPORT_SYMBOL(__kfifo_out_n);
152 282
153/** 283/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 284 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 285 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 286 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 287 * @len: the size of the destination buffer.
158 * 288 *
159 * This function copies at most @len bytes from the FIFO into the 289 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 290 * @to buffer and returns the number of copied bytes.
161 * 291 *
162 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
164 */ 294 */
165unsigned int __kfifo_get(struct kfifo *fifo, 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 296{
168 unsigned int l; 297 len = min(kfifo_len(fifo), len);
169 298
170 len = min(len, fifo->in - fifo->out); 299 __kfifo_out_data(fifo, to, len, 0);
300 __kfifo_add_out(fifo, len);
171 301
172 /* 302 return len;
173 * Ensure that we sample the fifo->in index -before- we 303}
174 * start removing bytes from the kfifo. 304EXPORT_SYMBOL(kfifo_out);
175 */
176 305
177 smp_rmb(); 306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
178 321
179 /* first get the data from fifo->out until the end of the buffer */ 322 __kfifo_out_data(fifo, to, len, offset);
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 323 return len;
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 324}
325EXPORT_SYMBOL(kfifo_out_peek);
182 326
183 /* then get the rest (if any) from the beginning of the buffer */ 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
184 memcpy(buffer + l, fifo->buffer, len - l); 328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{
331 return __kfifo_out_rec(fifo, to, len, recsize, total);
332}
333EXPORT_SYMBOL(__kfifo_out_generic);
185 334
186 /* 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
187 * Ensure that we remove the bytes from the kfifo -before- 336 const void __user *from, unsigned int len, unsigned int recsize)
188 * we update the fifo->out index. 337{
189 */ 338 unsigned total;
190 339
191 smp_mb(); 340 if (kfifo_avail(fifo) < len + recsize)
341 return len + 1;
192 342
193 fifo->out += len; 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
345}
346EXPORT_SYMBOL(__kfifo_from_user_n);
194 347
195 return len; 348/**
349 * kfifo_from_user - puts some data from user space into the FIFO
350 * @fifo: the fifo to be used.
351 * @from: pointer to the data to be added.
352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
354 *
355 * This function copies at most @len bytes from the @from into the
356 * FIFO depending and returns -EFAULT/0.
357 *
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
196} 371}
197EXPORT_SYMBOL(__kfifo_get); 372EXPORT_SYMBOL(kfifo_from_user);
373
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
375 const void __user *from, unsigned int len, unsigned int recsize)
376{
377 return __kfifo_from_user_rec(fifo, from, len, recsize);
378}
379EXPORT_SYMBOL(__kfifo_from_user_generic);
380
381unsigned int __kfifo_to_user_n(struct kfifo *fifo,
382 void __user *to, unsigned int len, unsigned int reclen,
383 unsigned int recsize)
384{
385 unsigned int ret, total;
386
387 if (kfifo_len(fifo) < reclen + recsize)
388 return len;
389
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
391
392 if (likely(ret == 0))
393 __kfifo_add_out(fifo, reclen + recsize);
394
395 return total;
396}
397EXPORT_SYMBOL(__kfifo_to_user_n);
398
399/**
400 * kfifo_to_user - gets data from the FIFO and write it to user space
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{
415 int ret;
416 len = min(kfifo_len(fifo), len);
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
418 __kfifo_add_out(fifo, *lenout);
419 return ret;
420}
421EXPORT_SYMBOL(kfifo_to_user);
422
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
424 void __user *to, unsigned int len, unsigned int recsize,
425 unsigned int *total)
426{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
428}
429EXPORT_SYMBOL(__kfifo_to_user_generic);
430
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
432{
433 if (recsize == 0)
434 return kfifo_avail(fifo);
435
436 return __kfifo_peek_n(fifo, recsize);
437}
438EXPORT_SYMBOL(__kfifo_peek_generic);
439
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
441{
442 __kfifo_skip_rec(fifo, recsize);
443}
444EXPORT_SYMBOL(__kfifo_skip_generic);
445
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d701463402..761fdd2b303 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -577,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
577 smp_wmb(); 583 smp_wmb();
578 atomic_set(&cpu_in_kgdb[cpu], 1); 584 atomic_set(&cpu_in_kgdb[cpu], 1);
579 585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
580 /* Wait till primary CPU is done with debugging */ 589 /* Wait till primary CPU is done with debugging */
581 while (atomic_read(&passive_cpu_wait[cpu])) 590 while (atomic_read(&passive_cpu_wait[cpu]))
582 cpu_relax(); 591 cpu_relax();
@@ -590,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 599
591 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
594 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
595 local_irq_restore(flags); 604 local_irq_restore(flags);
596} 605}
@@ -619,7 +628,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 628static int kgdb_activate_sw_breakpoints(void)
620{ 629{
621 unsigned long addr; 630 unsigned long addr;
622 int error = 0; 631 int error;
632 int ret = 0;
623 int i; 633 int i;
624 634
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 635 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +639,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 639 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 640 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 641 kgdb_break[i].saved_instr);
632 if (error) 642 if (error) {
633 return error; 643 ret = error;
644 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
645 continue;
646 }
634 647
635 kgdb_flush_swbreak_addr(addr); 648 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 649 kgdb_break[i].state = BP_ACTIVE;
637 } 650 }
638 return 0; 651 return ret;
639} 652}
640 653
641static int kgdb_set_sw_break(unsigned long addr) 654static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +695,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 695static int kgdb_deactivate_sw_breakpoints(void)
683{ 696{
684 unsigned long addr; 697 unsigned long addr;
685 int error = 0; 698 int error;
699 int ret = 0;
686 int i; 700 int i;
687 701
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 702 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +705,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 705 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 706 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 707 kgdb_break[i].saved_instr);
694 if (error) 708 if (error) {
695 return error; 709 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
710 ret = error;
711 }
696 712
697 kgdb_flush_swbreak_addr(addr); 713 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 714 kgdb_break[i].state = BP_SET;
699 } 715 }
700 return 0; 716 return ret;
701} 717}
702 718
703static int kgdb_remove_sw_break(unsigned long addr) 719static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1220,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1220 return 1;
1205 1221
1206 } else { 1222 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1223 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1224 " and 15 (pass and disconnect)\n"
1225 "Executing a continue without signal passing\n", 0);
1226 remcom_in_buffer[0] = 'c';
1209 } 1227 }
1210 1228
1211 /* Indicate fall through */ 1229 /* Indicate fall through */
@@ -1395,6 +1413,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1413 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1414 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1415 unsigned long flags;
1416 int sstep_tries = 100;
1398 int error = 0; 1417 int error = 0;
1399 int i, cpu; 1418 int i, cpu;
1400 1419
@@ -1425,15 +1444,16 @@ acquirelock:
1425 cpu_relax(); 1444 cpu_relax();
1426 1445
1427 /* 1446 /*
1428 * Do not start the debugger connection on this CPU if the last 1447 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1448 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1449 * kernel will only try for the value of sstep_tries before
1450 * giving up and continuing on.
1431 */ 1451 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1452 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1453 (kgdb_info[cpu].task &&
1434 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1437 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1438 local_irq_restore(flags); 1458 local_irq_restore(flags);
1439 1459
@@ -1524,9 +1544,16 @@ acquirelock:
1524 } 1544 }
1525 1545
1526kgdb_restore: 1546kgdb_restore:
1547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1548 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1549 if (kgdb_info[sstep_cpu].task)
1550 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1551 else
1552 kgdb_sstep_pid = 0;
1553 }
1527 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1530 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1531 local_irq_restore(flags); 1558 local_irq_restore(flags);
1532 1559
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b10319036..bf0e231d970 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c4..b7df302a020 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1035,7 +1035,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1035 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1036 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1037#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * num_possible_cpus()); 1038 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1039#else 1039#else
1040 rp->maxactive = num_possible_cpus(); 1040 rp->maxactive = num_possible_cpus();
1041#endif 1041#endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7..3feaf5a7451 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e..fbb6222fe7e 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168
169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 p->rt.nr_cpus_allowed = 1;
171 p->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
153 * kthread_stop - stop a thread created by kthread_create(). 176 * kthread_stop - stop a thread created by kthread_create().
154 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
155 * 178 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d315..c62ec14609b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 73 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 74 * code to recurse back into the lockdep code...
75 */ 75 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 76static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 77
78static int graph_lock(void) 78static int graph_lock(void)
79{ 79{
80 __raw_spin_lock(&lockdep_lock); 80 arch_spin_lock(&lockdep_lock);
81 /* 81 /*
82 * Make sure that if another CPU detected a bug while 82 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 83 * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
85 * dropped already) 85 * dropped already)
86 */ 86 */
87 if (!debug_locks) { 87 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 88 arch_spin_unlock(&lockdep_lock);
89 return 0; 89 return 0;
90 } 90 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 91 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
95 95
96static inline int graph_unlock(void) 96static inline int graph_unlock(void)
97{ 97{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 98 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 99 return DEBUG_LOCKS_WARN_ON(1);
100 100
101 current->lockdep_recursion--; 101 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 102 arch_spin_unlock(&lockdep_lock);
103 return 0; 103 return 0;
104} 104}
105 105
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 111{
112 int ret = debug_locks_off(); 112 int ret = debug_locks_off();
113 113
114 __raw_spin_unlock(&lockdep_lock); 114 arch_spin_unlock(&lockdep_lock);
115 115
116 return ret; 116 return ret;
117} 117}
@@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 140}
141 141
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
144 cpu_lock_stats);
144 145
145static inline u64 lockstat_clock(void) 146static inline u64 lockstat_clock(void)
146{ 147{
@@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 169 if (time > lt->max)
169 lt->max = time; 170 lt->max = time;
170 171
171 if (time < lt->min || !lt->min) 172 if (time < lt->min || !lt->nr)
172 lt->min = time; 173 lt->min = time;
173 174
174 lt->total += time; 175 lt->total += time;
@@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 178
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 179static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 180{
180 dst->min += src->min; 181 if (!src->nr)
181 dst->max += src->max; 182 return;
183
184 if (src->max > dst->max)
185 dst->max = src->max;
186
187 if (src->min < dst->min || !dst->nr)
188 dst->min = src->min;
189
182 dst->total += src->total; 190 dst->total += src->total;
183 dst->nr += src->nr; 191 dst->nr += src->nr;
184} 192}
@@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
191 memset(&stats, 0, sizeof(struct lock_class_stats)); 199 memset(&stats, 0, sizeof(struct lock_class_stats));
192 for_each_possible_cpu(cpu) { 200 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *pcs = 201 struct lock_class_stats *pcs =
194 &per_cpu(lock_stats, cpu)[class - lock_classes]; 202 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
195 203
196 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 204 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
197 stats.contention_point[i] += pcs->contention_point[i]; 205 stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
218 226
219 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
220 struct lock_class_stats *cpu_stats = 228 struct lock_class_stats *cpu_stats =
221 &per_cpu(lock_stats, cpu)[class - lock_classes]; 229 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
222 230
223 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 231 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
224 } 232 }
@@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class)
228 236
229static struct lock_class_stats *get_lock_stats(struct lock_class *class) 237static struct lock_class_stats *get_lock_stats(struct lock_class *class)
230{ 238{
231 return &get_cpu_var(lock_stats)[class - lock_classes]; 239 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
232} 240}
233 241
234static void put_lock_stats(struct lock_class_stats *stats) 242static void put_lock_stats(struct lock_class_stats *stats)
235{ 243{
236 put_cpu_var(lock_stats); 244 put_cpu_var(cpu_lock_stats);
237} 245}
238 246
239static void lock_release_holdtime(struct held_lock *hlock) 247static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 387 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 388 * as incomplete, friggin useless </rant>
381 */ 389 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 390 if (trace->nr_entries != 0 &&
391 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 392 trace->nr_entries--;
384 393
385 trace->max_entries = trace->nr_entries; 394 trace->max_entries = trace->nr_entries;
@@ -1161,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1161 this.class = class; 1170 this.class = class;
1162 1171
1163 local_irq_save(flags); 1172 local_irq_save(flags);
1164 __raw_spin_lock(&lockdep_lock); 1173 arch_spin_lock(&lockdep_lock);
1165 ret = __lockdep_count_forward_deps(&this); 1174 ret = __lockdep_count_forward_deps(&this);
1166 __raw_spin_unlock(&lockdep_lock); 1175 arch_spin_unlock(&lockdep_lock);
1167 local_irq_restore(flags); 1176 local_irq_restore(flags);
1168 1177
1169 return ret; 1178 return ret;
@@ -1188,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1188 this.class = class; 1197 this.class = class;
1189 1198
1190 local_irq_save(flags); 1199 local_irq_save(flags);
1191 __raw_spin_lock(&lockdep_lock); 1200 arch_spin_lock(&lockdep_lock);
1192 ret = __lockdep_count_backward_deps(&this); 1201 ret = __lockdep_count_backward_deps(&this);
1193 __raw_spin_unlock(&lockdep_lock); 1202 arch_spin_unlock(&lockdep_lock);
1194 local_irq_restore(flags); 1203 local_irq_restore(flags);
1195 1204
1196 return ret; 1205 return ret;
@@ -2138,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2138 return ret; 2147 return ret;
2139 2148
2140 return print_irq_inversion_bug(curr, &root, target_entry, 2149 return print_irq_inversion_bug(curr, &root, target_entry,
2141 this, 1, irqclass); 2150 this, 0, irqclass);
2142} 2151}
2143 2152
2144void print_irqtrace_events(struct task_struct *curr) 2153void print_irqtrace_events(struct task_struct *curr)
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf05..f82386bd9ee 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
374
375static void *percpu_modalloc(unsigned long size, unsigned long align, 373static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name) 374 const char *name)
377{ 375{
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
395 free_percpu(freeme); 393 free_percpu(freeme);
396} 394}
397 395
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 }
419
420 /* Insert a new subblock */
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428}
429
430static inline unsigned int block_size(int val)
431{
432 if (val < 0)
433 return -val;
434 return val;
435}
436
437static void *percpu_modalloc(unsigned long size, unsigned long align,
438 const char *name)
439{
440 unsigned long extra;
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486}
487
488static void percpu_modfree(void *freeme)
489{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu;
493
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu));
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523}
524
525static int percpu_modinit(void)
526{
527 pcpu_num_used = 2;
528 pcpu_num_allocated = 2;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
545
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 396static unsigned int find_pcpusec(Elf_Ehdr *hdr,
547 Elf_Shdr *sechdrs, 397 Elf_Shdr *sechdrs,
548 const char *secstrings) 398 const char *secstrings)
@@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
1030} 880}
1031 881
1032#ifdef CONFIG_MODVERSIONS 882#ifdef CONFIG_MODVERSIONS
883/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
884static unsigned long maybe_relocated(unsigned long crc,
885 const struct module *crc_owner)
886{
887#ifdef ARCH_RELOCATES_KCRCTAB
888 if (crc_owner == NULL)
889 return crc - (unsigned long)reloc_start;
890#endif
891 return crc;
892}
893
1033static int check_version(Elf_Shdr *sechdrs, 894static int check_version(Elf_Shdr *sechdrs,
1034 unsigned int versindex, 895 unsigned int versindex,
1035 const char *symname, 896 const char *symname,
1036 struct module *mod, 897 struct module *mod,
1037 const unsigned long *crc) 898 const unsigned long *crc,
899 const struct module *crc_owner)
1038{ 900{
1039 unsigned int i, num_versions; 901 unsigned int i, num_versions;
1040 struct modversion_info *versions; 902 struct modversion_info *versions;
@@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
1055 if (strcmp(versions[i].name, symname) != 0) 917 if (strcmp(versions[i].name, symname) != 0)
1056 continue; 918 continue;
1057 919
1058 if (versions[i].crc == *crc) 920 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 921 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 922 DEBUGP("Found checksum %lX vs module %lX\n",
1061 *crc, versions[i].crc); 923 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 924 goto bad_version;
1063 } 925 }
1064 926
@@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 943 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false)) 944 &crc, true, false))
1083 BUG(); 945 BUG();
1084 return check_version(sechdrs, versindex, "module_layout", mod, crc); 946 return check_version(sechdrs, versindex, "module_layout", mod, crc,
947 NULL);
1085} 948}
1086 949
1087/* First part is kernel version, which we ignore if module has crcs. */ 950/* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
1099 unsigned int versindex, 962 unsigned int versindex,
1100 const char *symname, 963 const char *symname,
1101 struct module *mod, 964 struct module *mod,
1102 const unsigned long *crc) 965 const unsigned long *crc,
966 const struct module *crc_owner)
1103{ 967{
1104 return 1; 968 return 1;
1105} 969}
@@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1134 /* use_module can fail due to OOM, 998 /* use_module can fail due to OOM,
1135 or module initialization or unloading */ 999 or module initialization or unloading */
1136 if (sym) { 1000 if (sym) {
1137 if (!check_version(sechdrs, versindex, name, mod, crc) || 1001 if (!check_version(sechdrs, versindex, name, mod, crc, owner)
1138 !use_module(mod, owner)) 1002 || !use_module(mod, owner))
1139 sym = NULL; 1003 sym = NULL;
1140 } 1004 }
1141 return sym; 1005 return sym;
@@ -1146,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1146 * J. Corbet <corbet@lwn.net> 1010 * J. Corbet <corbet@lwn.net>
1147 */ 1011 */
1148#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1013
1014static inline bool sect_empty(const Elf_Shdr *sect)
1015{
1016 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1017}
1018
1149struct module_sect_attr 1019struct module_sect_attr
1150{ 1020{
1151 struct module_attribute mattr; 1021 struct module_attribute mattr;
@@ -1187,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1057
1188 /* Count loaded sections and allocate structures */ 1058 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1059 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC 1060 if (!sect_empty(&sechdrs[i]))
1191 && sechdrs[i].sh_size)
1192 nloaded++; 1061 nloaded++;
1193 size[0] = ALIGN(sizeof(*sect_attrs) 1062 size[0] = ALIGN(sizeof(*sect_attrs)
1194 + nloaded * sizeof(sect_attrs->attrs[0]), 1063 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1206 sattr = &sect_attrs->attrs[0]; 1075 sattr = &sect_attrs->attrs[0];
1207 gattr = &sect_attrs->grp.attrs[0]; 1076 gattr = &sect_attrs->grp.attrs[0];
1208 for (i = 0; i < nsect; i++) { 1077 for (i = 0; i < nsect; i++) {
1209 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1078 if (sect_empty(&sechdrs[i]))
1210 continue;
1211 if (!sechdrs[i].sh_size)
1212 continue; 1079 continue;
1213 sattr->address = sechdrs[i].sh_addr; 1080 sattr->address = sechdrs[i].sh_addr;
1214 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1081 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1292,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1292 /* Count notes sections and allocate structures. */ 1159 /* Count notes sections and allocate structures. */
1293 notes = 0; 1160 notes = 0;
1294 for (i = 0; i < nsect; i++) 1161 for (i = 0; i < nsect; i++)
1295 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1162 if (!sect_empty(&sechdrs[i]) &&
1296 (sechdrs[i].sh_type == SHT_NOTE)) 1163 (sechdrs[i].sh_type == SHT_NOTE))
1297 ++notes; 1164 ++notes;
1298 1165
@@ -1308,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1308 notes_attrs->notes = notes; 1175 notes_attrs->notes = notes;
1309 nattr = &notes_attrs->attrs[0]; 1176 nattr = &notes_attrs->attrs[0];
1310 for (loaded = i = 0; i < nsect; ++i) { 1177 for (loaded = i = 0; i < nsect; ++i) {
1311 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1178 if (sect_empty(&sechdrs[i]))
1312 continue; 1179 continue;
1313 if (sechdrs[i].sh_type == SHT_NOTE) { 1180 if (sechdrs[i].sh_type == SHT_NOTE) {
1314 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -2046,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2046 unsigned int i; 1913 unsigned int i;
2047 1914
2048 /* only scan the sections containing data */ 1915 /* only scan the sections containing data */
2049 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1916 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2050 (unsigned long)mod->module_core,
2051 sizeof(struct module), GFP_KERNEL);
2052 1917
2053 for (i = 1; i < hdr->e_shnum; i++) { 1918 for (i = 1; i < hdr->e_shnum; i++) {
2054 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1919 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2057,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2057 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1922 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2058 continue; 1923 continue;
2059 1924
2060 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1925 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
2061 (unsigned long)mod->module_core,
2062 sechdrs[i].sh_size, GFP_KERNEL); 1926 sechdrs[i].sh_size, GFP_KERNEL);
2063 } 1927 }
2064} 1928}
@@ -2386,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod,
2386 "_ftrace_events", 2250 "_ftrace_events",
2387 sizeof(*mod->trace_events), 2251 sizeof(*mod->trace_events),
2388 &mod->num_trace_events); 2252 &mod->num_trace_events);
2253 /*
2254 * This section contains pointers to allocated objects in the trace
2255 * code and not scanning it leads to false positives.
2256 */
2257 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2258 mod->num_trace_events, GFP_KERNEL);
2389#endif 2259#endif
2390#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2260#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2391 /* sechdrs[0].sh_size is always zero */ 2261 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a..57d527a16f9 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba..c787333282b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/debug_locks.h> 11#include <linux/debug_locks.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kmsg_dump.h>
13#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -81,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
81 */ 82 */
82 crash_kexec(NULL); 83 crash_kexec(NULL);
83 84
85 kmsg_dump(KMSG_DUMP_PANIC);
86
84 /* 87 /*
85 * Note smp_send_stop is the usual smp shutdown function, which 88 * Note smp_send_stop is the usual smp shutdown function, which
86 * unfortunately means it may not be hardened to work in a panic 89 * unfortunately means it may not be hardened to work in a panic
@@ -339,6 +342,7 @@ void oops_exit(void)
339{ 342{
340 do_oops_enter_exit(); 343 do_oops_enter_exit();
341 print_oops_end_marker(); 344 print_oops_end_marker();
345 kmsg_dump(KMSG_DUMP_OOPS);
342} 346}
343 347
344#ifdef WANT_WARN_ON_SLOWPATH 348#ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508..cf1b6918312 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
27 28
28#if 0 29#if 0
29#define DEBUGP printk 30#define DEBUGP printk
@@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
122 next = args + i; 123 next = args + i;
123 124
124 /* Chew up trailing spaces. */ 125 /* Chew up trailing spaces. */
125 while (isspace(*next)) 126 return skip_spaces(next);
126 next++;
127 return next;
128} 127}
129 128
130/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +138,7 @@ int parse_args(const char *name,
139 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
140 139
141 /* Chew leading spaces */ 140 /* Chew leading spaces */
142 while (isspace(*args)) 141 args = skip_spaces(args);
143 args++;
144 142
145 while (*args) { 143 while (*args) {
146 int ret; 144 int ret;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd6..2b19297742c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
36/* 36/*
37 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
38 */ 38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40 40
41int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -203,14 +203,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
203 * if so. If we locked the right context, then it 203 * if so. If we locked the right context, then it
204 * can't get swapped on us any more. 204 * can't get swapped on us any more.
205 */ 205 */
206 spin_lock_irqsave(&ctx->lock, *flags); 206 raw_spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 207 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags); 208 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry; 209 goto retry;
210 } 210 }
211 211
212 if (!atomic_inc_not_zero(&ctx->refcount)) { 212 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags); 213 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL; 214 ctx = NULL;
215 } 215 }
216 } 216 }
@@ -231,7 +231,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
231 ctx = perf_lock_task_context(task, &flags); 231 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) { 232 if (ctx) {
233 ++ctx->pin_count; 233 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags); 234 raw_spin_unlock_irqrestore(&ctx->lock, flags);
235 } 235 }
236 return ctx; 236 return ctx;
237} 237}
@@ -240,9 +240,9 @@ static void perf_unpin_context(struct perf_event_context *ctx)
240{ 240{
241 unsigned long flags; 241 unsigned long flags;
242 242
243 spin_lock_irqsave(&ctx->lock, flags); 243 raw_spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count; 244 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags); 245 raw_spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx); 246 put_ctx(ctx);
247} 247}
248 248
@@ -427,7 +427,7 @@ static void __perf_event_remove_from_context(void *info)
427 if (ctx->task && cpuctx->task_ctx != ctx) 427 if (ctx->task && cpuctx->task_ctx != ctx)
428 return; 428 return;
429 429
430 spin_lock(&ctx->lock); 430 raw_spin_lock(&ctx->lock);
431 /* 431 /*
432 * Protect the list operation against NMI by disabling the 432 * Protect the list operation against NMI by disabling the
433 * events on a global level. 433 * events on a global level.
@@ -449,7 +449,7 @@ static void __perf_event_remove_from_context(void *info)
449 } 449 }
450 450
451 perf_enable(); 451 perf_enable();
452 spin_unlock(&ctx->lock); 452 raw_spin_unlock(&ctx->lock);
453} 453}
454 454
455 455
@@ -476,7 +476,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
476 if (!task) { 476 if (!task) {
477 /* 477 /*
478 * Per cpu events are removed via an smp call and 478 * Per cpu events are removed via an smp call and
479 * the removal is always sucessful. 479 * the removal is always successful.
480 */ 480 */
481 smp_call_function_single(event->cpu, 481 smp_call_function_single(event->cpu,
482 __perf_event_remove_from_context, 482 __perf_event_remove_from_context,
@@ -488,12 +488,12 @@ retry:
488 task_oncpu_function_call(task, __perf_event_remove_from_context, 488 task_oncpu_function_call(task, __perf_event_remove_from_context,
489 event); 489 event);
490 490
491 spin_lock_irq(&ctx->lock); 491 raw_spin_lock_irq(&ctx->lock);
492 /* 492 /*
493 * If the context is active we need to retry the smp call. 493 * If the context is active we need to retry the smp call.
494 */ 494 */
495 if (ctx->nr_active && !list_empty(&event->group_entry)) { 495 if (ctx->nr_active && !list_empty(&event->group_entry)) {
496 spin_unlock_irq(&ctx->lock); 496 raw_spin_unlock_irq(&ctx->lock);
497 goto retry; 497 goto retry;
498 } 498 }
499 499
@@ -504,7 +504,7 @@ retry:
504 */ 504 */
505 if (!list_empty(&event->group_entry)) 505 if (!list_empty(&event->group_entry))
506 list_del_event(event, ctx); 506 list_del_event(event, ctx);
507 spin_unlock_irq(&ctx->lock); 507 raw_spin_unlock_irq(&ctx->lock);
508} 508}
509 509
510/* 510/*
@@ -535,7 +535,7 @@ static void __perf_event_disable(void *info)
535 if (ctx->task && cpuctx->task_ctx != ctx) 535 if (ctx->task && cpuctx->task_ctx != ctx)
536 return; 536 return;
537 537
538 spin_lock(&ctx->lock); 538 raw_spin_lock(&ctx->lock);
539 539
540 /* 540 /*
541 * If the event is on, turn it off. 541 * If the event is on, turn it off.
@@ -551,7 +551,7 @@ static void __perf_event_disable(void *info)
551 event->state = PERF_EVENT_STATE_OFF; 551 event->state = PERF_EVENT_STATE_OFF;
552 } 552 }
553 553
554 spin_unlock(&ctx->lock); 554 raw_spin_unlock(&ctx->lock);
555} 555}
556 556
557/* 557/*
@@ -567,7 +567,7 @@ static void __perf_event_disable(void *info)
567 * is the current context on this CPU and preemption is disabled, 567 * is the current context on this CPU and preemption is disabled,
568 * hence we can't get into perf_event_task_sched_out for this context. 568 * hence we can't get into perf_event_task_sched_out for this context.
569 */ 569 */
570static void perf_event_disable(struct perf_event *event) 570void perf_event_disable(struct perf_event *event)
571{ 571{
572 struct perf_event_context *ctx = event->ctx; 572 struct perf_event_context *ctx = event->ctx;
573 struct task_struct *task = ctx->task; 573 struct task_struct *task = ctx->task;
@@ -584,12 +584,12 @@ static void perf_event_disable(struct perf_event *event)
584 retry: 584 retry:
585 task_oncpu_function_call(task, __perf_event_disable, event); 585 task_oncpu_function_call(task, __perf_event_disable, event);
586 586
587 spin_lock_irq(&ctx->lock); 587 raw_spin_lock_irq(&ctx->lock);
588 /* 588 /*
589 * If the event is still active, we need to retry the cross-call. 589 * If the event is still active, we need to retry the cross-call.
590 */ 590 */
591 if (event->state == PERF_EVENT_STATE_ACTIVE) { 591 if (event->state == PERF_EVENT_STATE_ACTIVE) {
592 spin_unlock_irq(&ctx->lock); 592 raw_spin_unlock_irq(&ctx->lock);
593 goto retry; 593 goto retry;
594 } 594 }
595 595
@@ -602,7 +602,7 @@ static void perf_event_disable(struct perf_event *event)
602 event->state = PERF_EVENT_STATE_OFF; 602 event->state = PERF_EVENT_STATE_OFF;
603 } 603 }
604 604
605 spin_unlock_irq(&ctx->lock); 605 raw_spin_unlock_irq(&ctx->lock);
606} 606}
607 607
608static int 608static int
@@ -770,7 +770,7 @@ static void __perf_install_in_context(void *info)
770 cpuctx->task_ctx = ctx; 770 cpuctx->task_ctx = ctx;
771 } 771 }
772 772
773 spin_lock(&ctx->lock); 773 raw_spin_lock(&ctx->lock);
774 ctx->is_active = 1; 774 ctx->is_active = 1;
775 update_context_time(ctx); 775 update_context_time(ctx);
776 776
@@ -782,6 +782,9 @@ static void __perf_install_in_context(void *info)
782 782
783 add_event_to_ctx(event, ctx); 783 add_event_to_ctx(event, ctx);
784 784
785 if (event->cpu != -1 && event->cpu != smp_processor_id())
786 goto unlock;
787
785 /* 788 /*
786 * Don't put the event on if it is disabled or if 789 * Don't put the event on if it is disabled or if
787 * it is in a group and the group isn't on. 790 * it is in a group and the group isn't on.
@@ -820,7 +823,7 @@ static void __perf_install_in_context(void *info)
820 unlock: 823 unlock:
821 perf_enable(); 824 perf_enable();
822 825
823 spin_unlock(&ctx->lock); 826 raw_spin_unlock(&ctx->lock);
824} 827}
825 828
826/* 829/*
@@ -845,7 +848,7 @@ perf_install_in_context(struct perf_event_context *ctx,
845 if (!task) { 848 if (!task) {
846 /* 849 /*
847 * Per cpu events are installed via an smp call and 850 * Per cpu events are installed via an smp call and
848 * the install is always sucessful. 851 * the install is always successful.
849 */ 852 */
850 smp_call_function_single(cpu, __perf_install_in_context, 853 smp_call_function_single(cpu, __perf_install_in_context,
851 event, 1); 854 event, 1);
@@ -856,12 +859,12 @@ retry:
856 task_oncpu_function_call(task, __perf_install_in_context, 859 task_oncpu_function_call(task, __perf_install_in_context,
857 event); 860 event);
858 861
859 spin_lock_irq(&ctx->lock); 862 raw_spin_lock_irq(&ctx->lock);
860 /* 863 /*
861 * we need to retry the smp call. 864 * we need to retry the smp call.
862 */ 865 */
863 if (ctx->is_active && list_empty(&event->group_entry)) { 866 if (ctx->is_active && list_empty(&event->group_entry)) {
864 spin_unlock_irq(&ctx->lock); 867 raw_spin_unlock_irq(&ctx->lock);
865 goto retry; 868 goto retry;
866 } 869 }
867 870
@@ -872,7 +875,7 @@ retry:
872 */ 875 */
873 if (list_empty(&event->group_entry)) 876 if (list_empty(&event->group_entry))
874 add_event_to_ctx(event, ctx); 877 add_event_to_ctx(event, ctx);
875 spin_unlock_irq(&ctx->lock); 878 raw_spin_unlock_irq(&ctx->lock);
876} 879}
877 880
878/* 881/*
@@ -917,7 +920,7 @@ static void __perf_event_enable(void *info)
917 cpuctx->task_ctx = ctx; 920 cpuctx->task_ctx = ctx;
918 } 921 }
919 922
920 spin_lock(&ctx->lock); 923 raw_spin_lock(&ctx->lock);
921 ctx->is_active = 1; 924 ctx->is_active = 1;
922 update_context_time(ctx); 925 update_context_time(ctx);
923 926
@@ -925,6 +928,9 @@ static void __perf_event_enable(void *info)
925 goto unlock; 928 goto unlock;
926 __perf_event_mark_enabled(event, ctx); 929 __perf_event_mark_enabled(event, ctx);
927 930
931 if (event->cpu != -1 && event->cpu != smp_processor_id())
932 goto unlock;
933
928 /* 934 /*
929 * If the event is in a group and isn't the group leader, 935 * If the event is in a group and isn't the group leader,
930 * then don't put it on unless the group is on. 936 * then don't put it on unless the group is on.
@@ -959,7 +965,7 @@ static void __perf_event_enable(void *info)
959 } 965 }
960 966
961 unlock: 967 unlock:
962 spin_unlock(&ctx->lock); 968 raw_spin_unlock(&ctx->lock);
963} 969}
964 970
965/* 971/*
@@ -971,7 +977,7 @@ static void __perf_event_enable(void *info)
971 * perf_event_for_each_child or perf_event_for_each as described 977 * perf_event_for_each_child or perf_event_for_each as described
972 * for perf_event_disable. 978 * for perf_event_disable.
973 */ 979 */
974static void perf_event_enable(struct perf_event *event) 980void perf_event_enable(struct perf_event *event)
975{ 981{
976 struct perf_event_context *ctx = event->ctx; 982 struct perf_event_context *ctx = event->ctx;
977 struct task_struct *task = ctx->task; 983 struct task_struct *task = ctx->task;
@@ -985,7 +991,7 @@ static void perf_event_enable(struct perf_event *event)
985 return; 991 return;
986 } 992 }
987 993
988 spin_lock_irq(&ctx->lock); 994 raw_spin_lock_irq(&ctx->lock);
989 if (event->state >= PERF_EVENT_STATE_INACTIVE) 995 if (event->state >= PERF_EVENT_STATE_INACTIVE)
990 goto out; 996 goto out;
991 997
@@ -1000,10 +1006,10 @@ static void perf_event_enable(struct perf_event *event)
1000 event->state = PERF_EVENT_STATE_OFF; 1006 event->state = PERF_EVENT_STATE_OFF;
1001 1007
1002 retry: 1008 retry:
1003 spin_unlock_irq(&ctx->lock); 1009 raw_spin_unlock_irq(&ctx->lock);
1004 task_oncpu_function_call(task, __perf_event_enable, event); 1010 task_oncpu_function_call(task, __perf_event_enable, event);
1005 1011
1006 spin_lock_irq(&ctx->lock); 1012 raw_spin_lock_irq(&ctx->lock);
1007 1013
1008 /* 1014 /*
1009 * If the context is active and the event is still off, 1015 * If the context is active and the event is still off,
@@ -1020,7 +1026,7 @@ static void perf_event_enable(struct perf_event *event)
1020 __perf_event_mark_enabled(event, ctx); 1026 __perf_event_mark_enabled(event, ctx);
1021 1027
1022 out: 1028 out:
1023 spin_unlock_irq(&ctx->lock); 1029 raw_spin_unlock_irq(&ctx->lock);
1024} 1030}
1025 1031
1026static int perf_event_refresh(struct perf_event *event, int refresh) 1032static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1042,7 +1048,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1042{ 1048{
1043 struct perf_event *event; 1049 struct perf_event *event;
1044 1050
1045 spin_lock(&ctx->lock); 1051 raw_spin_lock(&ctx->lock);
1046 ctx->is_active = 0; 1052 ctx->is_active = 0;
1047 if (likely(!ctx->nr_events)) 1053 if (likely(!ctx->nr_events))
1048 goto out; 1054 goto out;
@@ -1055,7 +1061,7 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 } 1061 }
1056 perf_enable(); 1062 perf_enable();
1057 out: 1063 out:
1058 spin_unlock(&ctx->lock); 1064 raw_spin_unlock(&ctx->lock);
1059} 1065}
1060 1066
1061/* 1067/*
@@ -1193,8 +1199,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1193 * order we take the locks because no other cpu could 1199 * order we take the locks because no other cpu could
1194 * be trying to lock both of these tasks. 1200 * be trying to lock both of these tasks.
1195 */ 1201 */
1196 spin_lock(&ctx->lock); 1202 raw_spin_lock(&ctx->lock);
1197 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1203 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1198 if (context_equiv(ctx, next_ctx)) { 1204 if (context_equiv(ctx, next_ctx)) {
1199 /* 1205 /*
1200 * XXX do we need a memory barrier of sorts 1206 * XXX do we need a memory barrier of sorts
@@ -1208,8 +1214,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1208 1214
1209 perf_event_sync_stat(ctx, next_ctx); 1215 perf_event_sync_stat(ctx, next_ctx);
1210 } 1216 }
1211 spin_unlock(&next_ctx->lock); 1217 raw_spin_unlock(&next_ctx->lock);
1212 spin_unlock(&ctx->lock); 1218 raw_spin_unlock(&ctx->lock);
1213 } 1219 }
1214 rcu_read_unlock(); 1220 rcu_read_unlock();
1215 1221
@@ -1251,7 +1257,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1251 struct perf_event *event; 1257 struct perf_event *event;
1252 int can_add_hw = 1; 1258 int can_add_hw = 1;
1253 1259
1254 spin_lock(&ctx->lock); 1260 raw_spin_lock(&ctx->lock);
1255 ctx->is_active = 1; 1261 ctx->is_active = 1;
1256 if (likely(!ctx->nr_events)) 1262 if (likely(!ctx->nr_events))
1257 goto out; 1263 goto out;
@@ -1306,7 +1312,7 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1306 } 1312 }
1307 perf_enable(); 1313 perf_enable();
1308 out: 1314 out:
1309 spin_unlock(&ctx->lock); 1315 raw_spin_unlock(&ctx->lock);
1310} 1316}
1311 1317
1312/* 1318/*
@@ -1370,11 +1376,14 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1370 struct hw_perf_event *hwc; 1376 struct hw_perf_event *hwc;
1371 u64 interrupts, freq; 1377 u64 interrupts, freq;
1372 1378
1373 spin_lock(&ctx->lock); 1379 raw_spin_lock(&ctx->lock);
1374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1375 if (event->state != PERF_EVENT_STATE_ACTIVE) 1381 if (event->state != PERF_EVENT_STATE_ACTIVE)
1376 continue; 1382 continue;
1377 1383
1384 if (event->cpu != -1 && event->cpu != smp_processor_id())
1385 continue;
1386
1378 hwc = &event->hw; 1387 hwc = &event->hw;
1379 1388
1380 interrupts = hwc->interrupts; 1389 interrupts = hwc->interrupts;
@@ -1425,7 +1434,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1425 perf_enable(); 1434 perf_enable();
1426 } 1435 }
1427 } 1436 }
1428 spin_unlock(&ctx->lock); 1437 raw_spin_unlock(&ctx->lock);
1429} 1438}
1430 1439
1431/* 1440/*
@@ -1438,7 +1447,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1438 if (!ctx->nr_events) 1447 if (!ctx->nr_events)
1439 return; 1448 return;
1440 1449
1441 spin_lock(&ctx->lock); 1450 raw_spin_lock(&ctx->lock);
1442 /* 1451 /*
1443 * Rotate the first entry last (works just fine for group events too): 1452 * Rotate the first entry last (works just fine for group events too):
1444 */ 1453 */
@@ -1449,7 +1458,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
1449 } 1458 }
1450 perf_enable(); 1459 perf_enable();
1451 1460
1452 spin_unlock(&ctx->lock); 1461 raw_spin_unlock(&ctx->lock);
1453} 1462}
1454 1463
1455void perf_event_task_tick(struct task_struct *curr, int cpu) 1464void perf_event_task_tick(struct task_struct *curr, int cpu)
@@ -1498,7 +1507,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1498 1507
1499 __perf_event_task_sched_out(ctx); 1508 __perf_event_task_sched_out(ctx);
1500 1509
1501 spin_lock(&ctx->lock); 1510 raw_spin_lock(&ctx->lock);
1502 1511
1503 list_for_each_entry(event, &ctx->group_list, group_entry) { 1512 list_for_each_entry(event, &ctx->group_list, group_entry) {
1504 if (!event->attr.enable_on_exec) 1513 if (!event->attr.enable_on_exec)
@@ -1516,7 +1525,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1516 if (enabled) 1525 if (enabled)
1517 unclone_ctx(ctx); 1526 unclone_ctx(ctx);
1518 1527
1519 spin_unlock(&ctx->lock); 1528 raw_spin_unlock(&ctx->lock);
1520 1529
1521 perf_event_task_sched_in(task, smp_processor_id()); 1530 perf_event_task_sched_in(task, smp_processor_id());
1522 out: 1531 out:
@@ -1542,10 +1551,10 @@ static void __perf_event_read(void *info)
1542 if (ctx->task && cpuctx->task_ctx != ctx) 1551 if (ctx->task && cpuctx->task_ctx != ctx)
1543 return; 1552 return;
1544 1553
1545 spin_lock(&ctx->lock); 1554 raw_spin_lock(&ctx->lock);
1546 update_context_time(ctx); 1555 update_context_time(ctx);
1547 update_event_times(event); 1556 update_event_times(event);
1548 spin_unlock(&ctx->lock); 1557 raw_spin_unlock(&ctx->lock);
1549 1558
1550 event->pmu->read(event); 1559 event->pmu->read(event);
1551} 1560}
@@ -1563,10 +1572,10 @@ static u64 perf_event_read(struct perf_event *event)
1563 struct perf_event_context *ctx = event->ctx; 1572 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags; 1573 unsigned long flags;
1565 1574
1566 spin_lock_irqsave(&ctx->lock, flags); 1575 raw_spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx); 1576 update_context_time(ctx);
1568 update_event_times(event); 1577 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags); 1578 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1570 } 1579 }
1571 1580
1572 return atomic64_read(&event->count); 1581 return atomic64_read(&event->count);
@@ -1579,8 +1588,7 @@ static void
1579__perf_event_init_context(struct perf_event_context *ctx, 1588__perf_event_init_context(struct perf_event_context *ctx,
1580 struct task_struct *task) 1589 struct task_struct *task)
1581{ 1590{
1582 memset(ctx, 0, sizeof(*ctx)); 1591 raw_spin_lock_init(&ctx->lock);
1583 spin_lock_init(&ctx->lock);
1584 mutex_init(&ctx->mutex); 1592 mutex_init(&ctx->mutex);
1585 INIT_LIST_HEAD(&ctx->group_list); 1593 INIT_LIST_HEAD(&ctx->group_list);
1586 INIT_LIST_HEAD(&ctx->event_list); 1594 INIT_LIST_HEAD(&ctx->event_list);
@@ -1596,15 +1604,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1596 unsigned long flags; 1604 unsigned long flags;
1597 int err; 1605 int err;
1598 1606
1599 /* 1607 if (pid == -1 && cpu != -1) {
1600 * If cpu is not a wildcard then this is a percpu event:
1601 */
1602 if (cpu != -1) {
1603 /* Must be root to operate on a CPU event: */ 1608 /* Must be root to operate on a CPU event: */
1604 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1609 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1605 return ERR_PTR(-EACCES); 1610 return ERR_PTR(-EACCES);
1606 1611
1607 if (cpu < 0 || cpu > num_possible_cpus()) 1612 if (cpu < 0 || cpu >= nr_cpumask_bits)
1608 return ERR_PTR(-EINVAL); 1613 return ERR_PTR(-EINVAL);
1609 1614
1610 /* 1615 /*
@@ -1612,7 +1617,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1612 * offline CPU and activate it when the CPU comes up, but 1617 * offline CPU and activate it when the CPU comes up, but
1613 * that's for later. 1618 * that's for later.
1614 */ 1619 */
1615 if (!cpu_isset(cpu, cpu_online_map)) 1620 if (!cpu_online(cpu))
1616 return ERR_PTR(-ENODEV); 1621 return ERR_PTR(-ENODEV);
1617 1622
1618 cpuctx = &per_cpu(perf_cpu_context, cpu); 1623 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1650,11 +1655,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1650 ctx = perf_lock_task_context(task, &flags); 1655 ctx = perf_lock_task_context(task, &flags);
1651 if (ctx) { 1656 if (ctx) {
1652 unclone_ctx(ctx); 1657 unclone_ctx(ctx);
1653 spin_unlock_irqrestore(&ctx->lock, flags); 1658 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1654 } 1659 }
1655 1660
1656 if (!ctx) { 1661 if (!ctx) {
1657 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1662 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1658 err = -ENOMEM; 1663 err = -ENOMEM;
1659 if (!ctx) 1664 if (!ctx)
1660 goto errout; 1665 goto errout;
@@ -1988,7 +1993,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1988 if (!value) 1993 if (!value)
1989 return -EINVAL; 1994 return -EINVAL;
1990 1995
1991 spin_lock_irq(&ctx->lock); 1996 raw_spin_lock_irq(&ctx->lock);
1992 if (event->attr.freq) { 1997 if (event->attr.freq) {
1993 if (value > sysctl_perf_event_sample_rate) { 1998 if (value > sysctl_perf_event_sample_rate) {
1994 ret = -EINVAL; 1999 ret = -EINVAL;
@@ -2001,7 +2006,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2001 event->hw.sample_period = value; 2006 event->hw.sample_period = value;
2002 } 2007 }
2003unlock: 2008unlock:
2004 spin_unlock_irq(&ctx->lock); 2009 raw_spin_unlock_irq(&ctx->lock);
2005 2010
2006 return ret; 2011 return ret;
2007} 2012}
@@ -3263,6 +3268,12 @@ static void perf_event_task_output(struct perf_event *event,
3263 3268
3264static int perf_event_task_match(struct perf_event *event) 3269static int perf_event_task_match(struct perf_event *event)
3265{ 3270{
3271 if (event->state != PERF_EVENT_STATE_ACTIVE)
3272 return 0;
3273
3274 if (event->cpu != -1 && event->cpu != smp_processor_id())
3275 return 0;
3276
3266 if (event->attr.comm || event->attr.mmap || event->attr.task) 3277 if (event->attr.comm || event->attr.mmap || event->attr.task)
3267 return 1; 3278 return 1;
3268 3279
@@ -3288,12 +3299,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3288 rcu_read_lock(); 3299 rcu_read_lock();
3289 cpuctx = &get_cpu_var(perf_cpu_context); 3300 cpuctx = &get_cpu_var(perf_cpu_context);
3290 perf_event_task_ctx(&cpuctx->ctx, task_event); 3301 perf_event_task_ctx(&cpuctx->ctx, task_event);
3291 put_cpu_var(perf_cpu_context);
3292
3293 if (!ctx) 3302 if (!ctx)
3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3303 ctx = rcu_dereference(task_event->task->perf_event_ctxp);
3295 if (ctx) 3304 if (ctx)
3296 perf_event_task_ctx(ctx, task_event); 3305 perf_event_task_ctx(ctx, task_event);
3306 put_cpu_var(perf_cpu_context);
3297 rcu_read_unlock(); 3307 rcu_read_unlock();
3298} 3308}
3299 3309
@@ -3370,6 +3380,12 @@ static void perf_event_comm_output(struct perf_event *event,
3370 3380
3371static int perf_event_comm_match(struct perf_event *event) 3381static int perf_event_comm_match(struct perf_event *event)
3372{ 3382{
3383 if (event->state != PERF_EVENT_STATE_ACTIVE)
3384 return 0;
3385
3386 if (event->cpu != -1 && event->cpu != smp_processor_id())
3387 return 0;
3388
3373 if (event->attr.comm) 3389 if (event->attr.comm)
3374 return 1; 3390 return 1;
3375 3391
@@ -3406,15 +3422,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3406 rcu_read_lock(); 3422 rcu_read_lock();
3407 cpuctx = &get_cpu_var(perf_cpu_context); 3423 cpuctx = &get_cpu_var(perf_cpu_context);
3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3424 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3409 put_cpu_var(perf_cpu_context);
3410
3411 /*
3412 * doesn't really matter which of the child contexts the
3413 * events ends up in.
3414 */
3415 ctx = rcu_dereference(current->perf_event_ctxp); 3425 ctx = rcu_dereference(current->perf_event_ctxp);
3416 if (ctx) 3426 if (ctx)
3417 perf_event_comm_ctx(ctx, comm_event); 3427 perf_event_comm_ctx(ctx, comm_event);
3428 put_cpu_var(perf_cpu_context);
3418 rcu_read_unlock(); 3429 rcu_read_unlock();
3419} 3430}
3420 3431
@@ -3489,6 +3500,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3489static int perf_event_mmap_match(struct perf_event *event, 3500static int perf_event_mmap_match(struct perf_event *event,
3490 struct perf_mmap_event *mmap_event) 3501 struct perf_mmap_event *mmap_event)
3491{ 3502{
3503 if (event->state != PERF_EVENT_STATE_ACTIVE)
3504 return 0;
3505
3506 if (event->cpu != -1 && event->cpu != smp_processor_id())
3507 return 0;
3508
3492 if (event->attr.mmap) 3509 if (event->attr.mmap)
3493 return 1; 3510 return 1;
3494 3511
@@ -3562,15 +3579,10 @@ got_name:
3562 rcu_read_lock(); 3579 rcu_read_lock();
3563 cpuctx = &get_cpu_var(perf_cpu_context); 3580 cpuctx = &get_cpu_var(perf_cpu_context);
3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3581 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3565 put_cpu_var(perf_cpu_context);
3566
3567 /*
3568 * doesn't really matter which of the child contexts the
3569 * events ends up in.
3570 */
3571 ctx = rcu_dereference(current->perf_event_ctxp); 3582 ctx = rcu_dereference(current->perf_event_ctxp);
3572 if (ctx) 3583 if (ctx)
3573 perf_event_mmap_ctx(ctx, mmap_event); 3584 perf_event_mmap_ctx(ctx, mmap_event);
3585 put_cpu_var(perf_cpu_context);
3574 rcu_read_unlock(); 3586 rcu_read_unlock();
3575 3587
3576 kfree(buf); 3588 kfree(buf);
@@ -3861,6 +3873,9 @@ static int perf_swevent_match(struct perf_event *event,
3861 struct perf_sample_data *data, 3873 struct perf_sample_data *data,
3862 struct pt_regs *regs) 3874 struct pt_regs *regs)
3863{ 3875{
3876 if (event->cpu != -1 && event->cpu != smp_processor_id())
3877 return 0;
3878
3864 if (!perf_swevent_is_counting(event)) 3879 if (!perf_swevent_is_counting(event))
3865 return 0; 3880 return 0;
3866 3881
@@ -4011,6 +4026,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4011 event->pmu->read(event); 4026 event->pmu->read(event);
4012 4027
4013 data.addr = 0; 4028 data.addr = 0;
4029 data.raw = NULL;
4014 data.period = event->hw.last_period; 4030 data.period = event->hw.last_period;
4015 regs = get_irq_regs(); 4031 regs = get_irq_regs();
4016 /* 4032 /*
@@ -4080,8 +4096,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4080 u64 now; 4096 u64 now;
4081 4097
4082 now = cpu_clock(cpu); 4098 now = cpu_clock(cpu);
4083 prev = atomic64_read(&event->hw.prev_count); 4099 prev = atomic64_xchg(&event->hw.prev_count, now);
4084 atomic64_set(&event->hw.prev_count, now);
4085 atomic64_add(now - prev, &event->count); 4100 atomic64_add(now - prev, &event->count);
4086} 4101}
4087 4102
@@ -4286,15 +4301,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4301static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{ 4302{
4288 int err; 4303 int err;
4289 /* 4304
4290 * The breakpoint is already filled if we haven't created the counter 4305 err = register_perf_hw_breakpoint(bp);
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err) 4306 if (err)
4299 return ERR_PTR(err); 4307 return ERR_PTR(err);
4300 4308
@@ -4308,6 +4316,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4308 struct perf_sample_data sample; 4316 struct perf_sample_data sample;
4309 struct pt_regs *regs = data; 4317 struct pt_regs *regs = data;
4310 4318
4319 sample.raw = NULL;
4311 sample.addr = bp->attr.bp_addr; 4320 sample.addr = bp->attr.bp_addr;
4312 4321
4313 if (!perf_exclude_event(bp, regs)) 4322 if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4399,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4390 struct perf_event_context *ctx, 4399 struct perf_event_context *ctx,
4391 struct perf_event *group_leader, 4400 struct perf_event *group_leader,
4392 struct perf_event *parent_event, 4401 struct perf_event *parent_event,
4393 perf_callback_t callback, 4402 perf_overflow_handler_t overflow_handler,
4394 gfp_t gfpflags) 4403 gfp_t gfpflags)
4395{ 4404{
4396 const struct pmu *pmu; 4405 const struct pmu *pmu;
@@ -4433,10 +4442,10 @@ perf_event_alloc(struct perf_event_attr *attr,
4433 4442
4434 event->state = PERF_EVENT_STATE_INACTIVE; 4443 event->state = PERF_EVENT_STATE_INACTIVE;
4435 4444
4436 if (!callback && parent_event) 4445 if (!overflow_handler && parent_event)
4437 callback = parent_event->callback; 4446 overflow_handler = parent_event->overflow_handler;
4438 4447
4439 event->callback = callback; 4448 event->overflow_handler = overflow_handler;
4440 4449
4441 if (attr->disabled) 4450 if (attr->disabled)
4442 event->state = PERF_EVENT_STATE_OFF; 4451 event->state = PERF_EVENT_STATE_OFF;
@@ -4571,7 +4580,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4571 if (attr->type >= PERF_TYPE_MAX) 4580 if (attr->type >= PERF_TYPE_MAX)
4572 return -EINVAL; 4581 return -EINVAL;
4573 4582
4574 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 4583 if (attr->__reserved_1)
4575 return -EINVAL; 4584 return -EINVAL;
4576 4585
4577 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4586 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4724,7 +4733,7 @@ SYSCALL_DEFINE5(perf_event_open,
4724 if (IS_ERR(event)) 4733 if (IS_ERR(event))
4725 goto err_put_context; 4734 goto err_put_context;
4726 4735
4727 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4736 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4728 if (err < 0) 4737 if (err < 0)
4729 goto err_free_put_context; 4738 goto err_free_put_context;
4730 4739
@@ -4776,7 +4785,8 @@ err_put_context:
4776 */ 4785 */
4777struct perf_event * 4786struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 4787perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback) 4788 pid_t pid,
4789 perf_overflow_handler_t overflow_handler)
4780{ 4790{
4781 struct perf_event *event; 4791 struct perf_event *event;
4782 struct perf_event_context *ctx; 4792 struct perf_event_context *ctx;
@@ -4793,7 +4803,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4793 } 4803 }
4794 4804
4795 event = perf_event_alloc(attr, cpu, ctx, NULL, 4805 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL); 4806 NULL, overflow_handler, GFP_KERNEL);
4797 if (IS_ERR(event)) { 4807 if (IS_ERR(event)) {
4798 err = PTR_ERR(event); 4808 err = PTR_ERR(event);
4799 goto err_put_context; 4809 goto err_put_context;
@@ -4998,7 +5008,7 @@ void perf_event_exit_task(struct task_struct *child)
4998 * reading child->perf_event_ctxp, we wait until it has 5008 * reading child->perf_event_ctxp, we wait until it has
4999 * incremented the context's refcount before we do put_ctx below. 5009 * incremented the context's refcount before we do put_ctx below.
5000 */ 5010 */
5001 spin_lock(&child_ctx->lock); 5011 raw_spin_lock(&child_ctx->lock);
5002 child->perf_event_ctxp = NULL; 5012 child->perf_event_ctxp = NULL;
5003 /* 5013 /*
5004 * If this context is a clone; unclone it so it can't get 5014 * If this context is a clone; unclone it so it can't get
@@ -5007,7 +5017,7 @@ void perf_event_exit_task(struct task_struct *child)
5007 */ 5017 */
5008 unclone_ctx(child_ctx); 5018 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx); 5019 update_context_time(child_ctx);
5010 spin_unlock_irqrestore(&child_ctx->lock, flags); 5020 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5011 5021
5012 /* 5022 /*
5013 * Report the task dead after unscheduling the events so that we 5023 * Report the task dead after unscheduling the events so that we
@@ -5090,7 +5100,7 @@ again:
5090 */ 5100 */
5091int perf_event_init_task(struct task_struct *child) 5101int perf_event_init_task(struct task_struct *child)
5092{ 5102{
5093 struct perf_event_context *child_ctx, *parent_ctx; 5103 struct perf_event_context *child_ctx = NULL, *parent_ctx;
5094 struct perf_event_context *cloned_ctx; 5104 struct perf_event_context *cloned_ctx;
5095 struct perf_event *event; 5105 struct perf_event *event;
5096 struct task_struct *parent = current; 5106 struct task_struct *parent = current;
@@ -5106,20 +5116,6 @@ int perf_event_init_task(struct task_struct *child)
5106 return 0; 5116 return 0;
5107 5117
5108 /* 5118 /*
5109 * This is executed from the parent task context, so inherit
5110 * events that have been marked for cloning.
5111 * First allocate and initialize a context for the child.
5112 */
5113
5114 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5115 if (!child_ctx)
5116 return -ENOMEM;
5117
5118 __perf_event_init_context(child_ctx, child);
5119 child->perf_event_ctxp = child_ctx;
5120 get_task_struct(child);
5121
5122 /*
5123 * If the parent's context is a clone, pin it so it won't get 5119 * If the parent's context is a clone, pin it so it won't get
5124 * swapped under us. 5120 * swapped under us.
5125 */ 5121 */
@@ -5149,6 +5145,26 @@ int perf_event_init_task(struct task_struct *child)
5149 continue; 5145 continue;
5150 } 5146 }
5151 5147
5148 if (!child->perf_event_ctxp) {
5149 /*
5150 * This is executed from the parent task context, so
5151 * inherit events that have been marked for cloning.
5152 * First allocate and initialize a context for the
5153 * child.
5154 */
5155
5156 child_ctx = kzalloc(sizeof(struct perf_event_context),
5157 GFP_KERNEL);
5158 if (!child_ctx) {
5159 ret = -ENOMEM;
5160 break;
5161 }
5162
5163 __perf_event_init_context(child_ctx, child);
5164 child->perf_event_ctxp = child_ctx;
5165 get_task_struct(child);
5166 }
5167
5152 ret = inherit_group(event, parent, parent_ctx, 5168 ret = inherit_group(event, parent, parent_ctx,
5153 child, child_ctx); 5169 child, child_ctx);
5154 if (ret) { 5170 if (ret) {
@@ -5157,7 +5173,7 @@ int perf_event_init_task(struct task_struct *child)
5157 } 5173 }
5158 } 5174 }
5159 5175
5160 if (inherited_all) { 5176 if (child_ctx && inherited_all) {
5161 /* 5177 /*
5162 * Mark the child context as a clone of the parent 5178 * Mark the child context as a clone of the parent
5163 * context, or of whatever the parent is a clone of. 5179 * context, or of whatever the parent is a clone of.
@@ -5291,11 +5307,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5291 perf_reserved_percpu = val; 5307 perf_reserved_percpu = val;
5292 for_each_online_cpu(cpu) { 5308 for_each_online_cpu(cpu) {
5293 cpuctx = &per_cpu(perf_cpu_context, cpu); 5309 cpuctx = &per_cpu(perf_cpu_context, cpu);
5294 spin_lock_irq(&cpuctx->ctx.lock); 5310 raw_spin_lock_irq(&cpuctx->ctx.lock);
5295 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5311 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5296 perf_max_events - perf_reserved_percpu); 5312 perf_max_events - perf_reserved_percpu);
5297 cpuctx->max_pertask = mpt; 5313 cpuctx->max_pertask = mpt;
5298 spin_unlock_irq(&cpuctx->ctx.lock); 5314 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5299 } 5315 }
5300 spin_unlock(&perf_resource_lock); 5316 spin_unlock(&perf_resource_lock);
5301 5317
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9..2e17c9c92cb 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b..3db49b9ca37 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 5c9dc228747..438ff452351 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -384,7 +384,8 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
384 384
385/* 385/*
386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. 386 * Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
387 * This is called from sys_timer_create with the new timer already locked. 387 * This is called from sys_timer_create() and do_cpu_nanosleep() with the
388 * new timer already all-zeros initialized.
388 */ 389 */
389int posix_cpu_timer_create(struct k_itimer *new_timer) 390int posix_cpu_timer_create(struct k_itimer *new_timer)
390{ 391{
@@ -396,8 +397,6 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
396 return -EINVAL; 397 return -EINVAL;
397 398
398 INIT_LIST_HEAD(&new_timer->it.cpu.entry); 399 INIT_LIST_HEAD(&new_timer->it.cpu.entry);
399 new_timer->it.cpu.incr.sched = 0;
400 new_timer->it.cpu.expires.sched = 0;
401 400
402 read_lock(&tasklist_lock); 401 read_lock(&tasklist_lock);
403 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { 402 if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) {
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1d..218e5af9015 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/printk.c b/kernel/printk.c
index b5ac4d99c66..1751c456b71 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39 40
@@ -1405,4 +1406,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1405 return false; 1406 return false;
1406} 1407}
1407EXPORT_SYMBOL(printk_timed_ratelimit); 1408EXPORT_SYMBOL(printk_timed_ratelimit);
1409
1410static DEFINE_SPINLOCK(dump_list_lock);
1411static LIST_HEAD(dump_list);
1412
1413/**
1414 * kmsg_dump_register - register a kernel log dumper.
1415 * @dumper: pointer to the kmsg_dumper structure
1416 *
1417 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be
1419 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
1420 */
1421int kmsg_dump_register(struct kmsg_dumper *dumper)
1422{
1423 unsigned long flags;
1424 int err = -EBUSY;
1425
1426 /* The dump callback needs to be set */
1427 if (!dumper->dump)
1428 return -EINVAL;
1429
1430 spin_lock_irqsave(&dump_list_lock, flags);
1431 /* Don't allow registering multiple times */
1432 if (!dumper->registered) {
1433 dumper->registered = 1;
1434 list_add_tail(&dumper->list, &dump_list);
1435 err = 0;
1436 }
1437 spin_unlock_irqrestore(&dump_list_lock, flags);
1438
1439 return err;
1440}
1441EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442
1443/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dumper: pointer to the kmsg_dumper structure
1446 *
1447 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise.
1449 */
1450int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1451{
1452 unsigned long flags;
1453 int err = -EINVAL;
1454
1455 spin_lock_irqsave(&dump_list_lock, flags);
1456 if (dumper->registered) {
1457 dumper->registered = 0;
1458 list_del(&dumper->list);
1459 err = 0;
1460 }
1461 spin_unlock_irqrestore(&dump_list_lock, flags);
1462
1463 return err;
1464}
1465EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1466
1467static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic",
1470 [KMSG_DUMP_KEXEC] = "kexec",
1471};
1472
1473static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1474{
1475 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1476 return "unknown";
1477
1478 return kmsg_reasons[reason];
1479}
1480
1481/**
1482 * kmsg_dump - dump kernel log to kernel message dumpers.
1483 * @reason: the reason (oops, panic etc) for dumping
1484 *
1485 * Iterate through each of the dump devices and call the oops/panic
1486 * callbacks with the log buffer.
1487 */
1488void kmsg_dump(enum kmsg_dump_reason reason)
1489{
1490 unsigned long end;
1491 unsigned chars;
1492 struct kmsg_dumper *dumper;
1493 const char *s1, *s2;
1494 unsigned long l1, l2;
1495 unsigned long flags;
1496
1497 /* Theoretically, the log could move on after we do this, but
1498 there's not a lot we can do about that. The new messages
1499 will overwrite the start of what we dump. */
1500 spin_lock_irqsave(&logbuf_lock, flags);
1501 end = log_end & LOG_BUF_MASK;
1502 chars = logged_chars;
1503 spin_unlock_irqrestore(&logbuf_lock, flags);
1504
1505 if (logged_chars > end) {
1506 s1 = log_buf + log_buf_len - logged_chars + end;
1507 l1 = logged_chars - end;
1508
1509 s2 = log_buf;
1510 l2 = end;
1511 } else {
1512 s1 = "";
1513 l1 = 0;
1514
1515 s2 = log_buf + end - logged_chars;
1516 l2 = logged_chars;
1517 }
1518
1519 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
1520 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
1521 kmsg_to_str(reason));
1522 return;
1523 }
1524 list_for_each_entry(dumper, &dump_list, list)
1525 dumper->dump(dumper, reason, s1, l1, s2, l2);
1526 spin_unlock_irqrestore(&dump_list_lock, flags);
1527}
1408#endif 1528#endif
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a621a67ef4e..9bb52177af0 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -763,13 +763,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 763 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 764 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 765 }
766 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 766 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
767 completed = cur_ops->completed() - completed; 767 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 768 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 769 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 770 completed = RCU_TORTURE_PIPE_LEN;
771 } 771 }
772 ++__get_cpu_var(rcu_torture_batch)[completed]; 772 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
773 preempt_enable(); 773 preempt_enable();
774 cur_ops->readunlock(idx); 774 cur_ops->readunlock(idx);
775} 775}
@@ -818,13 +818,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 818 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 820 }
821 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
822 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 824 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
826 } 826 }
827 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
828 preempt_enable(); 828 preempt_enable();
829 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
830 schedule(); 830 schedule();
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3..c705a41b4ba 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b959..af96c1e4b54 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -308,35 +308,37 @@ static int find_resource(struct resource *root, struct resource *new,
308 void *alignf_data) 308 void *alignf_data)
309{ 309{
310 struct resource *this = root->child; 310 struct resource *this = root->child;
311 struct resource tmp = *new;
311 312
312 new->start = root->start; 313 tmp.start = root->start;
313 /* 314 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 315 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 316 * of this->start - 1 to tmp->end below would cause an underflow.
316 */ 317 */
317 if (this && this->start == 0) { 318 if (this && this->start == 0) {
318 new->start = this->end + 1; 319 tmp.start = this->end + 1;
319 this = this->sibling; 320 this = this->sibling;
320 } 321 }
321 for(;;) { 322 for(;;) {
322 if (this) 323 if (this)
323 new->end = this->start - 1; 324 tmp.end = this->start - 1;
324 else 325 else
325 new->end = root->end; 326 tmp.end = root->end;
326 if (new->start < min) 327 if (tmp.start < min)
327 new->start = min; 328 tmp.start = min;
328 if (new->end > max) 329 if (tmp.end > max)
329 new->end = max; 330 tmp.end = max;
330 new->start = ALIGN(new->start, align); 331 tmp.start = ALIGN(tmp.start, align);
331 if (alignf) 332 if (alignf)
332 alignf(alignf_data, new, size, align); 333 alignf(alignf_data, &tmp, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 334 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
334 new->end = new->start + size - 1; 335 new->start = tmp.start;
336 new->end = tmp.start + size - 1;
335 return 0; 337 return 0;
336 } 338 }
337 if (!this) 339 if (!this)
338 break; 340 break;
339 new->start = this->end + 1; 341 tmp.start = this->end + 1;
340 this = this->sibling; 342 this = this->sibling;
341 } 343 }
342 return -EBUSY; 344 return -EBUSY;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e..ddabb54bb5c 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e7..a9604815786 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a25..3a8fb30a91b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -298,7 +298,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
298 298
299#ifdef CONFIG_RT_GROUP_SCHED 299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); 301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */ 302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */ 303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 304#define root_task_group init_task_group
@@ -470,7 +470,7 @@ struct rt_rq {
470 u64 rt_time; 470 u64 rt_time;
471 u64 rt_runtime; 471 u64 rt_runtime;
472 /* Nests inside the rq lock: */ 472 /* Nests inside the rq lock: */
473 spinlock_t rt_runtime_lock; 473 raw_spinlock_t rt_runtime_lock;
474 474
475#ifdef CONFIG_RT_GROUP_SCHED 475#ifdef CONFIG_RT_GROUP_SCHED
476 unsigned long rt_nr_boosted; 476 unsigned long rt_nr_boosted;
@@ -525,7 +525,7 @@ static struct root_domain def_root_domain;
525 */ 525 */
526struct rq { 526struct rq {
527 /* runqueue lock: */ 527 /* runqueue lock: */
528 spinlock_t lock; 528 raw_spinlock_t lock;
529 529
530 /* 530 /*
531 * nr_running and cpu_load should be in the same cacheline because 531 * nr_running and cpu_load should be in the same cacheline because
@@ -685,7 +685,7 @@ inline void update_rq_clock(struct rq *rq)
685 */ 685 */
686int runqueue_is_locked(int cpu) 686int runqueue_is_locked(int cpu)
687{ 687{
688 return spin_is_locked(&cpu_rq(cpu)->lock); 688 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
689} 689}
690 690
691/* 691/*
@@ -814,6 +814,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 814 * default: 0.25ms
815 */ 815 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 816unsigned int sysctl_sched_shares_ratelimit = 250000;
817unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 818
818/* 819/*
819 * Inject some fuzzyness into changing the per-cpu group shares 820 * Inject some fuzzyness into changing the per-cpu group shares
@@ -892,7 +893,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
892 */ 893 */
893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 894 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
894 895
895 spin_unlock_irq(&rq->lock); 896 raw_spin_unlock_irq(&rq->lock);
896} 897}
897 898
898#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 899#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -916,9 +917,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
916 next->oncpu = 1; 917 next->oncpu = 1;
917#endif 918#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 919#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 spin_unlock_irq(&rq->lock); 920 raw_spin_unlock_irq(&rq->lock);
920#else 921#else
921 spin_unlock(&rq->lock); 922 raw_spin_unlock(&rq->lock);
922#endif 923#endif
923} 924}
924 925
@@ -948,10 +949,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
948{ 949{
949 for (;;) { 950 for (;;) {
950 struct rq *rq = task_rq(p); 951 struct rq *rq = task_rq(p);
951 spin_lock(&rq->lock); 952 raw_spin_lock(&rq->lock);
952 if (likely(rq == task_rq(p))) 953 if (likely(rq == task_rq(p)))
953 return rq; 954 return rq;
954 spin_unlock(&rq->lock); 955 raw_spin_unlock(&rq->lock);
955 } 956 }
956} 957}
957 958
@@ -968,10 +969,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
968 for (;;) { 969 for (;;) {
969 local_irq_save(*flags); 970 local_irq_save(*flags);
970 rq = task_rq(p); 971 rq = task_rq(p);
971 spin_lock(&rq->lock); 972 raw_spin_lock(&rq->lock);
972 if (likely(rq == task_rq(p))) 973 if (likely(rq == task_rq(p)))
973 return rq; 974 return rq;
974 spin_unlock_irqrestore(&rq->lock, *flags); 975 raw_spin_unlock_irqrestore(&rq->lock, *flags);
975 } 976 }
976} 977}
977 978
@@ -980,19 +981,19 @@ void task_rq_unlock_wait(struct task_struct *p)
980 struct rq *rq = task_rq(p); 981 struct rq *rq = task_rq(p);
981 982
982 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 983 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
983 spin_unlock_wait(&rq->lock); 984 raw_spin_unlock_wait(&rq->lock);
984} 985}
985 986
986static void __task_rq_unlock(struct rq *rq) 987static void __task_rq_unlock(struct rq *rq)
987 __releases(rq->lock) 988 __releases(rq->lock)
988{ 989{
989 spin_unlock(&rq->lock); 990 raw_spin_unlock(&rq->lock);
990} 991}
991 992
992static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 993static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
993 __releases(rq->lock) 994 __releases(rq->lock)
994{ 995{
995 spin_unlock_irqrestore(&rq->lock, *flags); 996 raw_spin_unlock_irqrestore(&rq->lock, *flags);
996} 997}
997 998
998/* 999/*
@@ -1005,7 +1006,7 @@ static struct rq *this_rq_lock(void)
1005 1006
1006 local_irq_disable(); 1007 local_irq_disable();
1007 rq = this_rq(); 1008 rq = this_rq();
1008 spin_lock(&rq->lock); 1009 raw_spin_lock(&rq->lock);
1009 1010
1010 return rq; 1011 return rq;
1011} 1012}
@@ -1052,10 +1053,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1052 1053
1053 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1054 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1054 1055
1055 spin_lock(&rq->lock); 1056 raw_spin_lock(&rq->lock);
1056 update_rq_clock(rq); 1057 update_rq_clock(rq);
1057 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1058 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1058 spin_unlock(&rq->lock); 1059 raw_spin_unlock(&rq->lock);
1059 1060
1060 return HRTIMER_NORESTART; 1061 return HRTIMER_NORESTART;
1061} 1062}
@@ -1068,10 +1069,10 @@ static void __hrtick_start(void *arg)
1068{ 1069{
1069 struct rq *rq = arg; 1070 struct rq *rq = arg;
1070 1071
1071 spin_lock(&rq->lock); 1072 raw_spin_lock(&rq->lock);
1072 hrtimer_restart(&rq->hrtick_timer); 1073 hrtimer_restart(&rq->hrtick_timer);
1073 rq->hrtick_csd_pending = 0; 1074 rq->hrtick_csd_pending = 0;
1074 spin_unlock(&rq->lock); 1075 raw_spin_unlock(&rq->lock);
1075} 1076}
1076 1077
1077/* 1078/*
@@ -1178,7 +1179,7 @@ static void resched_task(struct task_struct *p)
1178{ 1179{
1179 int cpu; 1180 int cpu;
1180 1181
1181 assert_spin_locked(&task_rq(p)->lock); 1182 assert_raw_spin_locked(&task_rq(p)->lock);
1182 1183
1183 if (test_tsk_need_resched(p)) 1184 if (test_tsk_need_resched(p))
1184 return; 1185 return;
@@ -1200,10 +1201,10 @@ static void resched_cpu(int cpu)
1200 struct rq *rq = cpu_rq(cpu); 1201 struct rq *rq = cpu_rq(cpu);
1201 unsigned long flags; 1202 unsigned long flags;
1202 1203
1203 if (!spin_trylock_irqsave(&rq->lock, flags)) 1204 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1204 return; 1205 return;
1205 resched_task(cpu_curr(cpu)); 1206 resched_task(cpu_curr(cpu));
1206 spin_unlock_irqrestore(&rq->lock, flags); 1207 raw_spin_unlock_irqrestore(&rq->lock, flags);
1207} 1208}
1208 1209
1209#ifdef CONFIG_NO_HZ 1210#ifdef CONFIG_NO_HZ
@@ -1272,7 +1273,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1272#else /* !CONFIG_SMP */ 1273#else /* !CONFIG_SMP */
1273static void resched_task(struct task_struct *p) 1274static void resched_task(struct task_struct *p)
1274{ 1275{
1275 assert_spin_locked(&task_rq(p)->lock); 1276 assert_raw_spin_locked(&task_rq(p)->lock);
1276 set_tsk_need_resched(p); 1277 set_tsk_need_resched(p);
1277} 1278}
1278 1279
@@ -1599,11 +1600,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1599 struct rq *rq = cpu_rq(cpu); 1600 struct rq *rq = cpu_rq(cpu);
1600 unsigned long flags; 1601 unsigned long flags;
1601 1602
1602 spin_lock_irqsave(&rq->lock, flags); 1603 raw_spin_lock_irqsave(&rq->lock, flags);
1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1604 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1605 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1605 __set_se_shares(tg->se[cpu], shares); 1606 __set_se_shares(tg->se[cpu], shares);
1606 spin_unlock_irqrestore(&rq->lock, flags); 1607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1607 } 1608 }
1608} 1609}
1609 1610
@@ -1614,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1615 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1616static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1617{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1618 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1619 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1620 struct sched_domain *sd = data;
1620 unsigned long flags; 1621 unsigned long flags;
@@ -1630,6 +1631,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1631 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1632 usd_rq_weight[i] = weight;
1632 1633
1634 rq_weight += weight;
1633 /* 1635 /*
1634 * If there are currently no tasks on the cpu pretend there 1636 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1637 * is one of average load so that when a new task gets to
@@ -1638,10 +1640,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1640 if (!weight)
1639 weight = NICE_0_LOAD; 1641 weight = NICE_0_LOAD;
1640 1642
1641 rq_weight += weight; 1643 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1644 shares += tg->cfs_rq[i]->shares;
1643 } 1645 }
1644 1646
1647 if (!rq_weight)
1648 rq_weight = sum_weight;
1649
1645 if ((!shares && rq_weight) || shares > tg->shares) 1650 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1651 shares = tg->shares;
1647 1652
@@ -1701,9 +1706,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1701 if (root_task_group_empty()) 1706 if (root_task_group_empty())
1702 return; 1707 return;
1703 1708
1704 spin_unlock(&rq->lock); 1709 raw_spin_unlock(&rq->lock);
1705 update_shares(sd); 1710 update_shares(sd);
1706 spin_lock(&rq->lock); 1711 raw_spin_lock(&rq->lock);
1707} 1712}
1708 1713
1709static void update_h_load(long cpu) 1714static void update_h_load(long cpu)
@@ -1743,7 +1748,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1743 __acquires(busiest->lock) 1748 __acquires(busiest->lock)
1744 __acquires(this_rq->lock) 1749 __acquires(this_rq->lock)
1745{ 1750{
1746 spin_unlock(&this_rq->lock); 1751 raw_spin_unlock(&this_rq->lock);
1747 double_rq_lock(this_rq, busiest); 1752 double_rq_lock(this_rq, busiest);
1748 1753
1749 return 1; 1754 return 1;
@@ -1764,14 +1769,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1764{ 1769{
1765 int ret = 0; 1770 int ret = 0;
1766 1771
1767 if (unlikely(!spin_trylock(&busiest->lock))) { 1772 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1768 if (busiest < this_rq) { 1773 if (busiest < this_rq) {
1769 spin_unlock(&this_rq->lock); 1774 raw_spin_unlock(&this_rq->lock);
1770 spin_lock(&busiest->lock); 1775 raw_spin_lock(&busiest->lock);
1771 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1776 raw_spin_lock_nested(&this_rq->lock,
1777 SINGLE_DEPTH_NESTING);
1772 ret = 1; 1778 ret = 1;
1773 } else 1779 } else
1774 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1780 raw_spin_lock_nested(&busiest->lock,
1781 SINGLE_DEPTH_NESTING);
1775 } 1782 }
1776 return ret; 1783 return ret;
1777} 1784}
@@ -1785,7 +1792,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1785{ 1792{
1786 if (unlikely(!irqs_disabled())) { 1793 if (unlikely(!irqs_disabled())) {
1787 /* printk() doesn't work good under rq->lock */ 1794 /* printk() doesn't work good under rq->lock */
1788 spin_unlock(&this_rq->lock); 1795 raw_spin_unlock(&this_rq->lock);
1789 BUG_ON(1); 1796 BUG_ON(1);
1790 } 1797 }
1791 1798
@@ -1795,7 +1802,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1795static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1802static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1796 __releases(busiest->lock) 1803 __releases(busiest->lock)
1797{ 1804{
1798 spin_unlock(&busiest->lock); 1805 raw_spin_unlock(&busiest->lock);
1799 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1800} 1807}
1801#endif 1808#endif
@@ -1810,6 +1817,22 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1817#endif
1811 1818
1812static void calc_load_account_active(struct rq *this_rq); 1819static void calc_load_account_active(struct rq *this_rq);
1820static void update_sysctl(void);
1821static int get_update_sysctl_factor(void);
1822
1823static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1824{
1825 set_task_rq(p, cpu);
1826#ifdef CONFIG_SMP
1827 /*
1828 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1829 * successfuly executed on another CPU. We must ensure that updates of
1830 * per-task data have been completed by this moment.
1831 */
1832 smp_wmb();
1833 task_thread_info(p)->cpu = cpu;
1834#endif
1835}
1813 1836
1814#include "sched_stats.h" 1837#include "sched_stats.h"
1815#include "sched_idletask.c" 1838#include "sched_idletask.c"
@@ -1967,20 +1990,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1990 return cpu_curr(task_cpu(p)) == p;
1968} 1991}
1969 1992
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1993static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1994 const struct sched_class *prev_class,
1986 int oldprio, int running) 1995 int oldprio, int running)
@@ -1993,39 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1993 p->sched_class->prio_changed(rq, p, oldprio, running); 2002 p->sched_class->prio_changed(rq, p, oldprio, running);
1994} 2003}
1995 2004
1996/**
1997 * kthread_bind - bind a just-created kthread to a cpu.
1998 * @p: thread created by kthread_create().
1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 update_rq_clock(rq);
2021 set_task_cpu(p, cpu);
2022 p->cpus_allowed = cpumask_of_cpu(cpu);
2023 p->rt.nr_cpus_allowed = 1;
2024 p->flags |= PF_THREAD_BOUND;
2025 spin_unlock_irqrestore(&rq->lock, flags);
2026}
2027EXPORT_SYMBOL(kthread_bind);
2028
2029#ifdef CONFIG_SMP 2005#ifdef CONFIG_SMP
2030/* 2006/*
2031 * Is this task likely cache-hot: 2007 * Is this task likely cache-hot:
@@ -2035,6 +2011,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2035{ 2011{
2036 s64 delta; 2012 s64 delta;
2037 2013
2014 if (p->sched_class != &fair_sched_class)
2015 return 0;
2016
2038 /* 2017 /*
2039 * Buddy candidates are cache hot: 2018 * Buddy candidates are cache hot:
2040 */ 2019 */
@@ -2043,9 +2022,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2043 &p->se == cfs_rq_of(&p->se)->last)) 2022 &p->se == cfs_rq_of(&p->se)->last))
2044 return 1; 2023 return 1;
2045 2024
2046 if (p->sched_class != &fair_sched_class)
2047 return 0;
2048
2049 if (sysctl_sched_migration_cost == -1) 2025 if (sysctl_sched_migration_cost == -1)
2050 return 1; 2026 return 1;
2051 if (sysctl_sched_migration_cost == 0) 2027 if (sysctl_sched_migration_cost == 0)
@@ -2056,38 +2032,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2056 return delta < (s64)sysctl_sched_migration_cost; 2032 return delta < (s64)sysctl_sched_migration_cost;
2057} 2033}
2058 2034
2059
2060void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2035void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2061{ 2036{
2062 int old_cpu = task_cpu(p); 2037#ifdef CONFIG_SCHED_DEBUG
2063 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2038 /*
2064 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2039 * We should never call set_task_cpu() on a blocked task,
2065 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2040 * ttwu() will sort out the placement.
2066 u64 clock_offset; 2041 */
2067 2042 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2068 clock_offset = old_rq->clock - new_rq->clock; 2043 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2044#endif
2069 2045
2070 trace_sched_migrate_task(p, new_cpu); 2046 trace_sched_migrate_task(p, new_cpu);
2071 2047
2072#ifdef CONFIG_SCHEDSTATS 2048 if (task_cpu(p) != new_cpu) {
2073 if (p->se.wait_start)
2074 p->se.wait_start -= clock_offset;
2075 if (p->se.sleep_start)
2076 p->se.sleep_start -= clock_offset;
2077 if (p->se.block_start)
2078 p->se.block_start -= clock_offset;
2079#endif
2080 if (old_cpu != new_cpu) {
2081 p->se.nr_migrations++; 2049 p->se.nr_migrations++;
2082#ifdef CONFIG_SCHEDSTATS 2050 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0);
2088 } 2051 }
2089 p->se.vruntime -= old_cfsrq->min_vruntime -
2090 new_cfsrq->min_vruntime;
2091 2052
2092 __set_task_cpu(p, new_cpu); 2053 __set_task_cpu(p, new_cpu);
2093} 2054}
@@ -2112,13 +2073,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2112 2073
2113 /* 2074 /*
2114 * If the task is not on a runqueue (and not running), then 2075 * If the task is not on a runqueue (and not running), then
2115 * it is sufficient to simply update the task's cpu field. 2076 * the next wake-up will properly place the task.
2116 */ 2077 */
2117 if (!p->se.on_rq && !task_running(rq, p)) { 2078 if (!p->se.on_rq && !task_running(rq, p))
2118 update_rq_clock(rq);
2119 set_task_cpu(p, dest_cpu);
2120 return 0; 2079 return 0;
2121 }
2122 2080
2123 init_completion(&req->done); 2081 init_completion(&req->done);
2124 req->task = p; 2082 req->task = p;
@@ -2323,6 +2281,75 @@ void task_oncpu_function_call(struct task_struct *p,
2323 preempt_enable(); 2281 preempt_enable();
2324} 2282}
2325 2283
2284#ifdef CONFIG_SMP
2285static int select_fallback_rq(int cpu, struct task_struct *p)
2286{
2287 int dest_cpu;
2288 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2289
2290 /* Look for allowed, online CPU in same node. */
2291 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2292 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2293 return dest_cpu;
2294
2295 /* Any allowed, online CPU? */
2296 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2297 if (dest_cpu < nr_cpu_ids)
2298 return dest_cpu;
2299
2300 /* No more Mr. Nice Guy. */
2301 if (dest_cpu >= nr_cpu_ids) {
2302 rcu_read_lock();
2303 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2304 rcu_read_unlock();
2305 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2306
2307 /*
2308 * Don't tell them about moving exiting tasks or
2309 * kernel threads (both mm NULL), since they never
2310 * leave kernel.
2311 */
2312 if (p->mm && printk_ratelimit()) {
2313 printk(KERN_INFO "process %d (%s) no "
2314 "longer affine to cpu%d\n",
2315 task_pid_nr(p), p->comm, cpu);
2316 }
2317 }
2318
2319 return dest_cpu;
2320}
2321
2322/*
2323 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2324 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2325 * by:
2326 *
2327 * exec: is unstable, retry loop
2328 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2329 */
2330static inline
2331int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2332{
2333 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2334
2335 /*
2336 * In order not to call set_task_cpu() on a blocking task we need
2337 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2338 * cpu.
2339 *
2340 * Since this is common to all placement strategies, this lives here.
2341 *
2342 * [ this allows ->select_task() to simply return task_cpu(p) and
2343 * not worry about this generic constraint ]
2344 */
2345 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2346 !cpu_online(cpu)))
2347 cpu = select_fallback_rq(task_cpu(p), p);
2348
2349 return cpu;
2350}
2351#endif
2352
2326/*** 2353/***
2327 * try_to_wake_up - wake up a thread 2354 * try_to_wake_up - wake up a thread
2328 * @p: the to-be-woken-up thread 2355 * @p: the to-be-woken-up thread
@@ -2374,17 +2401,18 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2374 if (task_contributes_to_load(p)) 2401 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--; 2402 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING; 2403 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags);
2378 2404
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2405 if (p->sched_class->task_waking)
2380 if (cpu != orig_cpu) { 2406 p->sched_class->task_waking(rq, p);
2381 local_irq_save(flags); 2407
2382 rq = cpu_rq(cpu); 2408 __task_rq_unlock(rq);
2383 update_rq_clock(rq); 2409
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu)
2384 set_task_cpu(p, cpu); 2412 set_task_cpu(p, cpu);
2385 local_irq_restore(flags); 2413
2386 } 2414 rq = __task_rq_lock(p);
2387 rq = task_rq_lock(p, &flags); 2415 update_rq_clock(rq);
2388 2416
2389 WARN_ON(p->state != TASK_WAKING); 2417 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p); 2418 cpu = task_cpu(p);
@@ -2440,8 +2468,8 @@ out_running:
2440 2468
2441 p->state = TASK_RUNNING; 2469 p->state = TASK_RUNNING;
2442#ifdef CONFIG_SMP 2470#ifdef CONFIG_SMP
2443 if (p->sched_class->task_wake_up) 2471 if (p->sched_class->task_woken)
2444 p->sched_class->task_wake_up(rq, p); 2472 p->sched_class->task_woken(rq, p);
2445 2473
2446 if (unlikely(rq->idle_stamp)) { 2474 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp; 2475 u64 delta = rq->clock - rq->idle_stamp;
@@ -2499,7 +2527,6 @@ static void __sched_fork(struct task_struct *p)
2499 p->se.avg_overlap = 0; 2527 p->se.avg_overlap = 0;
2500 p->se.start_runtime = 0; 2528 p->se.start_runtime = 0;
2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2529 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2503 2530
2504#ifdef CONFIG_SCHEDSTATS 2531#ifdef CONFIG_SCHEDSTATS
2505 p->se.wait_start = 0; 2532 p->se.wait_start = 0;
@@ -2521,7 +2548,6 @@ static void __sched_fork(struct task_struct *p)
2521 p->se.nr_failed_migrations_running = 0; 2548 p->se.nr_failed_migrations_running = 0;
2522 p->se.nr_failed_migrations_hot = 0; 2549 p->se.nr_failed_migrations_hot = 0;
2523 p->se.nr_forced_migrations = 0; 2550 p->se.nr_forced_migrations = 0;
2524 p->se.nr_forced2_migrations = 0;
2525 2551
2526 p->se.nr_wakeups = 0; 2552 p->se.nr_wakeups = 0;
2527 p->se.nr_wakeups_sync = 0; 2553 p->se.nr_wakeups_sync = 0;
@@ -2542,14 +2568,6 @@ static void __sched_fork(struct task_struct *p)
2542#ifdef CONFIG_PREEMPT_NOTIFIERS 2568#ifdef CONFIG_PREEMPT_NOTIFIERS
2543 INIT_HLIST_HEAD(&p->preempt_notifiers); 2569 INIT_HLIST_HEAD(&p->preempt_notifiers);
2544#endif 2570#endif
2545
2546 /*
2547 * We mark the process as running here, but have not actually
2548 * inserted it onto the runqueue yet. This guarantees that
2549 * nobody will actually run it, and a signal or other external
2550 * event cannot wake it up and insert it on the runqueue either.
2551 */
2552 p->state = TASK_RUNNING;
2553} 2571}
2554 2572
2555/* 2573/*
@@ -2558,9 +2576,14 @@ static void __sched_fork(struct task_struct *p)
2558void sched_fork(struct task_struct *p, int clone_flags) 2576void sched_fork(struct task_struct *p, int clone_flags)
2559{ 2577{
2560 int cpu = get_cpu(); 2578 int cpu = get_cpu();
2561 unsigned long flags;
2562 2579
2563 __sched_fork(p); 2580 __sched_fork(p);
2581 /*
2582 * We mark the process as waking here. This guarantees that
2583 * nobody will actually run it, and a signal or other external
2584 * event cannot wake it up and insert it on the runqueue either.
2585 */
2586 p->state = TASK_WAKING;
2564 2587
2565 /* 2588 /*
2566 * Revert to default priority/policy on fork if requested. 2589 * Revert to default priority/policy on fork if requested.
@@ -2592,13 +2615,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
2592 if (!rt_prio(p->prio)) 2615 if (!rt_prio(p->prio))
2593 p->sched_class = &fair_sched_class; 2616 p->sched_class = &fair_sched_class;
2594 2617
2595#ifdef CONFIG_SMP 2618 if (p->sched_class->task_fork)
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2619 p->sched_class->task_fork(p);
2597#endif 2620
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu); 2621 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602 2622
2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2623#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2604 if (likely(sched_info_on())) 2624 if (likely(sched_info_on()))
@@ -2627,28 +2647,35 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2627{ 2647{
2628 unsigned long flags; 2648 unsigned long flags;
2629 struct rq *rq; 2649 struct rq *rq;
2650 int cpu = get_cpu();
2651
2652#ifdef CONFIG_SMP
2653 /*
2654 * Fork balancing, do it here and not earlier because:
2655 * - cpus_allowed can change in the fork path
2656 * - any previously selected cpu might disappear through hotplug
2657 *
2658 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2659 * ->cpus_allowed is stable, we have preemption disabled, meaning
2660 * cpu_online_mask is stable.
2661 */
2662 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2663 set_task_cpu(p, cpu);
2664#endif
2630 2665
2631 rq = task_rq_lock(p, &flags); 2666 rq = task_rq_lock(p, &flags);
2632 BUG_ON(p->state != TASK_RUNNING); 2667 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING;
2633 update_rq_clock(rq); 2669 update_rq_clock(rq);
2634 2670 activate_task(rq, p, 0);
2635 if (!p->sched_class->task_new || !current->se.on_rq) {
2636 activate_task(rq, p, 0);
2637 } else {
2638 /*
2639 * Let the scheduling class do new task startup
2640 * management (if any):
2641 */
2642 p->sched_class->task_new(rq, p);
2643 inc_nr_running(rq);
2644 }
2645 trace_sched_wakeup_new(rq, p, 1); 2671 trace_sched_wakeup_new(rq, p, 1);
2646 check_preempt_curr(rq, p, WF_FORK); 2672 check_preempt_curr(rq, p, WF_FORK);
2647#ifdef CONFIG_SMP 2673#ifdef CONFIG_SMP
2648 if (p->sched_class->task_wake_up) 2674 if (p->sched_class->task_woken)
2649 p->sched_class->task_wake_up(rq, p); 2675 p->sched_class->task_woken(rq, p);
2650#endif 2676#endif
2651 task_rq_unlock(rq, &flags); 2677 task_rq_unlock(rq, &flags);
2678 put_cpu();
2652} 2679}
2653 2680
2654#ifdef CONFIG_PREEMPT_NOTIFIERS 2681#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2798,10 +2825,10 @@ static inline void post_schedule(struct rq *rq)
2798 if (rq->post_schedule) { 2825 if (rq->post_schedule) {
2799 unsigned long flags; 2826 unsigned long flags;
2800 2827
2801 spin_lock_irqsave(&rq->lock, flags); 2828 raw_spin_lock_irqsave(&rq->lock, flags);
2802 if (rq->curr->sched_class->post_schedule) 2829 if (rq->curr->sched_class->post_schedule)
2803 rq->curr->sched_class->post_schedule(rq); 2830 rq->curr->sched_class->post_schedule(rq);
2804 spin_unlock_irqrestore(&rq->lock, flags); 2831 raw_spin_unlock_irqrestore(&rq->lock, flags);
2805 2832
2806 rq->post_schedule = 0; 2833 rq->post_schedule = 0;
2807 } 2834 }
@@ -3083,15 +3110,15 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3083{ 3110{
3084 BUG_ON(!irqs_disabled()); 3111 BUG_ON(!irqs_disabled());
3085 if (rq1 == rq2) { 3112 if (rq1 == rq2) {
3086 spin_lock(&rq1->lock); 3113 raw_spin_lock(&rq1->lock);
3087 __acquire(rq2->lock); /* Fake it out ;) */ 3114 __acquire(rq2->lock); /* Fake it out ;) */
3088 } else { 3115 } else {
3089 if (rq1 < rq2) { 3116 if (rq1 < rq2) {
3090 spin_lock(&rq1->lock); 3117 raw_spin_lock(&rq1->lock);
3091 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); 3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3092 } else { 3119 } else {
3093 spin_lock(&rq2->lock); 3120 raw_spin_lock(&rq2->lock);
3094 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3095 } 3122 }
3096 } 3123 }
3097 update_rq_clock(rq1); 3124 update_rq_clock(rq1);
@@ -3108,29 +3135,44 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3108 __releases(rq1->lock) 3135 __releases(rq1->lock)
3109 __releases(rq2->lock) 3136 __releases(rq2->lock)
3110{ 3137{
3111 spin_unlock(&rq1->lock); 3138 raw_spin_unlock(&rq1->lock);
3112 if (rq1 != rq2) 3139 if (rq1 != rq2)
3113 spin_unlock(&rq2->lock); 3140 raw_spin_unlock(&rq2->lock);
3114 else 3141 else
3115 __release(rq2->lock); 3142 __release(rq2->lock);
3116} 3143}
3117 3144
3118/* 3145/*
3119 * If dest_cpu is allowed for this process, migrate the task to it. 3146 * sched_exec - execve() is a valuable balancing opportunity, because at
3120 * This is accomplished by forcing the cpu_allowed mask to only 3147 * this point the task has the smallest effective memory and cache footprint.
3121 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3122 * the cpu_allowed mask is restored.
3123 */ 3148 */
3124static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3149void sched_exec(void)
3125{ 3150{
3151 struct task_struct *p = current;
3126 struct migration_req req; 3152 struct migration_req req;
3153 int dest_cpu, this_cpu;
3127 unsigned long flags; 3154 unsigned long flags;
3128 struct rq *rq; 3155 struct rq *rq;
3129 3156
3157again:
3158 this_cpu = get_cpu();
3159 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3160 if (dest_cpu == this_cpu) {
3161 put_cpu();
3162 return;
3163 }
3164
3130 rq = task_rq_lock(p, &flags); 3165 rq = task_rq_lock(p, &flags);
3166 put_cpu();
3167
3168 /*
3169 * select_task_rq() can race against ->cpus_allowed
3170 */
3131 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3171 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3132 || unlikely(!cpu_active(dest_cpu))) 3172 || unlikely(!cpu_active(dest_cpu))) {
3133 goto out; 3173 task_rq_unlock(rq, &flags);
3174 goto again;
3175 }
3134 3176
3135 /* force the process onto the specified CPU */ 3177 /* force the process onto the specified CPU */
3136 if (migrate_task(p, dest_cpu, &req)) { 3178 if (migrate_task(p, dest_cpu, &req)) {
@@ -3145,24 +3187,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3145 3187
3146 return; 3188 return;
3147 } 3189 }
3148out:
3149 task_rq_unlock(rq, &flags); 3190 task_rq_unlock(rq, &flags);
3150} 3191}
3151 3192
3152/* 3193/*
3153 * sched_exec - execve() is a valuable balancing opportunity, because at
3154 * this point the task has the smallest effective memory and cache footprint.
3155 */
3156void sched_exec(void)
3157{
3158 int new_cpu, this_cpu = get_cpu();
3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3160 put_cpu();
3161 if (new_cpu != this_cpu)
3162 sched_migrate_task(current, new_cpu);
3163}
3164
3165/*
3166 * pull_task - move a task from a remote runqueue to the local runqueue. 3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3167 * Both runqueues must be locked. 3195 * Both runqueues must be locked.
3168 */ 3196 */
@@ -3172,10 +3200,6 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
3172 deactivate_task(src_rq, p, 0); 3200 deactivate_task(src_rq, p, 0);
3173 set_task_cpu(p, this_cpu); 3201 set_task_cpu(p, this_cpu);
3174 activate_task(this_rq, p, 0); 3202 activate_task(this_rq, p, 0);
3175 /*
3176 * Note that idle threads have a prio of MAX_PRIO, for this test
3177 * to be always true for them.
3178 */
3179 check_preempt_curr(this_rq, p, 0); 3203 check_preempt_curr(this_rq, p, 0);
3180} 3204}
3181 3205
@@ -4134,7 +4158,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
4134 unsigned long flags; 4158 unsigned long flags;
4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4136 4160
4137 cpumask_copy(cpus, cpu_online_mask); 4161 cpumask_copy(cpus, cpu_active_mask);
4138 4162
4139 /* 4163 /*
4140 * When power savings policy is enabled for the parent domain, idle 4164 * When power savings policy is enabled for the parent domain, idle
@@ -4207,14 +4231,15 @@ redo:
4207 4231
4208 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { 4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4209 4233
4210 spin_lock_irqsave(&busiest->lock, flags); 4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4211 4235
4212 /* don't kick the migration_thread, if the curr 4236 /* don't kick the migration_thread, if the curr
4213 * task on busiest cpu can't be moved to this_cpu 4237 * task on busiest cpu can't be moved to this_cpu
4214 */ 4238 */
4215 if (!cpumask_test_cpu(this_cpu, 4239 if (!cpumask_test_cpu(this_cpu,
4216 &busiest->curr->cpus_allowed)) { 4240 &busiest->curr->cpus_allowed)) {
4217 spin_unlock_irqrestore(&busiest->lock, flags); 4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4218 all_pinned = 1; 4243 all_pinned = 1;
4219 goto out_one_pinned; 4244 goto out_one_pinned;
4220 } 4245 }
@@ -4224,7 +4249,7 @@ redo:
4224 busiest->push_cpu = this_cpu; 4249 busiest->push_cpu = this_cpu;
4225 active_balance = 1; 4250 active_balance = 1;
4226 } 4251 }
4227 spin_unlock_irqrestore(&busiest->lock, flags); 4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4228 if (active_balance) 4253 if (active_balance)
4229 wake_up_process(busiest->migration_thread); 4254 wake_up_process(busiest->migration_thread);
4230 4255
@@ -4297,7 +4322,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4297 int all_pinned = 0; 4322 int all_pinned = 0;
4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); 4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4299 4324
4300 cpumask_copy(cpus, cpu_online_mask); 4325 cpumask_copy(cpus, cpu_active_mask);
4301 4326
4302 /* 4327 /*
4303 * When power savings policy is enabled for the parent domain, idle 4328 * When power savings policy is enabled for the parent domain, idle
@@ -4406,10 +4431,10 @@ redo:
4406 /* 4431 /*
4407 * Should not call ttwu while holding a rq->lock 4432 * Should not call ttwu while holding a rq->lock
4408 */ 4433 */
4409 spin_unlock(&this_rq->lock); 4434 raw_spin_unlock(&this_rq->lock);
4410 if (active_balance) 4435 if (active_balance)
4411 wake_up_process(busiest->migration_thread); 4436 wake_up_process(busiest->migration_thread);
4412 spin_lock(&this_rq->lock); 4437 raw_spin_lock(&this_rq->lock);
4413 4438
4414 } else 4439 } else
4415 sd->nr_balance_failed = 0; 4440 sd->nr_balance_failed = 0;
@@ -4694,7 +4719,7 @@ int select_nohz_load_balancer(int stop_tick)
4694 cpumask_set_cpu(cpu, nohz.cpu_mask); 4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4695 4720
4696 /* time for ilb owner also to sleep */ 4721 /* time for ilb owner also to sleep */
4697 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4698 if (atomic_read(&nohz.load_balancer) == cpu) 4723 if (atomic_read(&nohz.load_balancer) == cpu)
4699 atomic_set(&nohz.load_balancer, -1); 4724 atomic_set(&nohz.load_balancer, -1);
4700 return 0; 4725 return 0;
@@ -5278,11 +5303,11 @@ void scheduler_tick(void)
5278 5303
5279 sched_clock_tick(); 5304 sched_clock_tick();
5280 5305
5281 spin_lock(&rq->lock); 5306 raw_spin_lock(&rq->lock);
5282 update_rq_clock(rq); 5307 update_rq_clock(rq);
5283 update_cpu_load(rq); 5308 update_cpu_load(rq);
5284 curr->sched_class->task_tick(rq, curr, 0); 5309 curr->sched_class->task_tick(rq, curr, 0);
5285 spin_unlock(&rq->lock); 5310 raw_spin_unlock(&rq->lock);
5286 5311
5287 perf_event_task_tick(curr, cpu); 5312 perf_event_task_tick(curr, cpu);
5288 5313
@@ -5396,13 +5421,14 @@ static inline void schedule_debug(struct task_struct *prev)
5396#endif 5421#endif
5397} 5422}
5398 5423
5399static void put_prev_task(struct rq *rq, struct task_struct *p) 5424static void put_prev_task(struct rq *rq, struct task_struct *prev)
5400{ 5425{
5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 5426 if (prev->state == TASK_RUNNING) {
5427 u64 runtime = prev->se.sum_exec_runtime;
5402 5428
5403 update_avg(&p->se.avg_running, runtime); 5429 runtime -= prev->se.prev_sum_exec_runtime;
5430 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5404 5431
5405 if (p->state == TASK_RUNNING) {
5406 /* 5432 /*
5407 * In order to avoid avg_overlap growing stale when we are 5433 * In order to avoid avg_overlap growing stale when we are
5408 * indeed overlapping and hence not getting put to sleep, grow 5434 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +5438,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5412 * correlates to the amount of cache footprint a task can 5438 * correlates to the amount of cache footprint a task can
5413 * build up. 5439 * build up.
5414 */ 5440 */
5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 5441 update_avg(&prev->se.avg_overlap, runtime);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5419 } 5442 }
5420 p->sched_class->put_prev_task(rq, p); 5443 prev->sched_class->put_prev_task(rq, prev);
5421} 5444}
5422 5445
5423/* 5446/*
@@ -5478,7 +5501,7 @@ need_resched_nonpreemptible:
5478 if (sched_feat(HRTICK)) 5501 if (sched_feat(HRTICK))
5479 hrtick_clear(rq); 5502 hrtick_clear(rq);
5480 5503
5481 spin_lock_irq(&rq->lock); 5504 raw_spin_lock_irq(&rq->lock);
5482 update_rq_clock(rq); 5505 update_rq_clock(rq);
5483 clear_tsk_need_resched(prev); 5506 clear_tsk_need_resched(prev);
5484 5507
@@ -5514,12 +5537,15 @@ need_resched_nonpreemptible:
5514 cpu = smp_processor_id(); 5537 cpu = smp_processor_id();
5515 rq = cpu_rq(cpu); 5538 rq = cpu_rq(cpu);
5516 } else 5539 } else
5517 spin_unlock_irq(&rq->lock); 5540 raw_spin_unlock_irq(&rq->lock);
5518 5541
5519 post_schedule(rq); 5542 post_schedule(rq);
5520 5543
5521 if (unlikely(reacquire_kernel_lock(current) < 0)) 5544 if (unlikely(reacquire_kernel_lock(current) < 0)) {
5545 prev = rq->curr;
5546 switch_count = &prev->nivcsw;
5522 goto need_resched_nonpreemptible; 5547 goto need_resched_nonpreemptible;
5548 }
5523 5549
5524 preempt_enable_no_resched(); 5550 preempt_enable_no_resched();
5525 if (need_resched()) 5551 if (need_resched())
@@ -5931,14 +5957,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5931 */ 5957 */
5932bool try_wait_for_completion(struct completion *x) 5958bool try_wait_for_completion(struct completion *x)
5933{ 5959{
5960 unsigned long flags;
5934 int ret = 1; 5961 int ret = 1;
5935 5962
5936 spin_lock_irq(&x->wait.lock); 5963 spin_lock_irqsave(&x->wait.lock, flags);
5937 if (!x->done) 5964 if (!x->done)
5938 ret = 0; 5965 ret = 0;
5939 else 5966 else
5940 x->done--; 5967 x->done--;
5941 spin_unlock_irq(&x->wait.lock); 5968 spin_unlock_irqrestore(&x->wait.lock, flags);
5942 return ret; 5969 return ret;
5943} 5970}
5944EXPORT_SYMBOL(try_wait_for_completion); 5971EXPORT_SYMBOL(try_wait_for_completion);
@@ -5953,12 +5980,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5953 */ 5980 */
5954bool completion_done(struct completion *x) 5981bool completion_done(struct completion *x)
5955{ 5982{
5983 unsigned long flags;
5956 int ret = 1; 5984 int ret = 1;
5957 5985
5958 spin_lock_irq(&x->wait.lock); 5986 spin_lock_irqsave(&x->wait.lock, flags);
5959 if (!x->done) 5987 if (!x->done)
5960 ret = 0; 5988 ret = 0;
5961 spin_unlock_irq(&x->wait.lock); 5989 spin_unlock_irqrestore(&x->wait.lock, flags);
5962 return ret; 5990 return ret;
5963} 5991}
5964EXPORT_SYMBOL(completion_done); 5992EXPORT_SYMBOL(completion_done);
@@ -6343,7 +6371,7 @@ recheck:
6343 * make sure no PI-waiters arrive (or leave) while we are 6371 * make sure no PI-waiters arrive (or leave) while we are
6344 * changing the priority of the task: 6372 * changing the priority of the task:
6345 */ 6373 */
6346 spin_lock_irqsave(&p->pi_lock, flags); 6374 raw_spin_lock_irqsave(&p->pi_lock, flags);
6347 /* 6375 /*
6348 * To be able to change p->policy safely, the apropriate 6376 * To be able to change p->policy safely, the apropriate
6349 * runqueue lock must be held. 6377 * runqueue lock must be held.
@@ -6353,7 +6381,7 @@ recheck:
6353 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 6381 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6354 policy = oldpolicy = -1; 6382 policy = oldpolicy = -1;
6355 __task_rq_unlock(rq); 6383 __task_rq_unlock(rq);
6356 spin_unlock_irqrestore(&p->pi_lock, flags); 6384 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6357 goto recheck; 6385 goto recheck;
6358 } 6386 }
6359 update_rq_clock(rq); 6387 update_rq_clock(rq);
@@ -6377,7 +6405,7 @@ recheck:
6377 check_class_changed(rq, p, prev_class, oldprio, running); 6405 check_class_changed(rq, p, prev_class, oldprio, running);
6378 } 6406 }
6379 __task_rq_unlock(rq); 6407 __task_rq_unlock(rq);
6380 spin_unlock_irqrestore(&p->pi_lock, flags); 6408 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6381 6409
6382 rt_mutex_adjust_pi(p); 6410 rt_mutex_adjust_pi(p);
6383 6411
@@ -6477,7 +6505,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6477 return -EINVAL; 6505 return -EINVAL;
6478 6506
6479 retval = -ESRCH; 6507 retval = -ESRCH;
6480 read_lock(&tasklist_lock); 6508 rcu_read_lock();
6481 p = find_process_by_pid(pid); 6509 p = find_process_by_pid(pid);
6482 if (p) { 6510 if (p) {
6483 retval = security_task_getscheduler(p); 6511 retval = security_task_getscheduler(p);
@@ -6485,7 +6513,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6485 retval = p->policy 6513 retval = p->policy
6486 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 6514 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6487 } 6515 }
6488 read_unlock(&tasklist_lock); 6516 rcu_read_unlock();
6489 return retval; 6517 return retval;
6490} 6518}
6491 6519
@@ -6503,7 +6531,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6503 if (!param || pid < 0) 6531 if (!param || pid < 0)
6504 return -EINVAL; 6532 return -EINVAL;
6505 6533
6506 read_lock(&tasklist_lock); 6534 rcu_read_lock();
6507 p = find_process_by_pid(pid); 6535 p = find_process_by_pid(pid);
6508 retval = -ESRCH; 6536 retval = -ESRCH;
6509 if (!p) 6537 if (!p)
@@ -6514,7 +6542,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6514 goto out_unlock; 6542 goto out_unlock;
6515 6543
6516 lp.sched_priority = p->rt_priority; 6544 lp.sched_priority = p->rt_priority;
6517 read_unlock(&tasklist_lock); 6545 rcu_read_unlock();
6518 6546
6519 /* 6547 /*
6520 * This one might sleep, we cannot do it with a spinlock held ... 6548 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6524,7 +6552,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6524 return retval; 6552 return retval;
6525 6553
6526out_unlock: 6554out_unlock:
6527 read_unlock(&tasklist_lock); 6555 rcu_read_unlock();
6528 return retval; 6556 return retval;
6529} 6557}
6530 6558
@@ -6535,22 +6563,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6535 int retval; 6563 int retval;
6536 6564
6537 get_online_cpus(); 6565 get_online_cpus();
6538 read_lock(&tasklist_lock); 6566 rcu_read_lock();
6539 6567
6540 p = find_process_by_pid(pid); 6568 p = find_process_by_pid(pid);
6541 if (!p) { 6569 if (!p) {
6542 read_unlock(&tasklist_lock); 6570 rcu_read_unlock();
6543 put_online_cpus(); 6571 put_online_cpus();
6544 return -ESRCH; 6572 return -ESRCH;
6545 } 6573 }
6546 6574
6547 /* 6575 /* Prevent p going away */
6548 * It is not safe to call set_cpus_allowed with the
6549 * tasklist_lock held. We will bump the task_struct's
6550 * usage count and then drop tasklist_lock.
6551 */
6552 get_task_struct(p); 6576 get_task_struct(p);
6553 read_unlock(&tasklist_lock); 6577 rcu_read_unlock();
6554 6578
6555 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 6579 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6556 retval = -ENOMEM; 6580 retval = -ENOMEM;
@@ -6631,10 +6655,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6631long sched_getaffinity(pid_t pid, struct cpumask *mask) 6655long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632{ 6656{
6633 struct task_struct *p; 6657 struct task_struct *p;
6658 unsigned long flags;
6659 struct rq *rq;
6634 int retval; 6660 int retval;
6635 6661
6636 get_online_cpus(); 6662 get_online_cpus();
6637 read_lock(&tasklist_lock); 6663 rcu_read_lock();
6638 6664
6639 retval = -ESRCH; 6665 retval = -ESRCH;
6640 p = find_process_by_pid(pid); 6666 p = find_process_by_pid(pid);
@@ -6645,10 +6671,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6645 if (retval) 6671 if (retval)
6646 goto out_unlock; 6672 goto out_unlock;
6647 6673
6674 rq = task_rq_lock(p, &flags);
6648 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 6675 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
6676 task_rq_unlock(rq, &flags);
6649 6677
6650out_unlock: 6678out_unlock:
6651 read_unlock(&tasklist_lock); 6679 rcu_read_unlock();
6652 put_online_cpus(); 6680 put_online_cpus();
6653 6681
6654 return retval; 6682 return retval;
@@ -6703,7 +6731,7 @@ SYSCALL_DEFINE0(sched_yield)
6703 */ 6731 */
6704 __release(rq->lock); 6732 __release(rq->lock);
6705 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 6733 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6706 _raw_spin_unlock(&rq->lock); 6734 do_raw_spin_unlock(&rq->lock);
6707 preempt_enable_no_resched(); 6735 preempt_enable_no_resched();
6708 6736
6709 schedule(); 6737 schedule();
@@ -6883,6 +6911,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6883{ 6911{
6884 struct task_struct *p; 6912 struct task_struct *p;
6885 unsigned int time_slice; 6913 unsigned int time_slice;
6914 unsigned long flags;
6915 struct rq *rq;
6886 int retval; 6916 int retval;
6887 struct timespec t; 6917 struct timespec t;
6888 6918
@@ -6890,7 +6920,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6890 return -EINVAL; 6920 return -EINVAL;
6891 6921
6892 retval = -ESRCH; 6922 retval = -ESRCH;
6893 read_lock(&tasklist_lock); 6923 rcu_read_lock();
6894 p = find_process_by_pid(pid); 6924 p = find_process_by_pid(pid);
6895 if (!p) 6925 if (!p)
6896 goto out_unlock; 6926 goto out_unlock;
@@ -6899,15 +6929,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6899 if (retval) 6929 if (retval)
6900 goto out_unlock; 6930 goto out_unlock;
6901 6931
6902 time_slice = p->sched_class->get_rr_interval(p); 6932 rq = task_rq_lock(p, &flags);
6933 time_slice = p->sched_class->get_rr_interval(rq, p);
6934 task_rq_unlock(rq, &flags);
6903 6935
6904 read_unlock(&tasklist_lock); 6936 rcu_read_unlock();
6905 jiffies_to_timespec(time_slice, &t); 6937 jiffies_to_timespec(time_slice, &t);
6906 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 6938 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6907 return retval; 6939 return retval;
6908 6940
6909out_unlock: 6941out_unlock:
6910 read_unlock(&tasklist_lock); 6942 rcu_read_unlock();
6911 return retval; 6943 return retval;
6912} 6944}
6913 6945
@@ -6995,12 +7027,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6995 struct rq *rq = cpu_rq(cpu); 7027 struct rq *rq = cpu_rq(cpu);
6996 unsigned long flags; 7028 unsigned long flags;
6997 7029
6998 spin_lock_irqsave(&rq->lock, flags); 7030 raw_spin_lock_irqsave(&rq->lock, flags);
6999 7031
7000 __sched_fork(idle); 7032 __sched_fork(idle);
7033 idle->state = TASK_RUNNING;
7001 idle->se.exec_start = sched_clock(); 7034 idle->se.exec_start = sched_clock();
7002 7035
7003 idle->prio = idle->normal_prio = MAX_PRIO;
7004 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 7036 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
7005 __set_task_cpu(idle, cpu); 7037 __set_task_cpu(idle, cpu);
7006 7038
@@ -7008,7 +7040,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7008#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 7040#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
7009 idle->oncpu = 1; 7041 idle->oncpu = 1;
7010#endif 7042#endif
7011 spin_unlock_irqrestore(&rq->lock, flags); 7043 raw_spin_unlock_irqrestore(&rq->lock, flags);
7012 7044
7013 /* Set the preempt count _outside_ the spinlocks! */ 7045 /* Set the preempt count _outside_ the spinlocks! */
7014#if defined(CONFIG_PREEMPT) 7046#if defined(CONFIG_PREEMPT)
@@ -7041,22 +7073,43 @@ cpumask_var_t nohz_cpu_mask;
7041 * 7073 *
7042 * This idea comes from the SD scheduler of Con Kolivas: 7074 * This idea comes from the SD scheduler of Con Kolivas:
7043 */ 7075 */
7044static inline void sched_init_granularity(void) 7076static int get_update_sysctl_factor(void)
7045{ 7077{
7046 unsigned int factor = 1 + ilog2(num_online_cpus()); 7078 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7047 const unsigned long limit = 200000000; 7079 unsigned int factor;
7048 7080
7049 sysctl_sched_min_granularity *= factor; 7081 switch (sysctl_sched_tunable_scaling) {
7050 if (sysctl_sched_min_granularity > limit) 7082 case SCHED_TUNABLESCALING_NONE:
7051 sysctl_sched_min_granularity = limit; 7083 factor = 1;
7084 break;
7085 case SCHED_TUNABLESCALING_LINEAR:
7086 factor = cpus;
7087 break;
7088 case SCHED_TUNABLESCALING_LOG:
7089 default:
7090 factor = 1 + ilog2(cpus);
7091 break;
7092 }
7052 7093
7053 sysctl_sched_latency *= factor; 7094 return factor;
7054 if (sysctl_sched_latency > limit) 7095}
7055 sysctl_sched_latency = limit;
7056 7096
7057 sysctl_sched_wakeup_granularity *= factor; 7097static void update_sysctl(void)
7098{
7099 unsigned int factor = get_update_sysctl_factor();
7058 7100
7059 sysctl_sched_shares_ratelimit *= factor; 7101#define SET_SYSCTL(name) \
7102 (sysctl_##name = (factor) * normalized_sysctl_##name)
7103 SET_SYSCTL(sched_min_granularity);
7104 SET_SYSCTL(sched_latency);
7105 SET_SYSCTL(sched_wakeup_granularity);
7106 SET_SYSCTL(sched_shares_ratelimit);
7107#undef SET_SYSCTL
7108}
7109
7110static inline void sched_init_granularity(void)
7111{
7112 update_sysctl();
7060} 7113}
7061 7114
7062#ifdef CONFIG_SMP 7115#ifdef CONFIG_SMP
@@ -7092,8 +7145,28 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7092 struct rq *rq; 7145 struct rq *rq;
7093 int ret = 0; 7146 int ret = 0;
7094 7147
7148 /*
7149 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7150 * the ->cpus_allowed mask from under waking tasks, which would be
7151 * possible when we change rq->lock in ttwu(), so synchronize against
7152 * TASK_WAKING to avoid that.
7153 *
7154 * Make an exception for freshly cloned tasks, since cpuset namespaces
7155 * might move the task about, we have to validate the target in
7156 * wake_up_new_task() anyway since the cpu might have gone away.
7157 */
7158again:
7159 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7160 cpu_relax();
7161
7095 rq = task_rq_lock(p, &flags); 7162 rq = task_rq_lock(p, &flags);
7096 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 7163
7164 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7165 task_rq_unlock(rq, &flags);
7166 goto again;
7167 }
7168
7169 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7097 ret = -EINVAL; 7170 ret = -EINVAL;
7098 goto out; 7171 goto out;
7099 } 7172 }
@@ -7115,7 +7188,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7115 if (cpumask_test_cpu(task_cpu(p), new_mask)) 7188 if (cpumask_test_cpu(task_cpu(p), new_mask))
7116 goto out; 7189 goto out;
7117 7190
7118 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 7191 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7119 /* Need help from migration thread: drop lock and wait. */ 7192 /* Need help from migration thread: drop lock and wait. */
7120 struct task_struct *mt = rq->migration_thread; 7193 struct task_struct *mt = rq->migration_thread;
7121 7194
@@ -7148,7 +7221,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7148static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 7221static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7149{ 7222{
7150 struct rq *rq_dest, *rq_src; 7223 struct rq *rq_dest, *rq_src;
7151 int ret = 0, on_rq; 7224 int ret = 0;
7152 7225
7153 if (unlikely(!cpu_active(dest_cpu))) 7226 if (unlikely(!cpu_active(dest_cpu)))
7154 return ret; 7227 return ret;
@@ -7164,12 +7237,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7164 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 7237 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7165 goto fail; 7238 goto fail;
7166 7239
7167 on_rq = p->se.on_rq; 7240 /*
7168 if (on_rq) 7241 * If we're not on a rq, the next wake-up will ensure we're
7242 * placed properly.
7243 */
7244 if (p->se.on_rq) {
7169 deactivate_task(rq_src, p, 0); 7245 deactivate_task(rq_src, p, 0);
7170 7246 set_task_cpu(p, dest_cpu);
7171 set_task_cpu(p, dest_cpu);
7172 if (on_rq) {
7173 activate_task(rq_dest, p, 0); 7247 activate_task(rq_dest, p, 0);
7174 check_preempt_curr(rq_dest, p, 0); 7248 check_preempt_curr(rq_dest, p, 0);
7175 } 7249 }
@@ -7204,10 +7278,10 @@ static int migration_thread(void *data)
7204 struct migration_req *req; 7278 struct migration_req *req;
7205 struct list_head *head; 7279 struct list_head *head;
7206 7280
7207 spin_lock_irq(&rq->lock); 7281 raw_spin_lock_irq(&rq->lock);
7208 7282
7209 if (cpu_is_offline(cpu)) { 7283 if (cpu_is_offline(cpu)) {
7210 spin_unlock_irq(&rq->lock); 7284 raw_spin_unlock_irq(&rq->lock);
7211 break; 7285 break;
7212 } 7286 }
7213 7287
@@ -7219,7 +7293,7 @@ static int migration_thread(void *data)
7219 head = &rq->migration_queue; 7293 head = &rq->migration_queue;
7220 7294
7221 if (list_empty(head)) { 7295 if (list_empty(head)) {
7222 spin_unlock_irq(&rq->lock); 7296 raw_spin_unlock_irq(&rq->lock);
7223 schedule(); 7297 schedule();
7224 set_current_state(TASK_INTERRUPTIBLE); 7298 set_current_state(TASK_INTERRUPTIBLE);
7225 continue; 7299 continue;
@@ -7228,14 +7302,14 @@ static int migration_thread(void *data)
7228 list_del_init(head->next); 7302 list_del_init(head->next);
7229 7303
7230 if (req->task != NULL) { 7304 if (req->task != NULL) {
7231 spin_unlock(&rq->lock); 7305 raw_spin_unlock(&rq->lock);
7232 __migrate_task(req->task, cpu, req->dest_cpu); 7306 __migrate_task(req->task, cpu, req->dest_cpu);
7233 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 7307 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7234 req->dest_cpu = RCU_MIGRATION_GOT_QS; 7308 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7235 spin_unlock(&rq->lock); 7309 raw_spin_unlock(&rq->lock);
7236 } else { 7310 } else {
7237 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 7311 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7238 spin_unlock(&rq->lock); 7312 raw_spin_unlock(&rq->lock);
7239 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 7313 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7240 } 7314 }
7241 local_irq_enable(); 7315 local_irq_enable();
@@ -7265,37 +7339,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7265static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 7339static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7266{ 7340{
7267 int dest_cpu; 7341 int dest_cpu;
7268 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7269 7342
7270again: 7343again:
7271 /* Look for allowed, online CPU in same node. */ 7344 dest_cpu = select_fallback_rq(dead_cpu, p);
7272 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7273 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7274 goto move;
7275
7276 /* Any allowed, online CPU? */
7277 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7278 if (dest_cpu < nr_cpu_ids)
7279 goto move;
7280
7281 /* No more Mr. Nice Guy. */
7282 if (dest_cpu >= nr_cpu_ids) {
7283 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7284 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7285
7286 /*
7287 * Don't tell them about moving exiting tasks or
7288 * kernel threads (both mm NULL), since they never
7289 * leave kernel.
7290 */
7291 if (p->mm && printk_ratelimit()) {
7292 printk(KERN_INFO "process %d (%s) no "
7293 "longer affine to cpu%d\n",
7294 task_pid_nr(p), p->comm, dead_cpu);
7295 }
7296 }
7297 7345
7298move:
7299 /* It can have affinity changed while we were choosing. */ 7346 /* It can have affinity changed while we were choosing. */
7300 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 7347 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7301 goto again; 7348 goto again;
@@ -7310,7 +7357,7 @@ move:
7310 */ 7357 */
7311static void migrate_nr_uninterruptible(struct rq *rq_src) 7358static void migrate_nr_uninterruptible(struct rq *rq_src)
7312{ 7359{
7313 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 7360 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7314 unsigned long flags; 7361 unsigned long flags;
7315 7362
7316 local_irq_save(flags); 7363 local_irq_save(flags);
@@ -7358,14 +7405,14 @@ void sched_idle_next(void)
7358 * Strictly not necessary since rest of the CPUs are stopped by now 7405 * Strictly not necessary since rest of the CPUs are stopped by now
7359 * and interrupts disabled on the current cpu. 7406 * and interrupts disabled on the current cpu.
7360 */ 7407 */
7361 spin_lock_irqsave(&rq->lock, flags); 7408 raw_spin_lock_irqsave(&rq->lock, flags);
7362 7409
7363 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 7410 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7364 7411
7365 update_rq_clock(rq); 7412 update_rq_clock(rq);
7366 activate_task(rq, p, 0); 7413 activate_task(rq, p, 0);
7367 7414
7368 spin_unlock_irqrestore(&rq->lock, flags); 7415 raw_spin_unlock_irqrestore(&rq->lock, flags);
7369} 7416}
7370 7417
7371/* 7418/*
@@ -7401,9 +7448,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7401 * that's OK. No task can be added to this CPU, so iteration is 7448 * that's OK. No task can be added to this CPU, so iteration is
7402 * fine. 7449 * fine.
7403 */ 7450 */
7404 spin_unlock_irq(&rq->lock); 7451 raw_spin_unlock_irq(&rq->lock);
7405 move_task_off_dead_cpu(dead_cpu, p); 7452 move_task_off_dead_cpu(dead_cpu, p);
7406 spin_lock_irq(&rq->lock); 7453 raw_spin_lock_irq(&rq->lock);
7407 7454
7408 put_task_struct(p); 7455 put_task_struct(p);
7409} 7456}
@@ -7563,7 +7610,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7563static struct ctl_table_header *sd_sysctl_header; 7610static struct ctl_table_header *sd_sysctl_header;
7564static void register_sched_domain_sysctl(void) 7611static void register_sched_domain_sysctl(void)
7565{ 7612{
7566 int i, cpu_num = num_online_cpus(); 7613 int i, cpu_num = num_possible_cpus();
7567 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 7614 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7568 char buf[32]; 7615 char buf[32];
7569 7616
@@ -7573,7 +7620,7 @@ static void register_sched_domain_sysctl(void)
7573 if (entry == NULL) 7620 if (entry == NULL)
7574 return; 7621 return;
7575 7622
7576 for_each_online_cpu(i) { 7623 for_each_possible_cpu(i) {
7577 snprintf(buf, 32, "cpu%d", i); 7624 snprintf(buf, 32, "cpu%d", i);
7578 entry->procname = kstrdup(buf, GFP_KERNEL); 7625 entry->procname = kstrdup(buf, GFP_KERNEL);
7579 entry->mode = 0555; 7626 entry->mode = 0555;
@@ -7669,13 +7716,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7669 7716
7670 /* Update our root-domain */ 7717 /* Update our root-domain */
7671 rq = cpu_rq(cpu); 7718 rq = cpu_rq(cpu);
7672 spin_lock_irqsave(&rq->lock, flags); 7719 raw_spin_lock_irqsave(&rq->lock, flags);
7673 if (rq->rd) { 7720 if (rq->rd) {
7674 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7721 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7675 7722
7676 set_rq_online(rq); 7723 set_rq_online(rq);
7677 } 7724 }
7678 spin_unlock_irqrestore(&rq->lock, flags); 7725 raw_spin_unlock_irqrestore(&rq->lock, flags);
7679 break; 7726 break;
7680 7727
7681#ifdef CONFIG_HOTPLUG_CPU 7728#ifdef CONFIG_HOTPLUG_CPU
@@ -7700,14 +7747,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7700 put_task_struct(rq->migration_thread); 7747 put_task_struct(rq->migration_thread);
7701 rq->migration_thread = NULL; 7748 rq->migration_thread = NULL;
7702 /* Idle task back to normal (off runqueue, low prio) */ 7749 /* Idle task back to normal (off runqueue, low prio) */
7703 spin_lock_irq(&rq->lock); 7750 raw_spin_lock_irq(&rq->lock);
7704 update_rq_clock(rq); 7751 update_rq_clock(rq);
7705 deactivate_task(rq, rq->idle, 0); 7752 deactivate_task(rq, rq->idle, 0);
7706 rq->idle->static_prio = MAX_PRIO;
7707 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 7753 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7708 rq->idle->sched_class = &idle_sched_class; 7754 rq->idle->sched_class = &idle_sched_class;
7709 migrate_dead_tasks(cpu); 7755 migrate_dead_tasks(cpu);
7710 spin_unlock_irq(&rq->lock); 7756 raw_spin_unlock_irq(&rq->lock);
7711 cpuset_unlock(); 7757 cpuset_unlock();
7712 migrate_nr_uninterruptible(rq); 7758 migrate_nr_uninterruptible(rq);
7713 BUG_ON(rq->nr_running != 0); 7759 BUG_ON(rq->nr_running != 0);
@@ -7717,30 +7763,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7717 * they didn't take sched_hotcpu_mutex. Just wake up 7763 * they didn't take sched_hotcpu_mutex. Just wake up
7718 * the requestors. 7764 * the requestors.
7719 */ 7765 */
7720 spin_lock_irq(&rq->lock); 7766 raw_spin_lock_irq(&rq->lock);
7721 while (!list_empty(&rq->migration_queue)) { 7767 while (!list_empty(&rq->migration_queue)) {
7722 struct migration_req *req; 7768 struct migration_req *req;
7723 7769
7724 req = list_entry(rq->migration_queue.next, 7770 req = list_entry(rq->migration_queue.next,
7725 struct migration_req, list); 7771 struct migration_req, list);
7726 list_del_init(&req->list); 7772 list_del_init(&req->list);
7727 spin_unlock_irq(&rq->lock); 7773 raw_spin_unlock_irq(&rq->lock);
7728 complete(&req->done); 7774 complete(&req->done);
7729 spin_lock_irq(&rq->lock); 7775 raw_spin_lock_irq(&rq->lock);
7730 } 7776 }
7731 spin_unlock_irq(&rq->lock); 7777 raw_spin_unlock_irq(&rq->lock);
7732 break; 7778 break;
7733 7779
7734 case CPU_DYING: 7780 case CPU_DYING:
7735 case CPU_DYING_FROZEN: 7781 case CPU_DYING_FROZEN:
7736 /* Update our root-domain */ 7782 /* Update our root-domain */
7737 rq = cpu_rq(cpu); 7783 rq = cpu_rq(cpu);
7738 spin_lock_irqsave(&rq->lock, flags); 7784 raw_spin_lock_irqsave(&rq->lock, flags);
7739 if (rq->rd) { 7785 if (rq->rd) {
7740 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 7786 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7741 set_rq_offline(rq); 7787 set_rq_offline(rq);
7742 } 7788 }
7743 spin_unlock_irqrestore(&rq->lock, flags); 7789 raw_spin_unlock_irqrestore(&rq->lock, flags);
7744 break; 7790 break;
7745#endif 7791#endif
7746 } 7792 }
@@ -7970,7 +8016,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7970 struct root_domain *old_rd = NULL; 8016 struct root_domain *old_rd = NULL;
7971 unsigned long flags; 8017 unsigned long flags;
7972 8018
7973 spin_lock_irqsave(&rq->lock, flags); 8019 raw_spin_lock_irqsave(&rq->lock, flags);
7974 8020
7975 if (rq->rd) { 8021 if (rq->rd) {
7976 old_rd = rq->rd; 8022 old_rd = rq->rd;
@@ -7996,7 +8042,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7996 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 8042 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7997 set_rq_online(rq); 8043 set_rq_online(rq);
7998 8044
7999 spin_unlock_irqrestore(&rq->lock, flags); 8045 raw_spin_unlock_irqrestore(&rq->lock, flags);
8000 8046
8001 if (old_rd) 8047 if (old_rd)
8002 free_rootdomain(old_rd); 8048 free_rootdomain(old_rd);
@@ -8282,14 +8328,14 @@ enum s_alloc {
8282 */ 8328 */
8283#ifdef CONFIG_SCHED_SMT 8329#ifdef CONFIG_SCHED_SMT
8284static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 8330static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8285static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 8331static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8286 8332
8287static int 8333static int
8288cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 8334cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8289 struct sched_group **sg, struct cpumask *unused) 8335 struct sched_group **sg, struct cpumask *unused)
8290{ 8336{
8291 if (sg) 8337 if (sg)
8292 *sg = &per_cpu(sched_group_cpus, cpu).sg; 8338 *sg = &per_cpu(sched_groups, cpu).sg;
8293 return cpu; 8339 return cpu;
8294} 8340}
8295#endif /* CONFIG_SCHED_SMT */ 8341#endif /* CONFIG_SCHED_SMT */
@@ -9099,7 +9145,7 @@ match1:
9099 if (doms_new == NULL) { 9145 if (doms_new == NULL) {
9100 ndoms_cur = 0; 9146 ndoms_cur = 0;
9101 doms_new = &fallback_doms; 9147 doms_new = &fallback_doms;
9102 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); 9148 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9103 WARN_ON_ONCE(dattr_new); 9149 WARN_ON_ONCE(dattr_new);
9104 } 9150 }
9105 9151
@@ -9230,8 +9276,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9230 switch (action) { 9276 switch (action) {
9231 case CPU_ONLINE: 9277 case CPU_ONLINE:
9232 case CPU_ONLINE_FROZEN: 9278 case CPU_ONLINE_FROZEN:
9233 case CPU_DEAD: 9279 case CPU_DOWN_PREPARE:
9234 case CPU_DEAD_FROZEN: 9280 case CPU_DOWN_PREPARE_FROZEN:
9281 case CPU_DOWN_FAILED:
9282 case CPU_DOWN_FAILED_FROZEN:
9235 partition_sched_domains(1, NULL, NULL); 9283 partition_sched_domains(1, NULL, NULL);
9236 return NOTIFY_OK; 9284 return NOTIFY_OK;
9237 9285
@@ -9278,7 +9326,7 @@ void __init sched_init_smp(void)
9278#endif 9326#endif
9279 get_online_cpus(); 9327 get_online_cpus();
9280 mutex_lock(&sched_domains_mutex); 9328 mutex_lock(&sched_domains_mutex);
9281 arch_init_sched_domains(cpu_online_mask); 9329 arch_init_sched_domains(cpu_active_mask);
9282 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 9330 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9283 if (cpumask_empty(non_isolated_cpus)) 9331 if (cpumask_empty(non_isolated_cpus))
9284 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 9332 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9351,13 +9399,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9351#ifdef CONFIG_SMP 9399#ifdef CONFIG_SMP
9352 rt_rq->rt_nr_migratory = 0; 9400 rt_rq->rt_nr_migratory = 0;
9353 rt_rq->overloaded = 0; 9401 rt_rq->overloaded = 0;
9354 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 9402 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9355#endif 9403#endif
9356 9404
9357 rt_rq->rt_time = 0; 9405 rt_rq->rt_time = 0;
9358 rt_rq->rt_throttled = 0; 9406 rt_rq->rt_throttled = 0;
9359 rt_rq->rt_runtime = 0; 9407 rt_rq->rt_runtime = 0;
9360 spin_lock_init(&rt_rq->rt_runtime_lock); 9408 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9361 9409
9362#ifdef CONFIG_RT_GROUP_SCHED 9410#ifdef CONFIG_RT_GROUP_SCHED
9363 rt_rq->rt_nr_boosted = 0; 9411 rt_rq->rt_nr_boosted = 0;
@@ -9517,7 +9565,7 @@ void __init sched_init(void)
9517 struct rq *rq; 9565 struct rq *rq;
9518 9566
9519 rq = cpu_rq(i); 9567 rq = cpu_rq(i);
9520 spin_lock_init(&rq->lock); 9568 raw_spin_lock_init(&rq->lock);
9521 rq->nr_running = 0; 9569 rq->nr_running = 0;
9522 rq->calc_load_active = 0; 9570 rq->calc_load_active = 0;
9523 rq->calc_load_update = jiffies + LOAD_FREQ; 9571 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9577,7 +9625,7 @@ void __init sched_init(void)
9577#elif defined CONFIG_USER_SCHED 9625#elif defined CONFIG_USER_SCHED
9578 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL); 9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9579 init_tg_rt_entry(&init_task_group, 9627 init_tg_rt_entry(&init_task_group,
9580 &per_cpu(init_rt_rq, i), 9628 &per_cpu(init_rt_rq_var, i),
9581 &per_cpu(init_sched_rt_entity, i), i, 1, 9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9582 root_task_group.rt_se[i]); 9630 root_task_group.rt_se[i]);
9583#endif 9631#endif
@@ -9615,7 +9663,7 @@ void __init sched_init(void)
9615#endif 9663#endif
9616 9664
9617#ifdef CONFIG_RT_MUTEXES 9665#ifdef CONFIG_RT_MUTEXES
9618 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 9666 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9619#endif 9667#endif
9620 9668
9621 /* 9669 /*
@@ -9659,7 +9707,7 @@ void __init sched_init(void)
9659#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 9707#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9660static inline int preempt_count_equals(int preempt_offset) 9708static inline int preempt_count_equals(int preempt_offset)
9661{ 9709{
9662 int nested = preempt_count() & ~PREEMPT_ACTIVE; 9710 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9663 9711
9664 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9665} 9713}
@@ -9740,13 +9788,13 @@ void normalize_rt_tasks(void)
9740 continue; 9788 continue;
9741 } 9789 }
9742 9790
9743 spin_lock(&p->pi_lock); 9791 raw_spin_lock(&p->pi_lock);
9744 rq = __task_rq_lock(p); 9792 rq = __task_rq_lock(p);
9745 9793
9746 normalize_task(rq, p); 9794 normalize_task(rq, p);
9747 9795
9748 __task_rq_unlock(rq); 9796 __task_rq_unlock(rq);
9749 spin_unlock(&p->pi_lock); 9797 raw_spin_unlock(&p->pi_lock);
9750 } while_each_thread(g, p); 9798 } while_each_thread(g, p);
9751 9799
9752 read_unlock_irqrestore(&tasklist_lock, flags); 9800 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9842,13 +9890,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9842 se = kzalloc_node(sizeof(struct sched_entity), 9890 se = kzalloc_node(sizeof(struct sched_entity),
9843 GFP_KERNEL, cpu_to_node(i)); 9891 GFP_KERNEL, cpu_to_node(i));
9844 if (!se) 9892 if (!se)
9845 goto err; 9893 goto err_free_rq;
9846 9894
9847 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 9895 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9848 } 9896 }
9849 9897
9850 return 1; 9898 return 1;
9851 9899
9900 err_free_rq:
9901 kfree(cfs_rq);
9852 err: 9902 err:
9853 return 0; 9903 return 0;
9854} 9904}
@@ -9930,13 +9980,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9930 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 9980 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9931 GFP_KERNEL, cpu_to_node(i)); 9981 GFP_KERNEL, cpu_to_node(i));
9932 if (!rt_se) 9982 if (!rt_se)
9933 goto err; 9983 goto err_free_rq;
9934 9984
9935 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 9985 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9936 } 9986 }
9937 9987
9938 return 1; 9988 return 1;
9939 9989
9990 err_free_rq:
9991 kfree(rt_rq);
9940 err: 9992 err:
9941 return 0; 9993 return 0;
9942} 9994}
@@ -10070,7 +10122,7 @@ void sched_move_task(struct task_struct *tsk)
10070 10122
10071#ifdef CONFIG_FAIR_GROUP_SCHED 10123#ifdef CONFIG_FAIR_GROUP_SCHED
10072 if (tsk->sched_class->moved_group) 10124 if (tsk->sched_class->moved_group)
10073 tsk->sched_class->moved_group(tsk); 10125 tsk->sched_class->moved_group(tsk, on_rq);
10074#endif 10126#endif
10075 10127
10076 if (unlikely(running)) 10128 if (unlikely(running))
@@ -10105,9 +10157,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
10105 struct rq *rq = cfs_rq->rq; 10157 struct rq *rq = cfs_rq->rq;
10106 unsigned long flags; 10158 unsigned long flags;
10107 10159
10108 spin_lock_irqsave(&rq->lock, flags); 10160 raw_spin_lock_irqsave(&rq->lock, flags);
10109 __set_se_shares(se, shares); 10161 __set_se_shares(se, shares);
10110 spin_unlock_irqrestore(&rq->lock, flags); 10162 raw_spin_unlock_irqrestore(&rq->lock, flags);
10111} 10163}
10112 10164
10113static DEFINE_MUTEX(shares_mutex); 10165static DEFINE_MUTEX(shares_mutex);
@@ -10292,18 +10344,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10292 if (err) 10344 if (err)
10293 goto unlock; 10345 goto unlock;
10294 10346
10295 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10347 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10296 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 10348 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10297 tg->rt_bandwidth.rt_runtime = rt_runtime; 10349 tg->rt_bandwidth.rt_runtime = rt_runtime;
10298 10350
10299 for_each_possible_cpu(i) { 10351 for_each_possible_cpu(i) {
10300 struct rt_rq *rt_rq = tg->rt_rq[i]; 10352 struct rt_rq *rt_rq = tg->rt_rq[i];
10301 10353
10302 spin_lock(&rt_rq->rt_runtime_lock); 10354 raw_spin_lock(&rt_rq->rt_runtime_lock);
10303 rt_rq->rt_runtime = rt_runtime; 10355 rt_rq->rt_runtime = rt_runtime;
10304 spin_unlock(&rt_rq->rt_runtime_lock); 10356 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10305 } 10357 }
10306 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 10358 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10307 unlock: 10359 unlock:
10308 read_unlock(&tasklist_lock); 10360 read_unlock(&tasklist_lock);
10309 mutex_unlock(&rt_constraints_mutex); 10361 mutex_unlock(&rt_constraints_mutex);
@@ -10408,15 +10460,15 @@ static int sched_rt_global_constraints(void)
10408 if (sysctl_sched_rt_runtime == 0) 10460 if (sysctl_sched_rt_runtime == 0)
10409 return -EBUSY; 10461 return -EBUSY;
10410 10462
10411 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 10463 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10412 for_each_possible_cpu(i) { 10464 for_each_possible_cpu(i) {
10413 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 10465 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10414 10466
10415 spin_lock(&rt_rq->rt_runtime_lock); 10467 raw_spin_lock(&rt_rq->rt_runtime_lock);
10416 rt_rq->rt_runtime = global_rt_runtime(); 10468 rt_rq->rt_runtime = global_rt_runtime();
10417 spin_unlock(&rt_rq->rt_runtime_lock); 10469 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10418 } 10470 }
10419 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 10471 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10420 10472
10421 return 0; 10473 return 0;
10422} 10474}
@@ -10707,9 +10759,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10707 /* 10759 /*
10708 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 10760 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10709 */ 10761 */
10710 spin_lock_irq(&cpu_rq(cpu)->lock); 10762 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10711 data = *cpuusage; 10763 data = *cpuusage;
10712 spin_unlock_irq(&cpu_rq(cpu)->lock); 10764 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10713#else 10765#else
10714 data = *cpuusage; 10766 data = *cpuusage;
10715#endif 10767#endif
@@ -10725,9 +10777,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10725 /* 10777 /*
10726 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 10778 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10727 */ 10779 */
10728 spin_lock_irq(&cpu_rq(cpu)->lock); 10780 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10729 *cpuusage = val; 10781 *cpuusage = val;
10730 spin_unlock_irq(&cpu_rq(cpu)->lock); 10782 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10731#else 10783#else
10732 *cpuusage = val; 10784 *cpuusage = val;
10733#endif 10785#endif
@@ -10961,9 +11013,9 @@ void synchronize_sched_expedited(void)
10961 init_completion(&req->done); 11013 init_completion(&req->done);
10962 req->task = NULL; 11014 req->task = NULL;
10963 req->dest_cpu = RCU_MIGRATION_NEED_QS; 11015 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10964 spin_lock_irqsave(&rq->lock, flags); 11016 raw_spin_lock_irqsave(&rq->lock, flags);
10965 list_add(&req->list, &rq->migration_queue); 11017 list_add(&req->list, &rq->migration_queue);
10966 spin_unlock_irqrestore(&rq->lock, flags); 11018 raw_spin_unlock_irqrestore(&rq->lock, flags);
10967 wake_up_process(rq->migration_thread); 11019 wake_up_process(rq->migration_thread);
10968 } 11020 }
10969 for_each_online_cpu(cpu) { 11021 for_each_online_cpu(cpu) {
@@ -10971,11 +11023,11 @@ void synchronize_sched_expedited(void)
10971 req = &per_cpu(rcu_migration_req, cpu); 11023 req = &per_cpu(rcu_migration_req, cpu);
10972 rq = cpu_rq(cpu); 11024 rq = cpu_rq(cpu);
10973 wait_for_completion(&req->done); 11025 wait_for_completion(&req->done);
10974 spin_lock_irqsave(&rq->lock, flags); 11026 raw_spin_lock_irqsave(&rq->lock, flags);
10975 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 11027 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10976 need_full_sync = 1; 11028 need_full_sync = 1;
10977 req->dest_cpu = RCU_MIGRATION_IDLE; 11029 req->dest_cpu = RCU_MIGRATION_IDLE;
10978 spin_unlock_irqrestore(&rq->lock, flags); 11030 raw_spin_unlock_irqrestore(&rq->lock, flags);
10979 } 11031 }
10980 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 11032 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10981 synchronize_sched_expedited_count++; 11033 synchronize_sched_expedited_count++;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7..5b496132c28 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d..597b33099df 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -135,26 +135,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 135 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 137
138 spin_lock_irqsave(&vec->lock, flags); 138 raw_spin_lock_irqsave(&vec->lock, flags);
139 139
140 cpumask_set_cpu(cpu, vec->mask); 140 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 141 vec->count++;
142 if (vec->count == 1) 142 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 143 set_bit(newpri, cp->pri_active);
144 144
145 spin_unlock_irqrestore(&vec->lock, flags); 145 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 146 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 147 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 149
150 spin_lock_irqsave(&vec->lock, flags); 150 raw_spin_lock_irqsave(&vec->lock, flags);
151 151
152 vec->count--; 152 vec->count--;
153 if (!vec->count) 153 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 154 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 155 cpumask_clear_cpu(cpu, vec->mask);
156 156
157 spin_unlock_irqrestore(&vec->lock, flags); 157 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 158 }
159 159
160 *currpri = newpri; 160 *currpri = newpri;
@@ -180,7 +180,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 181 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 182
183 spin_lock_init(&vec->lock); 183 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 184 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 185 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 186 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fb..7cb5bb6b95b 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f70..67f95aada4b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 185 SPLIT_NS(cfs_rq->exec_clock));
186 186
187 spin_lock_irqsave(&rq->lock, flags); 187 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 188 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 190 last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
309 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
310} 310}
311 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
312static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
313{ 319{
314 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
334#undef PN 340#undef PN
335#undef P 341#undef P
336 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
337 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
338 print_cpu(m, cpu); 348 print_cpu(m, cpu);
339 349
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
399 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
400 PN(se.avg_overlap); 410 PN(se.avg_overlap);
401 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
402 PN(se.avg_running);
403 412
404 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
405 414
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
423 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
424 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
425 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
426 P(se.nr_forced2_migrations);
427 P(se.nr_wakeups); 435 P(se.nr_wakeups);
428 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
429 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
499 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
500 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
501 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
502 p->se.nr_forced2_migrations = 0;
503 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
504 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
505 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336..8fe7ee81c55 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
485 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
486 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
487 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
488 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
489 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
490} 516}
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
740 se->vruntime = vruntime; 766 se->vruntime = vruntime;
741} 767}
742 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
743static void 772static void
744enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
745{ 774{
746 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
747 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
748 */ 784 */
749 update_curr(cfs_rq); 785 update_curr(cfs_rq);
750 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
751 787
752 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
753 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
754 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
755 } 791 }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
803 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
804 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
805 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
806} 850}
807 851
808/* 852/*
@@ -1013,13 +1057,19 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
1013{ 1057{
1014 struct cfs_rq *cfs_rq; 1058 struct cfs_rq *cfs_rq;
1015 struct sched_entity *se = &p->se; 1059 struct sched_entity *se = &p->se;
1060 int flags = 0;
1061
1062 if (wakeup)
1063 flags |= ENQUEUE_WAKEUP;
1064 if (p->state == TASK_WAKING)
1065 flags |= ENQUEUE_MIGRATE;
1016 1066
1017 for_each_sched_entity(se) { 1067 for_each_sched_entity(se) {
1018 if (se->on_rq) 1068 if (se->on_rq)
1019 break; 1069 break;
1020 cfs_rq = cfs_rq_of(se); 1070 cfs_rq = cfs_rq_of(se);
1021 enqueue_entity(cfs_rq, se, wakeup); 1071 enqueue_entity(cfs_rq, se, flags);
1022 wakeup = 1; 1072 flags = ENQUEUE_WAKEUP;
1023 } 1073 }
1024 1074
1025 hrtick_update(rq); 1075 hrtick_update(rq);
@@ -1095,6 +1145,14 @@ static void yield_task_fair(struct rq *rq)
1095 1145
1096#ifdef CONFIG_SMP 1146#ifdef CONFIG_SMP
1097 1147
1148static void task_waking_fair(struct rq *rq, struct task_struct *p)
1149{
1150 struct sched_entity *se = &p->se;
1151 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1152
1153 se->vruntime -= cfs_rq->min_vruntime;
1154}
1155
1098#ifdef CONFIG_FAIR_GROUP_SCHED 1156#ifdef CONFIG_FAIR_GROUP_SCHED
1099/* 1157/*
1100 * effective_load() calculates the load change as seen from the root_task_group 1158 * effective_load() calculates the load change as seen from the root_task_group
@@ -1403,8 +1461,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 new_cpu = prev_cpu; 1461 new_cpu = prev_cpu;
1404 } 1462 }
1405 1463
1406 rcu_read_lock();
1407 for_each_domain(cpu, tmp) { 1464 for_each_domain(cpu, tmp) {
1465 if (!(tmp->flags & SD_LOAD_BALANCE))
1466 continue;
1467
1408 /* 1468 /*
1409 * If power savings logic is enabled for a domain, see if we 1469 * If power savings logic is enabled for a domain, see if we
1410 * are not overloaded, if so, don't balance wider. 1470 * are not overloaded, if so, don't balance wider.
@@ -1448,7 +1508,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1448 * If there's an idle sibling in this domain, make that 1508 * If there's an idle sibling in this domain, make that
1449 * the wake_affine target instead of the current cpu. 1509 * the wake_affine target instead of the current cpu.
1450 */ 1510 */
1451 if (tmp->flags & SD_PREFER_SIBLING) 1511 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1452 target = select_idle_sibling(p, tmp, target); 1512 target = select_idle_sibling(p, tmp, target);
1453 1513
1454 if (target >= 0) { 1514 if (target >= 0) {
@@ -1484,10 +1544,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1484 update_shares(tmp); 1544 update_shares(tmp);
1485 } 1545 }
1486 1546
1487 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1547 if (affine_sd && wake_affine(affine_sd, p, sync))
1488 new_cpu = cpu; 1548 return cpu;
1489 goto out;
1490 }
1491 1549
1492 while (sd) { 1550 while (sd) {
1493 int load_idx = sd->forkexec_idx; 1551 int load_idx = sd->forkexec_idx;
@@ -1528,8 +1586,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1528 /* while loop will break here if sd == NULL */ 1586 /* while loop will break here if sd == NULL */
1529 } 1587 }
1530 1588
1531out:
1532 rcu_read_unlock();
1533 return new_cpu; 1589 return new_cpu;
1534} 1590}
1535#endif /* CONFIG_SMP */ 1591#endif /* CONFIG_SMP */
@@ -1651,12 +1707,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1651 int sync = wake_flags & WF_SYNC; 1707 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency; 1708 int scale = cfs_rq->nr_running >= sched_nr_latency;
1653 1709
1654 update_curr(cfs_rq); 1710 if (unlikely(rt_prio(p->prio)))
1655 1711 goto preempt;
1656 if (unlikely(rt_prio(p->prio))) {
1657 resched_task(curr);
1658 return;
1659 }
1660 1712
1661 if (unlikely(p->sched_class != &fair_sched_class)) 1713 if (unlikely(p->sched_class != &fair_sched_class))
1662 return; 1714 return;
@@ -1682,50 +1734,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1682 return; 1734 return;
1683 1735
1684 /* Idle tasks are by definition preempted by everybody. */ 1736 /* Idle tasks are by definition preempted by everybody. */
1685 if (unlikely(curr->policy == SCHED_IDLE)) { 1737 if (unlikely(curr->policy == SCHED_IDLE))
1686 resched_task(curr); 1738 goto preempt;
1687 return;
1688 }
1689 1739
1690 if ((sched_feat(WAKEUP_SYNC) && sync) || 1740 if (sched_feat(WAKEUP_SYNC) && sync)
1691 (sched_feat(WAKEUP_OVERLAP) && 1741 goto preempt;
1692 (se->avg_overlap < sysctl_sched_migration_cost &&
1693 pse->avg_overlap < sysctl_sched_migration_cost))) {
1694 resched_task(curr);
1695 return;
1696 }
1697 1742
1698 if (sched_feat(WAKEUP_RUNNING)) { 1743 if (sched_feat(WAKEUP_OVERLAP) &&
1699 if (pse->avg_running < se->avg_running) { 1744 se->avg_overlap < sysctl_sched_migration_cost &&
1700 set_next_buddy(pse); 1745 pse->avg_overlap < sysctl_sched_migration_cost)
1701 resched_task(curr); 1746 goto preempt;
1702 return;
1703 }
1704 }
1705 1747
1706 if (!sched_feat(WAKEUP_PREEMPT)) 1748 if (!sched_feat(WAKEUP_PREEMPT))
1707 return; 1749 return;
1708 1750
1751 update_curr(cfs_rq);
1709 find_matching_se(&se, &pse); 1752 find_matching_se(&se, &pse);
1710
1711 BUG_ON(!pse); 1753 BUG_ON(!pse);
1754 if (wakeup_preempt_entity(se, pse) == 1)
1755 goto preempt;
1712 1756
1713 if (wakeup_preempt_entity(se, pse) == 1) { 1757 return;
1714 resched_task(curr); 1758
1715 /* 1759preempt:
1716 * Only set the backward buddy when the current task is still 1760 resched_task(curr);
1717 * on the rq. This can happen when a wakeup gets interleaved 1761 /*
1718 * with schedule on the ->pre_schedule() or idle_balance() 1762 * Only set the backward buddy when the current task is still
1719 * point, either of which can * drop the rq lock. 1763 * on the rq. This can happen when a wakeup gets interleaved
1720 * 1764 * with schedule on the ->pre_schedule() or idle_balance()
1721 * Also, during early boot the idle thread is in the fair class, 1765 * point, either of which can * drop the rq lock.
1722 * for obvious reasons its a bad idea to schedule back to it. 1766 *
1723 */ 1767 * Also, during early boot the idle thread is in the fair class,
1724 if (unlikely(!se->on_rq || curr == rq->idle)) 1768 * for obvious reasons its a bad idea to schedule back to it.
1725 return; 1769 */
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1770 if (unlikely(!se->on_rq || curr == rq->idle))
1727 set_last_buddy(se); 1771 return;
1728 } 1772
1773 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1774 set_last_buddy(se);
1729} 1775}
1730 1776
1731static struct task_struct *pick_next_task_fair(struct rq *rq) 1777static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1905,6 +1951,17 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1905 1951
1906 return 0; 1952 return 0;
1907} 1953}
1954
1955static void rq_online_fair(struct rq *rq)
1956{
1957 update_sysctl();
1958}
1959
1960static void rq_offline_fair(struct rq *rq)
1961{
1962 update_sysctl();
1963}
1964
1908#endif /* CONFIG_SMP */ 1965#endif /* CONFIG_SMP */
1909 1966
1910/* 1967/*
@@ -1922,28 +1979,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1922} 1979}
1923 1980
1924/* 1981/*
1925 * Share the fairness runtime between parent and child, thus the 1982 * called on fork with the child task as argument from the parent's context
1926 * total amount of pressure for CPU stays equal - new tasks 1983 * - child not yet on the tasklist
1927 * get a chance to run but frequent forkers are not allowed to 1984 * - preemption disabled
1928 * monopolize the CPU. Note: the parent runqueue is locked,
1929 * the child is not running yet.
1930 */ 1985 */
1931static void task_new_fair(struct rq *rq, struct task_struct *p) 1986static void task_fork_fair(struct task_struct *p)
1932{ 1987{
1933 struct cfs_rq *cfs_rq = task_cfs_rq(p); 1988 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1934 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 1989 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1935 int this_cpu = smp_processor_id(); 1990 int this_cpu = smp_processor_id();
1991 struct rq *rq = this_rq();
1992 unsigned long flags;
1936 1993
1937 sched_info_queued(p); 1994 raw_spin_lock_irqsave(&rq->lock, flags);
1995
1996 if (unlikely(task_cpu(p) != this_cpu))
1997 __set_task_cpu(p, this_cpu);
1938 1998
1939 update_curr(cfs_rq); 1999 update_curr(cfs_rq);
2000
1940 if (curr) 2001 if (curr)
1941 se->vruntime = curr->vruntime; 2002 se->vruntime = curr->vruntime;
1942 place_entity(cfs_rq, se, 1); 2003 place_entity(cfs_rq, se, 1);
1943 2004
1944 /* 'curr' will be NULL if the child belongs to a different group */ 2005 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1945 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1946 curr && entity_before(curr, se)) {
1947 /* 2006 /*
1948 * Upon rescheduling, sched_class::put_prev_task() will place 2007 * Upon rescheduling, sched_class::put_prev_task() will place
1949 * 'current' within the tree based on its new key value. 2008 * 'current' within the tree based on its new key value.
@@ -1952,7 +2011,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1952 resched_task(rq->curr); 2011 resched_task(rq->curr);
1953 } 2012 }
1954 2013
1955 enqueue_task_fair(rq, p, 0); 2014 se->vruntime -= cfs_rq->min_vruntime;
2015
2016 raw_spin_unlock_irqrestore(&rq->lock, flags);
1956} 2017}
1957 2018
1958/* 2019/*
@@ -2005,30 +2066,27 @@ static void set_curr_task_fair(struct rq *rq)
2005} 2066}
2006 2067
2007#ifdef CONFIG_FAIR_GROUP_SCHED 2068#ifdef CONFIG_FAIR_GROUP_SCHED
2008static void moved_group_fair(struct task_struct *p) 2069static void moved_group_fair(struct task_struct *p, int on_rq)
2009{ 2070{
2010 struct cfs_rq *cfs_rq = task_cfs_rq(p); 2071 struct cfs_rq *cfs_rq = task_cfs_rq(p);
2011 2072
2012 update_curr(cfs_rq); 2073 update_curr(cfs_rq);
2013 place_entity(cfs_rq, &p->se, 1); 2074 if (!on_rq)
2075 place_entity(cfs_rq, &p->se, 1);
2014} 2076}
2015#endif 2077#endif
2016 2078
2017unsigned int get_rr_interval_fair(struct task_struct *task) 2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2018{ 2080{
2019 struct sched_entity *se = &task->se; 2081 struct sched_entity *se = &task->se;
2020 unsigned long flags;
2021 struct rq *rq;
2022 unsigned int rr_interval = 0; 2082 unsigned int rr_interval = 0;
2023 2083
2024 /* 2084 /*
2025 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 2085 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2026 * idle runqueue: 2086 * idle runqueue:
2027 */ 2087 */
2028 rq = task_rq_lock(task, &flags);
2029 if (rq->cfs.load.weight) 2088 if (rq->cfs.load.weight)
2030 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 2089 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
2031 task_rq_unlock(rq, &flags);
2032 2090
2033 return rr_interval; 2091 return rr_interval;
2034} 2092}
@@ -2052,11 +2110,15 @@ static const struct sched_class fair_sched_class = {
2052 2110
2053 .load_balance = load_balance_fair, 2111 .load_balance = load_balance_fair,
2054 .move_one_task = move_one_task_fair, 2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair,
2115
2116 .task_waking = task_waking_fair,
2055#endif 2117#endif
2056 2118
2057 .set_curr_task = set_curr_task_fair, 2119 .set_curr_task = set_curr_task_fair,
2058 .task_tick = task_tick_fair, 2120 .task_tick = task_tick_fair,
2059 .task_new = task_new_fair, 2121 .task_fork = task_fork_fair,
2060 2122
2061 .prio_changed = prio_changed_fair, 2123 .prio_changed = prio_changed_fair,
2062 .switched_to = switched_to_fair, 2124 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c..d5059fd761d 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde..5f93b570d38 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,10 +34,10 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
@@ -97,7 +97,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 97 check_preempt_curr(rq, p, 0);
98} 98}
99 99
100unsigned int get_rr_interval_idle(struct task_struct *task) 100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 101{
102 return 0; 102 return 0;
103} 103}
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef37841..f48328ac216 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -327,7 +327,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 327
328 weight = cpumask_weight(rd->span); 328 weight = cpumask_weight(rd->span);
329 329
330 spin_lock(&rt_b->rt_runtime_lock); 330 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 331 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 332 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +336,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 336 if (iter == rt_rq)
337 continue; 337 continue;
338 338
339 spin_lock(&iter->rt_runtime_lock); 339 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 340 /*
341 * Either all rqs have inf runtime and there's nothing to steal 341 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 342 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +358,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 358 rt_rq->rt_runtime += diff;
359 more = 1; 359 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 360 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 361 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 362 break;
363 } 363 }
364 } 364 }
365next: 365next:
366 spin_unlock(&iter->rt_runtime_lock); 366 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 367 }
368 spin_unlock(&rt_b->rt_runtime_lock); 368 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 369
370 return more; 370 return more;
371} 371}
@@ -386,8 +386,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 386 s64 want;
387 int i; 387 int i;
388 388
389 spin_lock(&rt_b->rt_runtime_lock); 389 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 390 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 391 /*
392 * Either we're all inf and nobody needs to borrow, or we're 392 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 393 * already disabled and thus have nothing to do, or we have
@@ -396,7 +396,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 396 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 397 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 398 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 399 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 400
401 /* 401 /*
402 * Calculate the difference between what we started out with 402 * Calculate the difference between what we started out with
@@ -418,7 +418,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 419 continue;
420 420
421 spin_lock(&iter->rt_runtime_lock); 421 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 422 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 423 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 424 iter->rt_runtime -= diff;
@@ -427,13 +427,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 427 iter->rt_runtime -= want;
428 want -= want; 428 want -= want;
429 } 429 }
430 spin_unlock(&iter->rt_runtime_lock); 430 raw_spin_unlock(&iter->rt_runtime_lock);
431 431
432 if (!want) 432 if (!want)
433 break; 433 break;
434 } 434 }
435 435
436 spin_lock(&rt_rq->rt_runtime_lock); 436 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 437 /*
438 * We cannot be left wanting - that would mean some runtime 438 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 439 * leaked out of the system.
@@ -445,8 +445,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 445 * runtime - in which case borrowing doesn't make sense.
446 */ 446 */
447 rt_rq->rt_runtime = RUNTIME_INF; 447 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 448 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 449 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 450 }
451} 451}
452 452
@@ -454,9 +454,9 @@ static void disable_runtime(struct rq *rq)
454{ 454{
455 unsigned long flags; 455 unsigned long flags;
456 456
457 spin_lock_irqsave(&rq->lock, flags); 457 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 458 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 459 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 460}
461 461
462static void __enable_runtime(struct rq *rq) 462static void __enable_runtime(struct rq *rq)
@@ -472,13 +472,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 472 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 474
475 spin_lock(&rt_b->rt_runtime_lock); 475 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 476 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 477 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 478 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 479 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 480 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 481 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 482 }
483} 483}
484 484
@@ -486,9 +486,9 @@ static void enable_runtime(struct rq *rq)
486{ 486{
487 unsigned long flags; 487 unsigned long flags;
488 488
489 spin_lock_irqsave(&rq->lock, flags); 489 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 490 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 491 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 492}
493 493
494static int balance_runtime(struct rt_rq *rt_rq) 494static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +496,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 496 int more = 0;
497 497
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 498 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 499 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 500 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 501 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 502 }
503 503
504 return more; 504 return more;
@@ -524,11 +524,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 525 struct rq *rq = rq_of_rt_rq(rt_rq);
526 526
527 spin_lock(&rq->lock); 527 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 528 if (rt_rq->rt_time) {
529 u64 runtime; 529 u64 runtime;
530 530
531 spin_lock(&rt_rq->rt_runtime_lock); 531 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 532 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 533 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 534 runtime = rt_rq->rt_runtime;
@@ -539,13 +539,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 539 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 540 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 541 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 542 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 543 } else if (rt_rq->rt_nr_running)
544 idle = 0; 544 idle = 0;
545 545
546 if (enqueue) 546 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 547 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 548 raw_spin_unlock(&rq->lock);
549 } 549 }
550 550
551 return idle; 551 return idle;
@@ -624,11 +624,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 624 rt_rq = rt_rq_of_se(rt_se);
625 625
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 627 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 628 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 629 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 630 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 631 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 632 }
633 } 633 }
634} 634}
@@ -1246,7 +1246,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1246 task_running(rq, task) || 1246 task_running(rq, task) ||
1247 !task->se.on_rq)) { 1247 !task->se.on_rq)) {
1248 1248
1249 spin_unlock(&lowest_rq->lock); 1249 raw_spin_unlock(&lowest_rq->lock);
1250 lowest_rq = NULL; 1250 lowest_rq = NULL;
1251 break; 1251 break;
1252 } 1252 }
@@ -1472,7 +1472,7 @@ static void post_schedule_rt(struct rq *rq)
1472 * If we are not running and we are not going to reschedule soon, we should 1472 * If we are not running and we are not going to reschedule soon, we should
1473 * try to push tasks away now 1473 * try to push tasks away now
1474 */ 1474 */
1475static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1475static void task_woken_rt(struct rq *rq, struct task_struct *p)
1476{ 1476{
1477 if (!task_running(rq, p) && 1477 if (!task_running(rq, p) &&
1478 !test_tsk_need_resched(rq->curr) && 1478 !test_tsk_need_resched(rq->curr) &&
@@ -1721,7 +1721,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1721 dequeue_pushable_task(rq, p);
1722} 1722}
1723 1723
1724unsigned int get_rr_interval_rt(struct task_struct *task) 1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1725{
1726 /* 1726 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1727 * Time slice is 0 for SCHED_FIFO tasks
@@ -1753,7 +1753,7 @@ static const struct sched_class rt_sched_class = {
1753 .rq_offline = rq_offline_rt, 1753 .rq_offline = rq_offline_rt,
1754 .pre_schedule = pre_schedule_rt, 1754 .pre_schedule = pre_schedule_rt,
1755 .post_schedule = post_schedule_rt, 1755 .post_schedule = post_schedule_rt,
1756 .task_wake_up = task_wake_up_rt, 1756 .task_woken = task_woken_rt,
1757 .switched_from = switched_from_rt, 1757 .switched_from = switched_from_rt,
1758#endif 1758#endif
1759 1759
diff --git a/kernel/signal.c b/kernel/signal.c
index 6b982f2cf52..934ae5e687b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
218 struct user_struct *user; 218 struct user_struct *user;
219 219
220 /* 220 /*
221 * We won't get problems with the target's UID changing under us 221 * Protect access to @t credentials. This can go away when all
222 * because changing it requires RCU be used, and if t != current, the 222 * callers hold rcu read lock.
223 * caller must be holding the RCU readlock (by way of a spinlock) and
224 * we use RCU protection here
225 */ 223 */
224 rcu_read_lock();
226 user = get_uid(__task_cred(t)->user); 225 user = get_uid(__task_cred(t)->user);
227 atomic_inc(&user->sigpending); 226 atomic_inc(&user->sigpending);
227 rcu_read_unlock();
228 228
229 if (override_rlimit || 229 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
@@ -423,7 +423,7 @@ still_pending:
423 */ 423 */
424 info->si_signo = sig; 424 info->si_signo = sig;
425 info->si_errno = 0; 425 info->si_errno = 0;
426 info->si_code = 0; 426 info->si_code = SI_USER;
427 info->si_pid = 0; 427 info->si_pid = 0;
428 info->si_uid = 0; 428 info->si_uid = 0;
429 } 429 }
@@ -607,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
607 return 1; 607 return 1;
608} 608}
609 609
610static inline int is_si_special(const struct siginfo *info)
611{
612 return info <= SEND_SIG_FORCED;
613}
614
615static inline bool si_fromuser(const struct siginfo *info)
616{
617 return info == SEND_SIG_NOINFO ||
618 (!is_si_special(info) && SI_FROMUSER(info));
619}
620
610/* 621/*
611 * Bad permissions for sending the signal 622 * Bad permissions for sending the signal
612 * - the caller must hold at least the RCU read lock 623 * - the caller must hold at least the RCU read lock
@@ -621,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
621 if (!valid_signal(sig)) 632 if (!valid_signal(sig))
622 return -EINVAL; 633 return -EINVAL;
623 634
624 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 635 if (!si_fromuser(info))
625 return 0; 636 return 0;
626 637
627 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 638 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -949,9 +960,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
949 int from_ancestor_ns = 0; 960 int from_ancestor_ns = 0;
950 961
951#ifdef CONFIG_PID_NS 962#ifdef CONFIG_PID_NS
952 if (!is_si_special(info) && SI_FROMUSER(info) && 963 from_ancestor_ns = si_fromuser(info) &&
953 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 964 !task_pid_nr_ns(current, task_active_pid_ns(t));
954 from_ancestor_ns = 1;
955#endif 965#endif
956 966
957 return __send_signal(sig, info, t, group, from_ancestor_ns); 967 return __send_signal(sig, info, t, group, from_ancestor_ns);
@@ -969,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
969 for (i = 0; i < 16; i++) { 979 for (i = 0; i < 16; i++) {
970 unsigned char insn; 980 unsigned char insn;
971 981
972 __get_user(insn, (unsigned char *)(regs->ip + i)); 982 if (get_user(insn, (unsigned char *)(regs->ip + i)))
983 break;
973 printk("%02x ", insn); 984 printk("%02x ", insn);
974 } 985 }
975 } 986 }
@@ -1052,12 +1063,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1052 return ret; 1063 return ret;
1053} 1064}
1054 1065
1055void
1056force_sig_specific(int sig, struct task_struct *t)
1057{
1058 force_sig_info(sig, SEND_SIG_FORCED, t);
1059}
1060
1061/* 1066/*
1062 * Nuke all other threads in the group. 1067 * Nuke all other threads in the group.
1063 */ 1068 */
@@ -1175,19 +1180,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1175 int ret = -EINVAL; 1180 int ret = -EINVAL;
1176 struct task_struct *p; 1181 struct task_struct *p;
1177 const struct cred *pcred; 1182 const struct cred *pcred;
1183 unsigned long flags;
1178 1184
1179 if (!valid_signal(sig)) 1185 if (!valid_signal(sig))
1180 return ret; 1186 return ret;
1181 1187
1182 read_lock(&tasklist_lock); 1188 rcu_read_lock();
1183 p = pid_task(pid, PIDTYPE_PID); 1189 p = pid_task(pid, PIDTYPE_PID);
1184 if (!p) { 1190 if (!p) {
1185 ret = -ESRCH; 1191 ret = -ESRCH;
1186 goto out_unlock; 1192 goto out_unlock;
1187 } 1193 }
1188 pcred = __task_cred(p); 1194 pcred = __task_cred(p);
1189 if ((info == SEND_SIG_NOINFO || 1195 if (si_fromuser(info) &&
1190 (!is_si_special(info) && SI_FROMUSER(info))) &&
1191 euid != pcred->suid && euid != pcred->uid && 1196 euid != pcred->suid && euid != pcred->uid &&
1192 uid != pcred->suid && uid != pcred->uid) { 1197 uid != pcred->suid && uid != pcred->uid) {
1193 ret = -EPERM; 1198 ret = -EPERM;
@@ -1196,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1196 ret = security_task_kill(p, info, sig, secid); 1201 ret = security_task_kill(p, info, sig, secid);
1197 if (ret) 1202 if (ret)
1198 goto out_unlock; 1203 goto out_unlock;
1199 if (sig && p->sighand) { 1204
1200 unsigned long flags; 1205 if (sig) {
1201 spin_lock_irqsave(&p->sighand->siglock, flags); 1206 if (lock_task_sighand(p, &flags)) {
1202 ret = __send_signal(sig, info, p, 1, 0); 1207 ret = __send_signal(sig, info, p, 1, 0);
1203 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1208 unlock_task_sighand(p, &flags);
1209 } else
1210 ret = -ESRCH;
1204 } 1211 }
1205out_unlock: 1212out_unlock:
1206 read_unlock(&tasklist_lock); 1213 rcu_read_unlock();
1207 return ret; 1214 return ret;
1208} 1215}
1209EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1216EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1837,11 +1844,6 @@ relock:
1837 1844
1838 for (;;) { 1845 for (;;) {
1839 struct k_sigaction *ka; 1846 struct k_sigaction *ka;
1840
1841 if (unlikely(signal->group_stop_count > 0) &&
1842 do_signal_stop(0))
1843 goto relock;
1844
1845 /* 1847 /*
1846 * Tracing can induce an artifical signal and choose sigaction. 1848 * Tracing can induce an artifical signal and choose sigaction.
1847 * The return value in @signr determines the default action, 1849 * The return value in @signr determines the default action,
@@ -1853,6 +1855,10 @@ relock:
1853 if (unlikely(signr != 0)) 1855 if (unlikely(signr != 0))
1854 ka = return_ka; 1856 ka = return_ka;
1855 else { 1857 else {
1858 if (unlikely(signal->group_stop_count > 0) &&
1859 do_signal_stop(0))
1860 goto relock;
1861
1856 signr = dequeue_signal(current, &current->blocked, 1862 signr = dequeue_signal(current, &current->blocked,
1857 info); 1863 info);
1858 1864
diff --git a/kernel/smp.c b/kernel/smp.c
index a8c76069cf5..f1040842244 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -16,11 +16,11 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16 16
17static struct { 17static struct {
18 struct list_head queue; 18 struct list_head queue;
19 spinlock_t lock; 19 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 20} call_function __cacheline_aligned_in_smp =
21 { 21 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 22 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 23 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 24 };
25 25
26enum { 26enum {
@@ -35,7 +35,7 @@ struct call_function_data {
35 35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU(struct call_function_data, cfd_data);
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 80 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 81 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 82
83 spin_lock_init(&q->lock); 83 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 84 INIT_LIST_HEAD(&q->list);
85 } 85 }
86 86
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 141 unsigned long flags;
142 int ipi; 142 int ipi;
143 143
144 spin_lock_irqsave(&dst->lock, flags); 144 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 145 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 146 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 147 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 148
149 /* 149 /*
150 * The list addition should be visible before sending the IPI 150 * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 171void generic_smp_call_function_interrupt(void)
172{ 172{
173 struct call_function_data *data; 173 struct call_function_data *data;
174 int cpu = get_cpu(); 174 int cpu = smp_processor_id();
175 175
176 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 177 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 201 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 202 WARN_ON(refs < 0);
203 if (!refs) { 203 if (!refs) {
204 spin_lock(&call_function.lock); 204 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 206 raw_spin_unlock(&call_function.lock);
207 } 207 }
208 208
209 if (refs) 209 if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 212 csd_unlock(&data->csd);
213 } 213 }
214 214
215 put_cpu();
216} 215}
217 216
218/* 217/*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 229 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 230 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 231
233 spin_lock(&q->lock); 232 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 233 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 234 raw_spin_unlock(&q->lock);
236 235
237 while (!list_empty(&list)) { 236 while (!list_empty(&list)) {
238 struct call_single_data *data; 237 struct call_single_data *data;
@@ -348,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask,
348 goto call; 347 goto call;
349 348
350 /* Try for same node. */ 349 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu); 350 nodemask = cpumask_of_node(cpu_to_node(cpu));
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; 351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) { 352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu)) 353 if (cpu_online(cpu))
@@ -449,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
449 cpumask_clear_cpu(this_cpu, data->cpumask); 448 cpumask_clear_cpu(this_cpu, data->cpumask);
450 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 449 atomic_set(&data->refs, cpumask_weight(data->cpumask));
451 450
452 spin_lock_irqsave(&call_function.lock, flags); 451 raw_spin_lock_irqsave(&call_function.lock, flags);
453 /* 452 /*
454 * Place entry at the _HEAD_ of the list, so that any cpu still 453 * Place entry at the _HEAD_ of the list, so that any cpu still
455 * observing the entry in generic_smp_call_function_interrupt() 454 * observing the entry in generic_smp_call_function_interrupt()
456 * will not miss any other list entries: 455 * will not miss any other list entries:
457 */ 456 */
458 list_add_rcu(&data->csd.list, &call_function.queue); 457 list_add_rcu(&data->csd.list, &call_function.queue);
459 spin_unlock_irqrestore(&call_function.lock, flags); 458 raw_spin_unlock_irqrestore(&call_function.lock, flags);
460 459
461 /* 460 /*
462 * Make the list addition visible before sending the ipi. 461 * Make the list addition visible before sending the ipi.
@@ -501,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
501 500
502void ipi_call_lock(void) 501void ipi_call_lock(void)
503{ 502{
504 spin_lock(&call_function.lock); 503 raw_spin_lock(&call_function.lock);
505} 504}
506 505
507void ipi_call_unlock(void) 506void ipi_call_unlock(void)
508{ 507{
509 spin_unlock(&call_function.lock); 508 raw_spin_unlock(&call_function.lock);
510} 509}
511 510
512void ipi_call_lock_irq(void) 511void ipi_call_lock_irq(void)
513{ 512{
514 spin_lock_irq(&call_function.lock); 513 raw_spin_lock_irq(&call_function.lock);
515} 514}
516 515
517void ipi_call_unlock_irq(void) 516void ipi_call_unlock_irq(void)
518{ 517{
519 spin_unlock_irq(&call_function.lock); 518 raw_spin_unlock_irq(&call_function.lock);
520} 519}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 21939d9e830..7c1a67ef027 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 692 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 693}
699 694
700static int ksoftirqd(void * __bind_cpu) 695static int run_ksoftirqd(void * __bind_cpu)
701{ 696{
702 set_current_state(TASK_INTERRUPTIBLE); 697 set_current_state(TASK_INTERRUPTIBLE);
703 698
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 805 switch (action) {
811 case CPU_UP_PREPARE: 806 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 807 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 811 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb3..0d4c7898ab8 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
70{ 71{
71 int this_cpu = raw_smp_processor_id(); 72 int this_cpu = raw_smp_processor_id();
72 73
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 75}
75 76
76void touch_softlockup_watchdog(void) 77void touch_softlockup_watchdog(void)
77{ 78{
78 __raw_get_cpu_var(touch_timestamp) = 0; 79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
85 92
86 /* Cause each CPU to re-update its timestamp rather than complain */ 93 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 94 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 95 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 96}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 98
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 111void softlockup_tick(void)
105{ 112{
106 int this_cpu = smp_processor_id(); 113 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 115 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 116 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 117 unsigned long now;
111 118
112 /* Is detection switched off? */ 119 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 121 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 122 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 124 return;
118 } 125 }
119 126
120 if (touch_timestamp == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
124 139
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 141
127 /* report at most once a second */ 142 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 143 if (print_ts == touch_ts || did_panic)
129 return; 144 return;
130 145
131 /* do not print during early bootup: */ 146 /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_timestamp + softlockup_thresh/2) 158 if (now > touch_ts + softlockup_thresh/2)
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 162 if (now <= (touch_ts + softlockup_thresh))
148 return; 163 return;
149 164
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 166
152 spin_lock(&print_lock); 167 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 169 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 170 current->comm, task_pid_nr(current));
156 print_modules(); 171 print_modules();
157 print_irqtrace_events(current); 172 print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 224 switch (action) {
210 case CPU_UP_PREPARE: 225 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 226 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 229 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 231 return NOTIFY_BAD;
217 } 232 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 234 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 235 kthread_bind(p, hotcpu);
221 break; 236 break;
222 case CPU_ONLINE: 237 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 238 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 240 break;
226#ifdef CONFIG_HOTPLUG_CPU 241#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 242 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 243 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 244 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 245 break;
231 /* Unbind so it can run. Fall thru. */ 246 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 248 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 249 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 250 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 251 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 253 kthread_stop(p);
239 break; 254 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 255#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 41e042219ff..be6517fb9c1 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -32,6 +32,8 @@
32 * include/linux/spinlock_api_smp.h 32 * include/linux/spinlock_api_smp.h
33 */ 33 */
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
35/* 37/*
36 * We build the __lock_function inlines here. They are too large for 38 * We build the __lock_function inlines here. They are too large for
37 * inlining all over the place, but here is only one user per function 39 * inlining all over the place, but here is only one user per function
@@ -42,49 +44,49 @@
42 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
43 */ 45 */
44#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
45void __lockfunc __##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
46{ \ 48{ \
47 for (;;) { \ 49 for (;;) { \
48 preempt_disable(); \ 50 preempt_disable(); \
49 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
50 break; \ 52 break; \
51 preempt_enable(); \ 53 preempt_enable(); \
52 \ 54 \
53 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
54 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
55 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
56 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
57 } \ 59 } \
58 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
59} \ 61} \
60 \ 62 \
61unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
62{ \ 64{ \
63 unsigned long flags; \ 65 unsigned long flags; \
64 \ 66 \
65 for (;;) { \ 67 for (;;) { \
66 preempt_disable(); \ 68 preempt_disable(); \
67 local_irq_save(flags); \ 69 local_irq_save(flags); \
68 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
69 break; \ 71 break; \
70 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
71 preempt_enable(); \ 73 preempt_enable(); \
72 \ 74 \
73 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
74 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
75 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
76 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
77 } \ 79 } \
78 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
79 return flags; \ 81 return flags; \
80} \ 82} \
81 \ 83 \
82void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
83{ \ 85{ \
84 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
85} \ 87} \
86 \ 88 \
87void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
88{ \ 90{ \
89 unsigned long flags; \ 91 unsigned long flags; \
90 \ 92 \
@@ -93,7 +95,7 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
93 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
94 /* function: */ \ 96 /* function: */ \
95 /**/ \ 97 /**/ \
96 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
97 local_bh_disable(); \ 99 local_bh_disable(); \
98 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
99} \ 101} \
@@ -107,269 +109,269 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
107 * __[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
108 * __[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
109 */ 111 */
110BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
111BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
112BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
113 115
114#endif 116#endif
115 117
116#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
117 119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
118void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
119{ 120{
120 preempt_disable(); 121 return __raw_spin_trylock(lock);
121 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
122 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
123} 122}
124EXPORT_SYMBOL(_spin_lock_nested); 123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
125 125
126unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127 int subclass) 127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
128{ 128{
129 unsigned long flags; 129 return __raw_spin_trylock_bh(lock);
130
131 local_irq_save(flags);
132 preempt_disable();
133 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
134 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
135 _raw_spin_lock_flags, &flags);
136 return flags;
137} 130}
138EXPORT_SYMBOL(_spin_lock_irqsave_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
139 133
140void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 134#ifndef CONFIG_INLINE_SPIN_LOCK
141 struct lockdep_map *nest_lock) 135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
142{ 136{
143 preempt_disable(); 137 __raw_spin_lock(lock);
144 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
145 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
146} 138}
147EXPORT_SYMBOL(_spin_lock_nest_lock); 139EXPORT_SYMBOL(_raw_spin_lock);
148
149#endif 140#endif
150 141
151#ifndef CONFIG_INLINE_SPIN_TRYLOCK 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
152int __lockfunc _spin_trylock(spinlock_t *lock) 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
153{ 144{
154 return __spin_trylock(lock); 145 return __raw_spin_lock_irqsave(lock);
155} 146}
156EXPORT_SYMBOL(_spin_trylock); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
157#endif 148#endif
158 149
159#ifndef CONFIG_INLINE_READ_TRYLOCK 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
160int __lockfunc _read_trylock(rwlock_t *lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
161{ 152{
162 return __read_trylock(lock); 153 __raw_spin_lock_irq(lock);
163} 154}
164EXPORT_SYMBOL(_read_trylock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
165#endif 156#endif
166 157
167#ifndef CONFIG_INLINE_WRITE_TRYLOCK 158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
168int __lockfunc _write_trylock(rwlock_t *lock) 159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
169{ 160{
170 return __write_trylock(lock); 161 __raw_spin_lock_bh(lock);
171} 162}
172EXPORT_SYMBOL(_write_trylock); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
173#endif 164#endif
174 165
175#ifndef CONFIG_INLINE_READ_LOCK 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
176void __lockfunc _read_lock(rwlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
177{ 168{
178 __read_lock(lock); 169 __raw_spin_unlock(lock);
179} 170}
180EXPORT_SYMBOL(_read_lock); 171EXPORT_SYMBOL(_raw_spin_unlock);
181#endif 172#endif
182 173
183#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
184unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
185{ 176{
186 return __spin_lock_irqsave(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
187} 178}
188EXPORT_SYMBOL(_spin_lock_irqsave); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
189#endif 180#endif
190 181
191#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
192void __lockfunc _spin_lock_irq(spinlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
193{ 184{
194 __spin_lock_irq(lock); 185 __raw_spin_unlock_irq(lock);
195} 186}
196EXPORT_SYMBOL(_spin_lock_irq); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
197#endif 188#endif
198 189
199#ifndef CONFIG_INLINE_SPIN_LOCK_BH 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
200void __lockfunc _spin_lock_bh(spinlock_t *lock) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
201{ 192{
202 __spin_lock_bh(lock); 193 __raw_spin_unlock_bh(lock);
203} 194}
204EXPORT_SYMBOL(_spin_lock_bh); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
205#endif 196#endif
206 197
207#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE 198#ifndef CONFIG_INLINE_READ_TRYLOCK
208unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
209{ 200{
210 return __read_lock_irqsave(lock); 201 return __raw_read_trylock(lock);
211} 202}
212EXPORT_SYMBOL(_read_lock_irqsave); 203EXPORT_SYMBOL(_raw_read_trylock);
213#endif 204#endif
214 205
215#ifndef CONFIG_INLINE_READ_LOCK_IRQ 206#ifndef CONFIG_INLINE_READ_LOCK
216void __lockfunc _read_lock_irq(rwlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
217{ 208{
218 __read_lock_irq(lock); 209 __raw_read_lock(lock);
219} 210}
220EXPORT_SYMBOL(_read_lock_irq); 211EXPORT_SYMBOL(_raw_read_lock);
221#endif 212#endif
222 213
223#ifndef CONFIG_INLINE_READ_LOCK_BH 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
224void __lockfunc _read_lock_bh(rwlock_t *lock) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
225{ 216{
226 __read_lock_bh(lock); 217 return __raw_read_lock_irqsave(lock);
227} 218}
228EXPORT_SYMBOL(_read_lock_bh); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
229#endif 220#endif
230 221
231#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
232unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
233{ 224{
234 return __write_lock_irqsave(lock); 225 __raw_read_lock_irq(lock);
235} 226}
236EXPORT_SYMBOL(_write_lock_irqsave); 227EXPORT_SYMBOL(_raw_read_lock_irq);
237#endif 228#endif
238 229
239#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ 230#ifndef CONFIG_INLINE_READ_LOCK_BH
240void __lockfunc _write_lock_irq(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
241{ 232{
242 __write_lock_irq(lock); 233 __raw_read_lock_bh(lock);
243} 234}
244EXPORT_SYMBOL(_write_lock_irq); 235EXPORT_SYMBOL(_raw_read_lock_bh);
245#endif 236#endif
246 237
247#ifndef CONFIG_INLINE_WRITE_LOCK_BH 238#ifndef CONFIG_INLINE_READ_UNLOCK
248void __lockfunc _write_lock_bh(rwlock_t *lock) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
249{ 240{
250 __write_lock_bh(lock); 241 __raw_read_unlock(lock);
251} 242}
252EXPORT_SYMBOL(_write_lock_bh); 243EXPORT_SYMBOL(_raw_read_unlock);
253#endif 244#endif
254 245
255#ifndef CONFIG_INLINE_SPIN_LOCK 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
256void __lockfunc _spin_lock(spinlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
257{ 248{
258 __spin_lock(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
259} 250}
260EXPORT_SYMBOL(_spin_lock); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
261#endif 252#endif
262 253
263#ifndef CONFIG_INLINE_WRITE_LOCK 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
264void __lockfunc _write_lock(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
265{ 256{
266 __write_lock(lock); 257 __raw_read_unlock_irq(lock);
267} 258}
268EXPORT_SYMBOL(_write_lock); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
269#endif 260#endif
270 261
271#ifndef CONFIG_INLINE_SPIN_UNLOCK 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
272void __lockfunc _spin_unlock(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
273{ 264{
274 __spin_unlock(lock); 265 __raw_read_unlock_bh(lock);
275} 266}
276EXPORT_SYMBOL(_spin_unlock); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
277#endif 268#endif
278 269
279#ifndef CONFIG_INLINE_WRITE_UNLOCK 270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
280void __lockfunc _write_unlock(rwlock_t *lock) 271int __lockfunc _raw_write_trylock(rwlock_t *lock)
281{ 272{
282 __write_unlock(lock); 273 return __raw_write_trylock(lock);
283} 274}
284EXPORT_SYMBOL(_write_unlock); 275EXPORT_SYMBOL(_raw_write_trylock);
285#endif 276#endif
286 277
287#ifndef CONFIG_INLINE_READ_UNLOCK 278#ifndef CONFIG_INLINE_WRITE_LOCK
288void __lockfunc _read_unlock(rwlock_t *lock) 279void __lockfunc _raw_write_lock(rwlock_t *lock)
289{ 280{
290 __read_unlock(lock); 281 __raw_write_lock(lock);
291} 282}
292EXPORT_SYMBOL(_read_unlock); 283EXPORT_SYMBOL(_raw_write_lock);
293#endif 284#endif
294 285
295#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE 286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
296void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
297{ 288{
298 __spin_unlock_irqrestore(lock, flags); 289 return __raw_write_lock_irqsave(lock);
299} 290}
300EXPORT_SYMBOL(_spin_unlock_irqrestore); 291EXPORT_SYMBOL(_raw_write_lock_irqsave);
301#endif 292#endif
302 293
303#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ 294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
304void __lockfunc _spin_unlock_irq(spinlock_t *lock) 295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
305{ 296{
306 __spin_unlock_irq(lock); 297 __raw_write_lock_irq(lock);
307} 298}
308EXPORT_SYMBOL(_spin_unlock_irq); 299EXPORT_SYMBOL(_raw_write_lock_irq);
309#endif 300#endif
310 301
311#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH 302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
312void __lockfunc _spin_unlock_bh(spinlock_t *lock) 303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
313{ 304{
314 __spin_unlock_bh(lock); 305 __raw_write_lock_bh(lock);
315} 306}
316EXPORT_SYMBOL(_spin_unlock_bh); 307EXPORT_SYMBOL(_raw_write_lock_bh);
317#endif 308#endif
318 309
319#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE 310#ifndef CONFIG_INLINE_WRITE_UNLOCK
320void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 311void __lockfunc _raw_write_unlock(rwlock_t *lock)
321{ 312{
322 __read_unlock_irqrestore(lock, flags); 313 __raw_write_unlock(lock);
323} 314}
324EXPORT_SYMBOL(_read_unlock_irqrestore); 315EXPORT_SYMBOL(_raw_write_unlock);
325#endif 316#endif
326 317
327#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ 318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
328void __lockfunc _read_unlock_irq(rwlock_t *lock) 319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
329{ 320{
330 __read_unlock_irq(lock); 321 __raw_write_unlock_irqrestore(lock, flags);
331} 322}
332EXPORT_SYMBOL(_read_unlock_irq); 323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
333#endif 324#endif
334 325
335#ifndef CONFIG_INLINE_READ_UNLOCK_BH 326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
336void __lockfunc _read_unlock_bh(rwlock_t *lock) 327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
337{ 328{
338 __read_unlock_bh(lock); 329 __raw_write_unlock_irq(lock);
339} 330}
340EXPORT_SYMBOL(_read_unlock_bh); 331EXPORT_SYMBOL(_raw_write_unlock_irq);
341#endif 332#endif
342 333
343#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE 334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
344void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
345{ 336{
346 __write_unlock_irqrestore(lock, flags); 337 __raw_write_unlock_bh(lock);
347} 338}
348EXPORT_SYMBOL(_write_unlock_irqrestore); 339EXPORT_SYMBOL(_raw_write_unlock_bh);
349#endif 340#endif
350 341
351#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ 342#ifdef CONFIG_DEBUG_LOCK_ALLOC
352void __lockfunc _write_unlock_irq(rwlock_t *lock) 343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
353{ 345{
354 __write_unlock_irq(lock); 346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
355} 349}
356EXPORT_SYMBOL(_write_unlock_irq); 350EXPORT_SYMBOL(_raw_spin_lock_nested);
357#endif
358 351
359#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH 352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
360void __lockfunc _write_unlock_bh(rwlock_t *lock) 353 int subclass)
361{ 354{
362 __write_unlock_bh(lock); 355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363} 363}
364EXPORT_SYMBOL(_write_unlock_bh); 364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365#endif
366 365
367#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH 366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
368int __lockfunc _spin_trylock_bh(spinlock_t *lock) 367 struct lockdep_map *nest_lock)
369{ 368{
370 return __spin_trylock_bh(lock); 369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371} 372}
372EXPORT_SYMBOL(_spin_trylock_bh); 373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
373#endif 375#endif
374 376
375notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b..26a6b73a6b8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -163,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
163 if (niceval > 19) 162 if (niceval > 19)
164 niceval = 19; 163 niceval = 19;
165 164
165 rcu_read_lock();
166 read_lock(&tasklist_lock); 166 read_lock(&tasklist_lock);
167 switch (which) { 167 switch (which) {
168 case PRIO_PROCESS: 168 case PRIO_PROCESS:
@@ -190,16 +190,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 190 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 191 goto out_unlock; /* No processes for this user */
192 192
193 do_each_thread(g, p) 193 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 194 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 195 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 196 } while_each_thread(g, p);
197 if (who != cred->uid) 197 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 198 free_uid(user); /* For find_user() */
199 break; 199 break;
200 } 200 }
201out_unlock: 201out_unlock:
202 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
203 rcu_read_unlock();
203out: 204out:
204 return error; 205 return error;
205} 206}
@@ -253,13 +254,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 254 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 255 goto out_unlock; /* No processes for this user */
255 256
256 do_each_thread(g, p) 257 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 258 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 259 niceval = 20 - task_nice(p);
259 if (niceval > retval) 260 if (niceval > retval)
260 retval = niceval; 261 retval = niceval;
261 } 262 }
262 while_each_thread(g, p); 263 } while_each_thread(g, p);
263 if (who != cred->uid) 264 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 265 free_uid(user); /* for find_user() */
265 break; 266 break;
@@ -349,6 +350,9 @@ void kernel_power_off(void)
349 machine_power_off(); 350 machine_power_off();
350} 351}
351EXPORT_SYMBOL_GPL(kernel_power_off); 352EXPORT_SYMBOL_GPL(kernel_power_off);
353
354static DEFINE_MUTEX(reboot_mutex);
355
352/* 356/*
353 * Reboot system call: for obvious reasons only root may call it, 357 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 358 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +385,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 385 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 386 cmd = LINUX_REBOOT_CMD_HALT;
383 387
384 lock_kernel(); 388 mutex_lock(&reboot_mutex);
385 switch (cmd) { 389 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 390 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 391 kernel_restart(NULL);
@@ -397,20 +401,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 401
398 case LINUX_REBOOT_CMD_HALT: 402 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 403 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 404 do_exit(0);
402 panic("cannot halt"); 405 panic("cannot halt");
403 406
404 case LINUX_REBOOT_CMD_POWER_OFF: 407 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 408 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 409 do_exit(0);
408 break; 410 break;
409 411
410 case LINUX_REBOOT_CMD_RESTART2: 412 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 413 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 414 ret = -EFAULT;
413 return -EFAULT; 415 break;
414 } 416 }
415 buffer[sizeof(buffer) - 1] = '\0'; 417 buffer[sizeof(buffer) - 1] = '\0';
416 418
@@ -433,7 +435,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 435 ret = -EINVAL;
434 break; 436 break;
435 } 437 }
436 unlock_kernel(); 438 mutex_unlock(&reboot_mutex);
437 return ret; 439 return ret;
438} 440}
439 441
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c..8a68b244846 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
245static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
247#endif 251#endif
248 252
249static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
260 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
261 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
262 .mode = 0644, 266 .mode = 0644,
263 .proc_handler = sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
264 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
265 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
266 }, 270 },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
269 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
270 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
271 .mode = 0644, 275 .mode = 0644,
272 .proc_handler = sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
273 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
274 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
275 }, 279 },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
278 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
279 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
280 .mode = 0644, 284 .mode = 0644,
281 .proc_handler = proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
282 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
283 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
284 }, 288 },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
287 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
288 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
289 .mode = 0644, 293 .mode = 0644,
290 .proc_handler = proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
297 },
298 {
299 .procname = "sched_tunable_scaling",
300 .data = &sysctl_sched_tunable_scaling,
301 .maxlen = sizeof(enum sched_tunable_scaling),
302 .mode = 0644,
303 .proc_handler = sched_proc_update_handler,
304 .extra1 = &min_sched_tunable_scaling,
305 .extra2 = &max_sched_tunable_scaling,
291 }, 306 },
292 { 307 {
293 .procname = "sched_shares_thresh", 308 .procname = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
298 .extra1 = &zero, 313 .extra1 = &zero,
299 }, 314 },
300 { 315 {
301 .procname = "sched_features",
302 .data = &sysctl_sched_features,
303 .maxlen = sizeof(unsigned int),
304 .mode = 0644,
305 .proc_handler = proc_dointvec,
306 },
307 {
308 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
309 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
310 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
@@ -1043,7 +1051,7 @@ static struct ctl_table vm_table[] = {
1043 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1044 }, 1052 },
1045#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1046 { 1054 {
1047 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1048 .data = NULL, 1056 .data = NULL,
1049 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
@@ -1051,7 +1059,18 @@ static struct ctl_table vm_table[] = {
1051 .proc_handler = hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1052 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1053 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1054 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1055 { 1074 {
1056 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1057 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
@@ -1112,7 +1131,8 @@ static struct ctl_table vm_table[] = {
1112 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1113 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1114 .mode = 0644, 1133 .mode = 0644,
1115 .proc_handler = proc_dointvec 1134 .proc_handler = proc_dointvec_minmax,
1135 .extra1 = &zero,
1116 }, 1136 },
1117#else 1137#else
1118 { 1138 {
@@ -1194,6 +1214,7 @@ static struct ctl_table vm_table[] = {
1194 .proc_handler = proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1195 }, 1215 },
1196#endif 1216#endif
1217#ifdef CONFIG_MMU
1197 { 1218 {
1198 .procname = "mmap_min_addr", 1219 .procname = "mmap_min_addr",
1199 .data = &dac_mmap_min_addr, 1220 .data = &dac_mmap_min_addr,
@@ -1201,6 +1222,7 @@ static struct ctl_table vm_table[] = {
1201 .mode = 0644, 1222 .mode = 0644,
1202 .proc_handler = mmap_min_addr_handler, 1223 .proc_handler = mmap_min_addr_handler,
1203 }, 1224 },
1225#endif
1204#ifdef CONFIG_NUMA 1226#ifdef CONFIG_NUMA
1205 { 1227 {
1206 .procname = "numa_zonelist_order", 1228 .procname = "numa_zonelist_order",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b75dbf40f57..8f5d16e0707 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1399,6 +1399,13 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1399{ 1399{
1400 int i; 1400 int i;
1401 1401
1402 /*
1403 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1404 * ever go away.
1405 */
1406 if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
1407 return;
1408
1402 if (printk_ratelimit()) { 1409 if (printk_ratelimit()) {
1403 printk(KERN_INFO 1410 printk(KERN_INFO
1404 "warning: process `%s' used the deprecated sysctl " 1411 "warning: process `%s' used the deprecated sysctl "
@@ -1410,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1410 return; 1417 return;
1411} 1418}
1412 1419
1420#define WARN_ONCE_HASH_BITS 8
1421#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1422
1423static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1424
1425#define FNV32_OFFSET 2166136261U
1426#define FNV32_PRIME 0x01000193
1427
1428/*
1429 * Print each legacy sysctl (approximately) only once.
1430 * To avoid making the tables non-const use a external
1431 * hash-table instead.
1432 * Worst case hash collision: 6, but very rarely.
1433 * NOTE! We don't use the SMP-safe bit tests. We simply
1434 * don't care enough.
1435 */
1436static void warn_on_bintable(const int *name, int nlen)
1437{
1438 int i;
1439 u32 hash = FNV32_OFFSET;
1440
1441 for (i = 0; i < nlen; i++)
1442 hash = (hash ^ name[i]) * FNV32_PRIME;
1443 hash %= WARN_ONCE_HASH_SIZE;
1444 if (__test_and_set_bit(hash, warn_once_bitmap))
1445 return;
1446 deprecated_sysctl_warning(name, nlen);
1447}
1448
1413static ssize_t do_sysctl(int __user *args_name, int nlen, 1449static ssize_t do_sysctl(int __user *args_name, int nlen,
1414 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1450 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1415{ 1451{
@@ -1424,7 +1460,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
1424 if (get_user(name[i], args_name + i)) 1460 if (get_user(name[i], args_name + i))
1425 return -EFAULT; 1461 return -EFAULT;
1426 1462
1427 deprecated_sysctl_warning(name, nlen); 1463 warn_on_bintable(name, nlen);
1428 1464
1429 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); 1465 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1430} 1466}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 620b58abdc3..d7395fdfb9f 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -20,6 +20,8 @@
20#include <linux/sysdev.h> 20#include <linux/sysdev.h>
21#include <linux/tick.h> 21#include <linux/tick.h>
22 22
23#include "tick-internal.h"
24
23/* The registered clock event devices */ 25/* The registered clock event devices */
24static LIST_HEAD(clockevent_devices); 26static LIST_HEAD(clockevent_devices);
25static LIST_HEAD(clockevents_released); 27static LIST_HEAD(clockevents_released);
@@ -28,7 +30,7 @@ static LIST_HEAD(clockevents_released);
28static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
29 31
30/* Protection for the above */ 32/* Protection for the above */
31static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
32 34
33/** 35/**
34 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -37,10 +39,9 @@ static DEFINE_SPINLOCK(clockevents_lock);
37 * 39 *
38 * Math helper, returns latch value converted to nanoseconds (bound checked) 40 * Math helper, returns latch value converted to nanoseconds (bound checked)
39 */ 41 */
40unsigned long clockevent_delta2ns(unsigned long latch, 42u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
41 struct clock_event_device *evt)
42{ 43{
43 u64 clc = ((u64) latch << evt->shift); 44 u64 clc = (u64) latch << evt->shift;
44 45
45 if (unlikely(!evt->mult)) { 46 if (unlikely(!evt->mult)) {
46 evt->mult = 1; 47 evt->mult = 1;
@@ -50,10 +51,10 @@ unsigned long clockevent_delta2ns(unsigned long latch,
50 do_div(clc, evt->mult); 51 do_div(clc, evt->mult);
51 if (clc < 1000) 52 if (clc < 1000)
52 clc = 1000; 53 clc = 1000;
53 if (clc > LONG_MAX) 54 if (clc > KTIME_MAX)
54 clc = LONG_MAX; 55 clc = KTIME_MAX;
55 56
56 return (unsigned long) clc; 57 return clc;
57} 58}
58EXPORT_SYMBOL_GPL(clockevent_delta2ns); 59EXPORT_SYMBOL_GPL(clockevent_delta2ns);
59 60
@@ -140,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
140 unsigned long flags; 141 unsigned long flags;
141 int ret; 142 int ret;
142 143
143 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
144 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
145 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
146 147
147 return ret; 148 return ret;
148} 149}
@@ -184,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
184 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
185 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
186 187
187 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
188 189
189 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
190 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
191 clockevents_notify_released(); 192 clockevents_notify_released();
192 193
193 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
194} 195}
195EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
196 197
@@ -237,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
237 */ 238 */
238void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
239{ 240{
240 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
241 unsigned long flags; 242 unsigned long flags;
243 int cpu;
242 244
243 spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
244 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
245 247
246 switch (reason) { 248 switch (reason) {
@@ -249,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
249 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
250 * released from the users in the notify chain. 252 * released from the users in the notify chain.
251 */ 253 */
252 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
253 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
265 list_del(&dev->list);
266 }
267 }
254 break; 268 break;
255 default: 269 default:
256 break; 270 break;
257 } 271 }
258 spin_unlock_irqrestore(&clockevents_lock, flags); 272 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
259} 273}
260EXPORT_SYMBOL_GPL(clockevents_notify); 274EXPORT_SYMBOL_GPL(clockevents_notify);
261#endif 275#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 4a310906b3e..13700833c18 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -107,6 +107,59 @@ u64 timecounter_cyc2time(struct timecounter *tc,
107} 107}
108EXPORT_SYMBOL_GPL(timecounter_cyc2time); 108EXPORT_SYMBOL_GPL(timecounter_cyc2time);
109 109
110/**
111 * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks
112 * @mult: pointer to mult variable
113 * @shift: pointer to shift variable
114 * @from: frequency to convert from
115 * @to: frequency to convert to
116 * @minsec: guaranteed runtime conversion range in seconds
117 *
118 * The function evaluates the shift/mult pair for the scaled math
119 * operations of clocksources and clockevents.
120 *
121 * @to and @from are frequency values in HZ. For clock sources @to is
122 * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock
123 * event @to is the counter frequency and @from is NSEC_PER_SEC.
124 *
125 * The @minsec conversion range argument controls the time frame in
126 * seconds which must be covered by the runtime conversion with the
127 * calculated mult and shift factors. This guarantees that no 64bit
128 * overflow happens when the input value of the conversion is
129 * multiplied with the calculated mult factor. Larger ranges may
130 * reduce the conversion accuracy by chosing smaller mult and shift
131 * factors.
132 */
133void
134clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec)
135{
136 u64 tmp;
137 u32 sft, sftacc= 32;
138
139 /*
140 * Calculate the shift factor which is limiting the conversion
141 * range:
142 */
143 tmp = ((u64)minsec * from) >> 32;
144 while (tmp) {
145 tmp >>=1;
146 sftacc--;
147 }
148
149 /*
150 * Find the conversion shift/mult pair which has the best
151 * accuracy and fits the maxsec conversion range:
152 */
153 for (sft = 32; sft > 0; sft--) {
154 tmp = (u64) to << sft;
155 do_div(tmp, from);
156 if ((tmp >> sftacc) == 0)
157 break;
158 }
159 *mult = tmp;
160 *shift = sft;
161}
162
110/*[Clocksource internal variables]--------- 163/*[Clocksource internal variables]---------
111 * curr_clocksource: 164 * curr_clocksource:
112 * currently selected clocksource. 165 * currently selected clocksource.
@@ -290,7 +343,19 @@ static void clocksource_resume_watchdog(void)
290{ 343{
291 unsigned long flags; 344 unsigned long flags;
292 345
293 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
294 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
295 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
296} 361}
@@ -405,14 +470,55 @@ void clocksource_resume(void)
405 * clocksource_touch_watchdog - Update watchdog 470 * clocksource_touch_watchdog - Update watchdog
406 * 471 *
407 * Update the watchdog after exception contexts such as kgdb so as not 472 * Update the watchdog after exception contexts such as kgdb so as not
408 * to incorrectly trip the watchdog. 473 * to incorrectly trip the watchdog. This might fail when the kernel
409 * 474 * was stopped in code which holds watchdog_lock.
410 */ 475 */
411void clocksource_touch_watchdog(void) 476void clocksource_touch_watchdog(void)
412{ 477{
413 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
414} 479}
415 480
481/**
482 * clocksource_max_deferment - Returns max time the clocksource can be deferred
483 * @cs: Pointer to clocksource
484 *
485 */
486static u64 clocksource_max_deferment(struct clocksource *cs)
487{
488 u64 max_nsecs, max_cycles;
489
490 /*
491 * Calculate the maximum number of cycles that we can pass to the
492 * cyc2ns function without overflowing a 64-bit signed result. The
493 * maximum number of cycles is equal to ULLONG_MAX/cs->mult which
494 * is equivalent to the below.
495 * max_cycles < (2^63)/cs->mult
496 * max_cycles < 2^(log2((2^63)/cs->mult))
497 * max_cycles < 2^(log2(2^63) - log2(cs->mult))
498 * max_cycles < 2^(63 - log2(cs->mult))
499 * max_cycles < 1 << (63 - log2(cs->mult))
500 * Please note that we add 1 to the result of the log2 to account for
501 * any rounding errors, ensure the above inequality is satisfied and
502 * no overflow will occur.
503 */
504 max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1));
505
506 /*
507 * The actual maximum number of cycles we can defer the clocksource is
508 * determined by the minimum of max_cycles and cs->mask.
509 */
510 max_cycles = min_t(u64, max_cycles, (u64) cs->mask);
511 max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift);
512
513 /*
514 * To ensure that the clocksource does not wrap whilst we are idle,
515 * limit the time the clocksource can be deferred by 12.5%. Please
516 * note a margin of 12.5% is used because this can be computed with
517 * a shift, versus say 10% which would require division.
518 */
519 return max_nsecs - (max_nsecs >> 5);
520}
521
416#ifdef CONFIG_GENERIC_TIME 522#ifdef CONFIG_GENERIC_TIME
417 523
418/** 524/**
@@ -511,6 +617,9 @@ static void clocksource_enqueue(struct clocksource *cs)
511 */ 617 */
512int clocksource_register(struct clocksource *cs) 618int clocksource_register(struct clocksource *cs)
513{ 619{
620 /* calculate max idle time permitted for this clocksource */
621 cs->max_idle_ns = clocksource_max_deferment(cs);
622
514 mutex_lock(&clocksource_mutex); 623 mutex_lock(&clocksource_mutex);
515 clocksource_enqueue(cs); 624 clocksource_enqueue(cs);
516 clocksource_select(); 625 clocksource_select();
@@ -580,7 +689,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
580 * @count: length of buffer 689 * @count: length of buffer
581 * 690 *
582 * Takes input from sysfs interface for manually overriding the default 691 * Takes input from sysfs interface for manually overriding the default
583 * clocksource selction. 692 * clocksource selection.
584 */ 693 */
585static ssize_t sysfs_override_clocksource(struct sys_device *dev, 694static ssize_t sysfs_override_clocksource(struct sys_device *dev,
586 struct sysdev_attribute *attr, 695 struct sysdev_attribute *attr,
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a3..b3bafd5fc66 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3..b6b898d2eee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee..290eefbc1f6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index a96c0e2b89c..0a8a213016f 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -50,9 +50,9 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51 51
52 printk(KERN_WARNING 52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n", 53 "CE: %s increasing min_delta_ns to %llu nsec\n",
54 dev->name ? dev->name : "?", 54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1); 55 (unsigned long long) dev->min_delta_ns << 1);
56 56
57 i = 0; 57 i = 0;
58 } 58 }
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 89aed5933ed..f992762d7f5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,18 +134,13 @@ __setup("nohz=", setup_tick_nohz);
134 * value. We do this unconditionally on any cpu, as we don't know whether the 134 * value. We do this unconditionally on any cpu, as we don't know whether the
135 * cpu, which has the update task assigned is in a long sleep. 135 * cpu, which has the update task assigned is in a long sleep.
136 */ 136 */
137static void tick_nohz_update_jiffies(void) 137static void tick_nohz_update_jiffies(ktime_t now)
138{ 138{
139 int cpu = smp_processor_id(); 139 int cpu = smp_processor_id();
140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 140 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
141 unsigned long flags; 141 unsigned long flags;
142 ktime_t now;
143
144 if (!ts->tick_stopped)
145 return;
146 142
147 cpumask_clear_cpu(cpu, nohz_cpu_mask); 143 cpumask_clear_cpu(cpu, nohz_cpu_mask);
148 now = ktime_get();
149 ts->idle_waketime = now; 144 ts->idle_waketime = now;
150 145
151 local_irq_save(flags); 146 local_irq_save(flags);
@@ -155,20 +150,17 @@ static void tick_nohz_update_jiffies(void)
155 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
156} 151}
157 152
158static void tick_nohz_stop_idle(int cpu) 153static void tick_nohz_stop_idle(int cpu, ktime_t now)
159{ 154{
160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
161 157
162 if (ts->idle_active) { 158 delta = ktime_sub(now, ts->idle_entrytime);
163 ktime_t now, delta; 159 ts->idle_lastupdate = now;
164 now = ktime_get(); 160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
165 delta = ktime_sub(now, ts->idle_entrytime); 161 ts->idle_active = 0;
166 ts->idle_lastupdate = now;
167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
168 ts->idle_active = 0;
169 162
170 sched_clock_idle_wakeup_event(0); 163 sched_clock_idle_wakeup_event(0);
171 }
172} 164}
173 165
174static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 166static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
@@ -216,6 +208,7 @@ void tick_nohz_stop_sched_tick(int inidle)
216 struct tick_sched *ts; 208 struct tick_sched *ts;
217 ktime_t last_update, expires, now; 209 ktime_t last_update, expires, now;
218 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 210 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
211 u64 time_delta;
219 int cpu; 212 int cpu;
220 213
221 local_irq_save(flags); 214 local_irq_save(flags);
@@ -263,7 +256,7 @@ void tick_nohz_stop_sched_tick(int inidle)
263 256
264 if (ratelimit < 10) { 257 if (ratelimit < 10) {
265 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", 258 printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n",
266 local_softirq_pending()); 259 (unsigned int) local_softirq_pending());
267 ratelimit++; 260 ratelimit++;
268 } 261 }
269 goto end; 262 goto end;
@@ -275,14 +268,18 @@ void tick_nohz_stop_sched_tick(int inidle)
275 seq = read_seqbegin(&xtime_lock); 268 seq = read_seqbegin(&xtime_lock);
276 last_update = last_jiffies_update; 269 last_update = last_jiffies_update;
277 last_jiffies = jiffies; 270 last_jiffies = jiffies;
271 time_delta = timekeeping_max_deferment();
278 } while (read_seqretry(&xtime_lock, seq)); 272 } while (read_seqretry(&xtime_lock, seq));
279 273
280 /* Get the next timer wheel timer */ 274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
281 next_jiffies = get_next_timer_interrupt(last_jiffies); 275 arch_needs_cpu(cpu)) {
282 delta_jiffies = next_jiffies - last_jiffies; 276 next_jiffies = last_jiffies + 1;
283
284 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
285 delta_jiffies = 1; 277 delta_jiffies = 1;
278 } else {
279 /* Get the next timer wheel timer */
280 next_jiffies = get_next_timer_interrupt(last_jiffies);
281 delta_jiffies = next_jiffies - last_jiffies;
282 }
286 /* 283 /*
287 * Do not stop the tick, if we are only one off 284 * Do not stop the tick, if we are only one off
288 * or if the cpu is required for rcu 285 * or if the cpu is required for rcu
@@ -294,22 +291,51 @@ void tick_nohz_stop_sched_tick(int inidle)
294 if ((long)delta_jiffies >= 1) { 291 if ((long)delta_jiffies >= 1) {
295 292
296 /* 293 /*
297 * calculate the expiry time for the next timer wheel
298 * timer
299 */
300 expires = ktime_add_ns(last_update, tick_period.tv64 *
301 delta_jiffies);
302
303 /*
304 * If this cpu is the one which updates jiffies, then 294 * If this cpu is the one which updates jiffies, then
305 * give up the assignment and let it be taken by the 295 * give up the assignment and let it be taken by the
306 * cpu which runs the tick timer next, which might be 296 * cpu which runs the tick timer next, which might be
307 * this cpu as well. If we don't drop this here the 297 * this cpu as well. If we don't drop this here the
308 * jiffies might be stale and do_timer() never 298 * jiffies might be stale and do_timer() never
309 * invoked. 299 * invoked. Keep track of the fact that it was the one
300 * which had the do_timer() duty last. If this cpu is
301 * the one which had the do_timer() duty last, we
302 * limit the sleep time to the timekeeping
303 * max_deferement value which we retrieved
304 * above. Otherwise we can sleep as long as we want.
310 */ 305 */
311 if (cpu == tick_do_timer_cpu) 306 if (cpu == tick_do_timer_cpu) {
312 tick_do_timer_cpu = TICK_DO_TIMER_NONE; 307 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
308 ts->do_timer_last = 1;
309 } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
310 time_delta = KTIME_MAX;
311 ts->do_timer_last = 0;
312 } else if (!ts->do_timer_last) {
313 time_delta = KTIME_MAX;
314 }
315
316 /*
317 * calculate the expiry time for the next timer wheel
318 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
319 * that there is no timer pending or at least extremely
320 * far into the future (12 days for HZ=1000). In this
321 * case we set the expiry to the end of time.
322 */
323 if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
324 /*
325 * Calculate the time delta for the next timer event.
326 * If the time delta exceeds the maximum time delta
327 * permitted by the current clocksource then adjust
328 * the time delta accordingly to ensure the
329 * clocksource does not wrap.
330 */
331 time_delta = min_t(u64, time_delta,
332 tick_period.tv64 * delta_jiffies);
333 }
334
335 if (time_delta < KTIME_MAX)
336 expires = ktime_add_ns(last_update, time_delta);
337 else
338 expires.tv64 = KTIME_MAX;
313 339
314 if (delta_jiffies > 1) 340 if (delta_jiffies > 1)
315 cpumask_set_cpu(cpu, nohz_cpu_mask); 341 cpumask_set_cpu(cpu, nohz_cpu_mask);
@@ -342,22 +368,19 @@ void tick_nohz_stop_sched_tick(int inidle)
342 368
343 ts->idle_sleeps++; 369 ts->idle_sleeps++;
344 370
371 /* Mark expires */
372 ts->idle_expires = expires;
373
345 /* 374 /*
346 * delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that 375 * If the expiration time == KTIME_MAX, then
347 * there is no timer pending or at least extremly far 376 * in this case we simply stop the tick timer.
348 * into the future (12 days for HZ=1000). In this case
349 * we simply stop the tick timer:
350 */ 377 */
351 if (unlikely(delta_jiffies >= NEXT_TIMER_MAX_DELTA)) { 378 if (unlikely(expires.tv64 == KTIME_MAX)) {
352 ts->idle_expires.tv64 = KTIME_MAX;
353 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) 379 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
354 hrtimer_cancel(&ts->sched_timer); 380 hrtimer_cancel(&ts->sched_timer);
355 goto out; 381 goto out;
356 } 382 }
357 383
358 /* Mark expiries */
359 ts->idle_expires = expires;
360
361 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { 384 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
362 hrtimer_start(&ts->sched_timer, expires, 385 hrtimer_start(&ts->sched_timer, expires,
363 HRTIMER_MODE_ABS_PINNED); 386 HRTIMER_MODE_ABS_PINNED);
@@ -436,7 +459,11 @@ void tick_nohz_restart_sched_tick(void)
436 ktime_t now; 459 ktime_t now;
437 460
438 local_irq_disable(); 461 local_irq_disable();
439 tick_nohz_stop_idle(cpu); 462 if (ts->idle_active || (ts->inidle && ts->tick_stopped))
463 now = ktime_get();
464
465 if (ts->idle_active)
466 tick_nohz_stop_idle(cpu, now);
440 467
441 if (!ts->inidle || !ts->tick_stopped) { 468 if (!ts->inidle || !ts->tick_stopped) {
442 ts->inidle = 0; 469 ts->inidle = 0;
@@ -450,7 +477,6 @@ void tick_nohz_restart_sched_tick(void)
450 477
451 /* Update jiffies first */ 478 /* Update jiffies first */
452 select_nohz_load_balancer(0); 479 select_nohz_load_balancer(0);
453 now = ktime_get();
454 tick_do_update_jiffies64(now); 480 tick_do_update_jiffies64(now);
455 cpumask_clear_cpu(cpu, nohz_cpu_mask); 481 cpumask_clear_cpu(cpu, nohz_cpu_mask);
456 482
@@ -584,22 +610,18 @@ static void tick_nohz_switch_to_nohz(void)
584 * timer and do not touch the other magic bits which need to be done 610 * timer and do not touch the other magic bits which need to be done
585 * when idle is left. 611 * when idle is left.
586 */ 612 */
587static void tick_nohz_kick_tick(int cpu) 613static void tick_nohz_kick_tick(int cpu, ktime_t now)
588{ 614{
589#if 0 615#if 0
590 /* Switch back to 2.6.27 behaviour */ 616 /* Switch back to 2.6.27 behaviour */
591 617
592 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 618 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
593 ktime_t delta, now; 619 ktime_t delta;
594
595 if (!ts->tick_stopped)
596 return;
597 620
598 /* 621 /*
599 * Do not touch the tick device, when the next expiry is either 622 * Do not touch the tick device, when the next expiry is either
600 * already reached or less/equal than the tick period. 623 * already reached or less/equal than the tick period.
601 */ 624 */
602 now = ktime_get();
603 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); 625 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
604 if (delta.tv64 <= tick_period.tv64) 626 if (delta.tv64 <= tick_period.tv64)
605 return; 627 return;
@@ -608,9 +630,26 @@ static void tick_nohz_kick_tick(int cpu)
608#endif 630#endif
609} 631}
610 632
633static inline void tick_check_nohz(int cpu)
634{
635 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
636 ktime_t now;
637
638 if (!ts->idle_active && !ts->tick_stopped)
639 return;
640 now = ktime_get();
641 if (ts->idle_active)
642 tick_nohz_stop_idle(cpu, now);
643 if (ts->tick_stopped) {
644 tick_nohz_update_jiffies(now);
645 tick_nohz_kick_tick(cpu, now);
646 }
647}
648
611#else 649#else
612 650
613static inline void tick_nohz_switch_to_nohz(void) { } 651static inline void tick_nohz_switch_to_nohz(void) { }
652static inline void tick_check_nohz(int cpu) { }
614 653
615#endif /* NO_HZ */ 654#endif /* NO_HZ */
616 655
@@ -620,11 +659,7 @@ static inline void tick_nohz_switch_to_nohz(void) { }
620void tick_check_idle(int cpu) 659void tick_check_idle(int cpu)
621{ 660{
622 tick_check_oneshot_broadcast(cpu); 661 tick_check_oneshot_broadcast(cpu);
623#ifdef CONFIG_NO_HZ 662 tick_check_nohz(cpu);
624 tick_nohz_stop_idle(cpu);
625 tick_nohz_update_jiffies();
626 tick_nohz_kick_tick(cpu);
627#endif
628} 663}
629 664
630/* 665/*
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 96ff643a5a5..12f5c55090b 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 89 * source time
90 */ 90 */
91 sample.offset = 91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 92 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 93 ts;
94 94
95 /* simple insertion sort based on duration */ 95 /* simple insertion sort based on duration */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3a4e2907ea..e2ab064c6d4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -177,7 +177,7 @@ void timekeeping_leap_insert(int leapsecond)
177{ 177{
178 xtime.tv_sec += leapsecond; 178 xtime.tv_sec += leapsecond;
179 wall_to_monotonic.tv_sec -= leapsecond; 179 wall_to_monotonic.tv_sec -= leapsecond;
180 update_vsyscall(&xtime, timekeeper.clock); 180 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
181} 181}
182 182
183#ifdef CONFIG_GENERIC_TIME 183#ifdef CONFIG_GENERIC_TIME
@@ -337,7 +337,7 @@ int do_settimeofday(struct timespec *tv)
337 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
338 ntp_clear(); 338 ntp_clear();
339 339
340 update_vsyscall(&xtime, timekeeper.clock); 340 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
341 341
342 write_sequnlock_irqrestore(&xtime_lock, flags); 342 write_sequnlock_irqrestore(&xtime_lock, flags);
343 343
@@ -488,6 +488,17 @@ int timekeeping_valid_for_hres(void)
488} 488}
489 489
490/** 490/**
491 * timekeeping_max_deferment - Returns max time the clocksource can be deferred
492 *
493 * Caller must observe xtime_lock via read_seqbegin/read_seqretry to
494 * ensure that the clocksource does not change!
495 */
496u64 timekeeping_max_deferment(void)
497{
498 return timekeeper.clock->max_idle_ns;
499}
500
501/**
491 * read_persistent_clock - Return time from the persistent clock. 502 * read_persistent_clock - Return time from the persistent clock.
492 * 503 *
493 * Weak dummy function for arches that do not yet support it. 504 * Weak dummy function for arches that do not yet support it.
@@ -722,6 +733,51 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 733 timekeeper.ntp_error_shift;
723} 734}
724 735
736
737/**
738 * logarithmic_accumulation - shifted accumulation of cycles
739 *
740 * This functions accumulates a shifted interval of cycles into
741 * into a shifted interval nanoseconds. Allows for O(log) accumulation
742 * loop.
743 *
744 * Returns the unconsumed cycles.
745 */
746static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
747{
748 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
749
750 /* If the offset is smaller then a shifted interval, do nothing */
751 if (offset < timekeeper.cycle_interval<<shift)
752 return offset;
753
754 /* Accumulate one shifted interval */
755 offset -= timekeeper.cycle_interval << shift;
756 timekeeper.clock->cycle_last += timekeeper.cycle_interval << shift;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval << shift;
759 while (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 /* Accumulate into raw time */
766 raw_time.tv_nsec += timekeeper.raw_interval << shift;;
767 while (raw_time.tv_nsec >= NSEC_PER_SEC) {
768 raw_time.tv_nsec -= NSEC_PER_SEC;
769 raw_time.tv_sec++;
770 }
771
772 /* Accumulate error between NTP and clock interval */
773 timekeeper.ntp_error += tick_length << shift;
774 timekeeper.ntp_error -= timekeeper.xtime_interval <<
775 (timekeeper.ntp_error_shift + shift);
776
777 return offset;
778}
779
780
725/** 781/**
726 * update_wall_time - Uses the current clocksource to increment the wall time 782 * update_wall_time - Uses the current clocksource to increment the wall time
727 * 783 *
@@ -732,6 +788,7 @@ void update_wall_time(void)
732 struct clocksource *clock; 788 struct clocksource *clock;
733 cycle_t offset; 789 cycle_t offset;
734 u64 nsecs; 790 u64 nsecs;
791 int shift = 0, maxshift;
735 792
736 /* Make sure we're fully resumed: */ 793 /* Make sure we're fully resumed: */
737 if (unlikely(timekeeping_suspended)) 794 if (unlikely(timekeeping_suspended))
@@ -745,33 +802,22 @@ void update_wall_time(void)
745#endif 802#endif
746 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; 803 timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift;
747 804
748 /* normally this loop will run just once, however in the 805 /*
749 * case of lost or late ticks, it will accumulate correctly. 806 * With NO_HZ we may have to accumulate many cycle_intervals
807 * (think "ticks") worth of time at once. To do this efficiently,
808 * we calculate the largest doubling multiple of cycle_intervals
809 * that is smaller then the offset. We then accumulate that
810 * chunk in one go, and then try to consume the next smaller
811 * doubled multiple.
750 */ 812 */
813 shift = ilog2(offset) - ilog2(timekeeper.cycle_interval);
814 shift = max(0, shift);
815 /* Bound shift to one less then what overflows tick_length */
816 maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1;
817 shift = min(shift, maxshift);
751 while (offset >= timekeeper.cycle_interval) { 818 while (offset >= timekeeper.cycle_interval) {
752 u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; 819 offset = logarithmic_accumulation(offset, shift);
753 820 shift--;
754 /* accumulate one interval */
755 offset -= timekeeper.cycle_interval;
756 clock->cycle_last += timekeeper.cycle_interval;
757
758 timekeeper.xtime_nsec += timekeeper.xtime_interval;
759 if (timekeeper.xtime_nsec >= nsecps) {
760 timekeeper.xtime_nsec -= nsecps;
761 xtime.tv_sec++;
762 second_overflow();
763 }
764
765 raw_time.tv_nsec += timekeeper.raw_interval;
766 if (raw_time.tv_nsec >= NSEC_PER_SEC) {
767 raw_time.tv_nsec -= NSEC_PER_SEC;
768 raw_time.tv_sec++;
769 }
770
771 /* accumulate error between NTP and clock interval */
772 timekeeper.ntp_error += tick_length;
773 timekeeper.ntp_error -= timekeeper.xtime_interval <<
774 timekeeper.ntp_error_shift;
775 } 821 }
776 822
777 /* correct the clock when NTP error is too big */ 823 /* correct the clock when NTP error is too big */
@@ -811,7 +857,7 @@ void update_wall_time(void)
811 update_xtime_cache(nsecs); 857 update_xtime_cache(nsecs);
812 858
813 /* check to see if there is a new clocksource to use */ 859 /* check to see if there is a new clocksource to use */
814 update_vsyscall(&xtime, timekeeper.clock); 860 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
815} 861}
816 862
817/** 863/**
@@ -834,6 +880,7 @@ void getboottime(struct timespec *ts)
834 880
835 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
836} 882}
883EXPORT_SYMBOL_GPL(getboottime);
837 884
838/** 885/**
839 * monotonic_to_bootbased - Convert the monotonic time to boot based. 886 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -843,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
843{ 890{
844 *ts = timespec_add_safe(*ts, total_sleep_time); 891 *ts = timespec_add_safe(*ts, total_sleep_time);
845} 892}
893EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
846 894
847unsigned long get_seconds(void) 895unsigned long get_seconds(void)
848{ 896{
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1b5b7aa2fdf..bdfb8dd1050 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -204,10 +207,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
204 return; 207 return;
205 } 208 }
206 SEQ_printf(m, "%s\n", dev->name); 209 SEQ_printf(m, "%s\n", dev->name);
207 SEQ_printf(m, " max_delta_ns: %lu\n", dev->max_delta_ns); 210 SEQ_printf(m, " max_delta_ns: %llu\n",
208 SEQ_printf(m, " min_delta_ns: %lu\n", dev->min_delta_ns); 211 (unsigned long long) dev->max_delta_ns);
209 SEQ_printf(m, " mult: %lu\n", dev->mult); 212 SEQ_printf(m, " min_delta_ns: %llu\n",
210 SEQ_printf(m, " shift: %d\n", dev->shift); 213 (unsigned long long) dev->min_delta_ns);
214 SEQ_printf(m, " mult: %u\n", dev->mult);
215 SEQ_printf(m, " shift: %u\n", dev->shift);
211 SEQ_printf(m, " mode: %d\n", dev->mode); 216 SEQ_printf(m, " mode: %d\n", dev->mode);
212 SEQ_printf(m, " next_event: %Ld nsecs\n", 217 SEQ_printf(m, " next_event: %Ld nsecs\n",
213 (unsigned long long) ktime_to_ns(dev->next_event)); 218 (unsigned long long) ktime_to_ns(dev->next_event));
@@ -232,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
233 print_tickdevice(m, tick_get_broadcast_device(), -1); 238 print_tickdevice(m, tick_get_broadcast_device(), -1);
234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
235 tick_get_broadcast_mask()->bits[0]); 240 cpumask_bits(tick_get_broadcast_mask())[0]);
236#ifdef CONFIG_TICK_ONESHOT 241#ifdef CONFIG_TICK_ONESHOT
237 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
238 tick_get_broadcast_oneshot_mask()->bits[0]); 243 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
239#endif 244#endif
240 SEQ_printf(m, "\n"); 245 SEQ_printf(m, "\n");
241#endif 246#endif
@@ -252,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
252 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
253 int cpu; 258 int cpu;
254 259
255 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
258 263
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7e..2f3b585b8d7 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d2681..c61a7949387 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 656
657 debug_activate(timer, expires); 657 debug_activate(timer, expires);
658 658
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 659 cpu = smp_processor_id();
662 660
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 661#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -1200,6 +1198,7 @@ void update_process_times(int user_tick)
1200 run_local_timers(); 1198 run_local_timers();
1201 rcu_check_callbacks(cpu, user_tick); 1199 rcu_check_callbacks(cpu, user_tick);
1202 printk_tick(); 1200 printk_tick();
1201 perf_event_do_pending();
1203 scheduler_tick(); 1202 scheduler_tick();
1204 run_posix_cpu_timers(p); 1203 run_posix_cpu_timers(p);
1205} 1204}
@@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
1211{ 1210{
1212 struct tvec_base *base = __get_cpu_var(tvec_bases); 1211 struct tvec_base *base = __get_cpu_var(tvec_bases);
1213 1212
1214 perf_event_do_pending();
1215
1216 hrtimer_run_pending(); 1213 hrtimer_run_pending();
1217 1214
1218 if (time_after_eq(jiffies, base->timer_jiffies)) 1215 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888d..60e2ce0181e 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -335,7 +333,7 @@ config POWER_TRACER
335 depends on X86 333 depends on X86
336 select GENERIC_TRACER 334 select GENERIC_TRACER
337 help 335 help
338 This tracer helps developers to analyze and optimize the kernels 336 This tracer helps developers to analyze and optimize the kernel's
339 power management decisions, specifically the C-state and P-state 337 power management decisions, specifically the C-state and P-state
340 behavior. 338 behavior.
341 339
@@ -391,14 +389,14 @@ config HW_BRANCH_TRACER
391 select GENERIC_TRACER 389 select GENERIC_TRACER
392 help 390 help
393 This tracer records all branches on the system in a circular 391 This tracer records all branches on the system in a circular
394 buffer giving access to the last N branches for each cpu. 392 buffer, giving access to the last N branches for each cpu.
395 393
396config KMEMTRACE 394config KMEMTRACE
397 bool "Trace SLAB allocations" 395 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 396 select GENERIC_TRACER
399 help 397 help
400 kmemtrace provides tracing for slab allocator functions, such as 398 kmemtrace provides tracing for slab allocator functions, such as
401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 399 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
402 data is then fed to the userspace application in order to analyse 400 data is then fed to the userspace application in order to analyse
403 allocation hotspots, internal fragmentation and so on, making it 401 allocation hotspots, internal fragmentation and so on, making it
404 possible to see how well an allocator performs, as well as debug 402 possible to see how well an allocator performs, as well as debug
@@ -417,15 +415,15 @@ config WORKQUEUE_TRACER
417 bool "Trace workqueues" 415 bool "Trace workqueues"
418 select GENERIC_TRACER 416 select GENERIC_TRACER
419 help 417 help
420 The workqueue tracer provides some statistical informations 418 The workqueue tracer provides some statistical information
421 about each cpu workqueue thread such as the number of the 419 about each cpu workqueue thread such as the number of the
422 works inserted and executed since their creation. It can help 420 works inserted and executed since their creation. It can help
423 to evaluate the amount of work each of them have to perform. 421 to evaluate the amount of work each of them has to perform.
424 For example it can help a developer to decide whether he should 422 For example it can help a developer to decide whether he should
425 choose a per cpu workqueue instead of a singlethreaded one. 423 choose a per-cpu workqueue instead of a singlethreaded one.
426 424
427config BLK_DEV_IO_TRACE 425config BLK_DEV_IO_TRACE
428 bool "Support for tracing block io actions" 426 bool "Support for tracing block IO actions"
429 depends on SYSFS 427 depends on SYSFS
430 depends on BLOCK 428 depends on BLOCK
431 select RELAY 429 select RELAY
@@ -456,15 +454,15 @@ config KPROBE_EVENT
456 select TRACING 454 select TRACING
457 default y 455 default y
458 help 456 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly 457 This allows the user to add tracing events (similar to tracepoints)
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt 458 on the fly via the ftrace interface. See
461 for more details. 459 Documentation/trace/kprobetrace.txt for more details.
462 460
463 Those events can be inserted wherever kprobes can probe, and record 461 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values. 462 various register and memory values.
465 463
466 This option is also required by perf-probe subcommand of perf tools. If 464 This option is also required by perf-probe subcommand of perf tools.
467 you want to use perf tools, this option is strongly recommended. 465 If you want to use perf tools, this option is strongly recommended.
468 466
469config DYNAMIC_FTRACE 467config DYNAMIC_FTRACE
470 bool "enable/disable ftrace tracepoints dynamically" 468 bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +470,32 @@ config DYNAMIC_FTRACE
472 depends on HAVE_DYNAMIC_FTRACE 470 depends on HAVE_DYNAMIC_FTRACE
473 default y 471 default y
474 help 472 help
475 This option will modify all the calls to ftrace dynamically 473 This option will modify all the calls to ftrace dynamically
476 (will patch them out of the binary image and replaces them 474 (will patch them out of the binary image and replace them
477 with a No-Op instruction) as they are called. A table is 475 with a No-Op instruction) as they are called. A table is
478 created to dynamically enable them again. 476 created to dynamically enable them again.
479 477
480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 478 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
481 has native performance as long as no tracing is active. 479 otherwise has native performance as long as no tracing is active.
482 480
483 The changes to the code are done by a kernel thread that 481 The changes to the code are done by a kernel thread that
484 wakes up once a second and checks to see if any ftrace calls 482 wakes up once a second and checks to see if any ftrace calls
485 were made. If so, it runs stop_machine (stops all CPUS) 483 were made. If so, it runs stop_machine (stops all CPUS)
486 and modifies the code to jump over the call to ftrace. 484 and modifies the code to jump over the call to ftrace.
487 485
488config FUNCTION_PROFILER 486config FUNCTION_PROFILER
489 bool "Kernel function profiler" 487 bool "Kernel function profiler"
490 depends on FUNCTION_TRACER 488 depends on FUNCTION_TRACER
491 default n 489 default n
492 help 490 help
493 This option enables the kernel function profiler. A file is created 491 This option enables the kernel function profiler. A file is created
494 in debugfs called function_profile_enabled which defaults to zero. 492 in debugfs called function_profile_enabled which defaults to zero.
495 When a 1 is echoed into this file profiling begins, and when a 493 When a 1 is echoed into this file profiling begins, and when a
496 zero is entered, profiling stops. A file in the trace_stats 494 zero is entered, profiling stops. A "functions" file is created in
497 directory called functions, that show the list of functions that 495 the trace_stats directory; this file shows the list of functions that
498 have been hit and their counters. 496 have been hit and their counters.
499 497
500 If in doubt, say N 498 If in doubt, say N.
501 499
502config FTRACE_MCOUNT_RECORD 500config FTRACE_MCOUNT_RECORD
503 def_bool y 501 def_bool y
@@ -556,8 +554,8 @@ config RING_BUFFER_BENCHMARK
556 tristate "Ring buffer benchmark stress tester" 554 tristate "Ring buffer benchmark stress tester"
557 depends on RING_BUFFER 555 depends on RING_BUFFER
558 help 556 help
559 This option creates a test to stress the ring buffer and bench mark it. 557 This option creates a test to stress the ring buffer and benchmark it.
560 It creates its own ring buffer such that it will not interfer with 558 It creates its own ring buffer such that it will not interfere with
561 any other users of the ring buffer (such as ftrace). It then creates 559 any other users of the ring buffer (such as ftrace). It then creates
562 a producer and consumer that will run for 10 seconds and sleep for 560 a producer and consumer that will run for 10 seconds and sleep for
563 10 seconds. Each interval it will print out the number of events 561 10 seconds. Each interval it will print out the number of events
@@ -566,7 +564,7 @@ config RING_BUFFER_BENCHMARK
566 It does not disable interrupts or raise its priority, so it may be 564 It does not disable interrupts or raise its priority, so it may be
567 affected by processes that are running. 565 affected by processes that are running.
568 566
569 If unsure, say N 567 If unsure, say N.
570 568
571endif # FTRACE 569endif # FTRACE
572 570
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e51a1bcb7be..1e6640f8045 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1690,7 +1690,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1690static int ftrace_match(char *str, char *regex, int len, int type) 1690static int ftrace_match(char *str, char *regex, int len, int type)
1691{ 1691{
1692 int matched = 0; 1692 int matched = 0;
1693 char *ptr; 1693 int slen;
1694 1694
1695 switch (type) { 1695 switch (type) {
1696 case MATCH_FULL: 1696 case MATCH_FULL:
@@ -1706,8 +1706,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1706 matched = 1; 1706 matched = 1;
1707 break; 1707 break;
1708 case MATCH_END_ONLY: 1708 case MATCH_END_ONLY:
1709 ptr = strstr(str, regex); 1709 slen = strlen(str);
1710 if (ptr && (ptr[len] == 0)) 1710 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1711 matched = 1; 1711 matched = 1;
1712 break; 1712 break;
1713 } 1713 }
@@ -1724,7 +1724,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1724 return ftrace_match(str, regex, len, type); 1724 return ftrace_match(str, regex, len, type);
1725} 1725}
1726 1726
1727static void ftrace_match_records(char *buff, int len, int enable) 1727static int ftrace_match_records(char *buff, int len, int enable)
1728{ 1728{
1729 unsigned int search_len; 1729 unsigned int search_len;
1730 struct ftrace_page *pg; 1730 struct ftrace_page *pg;
@@ -1733,6 +1733,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1733 char *search; 1733 char *search;
1734 int type; 1734 int type;
1735 int not; 1735 int not;
1736 int found = 0;
1736 1737
1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1738 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1738 type = filter_parse_regex(buff, len, &search, &not); 1739 type = filter_parse_regex(buff, len, &search, &not);
@@ -1750,6 +1751,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1750 rec->flags &= ~flag; 1751 rec->flags &= ~flag;
1751 else 1752 else
1752 rec->flags |= flag; 1753 rec->flags |= flag;
1754 found = 1;
1753 } 1755 }
1754 /* 1756 /*
1755 * Only enable filtering if we have a function that 1757 * Only enable filtering if we have a function that
@@ -1759,6 +1761,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
1759 ftrace_filtered = 1; 1761 ftrace_filtered = 1;
1760 } while_for_each_ftrace_rec(); 1762 } while_for_each_ftrace_rec();
1761 mutex_unlock(&ftrace_lock); 1763 mutex_unlock(&ftrace_lock);
1764
1765 return found;
1762} 1766}
1763 1767
1764static int 1768static int
@@ -1780,7 +1784,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1780 return 1; 1784 return 1;
1781} 1785}
1782 1786
1783static void ftrace_match_module_records(char *buff, char *mod, int enable) 1787static int ftrace_match_module_records(char *buff, char *mod, int enable)
1784{ 1788{
1785 unsigned search_len = 0; 1789 unsigned search_len = 0;
1786 struct ftrace_page *pg; 1790 struct ftrace_page *pg;
@@ -1789,6 +1793,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1789 char *search = buff; 1793 char *search = buff;
1790 unsigned long flag; 1794 unsigned long flag;
1791 int not = 0; 1795 int not = 0;
1796 int found = 0;
1792 1797
1793 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1798 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1794 1799
@@ -1819,12 +1824,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1819 rec->flags &= ~flag; 1824 rec->flags &= ~flag;
1820 else 1825 else
1821 rec->flags |= flag; 1826 rec->flags |= flag;
1827 found = 1;
1822 } 1828 }
1823 if (enable && (rec->flags & FTRACE_FL_FILTER)) 1829 if (enable && (rec->flags & FTRACE_FL_FILTER))
1824 ftrace_filtered = 1; 1830 ftrace_filtered = 1;
1825 1831
1826 } while_for_each_ftrace_rec(); 1832 } while_for_each_ftrace_rec();
1827 mutex_unlock(&ftrace_lock); 1833 mutex_unlock(&ftrace_lock);
1834
1835 return found;
1828} 1836}
1829 1837
1830/* 1838/*
@@ -1853,8 +1861,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1853 if (!strlen(mod)) 1861 if (!strlen(mod))
1854 return -EINVAL; 1862 return -EINVAL;
1855 1863
1856 ftrace_match_module_records(func, mod, enable); 1864 if (ftrace_match_module_records(func, mod, enable))
1857 return 0; 1865 return 0;
1866 return -EINVAL;
1858} 1867}
1859 1868
1860static struct ftrace_func_command ftrace_mod_cmd = { 1869static struct ftrace_func_command ftrace_mod_cmd = {
@@ -2151,8 +2160,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2151 func = strsep(&next, ":"); 2160 func = strsep(&next, ":");
2152 2161
2153 if (!next) { 2162 if (!next) {
2154 ftrace_match_records(func, len, enable); 2163 if (ftrace_match_records(func, len, enable))
2155 return 0; 2164 return 0;
2165 return ret;
2156 } 2166 }
2157 2167
2158 /* command found */ 2168 /* command found */
@@ -2198,10 +2208,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2198 !trace_parser_cont(parser)) { 2208 !trace_parser_cont(parser)) {
2199 ret = ftrace_process_regex(parser->buffer, 2209 ret = ftrace_process_regex(parser->buffer,
2200 parser->idx, enable); 2210 parser->idx, enable);
2211 trace_parser_clear(parser);
2201 if (ret) 2212 if (ret)
2202 goto out_unlock; 2213 goto out_unlock;
2203
2204 trace_parser_clear(parser);
2205 } 2214 }
2206 2215
2207 ret = read; 2216 ret = read;
@@ -2543,10 +2552,9 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2543 exists = true; 2552 exists = true;
2544 break; 2553 break;
2545 } 2554 }
2546 if (!exists) { 2555 if (!exists)
2547 array[(*idx)++] = rec->ip; 2556 array[(*idx)++] = rec->ip;
2548 found = 1; 2557 found = 1;
2549 }
2550 } 2558 }
2551 } while_for_each_ftrace_rec(); 2559 } while_for_each_ftrace_rec();
2552 2560
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a..9f4f565b01e 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -14,7 +14,5 @@
14#define CREATE_TRACE_POINTS 14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16 16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 17EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1ca4956ab5..8c1b2d29071 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -423,7 +423,7 @@ struct ring_buffer_per_cpu {
423 int cpu; 423 int cpu;
424 struct ring_buffer *buffer; 424 struct ring_buffer *buffer;
425 spinlock_t reader_lock; /* serialize readers */ 425 spinlock_t reader_lock; /* serialize readers */
426 raw_spinlock_t lock; 426 arch_spinlock_t lock;
427 struct lock_class_key lock_key; 427 struct lock_class_key lock_key;
428 struct list_head *pages; 428 struct list_head *pages;
429 struct buffer_page *head_page; /* read from head */ 429 struct buffer_page *head_page; /* read from head */
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 464 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 465 unsigned long head;
466 struct buffer_page *head_page; 466 struct buffer_page *head_page;
467 struct buffer_page *cache_reader_page;
468 unsigned long cache_read;
467 u64 read_stamp; 469 u64 read_stamp;
468}; 470};
469 471
@@ -998,7 +1000,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
998 cpu_buffer->buffer = buffer; 1000 cpu_buffer->buffer = buffer;
999 spin_lock_init(&cpu_buffer->reader_lock); 1001 spin_lock_init(&cpu_buffer->reader_lock);
1000 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1002 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1001 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1003 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1002 1004
1003 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1005 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1004 GFP_KERNEL, cpu_to_node(cpu)); 1006 GFP_KERNEL, cpu_to_node(cpu));
@@ -1193,9 +1195,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1193 struct list_head *p; 1195 struct list_head *p;
1194 unsigned i; 1196 unsigned i;
1195 1197
1196 atomic_inc(&cpu_buffer->record_disabled);
1197 synchronize_sched();
1198
1199 spin_lock_irq(&cpu_buffer->reader_lock); 1198 spin_lock_irq(&cpu_buffer->reader_lock);
1200 rb_head_page_deactivate(cpu_buffer); 1199 rb_head_page_deactivate(cpu_buffer);
1201 1200
@@ -1211,12 +1210,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1211 return; 1210 return;
1212 1211
1213 rb_reset_cpu(cpu_buffer); 1212 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1215
1216 rb_check_pages(cpu_buffer); 1213 rb_check_pages(cpu_buffer);
1217 1214
1218 atomic_dec(&cpu_buffer->record_disabled); 1215 spin_unlock_irq(&cpu_buffer->reader_lock);
1219
1220} 1216}
1221 1217
1222static void 1218static void
@@ -1227,9 +1223,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1227 struct list_head *p; 1223 struct list_head *p;
1228 unsigned i; 1224 unsigned i;
1229 1225
1230 atomic_inc(&cpu_buffer->record_disabled);
1231 synchronize_sched();
1232
1233 spin_lock_irq(&cpu_buffer->reader_lock); 1226 spin_lock_irq(&cpu_buffer->reader_lock);
1234 rb_head_page_deactivate(cpu_buffer); 1227 rb_head_page_deactivate(cpu_buffer);
1235 1228
@@ -1242,11 +1235,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1242 list_add_tail(&bpage->list, cpu_buffer->pages); 1235 list_add_tail(&bpage->list, cpu_buffer->pages);
1243 } 1236 }
1244 rb_reset_cpu(cpu_buffer); 1237 rb_reset_cpu(cpu_buffer);
1245 spin_unlock_irq(&cpu_buffer->reader_lock);
1246
1247 rb_check_pages(cpu_buffer); 1238 rb_check_pages(cpu_buffer);
1248 1239
1249 atomic_dec(&cpu_buffer->record_disabled); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1250} 1241}
1251 1242
1252/** 1243/**
@@ -1254,11 +1245,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1254 * @buffer: the buffer to resize. 1245 * @buffer: the buffer to resize.
1255 * @size: the new size. 1246 * @size: the new size.
1256 * 1247 *
1257 * The tracer is responsible for making sure that the buffer is
1258 * not being used while changing the size.
1259 * Note: We may be able to change the above requirement by using
1260 * RCU synchronizations.
1261 *
1262 * Minimum size is 2 * BUF_PAGE_SIZE. 1248 * Minimum size is 2 * BUF_PAGE_SIZE.
1263 * 1249 *
1264 * Returns -1 on failure. 1250 * Returns -1 on failure.
@@ -1290,6 +1276,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1290 if (size == buffer_size) 1276 if (size == buffer_size)
1291 return size; 1277 return size;
1292 1278
1279 atomic_inc(&buffer->record_disabled);
1280
1281 /* Make sure all writers are done with this buffer. */
1282 synchronize_sched();
1283
1293 mutex_lock(&buffer->mutex); 1284 mutex_lock(&buffer->mutex);
1294 get_online_cpus(); 1285 get_online_cpus();
1295 1286
@@ -1352,6 +1343,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1352 put_online_cpus(); 1343 put_online_cpus();
1353 mutex_unlock(&buffer->mutex); 1344 mutex_unlock(&buffer->mutex);
1354 1345
1346 atomic_dec(&buffer->record_disabled);
1347
1355 return size; 1348 return size;
1356 1349
1357 free_pages: 1350 free_pages:
@@ -1361,6 +1354,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1361 } 1354 }
1362 put_online_cpus(); 1355 put_online_cpus();
1363 mutex_unlock(&buffer->mutex); 1356 mutex_unlock(&buffer->mutex);
1357 atomic_dec(&buffer->record_disabled);
1364 return -ENOMEM; 1358 return -ENOMEM;
1365 1359
1366 /* 1360 /*
@@ -1370,6 +1364,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1370 out_fail: 1364 out_fail:
1371 put_online_cpus(); 1365 put_online_cpus();
1372 mutex_unlock(&buffer->mutex); 1366 mutex_unlock(&buffer->mutex);
1367 atomic_dec(&buffer->record_disabled);
1373 return -1; 1368 return -1;
1374} 1369}
1375EXPORT_SYMBOL_GPL(ring_buffer_resize); 1370EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -2723,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2723 iter->read_stamp = cpu_buffer->read_stamp; 2718 iter->read_stamp = cpu_buffer->read_stamp;
2724 else 2719 else
2725 iter->read_stamp = iter->head_page->page->time_stamp; 2720 iter->read_stamp = iter->head_page->page->time_stamp;
2721 iter->cache_reader_page = cpu_buffer->reader_page;
2722 iter->cache_read = cpu_buffer->read;
2726} 2723}
2727 2724
2728/** 2725/**
@@ -2834,7 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2834 int ret; 2831 int ret;
2835 2832
2836 local_irq_save(flags); 2833 local_irq_save(flags);
2837 __raw_spin_lock(&cpu_buffer->lock); 2834 arch_spin_lock(&cpu_buffer->lock);
2838 2835
2839 again: 2836 again:
2840 /* 2837 /*
@@ -2876,7 +2873,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2876 * Splice the empty reader page into the list around the head. 2873 * Splice the empty reader page into the list around the head.
2877 */ 2874 */
2878 reader = rb_set_head_page(cpu_buffer); 2875 reader = rb_set_head_page(cpu_buffer);
2879 cpu_buffer->reader_page->list.next = reader->list.next; 2876 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2880 cpu_buffer->reader_page->list.prev = reader->list.prev; 2877 cpu_buffer->reader_page->list.prev = reader->list.prev;
2881 2878
2882 /* 2879 /*
@@ -2913,7 +2910,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2913 * 2910 *
2914 * Now make the new head point back to the reader page. 2911 * Now make the new head point back to the reader page.
2915 */ 2912 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list; 2913 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2914 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2918 2915
2919 /* Finally update the reader page to the new head */ 2916 /* Finally update the reader page to the new head */
@@ -2923,7 +2920,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2923 goto again; 2920 goto again;
2924 2921
2925 out: 2922 out:
2926 __raw_spin_unlock(&cpu_buffer->lock); 2923 arch_spin_unlock(&cpu_buffer->lock);
2927 local_irq_restore(flags); 2924 local_irq_restore(flags);
2928 2925
2929 return reader; 2926 return reader;
@@ -3067,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3067 struct ring_buffer_event *event; 3064 struct ring_buffer_event *event;
3068 int nr_loops = 0; 3065 int nr_loops = 0;
3069 3066
3070 if (ring_buffer_iter_empty(iter))
3071 return NULL;
3072
3073 cpu_buffer = iter->cpu_buffer; 3067 cpu_buffer = iter->cpu_buffer;
3074 buffer = cpu_buffer->buffer; 3068 buffer = cpu_buffer->buffer;
3075 3069
3070 /*
3071 * Check if someone performed a consuming read to
3072 * the buffer. A consuming read invalidates the iterator
3073 * and we need to reset the iterator in this case.
3074 */
3075 if (unlikely(iter->cache_read != cpu_buffer->read ||
3076 iter->cache_reader_page != cpu_buffer->reader_page))
3077 rb_iter_reset(iter);
3078
3076 again: 3079 again:
3080 if (ring_buffer_iter_empty(iter))
3081 return NULL;
3082
3077 /* 3083 /*
3078 * We repeat when a timestamp is encountered. 3084 * We repeat when a timestamp is encountered.
3079 * We can get multiple timestamps by nested interrupts or also 3085 * We can get multiple timestamps by nested interrupts or also
@@ -3088,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3088 if (rb_per_cpu_empty(cpu_buffer)) 3094 if (rb_per_cpu_empty(cpu_buffer))
3089 return NULL; 3095 return NULL;
3090 3096
3097 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3098 rb_inc_iter(iter);
3099 goto again;
3100 }
3101
3091 event = rb_iter_head_event(iter); 3102 event = rb_iter_head_event(iter);
3092 3103
3093 switch (event->type_len) { 3104 switch (event->type_len) {
@@ -3286,9 +3297,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3286 synchronize_sched(); 3297 synchronize_sched();
3287 3298
3288 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3299 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3289 __raw_spin_lock(&cpu_buffer->lock); 3300 arch_spin_lock(&cpu_buffer->lock);
3290 rb_iter_reset(iter); 3301 rb_iter_reset(iter);
3291 __raw_spin_unlock(&cpu_buffer->lock); 3302 arch_spin_unlock(&cpu_buffer->lock);
3292 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3303 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3293 3304
3294 return iter; 3305 return iter;
@@ -3408,11 +3419,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3419 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409 goto out; 3420 goto out;
3410 3421
3411 __raw_spin_lock(&cpu_buffer->lock); 3422 arch_spin_lock(&cpu_buffer->lock);
3412 3423
3413 rb_reset_cpu(cpu_buffer); 3424 rb_reset_cpu(cpu_buffer);
3414 3425
3415 __raw_spin_unlock(&cpu_buffer->lock); 3426 arch_spin_unlock(&cpu_buffer->lock);
3416 3427
3417 out: 3428 out:
3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3429 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 874f2893cff..eac6875cb99 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <linux/utsrelease.h> 15#include <generated/utsrelease.h>
16#include <linux/stacktrace.h> 16#include <linux/stacktrace.h>
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
@@ -86,17 +86,17 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 86 */
87static int tracing_disabled = 1; 87static int tracing_disabled = 1;
88 88
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 89DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 90
91static inline void ftrace_disable_cpu(void) 91static inline void ftrace_disable_cpu(void)
92{ 92{
93 preempt_disable(); 93 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 94 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
95} 95}
96 96
97static inline void ftrace_enable_cpu(void) 97static inline void ftrace_enable_cpu(void)
98{ 98{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 99 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
100 preempt_enable(); 100 preempt_enable();
101} 101}
102 102
@@ -203,7 +203,7 @@ cycle_t ftrace_now(int cpu)
203 */ 203 */
204static struct trace_array max_tr; 204static struct trace_array max_tr;
205 205
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 206static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 207
208/* tracer_enabled is used to toggle activation of a tracer */ 208/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 209static int tracer_enabled = 1;
@@ -313,7 +313,6 @@ static const char *trace_options[] = {
313 "bin", 313 "bin",
314 "block", 314 "block",
315 "stacktrace", 315 "stacktrace",
316 "sched-tree",
317 "trace_printk", 316 "trace_printk",
318 "ftrace_preempt", 317 "ftrace_preempt",
319 "branch", 318 "branch",
@@ -493,15 +492,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 492 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 493 * needs its own lock.
495 * 494 *
496 * This is defined as a raw_spinlock_t in order to help 495 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 496 * with performance when lockdep debugging is enabled.
498 * 497 *
499 * It is also used in other places outside the update_max_tr 498 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 499 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 500 * CONFIG_TRACER_MAX_TRACE.
502 */ 501 */
503static raw_spinlock_t ftrace_max_lock = 502static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 503 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
505 504
506#ifdef CONFIG_TRACER_MAX_TRACE 505#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 506unsigned long __read_mostly tracing_max_latency;
@@ -555,13 +554,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 554 return;
556 555
557 WARN_ON_ONCE(!irqs_disabled()); 556 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 557 arch_spin_lock(&ftrace_max_lock);
559 558
560 tr->buffer = max_tr.buffer; 559 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 560 max_tr.buffer = buf;
562 561
563 __update_max_tr(tr, tsk, cpu); 562 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 563 arch_spin_unlock(&ftrace_max_lock);
565} 564}
566 565
567/** 566/**
@@ -581,7 +580,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 580 return;
582 581
583 WARN_ON_ONCE(!irqs_disabled()); 582 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 583 arch_spin_lock(&ftrace_max_lock);
585 584
586 ftrace_disable_cpu(); 585 ftrace_disable_cpu();
587 586
@@ -603,7 +602,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 602 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 603
605 __update_max_tr(tr, tsk, cpu); 604 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 605 arch_spin_unlock(&ftrace_max_lock);
607} 606}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 607#endif /* CONFIG_TRACER_MAX_TRACE */
609 608
@@ -802,7 +801,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 801static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 802static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 803static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 804static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 805
807/* temporary disable recording */ 806/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 807static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +914,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 914 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 915 * so if we miss here, then better luck next time.
917 */ 916 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 917 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 918 return;
920 919
921 idx = map_pid_to_cmdline[tsk->pid]; 920 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +939,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 939
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 940 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 941
943 __raw_spin_unlock(&trace_cmdline_lock); 942 arch_spin_unlock(&trace_cmdline_lock);
944} 943}
945 944
946void trace_find_cmdline(int pid, char comm[]) 945void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +951,25 @@ void trace_find_cmdline(int pid, char comm[])
952 return; 951 return;
953 } 952 }
954 953
954 if (WARN_ON_ONCE(pid < 0)) {
955 strcpy(comm, "<XXX>");
956 return;
957 }
958
955 if (pid > PID_MAX_DEFAULT) { 959 if (pid > PID_MAX_DEFAULT) {
956 strcpy(comm, "<...>"); 960 strcpy(comm, "<...>");
957 return; 961 return;
958 } 962 }
959 963
960 preempt_disable(); 964 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 965 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 966 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 967 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 968 strcpy(comm, saved_cmdlines[map]);
965 else 969 else
966 strcpy(comm, "<...>"); 970 strcpy(comm, "<...>");
967 971
968 __raw_spin_unlock(&trace_cmdline_lock); 972 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 973 preempt_enable();
970} 974}
971 975
@@ -1085,7 +1089,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1089 struct ftrace_entry *entry;
1086 1090
1087 /* If we are reading the ring buffer, don't trace */ 1091 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1092 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
1089 return; 1093 return;
1090 1094
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1095 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1155,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1155 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1152} 1156}
1153 1157
1158/**
1159 * trace_dump_stack - record a stack back trace in the trace buffer
1160 */
1161void trace_dump_stack(void)
1162{
1163 unsigned long flags;
1164
1165 if (tracing_disabled || tracing_selftest_running)
1166 return;
1167
1168 local_save_flags(flags);
1169
1170 /* skipping 3 traces, seems to get us at the caller of this function */
1171 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1172}
1173
1154void 1174void
1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1175ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1156{ 1176{
@@ -1251,8 +1271,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1271 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1272int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1273{
1254 static raw_spinlock_t trace_buf_lock = 1274 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1275 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1276 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1277
1258 struct ftrace_event_call *call = &event_bprint; 1278 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1303
1284 /* Lockdep uses trace_printk for lock tracing */ 1304 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1305 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1306 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1307 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1308
1289 if (len > TRACE_BUF_SIZE || len < 0) 1309 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1304,7 +1324,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1304 ring_buffer_unlock_commit(buffer, event); 1324 ring_buffer_unlock_commit(buffer, event);
1305 1325
1306out_unlock: 1326out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1327 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1328 local_irq_restore(flags);
1309 1329
1310out: 1330out:
@@ -1334,7 +1354,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1354int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1355 unsigned long ip, const char *fmt, va_list args)
1336{ 1356{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1357 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1358 static char trace_buf[TRACE_BUF_SIZE];
1339 1359
1340 struct ftrace_event_call *call = &event_print; 1360 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1380,8 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1380
1361 pause_graph_tracing(); 1381 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1382 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1383 arch_spin_lock(&trace_buf_lock);
1364 if (args == NULL) { 1384 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
1366 len = strlen(trace_buf);
1367 } else
1368 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1369 1385
1370 size = sizeof(*entry) + len + 1; 1386 size = sizeof(*entry) + len + 1;
1371 buffer = tr->buffer; 1387 buffer = tr->buffer;
@@ -1382,7 +1398,7 @@ int trace_array_vprintk(struct trace_array *tr,
1382 ring_buffer_unlock_commit(buffer, event); 1398 ring_buffer_unlock_commit(buffer, event);
1383 1399
1384 out_unlock: 1400 out_unlock:
1385 __raw_spin_unlock(&trace_buf_lock); 1401 arch_spin_unlock(&trace_buf_lock);
1386 raw_local_irq_restore(irq_flags); 1402 raw_local_irq_restore(irq_flags);
1387 unpause_graph_tracing(); 1403 unpause_graph_tracing();
1388 out: 1404 out:
@@ -1516,6 +1532,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1516 int i = (int)*pos; 1532 int i = (int)*pos;
1517 void *ent; 1533 void *ent;
1518 1534
1535 WARN_ON_ONCE(iter->leftover);
1536
1519 (*pos)++; 1537 (*pos)++;
1520 1538
1521 /* can't go backwards */ 1539 /* can't go backwards */
@@ -1614,8 +1632,16 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1614 ; 1632 ;
1615 1633
1616 } else { 1634 } else {
1617 l = *pos - 1; 1635 /*
1618 p = s_next(m, p, &l); 1636 * If we overflowed the seq_file before, then we want
1637 * to just reuse the trace_seq buffer again.
1638 */
1639 if (iter->leftover)
1640 p = iter;
1641 else {
1642 l = *pos - 1;
1643 p = s_next(m, p, &l);
1644 }
1619 } 1645 }
1620 1646
1621 trace_event_read_lock(); 1647 trace_event_read_lock();
@@ -1923,6 +1949,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1923static int s_show(struct seq_file *m, void *v) 1949static int s_show(struct seq_file *m, void *v)
1924{ 1950{
1925 struct trace_iterator *iter = v; 1951 struct trace_iterator *iter = v;
1952 int ret;
1926 1953
1927 if (iter->ent == NULL) { 1954 if (iter->ent == NULL) {
1928 if (iter->tr) { 1955 if (iter->tr) {
@@ -1942,9 +1969,27 @@ static int s_show(struct seq_file *m, void *v)
1942 if (!(trace_flags & TRACE_ITER_VERBOSE)) 1969 if (!(trace_flags & TRACE_ITER_VERBOSE))
1943 print_func_help_header(m); 1970 print_func_help_header(m);
1944 } 1971 }
1972 } else if (iter->leftover) {
1973 /*
1974 * If we filled the seq_file buffer earlier, we
1975 * want to just show it now.
1976 */
1977 ret = trace_print_seq(m, &iter->seq);
1978
1979 /* ret should this time be zero, but you never know */
1980 iter->leftover = ret;
1981
1945 } else { 1982 } else {
1946 print_trace_line(iter); 1983 print_trace_line(iter);
1947 trace_print_seq(m, &iter->seq); 1984 ret = trace_print_seq(m, &iter->seq);
1985 /*
1986 * If we overflow the seq_file buffer, then it will
1987 * ask us for this data again at start up.
1988 * Use that instead.
1989 * ret is 0 if seq_file write succeeded.
1990 * -1 otherwise.
1991 */
1992 iter->leftover = ret;
1948 } 1993 }
1949 1994
1950 return 0; 1995 return 0;
@@ -2254,7 +2299,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2254 mutex_lock(&tracing_cpumask_update_lock); 2299 mutex_lock(&tracing_cpumask_update_lock);
2255 2300
2256 local_irq_disable(); 2301 local_irq_disable();
2257 __raw_spin_lock(&ftrace_max_lock); 2302 arch_spin_lock(&ftrace_max_lock);
2258 for_each_tracing_cpu(cpu) { 2303 for_each_tracing_cpu(cpu) {
2259 /* 2304 /*
2260 * Increase/decrease the disabled counter if we are 2305 * Increase/decrease the disabled counter if we are
@@ -2269,7 +2314,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2269 atomic_dec(&global_trace.data[cpu]->disabled); 2314 atomic_dec(&global_trace.data[cpu]->disabled);
2270 } 2315 }
2271 } 2316 }
2272 __raw_spin_unlock(&ftrace_max_lock); 2317 arch_spin_unlock(&ftrace_max_lock);
2273 local_irq_enable(); 2318 local_irq_enable();
2274 2319
2275 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2320 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2291,67 +2336,49 @@ static const struct file_operations tracing_cpumask_fops = {
2291 .write = tracing_cpumask_write, 2336 .write = tracing_cpumask_write,
2292}; 2337};
2293 2338
2294static ssize_t 2339static int tracing_trace_options_show(struct seq_file *m, void *v)
2295tracing_trace_options_read(struct file *filp, char __user *ubuf,
2296 size_t cnt, loff_t *ppos)
2297{ 2340{
2298 struct tracer_opt *trace_opts; 2341 struct tracer_opt *trace_opts;
2299 u32 tracer_flags; 2342 u32 tracer_flags;
2300 int len = 0;
2301 char *buf;
2302 int r = 0;
2303 int i; 2343 int i;
2304 2344
2305
2306 /* calculate max size */
2307 for (i = 0; trace_options[i]; i++) {
2308 len += strlen(trace_options[i]);
2309 len += 3; /* "no" and newline */
2310 }
2311
2312 mutex_lock(&trace_types_lock); 2345 mutex_lock(&trace_types_lock);
2313 tracer_flags = current_trace->flags->val; 2346 tracer_flags = current_trace->flags->val;
2314 trace_opts = current_trace->flags->opts; 2347 trace_opts = current_trace->flags->opts;
2315 2348
2316 /*
2317 * Increase the size with names of options specific
2318 * of the current tracer.
2319 */
2320 for (i = 0; trace_opts[i].name; i++) {
2321 len += strlen(trace_opts[i].name);
2322 len += 3; /* "no" and newline */
2323 }
2324
2325 /* +1 for \0 */
2326 buf = kmalloc(len + 1, GFP_KERNEL);
2327 if (!buf) {
2328 mutex_unlock(&trace_types_lock);
2329 return -ENOMEM;
2330 }
2331
2332 for (i = 0; trace_options[i]; i++) { 2349 for (i = 0; trace_options[i]; i++) {
2333 if (trace_flags & (1 << i)) 2350 if (trace_flags & (1 << i))
2334 r += sprintf(buf + r, "%s\n", trace_options[i]); 2351 seq_printf(m, "%s\n", trace_options[i]);
2335 else 2352 else
2336 r += sprintf(buf + r, "no%s\n", trace_options[i]); 2353 seq_printf(m, "no%s\n", trace_options[i]);
2337 } 2354 }
2338 2355
2339 for (i = 0; trace_opts[i].name; i++) { 2356 for (i = 0; trace_opts[i].name; i++) {
2340 if (tracer_flags & trace_opts[i].bit) 2357 if (tracer_flags & trace_opts[i].bit)
2341 r += sprintf(buf + r, "%s\n", 2358 seq_printf(m, "%s\n", trace_opts[i].name);
2342 trace_opts[i].name);
2343 else 2359 else
2344 r += sprintf(buf + r, "no%s\n", 2360 seq_printf(m, "no%s\n", trace_opts[i].name);
2345 trace_opts[i].name);
2346 } 2361 }
2347 mutex_unlock(&trace_types_lock); 2362 mutex_unlock(&trace_types_lock);
2348 2363
2349 WARN_ON(r >= len + 1); 2364 return 0;
2365}
2350 2366
2351 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2367static int __set_tracer_option(struct tracer *trace,
2368 struct tracer_flags *tracer_flags,
2369 struct tracer_opt *opts, int neg)
2370{
2371 int ret;
2352 2372
2353 kfree(buf); 2373 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
2354 return r; 2374 if (ret)
2375 return ret;
2376
2377 if (neg)
2378 tracer_flags->val &= ~opts->bit;
2379 else
2380 tracer_flags->val |= opts->bit;
2381 return 0;
2355} 2382}
2356 2383
2357/* Try to assign a tracer specific option */ 2384/* Try to assign a tracer specific option */
@@ -2359,33 +2386,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2359{ 2386{
2360 struct tracer_flags *tracer_flags = trace->flags; 2387 struct tracer_flags *tracer_flags = trace->flags;
2361 struct tracer_opt *opts = NULL; 2388 struct tracer_opt *opts = NULL;
2362 int ret = 0, i = 0; 2389 int i;
2363 int len;
2364 2390
2365 for (i = 0; tracer_flags->opts[i].name; i++) { 2391 for (i = 0; tracer_flags->opts[i].name; i++) {
2366 opts = &tracer_flags->opts[i]; 2392 opts = &tracer_flags->opts[i];
2367 len = strlen(opts->name);
2368 2393
2369 if (strncmp(cmp, opts->name, len) == 0) { 2394 if (strcmp(cmp, opts->name) == 0)
2370 ret = trace->set_flag(tracer_flags->val, 2395 return __set_tracer_option(trace, trace->flags,
2371 opts->bit, !neg); 2396 opts, neg);
2372 break;
2373 }
2374 } 2397 }
2375 /* Not found */
2376 if (!tracer_flags->opts[i].name)
2377 return -EINVAL;
2378
2379 /* Refused to handle */
2380 if (ret)
2381 return ret;
2382
2383 if (neg)
2384 tracer_flags->val &= ~opts->bit;
2385 else
2386 tracer_flags->val |= opts->bit;
2387 2398
2388 return 0; 2399 return -EINVAL;
2389} 2400}
2390 2401
2391static void set_tracer_flags(unsigned int mask, int enabled) 2402static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2405,7 +2416,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2405 size_t cnt, loff_t *ppos) 2416 size_t cnt, loff_t *ppos)
2406{ 2417{
2407 char buf[64]; 2418 char buf[64];
2408 char *cmp = buf; 2419 char *cmp;
2409 int neg = 0; 2420 int neg = 0;
2410 int ret; 2421 int ret;
2411 int i; 2422 int i;
@@ -2417,16 +2428,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2417 return -EFAULT; 2428 return -EFAULT;
2418 2429
2419 buf[cnt] = 0; 2430 buf[cnt] = 0;
2431 cmp = strstrip(buf);
2420 2432
2421 if (strncmp(buf, "no", 2) == 0) { 2433 if (strncmp(cmp, "no", 2) == 0) {
2422 neg = 1; 2434 neg = 1;
2423 cmp += 2; 2435 cmp += 2;
2424 } 2436 }
2425 2437
2426 for (i = 0; trace_options[i]; i++) { 2438 for (i = 0; trace_options[i]; i++) {
2427 int len = strlen(trace_options[i]); 2439 if (strcmp(cmp, trace_options[i]) == 0) {
2428
2429 if (strncmp(cmp, trace_options[i], len) == 0) {
2430 set_tracer_flags(1 << i, !neg); 2440 set_tracer_flags(1 << i, !neg);
2431 break; 2441 break;
2432 } 2442 }
@@ -2446,9 +2456,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2446 return cnt; 2456 return cnt;
2447} 2457}
2448 2458
2459static int tracing_trace_options_open(struct inode *inode, struct file *file)
2460{
2461 if (tracing_disabled)
2462 return -ENODEV;
2463 return single_open(file, tracing_trace_options_show, NULL);
2464}
2465
2449static const struct file_operations tracing_iter_fops = { 2466static const struct file_operations tracing_iter_fops = {
2450 .open = tracing_open_generic, 2467 .open = tracing_trace_options_open,
2451 .read = tracing_trace_options_read, 2468 .read = seq_read,
2469 .llseek = seq_lseek,
2470 .release = single_release,
2452 .write = tracing_trace_options_write, 2471 .write = tracing_trace_options_write,
2453}; 2472};
2454 2473
@@ -2898,6 +2917,10 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2898 else 2917 else
2899 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); 2918 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2900 2919
2920
2921 if (iter->trace->pipe_close)
2922 iter->trace->pipe_close(iter);
2923
2901 mutex_unlock(&trace_types_lock); 2924 mutex_unlock(&trace_types_lock);
2902 2925
2903 free_cpumask_var(iter->started); 2926 free_cpumask_var(iter->started);
@@ -3104,7 +3127,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3104 __free_page(spd->pages[idx]); 3127 __free_page(spd->pages[idx]);
3105} 3128}
3106 3129
3107static struct pipe_buf_operations tracing_pipe_buf_ops = { 3130static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3108 .can_merge = 0, 3131 .can_merge = 0,
3109 .map = generic_pipe_buf_map, 3132 .map = generic_pipe_buf_map,
3110 .unmap = generic_pipe_buf_unmap, 3133 .unmap = generic_pipe_buf_unmap,
@@ -3320,6 +3343,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3320 return cnt; 3343 return cnt;
3321} 3344}
3322 3345
3346static int mark_printk(const char *fmt, ...)
3347{
3348 int ret;
3349 va_list args;
3350 va_start(args, fmt);
3351 ret = trace_vprintk(0, fmt, args);
3352 va_end(args);
3353 return ret;
3354}
3355
3323static ssize_t 3356static ssize_t
3324tracing_mark_write(struct file *filp, const char __user *ubuf, 3357tracing_mark_write(struct file *filp, const char __user *ubuf,
3325 size_t cnt, loff_t *fpos) 3358 size_t cnt, loff_t *fpos)
@@ -3346,28 +3379,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3346 } else 3379 } else
3347 buf[cnt] = '\0'; 3380 buf[cnt] = '\0';
3348 3381
3349 cnt = trace_vprintk(0, buf, NULL); 3382 cnt = mark_printk("%s", buf);
3350 kfree(buf); 3383 kfree(buf);
3351 *fpos += cnt; 3384 *fpos += cnt;
3352 3385
3353 return cnt; 3386 return cnt;
3354} 3387}
3355 3388
3356static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, 3389static int tracing_clock_show(struct seq_file *m, void *v)
3357 size_t cnt, loff_t *ppos)
3358{ 3390{
3359 char buf[64];
3360 int bufiter = 0;
3361 int i; 3391 int i;
3362 3392
3363 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 3393 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3364 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, 3394 seq_printf(m,
3365 "%s%s%s%s", i ? " " : "", 3395 "%s%s%s%s", i ? " " : "",
3366 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 3396 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3367 i == trace_clock_id ? "]" : ""); 3397 i == trace_clock_id ? "]" : "");
3368 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); 3398 seq_putc(m, '\n');
3369 3399
3370 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); 3400 return 0;
3371} 3401}
3372 3402
3373static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 3403static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3409,6 +3439,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3409 return cnt; 3439 return cnt;
3410} 3440}
3411 3441
3442static int tracing_clock_open(struct inode *inode, struct file *file)
3443{
3444 if (tracing_disabled)
3445 return -ENODEV;
3446 return single_open(file, tracing_clock_show, NULL);
3447}
3448
3412static const struct file_operations tracing_max_lat_fops = { 3449static const struct file_operations tracing_max_lat_fops = {
3413 .open = tracing_open_generic, 3450 .open = tracing_open_generic,
3414 .read = tracing_max_lat_read, 3451 .read = tracing_max_lat_read,
@@ -3447,8 +3484,10 @@ static const struct file_operations tracing_mark_fops = {
3447}; 3484};
3448 3485
3449static const struct file_operations trace_clock_fops = { 3486static const struct file_operations trace_clock_fops = {
3450 .open = tracing_open_generic, 3487 .open = tracing_clock_open,
3451 .read = tracing_clock_read, 3488 .read = seq_read,
3489 .llseek = seq_lseek,
3490 .release = single_release,
3452 .write = tracing_clock_write, 3491 .write = tracing_clock_write,
3453}; 3492};
3454 3493
@@ -3578,7 +3617,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3578} 3617}
3579 3618
3580/* Pipe buffer operations for a buffer. */ 3619/* Pipe buffer operations for a buffer. */
3581static struct pipe_buf_operations buffer_pipe_buf_ops = { 3620static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3582 .can_merge = 0, 3621 .can_merge = 0,
3583 .map = generic_pipe_buf_map, 3622 .map = generic_pipe_buf_map,
3584 .unmap = generic_pipe_buf_unmap, 3623 .unmap = generic_pipe_buf_unmap,
@@ -3909,39 +3948,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3909 if (ret < 0) 3948 if (ret < 0)
3910 return ret; 3949 return ret;
3911 3950
3912 ret = 0; 3951 if (val != 0 && val != 1)
3913 switch (val) { 3952 return -EINVAL;
3914 case 0:
3915 /* do nothing if already cleared */
3916 if (!(topt->flags->val & topt->opt->bit))
3917 break;
3918
3919 mutex_lock(&trace_types_lock);
3920 if (current_trace->set_flag)
3921 ret = current_trace->set_flag(topt->flags->val,
3922 topt->opt->bit, 0);
3923 mutex_unlock(&trace_types_lock);
3924 if (ret)
3925 return ret;
3926 topt->flags->val &= ~topt->opt->bit;
3927 break;
3928 case 1:
3929 /* do nothing if already set */
3930 if (topt->flags->val & topt->opt->bit)
3931 break;
3932 3953
3954 if (!!(topt->flags->val & topt->opt->bit) != val) {
3933 mutex_lock(&trace_types_lock); 3955 mutex_lock(&trace_types_lock);
3934 if (current_trace->set_flag) 3956 ret = __set_tracer_option(current_trace, topt->flags,
3935 ret = current_trace->set_flag(topt->flags->val, 3957 topt->opt, !val);
3936 topt->opt->bit, 1);
3937 mutex_unlock(&trace_types_lock); 3958 mutex_unlock(&trace_types_lock);
3938 if (ret) 3959 if (ret)
3939 return ret; 3960 return ret;
3940 topt->flags->val |= topt->opt->bit;
3941 break;
3942
3943 default:
3944 return -EINVAL;
3945 } 3961 }
3946 3962
3947 *ppos += cnt; 3963 *ppos += cnt;
@@ -4268,8 +4284,8 @@ trace_printk_seq(struct trace_seq *s)
4268 4284
4269static void __ftrace_dump(bool disable_tracing) 4285static void __ftrace_dump(bool disable_tracing)
4270{ 4286{
4271 static raw_spinlock_t ftrace_dump_lock = 4287 static arch_spinlock_t ftrace_dump_lock =
4272 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4288 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4273 /* use static because iter can be a bit big for the stack */ 4289 /* use static because iter can be a bit big for the stack */
4274 static struct trace_iterator iter; 4290 static struct trace_iterator iter;
4275 unsigned int old_userobj; 4291 unsigned int old_userobj;
@@ -4279,7 +4295,7 @@ static void __ftrace_dump(bool disable_tracing)
4279 4295
4280 /* only one dump */ 4296 /* only one dump */
4281 local_irq_save(flags); 4297 local_irq_save(flags);
4282 __raw_spin_lock(&ftrace_dump_lock); 4298 arch_spin_lock(&ftrace_dump_lock);
4283 if (dump_ran) 4299 if (dump_ran)
4284 goto out; 4300 goto out;
4285 4301
@@ -4354,7 +4370,7 @@ static void __ftrace_dump(bool disable_tracing)
4354 } 4370 }
4355 4371
4356 out: 4372 out:
4357 __raw_spin_unlock(&ftrace_dump_lock); 4373 arch_spin_unlock(&ftrace_dump_lock);
4358 local_irq_restore(flags); 4374 local_irq_restore(flags);
4359} 4375}
4360 4376
@@ -4415,7 +4431,7 @@ __init static int tracer_alloc_buffers(void)
4415 /* Allocate the first page for all buffers */ 4431 /* Allocate the first page for all buffers */
4416 for_each_tracing_cpu(i) { 4432 for_each_tracing_cpu(i) {
4417 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4433 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4418 max_tr.data[i] = &per_cpu(max_data, i); 4434 max_tr.data[i] = &per_cpu(max_tr_data, i);
4419 } 4435 }
4420 4436
4421 trace_init_cmdlines(); 4437 trace_init_cmdlines();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80..4df6a77eb19 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
272 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
273 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
274 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
275 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
276 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
277 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
290 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
291 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
292 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
293 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
294 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
295 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -441,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
441 443
442extern int ring_buffer_expanded; 444extern int ring_buffer_expanded;
443extern bool tracing_selftest_disabled; 445extern bool tracing_selftest_disabled;
444DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 446DECLARE_PER_CPU(int, ftrace_cpu_disabled);
445 447
446#ifdef CONFIG_FTRACE_STARTUP_TEST 448#ifdef CONFIG_FTRACE_STARTUP_TEST
447extern int trace_selftest_startup_function(struct tracer *trace, 449extern int trace_selftest_startup_function(struct tracer *trace,
@@ -595,18 +597,17 @@ enum trace_iterator_flags {
595 TRACE_ITER_BIN = 0x40, 597 TRACE_ITER_BIN = 0x40,
596 TRACE_ITER_BLOCK = 0x80, 598 TRACE_ITER_BLOCK = 0x80,
597 TRACE_ITER_STACKTRACE = 0x100, 599 TRACE_ITER_STACKTRACE = 0x100,
598 TRACE_ITER_SCHED_TREE = 0x200, 600 TRACE_ITER_PRINTK = 0x200,
599 TRACE_ITER_PRINTK = 0x400, 601 TRACE_ITER_PREEMPTONLY = 0x400,
600 TRACE_ITER_PREEMPTONLY = 0x800, 602 TRACE_ITER_BRANCH = 0x800,
601 TRACE_ITER_BRANCH = 0x1000, 603 TRACE_ITER_ANNOTATE = 0x1000,
602 TRACE_ITER_ANNOTATE = 0x2000, 604 TRACE_ITER_USERSTACKTRACE = 0x2000,
603 TRACE_ITER_USERSTACKTRACE = 0x4000, 605 TRACE_ITER_SYM_USEROBJ = 0x4000,
604 TRACE_ITER_SYM_USEROBJ = 0x8000, 606 TRACE_ITER_PRINTK_MSGONLY = 0x8000,
605 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 607 TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
606 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 608 TRACE_ITER_LATENCY_FMT = 0x20000,
607 TRACE_ITER_LATENCY_FMT = 0x40000, 609 TRACE_ITER_SLEEP_TIME = 0x40000,
608 TRACE_ITER_SLEEP_TIME = 0x80000, 610 TRACE_ITER_GRAPH_TIME = 0x80000,
609 TRACE_ITER_GRAPH_TIME = 0x100000,
610}; 611};
611 612
612/* 613/*
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 878c03f386b..84a3a7ba072 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -71,10 +71,10 @@ u64 notrace trace_clock(void)
71/* keep prev_time and lock in the same cacheline. */ 71/* keep prev_time and lock in the same cacheline. */
72static struct { 72static struct {
73 u64 prev_time; 73 u64 prev_time;
74 raw_spinlock_t lock; 74 arch_spinlock_t lock;
75} trace_clock_struct ____cacheline_aligned_in_smp = 75} trace_clock_struct ____cacheline_aligned_in_smp =
76 { 76 {
77 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 77 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
78 }; 78 };
79 79
80u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
@@ -94,7 +94,7 @@ u64 notrace trace_clock_global(void)
94 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
95 goto out; 95 goto out;
96 96
97 __raw_spin_lock(&trace_clock_struct.lock); 97 arch_spin_lock(&trace_clock_struct.lock);
98 98
99 /* 99 /*
100 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
@@ -106,7 +106,7 @@ u64 notrace trace_clock_global(void)
106 106
107 trace_clock_struct.prev_time = now; 107 trace_clock_struct.prev_time = now;
108 108
109 __raw_spin_unlock(&trace_clock_struct.lock); 109 arch_spin_unlock(&trace_clock_struct.lock);
110 110
111 out: 111 out:
112 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index d9c60f80aa0..9e25573242c 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -25,7 +25,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
25 char *buf; 25 char *buf;
26 int ret = -ENOMEM; 26 int ret = -ENOMEM;
27 27
28 if (atomic_inc_return(&event->profile_count)) 28 if (event->profile_count++ > 0)
29 return 0; 29 return 0;
30 30
31 if (!total_profile_count) { 31 if (!total_profile_count) {
@@ -56,7 +56,7 @@ fail_buf_nmi:
56 perf_trace_buf = NULL; 56 perf_trace_buf = NULL;
57 } 57 }
58fail_buf: 58fail_buf:
59 atomic_dec(&event->profile_count); 59 event->profile_count--;
60 60
61 return ret; 61 return ret;
62} 62}
@@ -83,7 +83,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
83{ 83{
84 char *buf, *nmi_buf; 84 char *buf, *nmi_buf;
85 85
86 if (!atomic_add_negative(-1, &event->profile_count)) 86 if (--event->profile_count > 0)
87 return; 87 return;
88 88
89 event->profile_disable(event); 89 event->profile_disable(event);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d18315dc83..189b09baf4f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -78,7 +78,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
78 if (ret) \ 78 if (ret) \
79 return ret; 79 return ret;
80 80
81int trace_define_common_fields(struct ftrace_event_call *call) 81static int trace_define_common_fields(struct ftrace_event_call *call)
82{ 82{
83 int ret; 83 int ret;
84 struct trace_entry ent; 84 struct trace_entry ent;
@@ -91,7 +91,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
91 91
92 return ret; 92 return ret;
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 94
96void trace_destroy_fields(struct ftrace_event_call *call) 95void trace_destroy_fields(struct ftrace_event_call *call)
97{ 96{
@@ -105,9 +104,25 @@ void trace_destroy_fields(struct ftrace_event_call *call)
105 } 104 }
106} 105}
107 106
108static void ftrace_event_enable_disable(struct ftrace_event_call *call, 107int trace_event_raw_init(struct ftrace_event_call *call)
108{
109 int id;
110
111 id = register_ftrace_event(call->event);
112 if (!id)
113 return -ENODEV;
114 call->id = id;
115 INIT_LIST_HEAD(&call->fields);
116
117 return 0;
118}
119EXPORT_SYMBOL_GPL(trace_event_raw_init);
120
121static int ftrace_event_enable_disable(struct ftrace_event_call *call,
109 int enable) 122 int enable)
110{ 123{
124 int ret = 0;
125
111 switch (enable) { 126 switch (enable) {
112 case 0: 127 case 0:
113 if (call->enabled) { 128 if (call->enabled) {
@@ -118,12 +133,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
118 break; 133 break;
119 case 1: 134 case 1:
120 if (!call->enabled) { 135 if (!call->enabled) {
121 call->enabled = 1;
122 tracing_start_cmdline_record(); 136 tracing_start_cmdline_record();
123 call->regfunc(call); 137 ret = call->regfunc(call);
138 if (ret) {
139 tracing_stop_cmdline_record();
140 pr_info("event trace: Could not enable event "
141 "%s\n", call->name);
142 break;
143 }
144 call->enabled = 1;
124 } 145 }
125 break; 146 break;
126 } 147 }
148
149 return ret;
127} 150}
128 151
129static void ftrace_clear_events(void) 152static void ftrace_clear_events(void)
@@ -402,7 +425,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
402 case 0: 425 case 0:
403 case 1: 426 case 1:
404 mutex_lock(&event_mutex); 427 mutex_lock(&event_mutex);
405 ftrace_event_enable_disable(call, val); 428 ret = ftrace_event_enable_disable(call, val);
406 mutex_unlock(&event_mutex); 429 mutex_unlock(&event_mutex);
407 break; 430 break;
408 431
@@ -412,7 +435,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
412 435
413 *ppos += cnt; 436 *ppos += cnt;
414 437
415 return cnt; 438 return ret ? ret : cnt;
416} 439}
417 440
418static ssize_t 441static ssize_t
@@ -913,7 +936,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
913 id); 936 id);
914 937
915 if (call->define_fields) { 938 if (call->define_fields) {
916 ret = call->define_fields(call); 939 ret = trace_define_common_fields(call);
940 if (!ret)
941 ret = call->define_fields(call);
917 if (ret < 0) { 942 if (ret < 0) {
918 pr_warning("Could not initialize trace point" 943 pr_warning("Could not initialize trace point"
919 " events/%s\n", call->name); 944 " events/%s\n", call->name);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb228d..e42af9aad69 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211{ 211{
212 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
213 int cmp, match; 213 int cmp, match;
214 int len = strlen(*addr) + 1; /* including tailing '\0' */
214 215
215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); 216 cmp = pred->regex.match(*addr, &pred->regex, len);
216 217
217 match = cmp ^ pred->not; 218 match = cmp ^ pred->not;
218 219
@@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
251 return 0; 252 return 0;
252} 253}
253 254
254/* Basic regex callbacks */ 255/*
256 * regex_match_foo - Basic regex callbacks
257 *
258 * @str: the string to be searched
259 * @r: the regex structure containing the pattern string
260 * @len: the length of the string to be searched (including '\0')
261 *
262 * Note:
263 * - @str might not be NULL-terminated if it's of type DYN_STRING
264 * or STATIC_STRING
265 */
266
255static int regex_match_full(char *str, struct regex *r, int len) 267static int regex_match_full(char *str, struct regex *r, int len)
256{ 268{
257 if (strncmp(str, r->pattern, len) == 0) 269 if (strncmp(str, r->pattern, len) == 0)
@@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len)
261 273
262static int regex_match_front(char *str, struct regex *r, int len) 274static int regex_match_front(char *str, struct regex *r, int len)
263{ 275{
264 if (strncmp(str, r->pattern, len) == 0) 276 if (strncmp(str, r->pattern, r->len) == 0)
265 return 1; 277 return 1;
266 return 0; 278 return 0;
267} 279}
268 280
269static int regex_match_middle(char *str, struct regex *r, int len) 281static int regex_match_middle(char *str, struct regex *r, int len)
270{ 282{
271 if (strstr(str, r->pattern)) 283 if (strnstr(str, r->pattern, len))
272 return 1; 284 return 1;
273 return 0; 285 return 0;
274} 286}
275 287
276static int regex_match_end(char *str, struct regex *r, int len) 288static int regex_match_end(char *str, struct regex *r, int len)
277{ 289{
278 char *ptr = strstr(str, r->pattern); 290 int strlen = len - 1;
279 291
280 if (ptr && (ptr[r->len] == 0)) 292 if (strlen >= r->len &&
293 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
281 return 1; 294 return 1;
282 return 0; 295 return 0;
283} 296}
@@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps,
781 pred->regex.field_len = field->size; 794 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING) 795 } else if (field->filter_type == FILTER_DYN_STRING)
783 fn = filter_pred_strloc; 796 fn = filter_pred_strloc;
784 else { 797 else
785 fn = filter_pred_pchar; 798 fn = filter_pred_pchar;
786 pred->regex.field_len = strlen(pred->regex.pattern);
787 }
788 } else { 799 } else {
789 if (field->is_signed) 800 if (field->is_signed)
790 ret = strict_strtoll(pred->regex.pattern, 0, &val); 801 ret = strict_strtoll(pred->regex.pattern, 0, &val);
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index dff8c84ddf1..d4fa5dc1ee4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -158,7 +158,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
160 offsetof(typeof(field), item), \ 160 offsetof(typeof(field), item), \
161 sizeof(field.item), 0, FILTER_OTHER); \ 161 sizeof(field.item), \
162 is_signed_type(type), FILTER_OTHER); \
162 if (ret) \ 163 if (ret) \
163 return ret; 164 return ret;
164 165
@@ -168,8 +169,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 169 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), \ 170 offsetof(typeof(field), \
170 container.item), \ 171 container.item), \
171 sizeof(field.container.item), 0, \ 172 sizeof(field.container.item), \
172 FILTER_OTHER); \ 173 is_signed_type(type), FILTER_OTHER); \
173 if (ret) \ 174 if (ret) \
174 return ret; 175 return ret;
175 176
@@ -184,10 +185,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
184 struct struct_name field; \ 185 struct struct_name field; \
185 int ret; \ 186 int ret; \
186 \ 187 \
187 ret = trace_define_common_fields(event_call); \
188 if (ret) \
189 return ret; \
190 \
191 tstruct; \ 188 tstruct; \
192 \ 189 \
193 return ret; \ 190 return ret; \
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4..b1342c5d37c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,20 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21};
22
23struct fgraph_data {
24 struct fgraph_cpu_data *cpu_data;
25
26 /* Place to preserve last processed entry. */
27 struct ftrace_graph_ent_entry ent;
28 struct ftrace_graph_ret_entry ret;
29 int failed;
30 int cpu;
20}; 31};
21 32
22#define TRACE_GRAPH_INDENT 2 33#define TRACE_GRAPH_INDENT 2
@@ -176,7 +187,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 187 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 188 struct ftrace_graph_ent_entry *entry;
178 189
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 190 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
180 return 0; 191 return 0;
181 192
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 193 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -240,7 +251,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 251 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 252 struct ftrace_graph_ret_entry *entry;
242 253
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 254 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
244 return; 255 return;
245 256
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 257 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -384,7 +395,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 395 if (!data)
385 return TRACE_TYPE_HANDLED; 396 return TRACE_TYPE_HANDLED;
386 397
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 398 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 399
389 if (*last_pid == pid) 400 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 401 return TRACE_TYPE_HANDLED;
@@ -435,26 +446,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 446get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 447 struct ftrace_graph_ent_entry *curr)
437{ 448{
438 struct ring_buffer_iter *ring_iter; 449 struct fgraph_data *data = iter->private;
450 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 451 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 452 struct ftrace_graph_ret_entry *next;
441 453
442 ring_iter = iter->buffer_iter[iter->cpu]; 454 /*
455 * If the previous output failed to write to the seq buffer,
456 * then we just reuse the data from before.
457 */
458 if (data && data->failed) {
459 curr = &data->ent;
460 next = &data->ret;
461 } else {
443 462
444 /* First peek to compare current entry and the next one */ 463 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 464
446 event = ring_buffer_iter_peek(ring_iter, NULL); 465 /* First peek to compare current entry and the next one */
447 else { 466 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 467 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 468 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 469 /*
451 NULL); 470 * We need to consume the current entry to see
452 } 471 * the next one.
472 */
473 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
474 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
475 NULL);
476 }
453 477
454 if (!event) 478 if (!event)
455 return NULL; 479 return NULL;
480
481 next = ring_buffer_event_data(event);
456 482
457 next = ring_buffer_event_data(event); 483 if (data) {
484 /*
485 * Save current and next entries for later reference
486 * if the output fails.
487 */
488 data->ent = *curr;
489 data->ret = *next;
490 }
491 }
458 492
459 if (next->ent.type != TRACE_GRAPH_RET) 493 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 494 return NULL;
@@ -640,7 +674,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
640 674
641 if (data) { 675 if (data) {
642 int cpu = iter->cpu; 676 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 677 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
@@ -688,7 +722,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
688 722
689 if (data) { 723 if (data) {
690 int cpu = iter->cpu; 724 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth); 725 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
692 726
693 *depth = call->depth; 727 *depth = call->depth;
694 } 728 }
@@ -782,19 +816,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 816print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 817 struct trace_iterator *iter)
784{ 818{
785 int cpu = iter->cpu; 819 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 820 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 821 struct ftrace_graph_ret_entry *leaf_ret;
822 static enum print_line_t ret;
823 int cpu = iter->cpu;
788 824
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 825 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 826 return TRACE_TYPE_PARTIAL_LINE;
791 827
792 leaf_ret = get_return_for_leaf(iter, field); 828 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 829 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 830 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 831 else
796 return print_graph_entry_nested(iter, field, s, cpu); 832 ret = print_graph_entry_nested(iter, field, s, cpu);
797 833
834 if (data) {
835 /*
836 * If we failed to write our output, then we need to make
837 * note of it. Because we already consumed our entry.
838 */
839 if (s->full) {
840 data->failed = 1;
841 data->cpu = cpu;
842 } else
843 data->failed = 0;
844 }
845
846 return ret;
798} 847}
799 848
800static enum print_line_t 849static enum print_line_t
@@ -810,7 +859,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
810 859
811 if (data) { 860 if (data) {
812 int cpu = iter->cpu; 861 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 862 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
814 863
815 /* 864 /*
816 * Comments display at + 1 to depth. This is the 865 * Comments display at + 1 to depth. This is the
@@ -873,7 +922,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 922 int i;
874 923
875 if (data) 924 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 925 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 926
878 if (print_graph_prologue(iter, s, 0, 0)) 927 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +990,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 990enum print_line_t
942print_graph_function(struct trace_iterator *iter) 991print_graph_function(struct trace_iterator *iter)
943{ 992{
993 struct ftrace_graph_ent_entry *field;
994 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 995 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 996 struct trace_seq *s = &iter->seq;
997 int cpu = iter->cpu;
998 int ret;
999
1000 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1001 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1002 return TRACE_TYPE_HANDLED;
1003 }
1004
1005 /*
1006 * If the last output failed, there's a possibility we need
1007 * to print out the missing entry which would never go out.
1008 */
1009 if (data && data->failed) {
1010 field = &data->ent;
1011 iter->cpu = data->cpu;
1012 ret = print_graph_entry(field, s, iter);
1013 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1014 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1015 ret = TRACE_TYPE_NO_CONSUME;
1016 }
1017 iter->cpu = cpu;
1018 return ret;
1019 }
946 1020
947 switch (entry->type) { 1021 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1022 case TRACE_GRAPH_ENT: {
@@ -952,7 +1026,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1026 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1027 * it can be safely saved at the stack.
954 */ 1028 */
955 struct ftrace_graph_ent_entry *field, saved; 1029 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1030 trace_assign_type(field, entry);
957 saved = *field; 1031 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1032 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1104,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1104static void graph_trace_open(struct trace_iterator *iter)
1031{ 1105{
1032 /* pid and depth on the last trace processed */ 1106 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1107 struct fgraph_data *data;
1034 int cpu; 1108 int cpu;
1035 1109
1110 iter->private = NULL;
1111
1112 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1113 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1114 goto out_err;
1038 else 1115
1039 for_each_possible_cpu(cpu) { 1116 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1117 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1118 goto out_err_free;
1042 *pid = -1; 1119
1043 *depth = 0; 1120 for_each_possible_cpu(cpu) {
1044 } 1121 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1122 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1123 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1124 *pid = -1;
1125 *depth = 0;
1126 *ignore = 0;
1127 }
1045 1128
1046 iter->private = data; 1129 iter->private = data;
1130
1131 return;
1132
1133 out_err_free:
1134 kfree(data);
1135 out_err:
1136 pr_warning("function graph tracer: not enough memory\n");
1047} 1137}
1048 1138
1049static void graph_trace_close(struct trace_iterator *iter) 1139static void graph_trace_close(struct trace_iterator *iter)
1050{ 1140{
1051 free_percpu(iter->private); 1141 struct fgraph_data *data = iter->private;
1142
1143 if (data) {
1144 free_percpu(data->cpu_data);
1145 kfree(data);
1146 }
1052} 1147}
1053 1148
1054static struct tracer graph_trace __read_mostly = { 1149static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1150 .name = "function_graph",
1056 .open = graph_trace_open, 1151 .open = graph_trace_open,
1152 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1153 .close = graph_trace_close,
1154 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1155 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1156 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1157 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd..7b97000745f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
258 259
259 get_online_cpus(); 260 get_online_cpus();
260 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
261 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
262 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
263 /* 264 /*
264 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
265 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
268 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
269 270
270 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
271 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
272 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
273 put_online_cpus(); 274 put_online_cpus();
274} 275}
275 276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114..2974bc7538c 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
151 goto out_unlock; 151 goto out_unlock;
152 152
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc);
154 156
155 if (data->critical_sequence != max_sequence) 157 if (data->critical_sequence != max_sequence)
156 goto out_unlock; 158 goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aff5f80b59b..50b1b823980 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -282,6 +282,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri, 282static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs); 283 struct pt_regs *regs);
284 284
285/* Check the name is good for event/group */
286static int check_event_name(const char *name)
287{
288 if (!isalpha(*name) && *name != '_')
289 return 0;
290 while (*++name != '\0') {
291 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
292 return 0;
293 }
294 return 1;
295}
296
285/* 297/*
286 * Allocate new trace_probe and initialize it (including kprobes). 298 * Allocate new trace_probe and initialize it (including kprobes).
287 */ 299 */
@@ -293,10 +305,11 @@ static struct trace_probe *alloc_trace_probe(const char *group,
293 int nargs, int is_return) 305 int nargs, int is_return)
294{ 306{
295 struct trace_probe *tp; 307 struct trace_probe *tp;
308 int ret = -ENOMEM;
296 309
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); 310 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp) 311 if (!tp)
299 return ERR_PTR(-ENOMEM); 312 return ERR_PTR(ret);
300 313
301 if (symbol) { 314 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL); 315 tp->symbol = kstrdup(symbol, GFP_KERNEL);
@@ -312,14 +325,20 @@ static struct trace_probe *alloc_trace_probe(const char *group,
312 else 325 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher; 326 tp->rp.kp.pre_handler = kprobe_dispatcher;
314 327
315 if (!event) 328 if (!event || !check_event_name(event)) {
329 ret = -EINVAL;
316 goto error; 330 goto error;
331 }
332
317 tp->call.name = kstrdup(event, GFP_KERNEL); 333 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name) 334 if (!tp->call.name)
319 goto error; 335 goto error;
320 336
321 if (!group) 337 if (!group || !check_event_name(group)) {
338 ret = -EINVAL;
322 goto error; 339 goto error;
340 }
341
323 tp->call.system = kstrdup(group, GFP_KERNEL); 342 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system) 343 if (!tp->call.system)
325 goto error; 344 goto error;
@@ -330,7 +349,7 @@ error:
330 kfree(tp->call.name); 349 kfree(tp->call.name);
331 kfree(tp->symbol); 350 kfree(tp->symbol);
332 kfree(tp); 351 kfree(tp);
333 return ERR_PTR(-ENOMEM); 352 return ERR_PTR(ret);
334} 353}
335 354
336static void free_probe_arg(struct probe_arg *arg) 355static void free_probe_arg(struct probe_arg *arg)
@@ -606,23 +625,22 @@ static int create_trace_probe(int argc, char **argv)
606 */ 625 */
607 struct trace_probe *tp; 626 struct trace_probe *tp;
608 int i, ret = 0; 627 int i, ret = 0;
609 int is_return = 0; 628 int is_return = 0, is_delete = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 629 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0; 630 unsigned long offset = 0;
612 void *addr = NULL; 631 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN]; 632 char buf[MAX_EVENT_NAME_LEN];
614 633
615 if (argc < 2) { 634 /* argc must be >= 1 */
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p') 635 if (argv[0][0] == 'p')
621 is_return = 0; 636 is_return = 0;
622 else if (argv[0][0] == 'r') 637 else if (argv[0][0] == 'r')
623 is_return = 1; 638 is_return = 1;
639 else if (argv[0][0] == '-')
640 is_delete = 1;
624 else { 641 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n"); 642 pr_info("Probe definition must be started with 'p', 'r' or"
643 " '-'.\n");
626 return -EINVAL; 644 return -EINVAL;
627 } 645 }
628 646
@@ -642,14 +660,36 @@ static int create_trace_probe(int argc, char **argv)
642 return -EINVAL; 660 return -EINVAL;
643 } 661 }
644 } 662 }
663 if (!group)
664 group = KPROBE_EVENT_SYSTEM;
665
666 if (is_delete) {
667 if (!event) {
668 pr_info("Delete command needs an event name.\n");
669 return -EINVAL;
670 }
671 tp = find_probe_event(event, group);
672 if (!tp) {
673 pr_info("Event %s/%s doesn't exist.\n", group, event);
674 return -ENOENT;
675 }
676 /* delete an event */
677 unregister_trace_probe(tp);
678 free_trace_probe(tp);
679 return 0;
680 }
645 681
682 if (argc < 2) {
683 pr_info("Probe point is not specified.\n");
684 return -EINVAL;
685 }
646 if (isdigit(argv[1][0])) { 686 if (isdigit(argv[1][0])) {
647 if (is_return) { 687 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n"); 688 pr_info("Return probe point must be a symbol.\n");
649 return -EINVAL; 689 return -EINVAL;
650 } 690 }
651 /* an address specified */ 691 /* an address specified */
652 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 692 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
653 if (ret) { 693 if (ret) {
654 pr_info("Failed to parse address.\n"); 694 pr_info("Failed to parse address.\n");
655 return ret; 695 return ret;
@@ -671,15 +711,13 @@ static int create_trace_probe(int argc, char **argv)
671 argc -= 2; argv += 2; 711 argc -= 2; argv += 2;
672 712
673 /* setup a probe */ 713 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) { 714 if (!event) {
677 /* Make a new event name */ 715 /* Make a new event name */
678 if (symbol) 716 if (symbol)
679 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", 717 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
680 is_return ? 'r' : 'p', symbol, offset); 718 is_return ? 'r' : 'p', symbol, offset);
681 else 719 else
682 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", 720 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
683 is_return ? 'r' : 'p', addr); 721 is_return ? 'r' : 'p', addr);
684 event = buf; 722 event = buf;
685 } 723 }
@@ -1113,10 +1151,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1113 struct kprobe_trace_entry field; 1151 struct kprobe_trace_entry field;
1114 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1152 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1115 1153
1116 ret = trace_define_common_fields(event_call);
1117 if (!ret)
1118 return ret;
1119
1120 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1154 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1121 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1155 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1122 /* Set argument names as fields */ 1156 /* Set argument names as fields */
@@ -1131,10 +1165,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1131 struct kretprobe_trace_entry field; 1165 struct kretprobe_trace_entry field;
1132 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1166 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1133 1167
1134 ret = trace_define_common_fields(event_call);
1135 if (!ret)
1136 return ret;
1137
1138 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1168 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1139 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1169 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1170 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
@@ -1171,10 +1201,11 @@ static int __probe_event_show_format(struct trace_seq *s,
1171#undef SHOW_FIELD 1201#undef SHOW_FIELD
1172#define SHOW_FIELD(type, item, name) \ 1202#define SHOW_FIELD(type, item, name) \
1173 do { \ 1203 do { \
1174 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ 1204 ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \
1175 "offset:%u;\tsize:%u;\n", name, \ 1205 "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\
1176 (unsigned int)offsetof(typeof(field), item),\ 1206 (unsigned int)offsetof(typeof(field), item),\
1177 (unsigned int)sizeof(type)); \ 1207 (unsigned int)sizeof(type), \
1208 is_signed_type(type)); \
1178 if (!ret) \ 1209 if (!ret) \
1179 return 0; \ 1210 return 0; \
1180 } while (0) 1211 } while (0)
@@ -1434,7 +1465,6 @@ static int register_probe_event(struct trace_probe *tp)
1434 call->unregfunc = probe_event_disable; 1465 call->unregfunc = probe_event_disable;
1435 1466
1436#ifdef CONFIG_EVENT_PROFILE 1467#ifdef CONFIG_EVENT_PROFILE
1437 atomic_set(&call->profile_count, -1);
1438 call->profile_enable = probe_profile_enable; 1468 call->profile_enable = probe_profile_enable;
1439 call->profile_disable = probe_profile_disable; 1469 call->profile_disable = probe_profile_disable;
1440#endif 1470#endif
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index ddfa0fd43bc..94103cdcf9d 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27 27
28#include "trace_output.h" 28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h" 29#include "trace.h"
31 30
32#include <linux/hw_breakpoint.h> 31#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h> 32#include <asm/hw_breakpoint.h>
34 33
34#include <asm/atomic.h>
35
35/* 36/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number 37 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers. 38 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
44 struct perf_event **ksym_hbp; 45 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr; 46 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER 47#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter; 48 atomic64_t counter;
48#endif 49#endif
49 struct hlist_node ksym_hlist; 50 struct hlist_node ksym_hlist;
50}; 51};
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
69 70
70 rcu_read_lock(); 71 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { 72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) && 73 if (entry->attr.bp_addr == hbp_hit_addr) {
73 (entry->counter <= MAX_UL_INT)) { 74 atomic64_inc(&entry->counter);
74 entry->counter++;
75 break; 75 break;
76 } 76 }
77 } 77 }
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
79} 79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */ 80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81 81
82void ksym_hbp_handler(struct perf_event *hbp, void *data) 82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
83{ 85{
84 struct ring_buffer_event *event; 86 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry; 87 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer; 88 struct ring_buffer *buffer;
88 int pc; 89 int pc;
89 90
@@ -196,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
196 entry->attr.bp_addr = addr; 197 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4; 198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198 199
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, 200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler); 201 ksym_hbp_handler);
202 202
@@ -235,7 +235,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
235 mutex_lock(&ksym_tracer_mutex); 235 mutex_lock(&ksym_tracer_mutex);
236 236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { 237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); 238 ret = trace_seq_printf(s, "%pS:",
239 (void *)(unsigned long)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R) 240 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n"); 241 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W) 242 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
@@ -277,21 +278,20 @@ static ssize_t ksym_trace_filter_write(struct file *file,
277{ 278{
278 struct trace_ksym *entry; 279 struct trace_ksym *entry;
279 struct hlist_node *node; 280 struct hlist_node *node;
280 char *input_string, *ksymname = NULL; 281 char *buf, *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0; 282 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0; 283 int ret, op, changed = 0;
283 284
284 input_string = kzalloc(count + 1, GFP_KERNEL); 285 buf = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string) 286 if (!buf)
286 return -ENOMEM; 287 return -ENOMEM;
287 288
288 if (copy_from_user(input_string, buffer, count)) { 289 ret = -EFAULT;
289 kfree(input_string); 290 if (copy_from_user(buf, buffer, count))
290 return -EFAULT; 291 goto out;
291 }
292 input_string[count] = '\0';
293 292
294 strstrip(input_string); 293 buf[count] = '\0';
294 input_string = strstrip(buf);
295 295
296 /* 296 /*
297 * Clear all breakpoints if: 297 * Clear all breakpoints if:
@@ -302,15 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file,
302 if (!input_string[0] || !strcmp(input_string, "0") || 302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) { 303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset(); 304 __ksym_trace_reset();
305 kfree(input_string); 305 ret = 0;
306 return count; 306 goto out;
307 } 307 }
308 308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); 309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) { 310 if (ret < 0)
311 kfree(input_string); 311 goto out;
312 return ret;
313 }
314 312
315 mutex_lock(&ksym_tracer_mutex); 313 mutex_lock(&ksym_tracer_mutex);
316 314
@@ -321,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
321 if (entry->attr.bp_type != op) 319 if (entry->attr.bp_type != op)
322 changed = 1; 320 changed = 1;
323 else 321 else
324 goto out; 322 goto out_unlock;
325 break; 323 break;
326 } 324 }
327 } 325 }
@@ -336,28 +334,24 @@ static ssize_t ksym_trace_filter_write(struct file *file,
336 if (IS_ERR(entry->ksym_hbp)) 334 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp); 335 ret = PTR_ERR(entry->ksym_hbp);
338 else 336 else
339 goto out; 337 goto out_unlock;
340 } 338 }
341 /* Error or "symbol:---" case: drop it */ 339 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--; 340 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist)); 341 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu(); 342 synchronize_rcu();
345 kfree(entry); 343 kfree(entry);
346 goto out; 344 goto out_unlock;
347 } else { 345 } else {
348 /* Check for malformed request: (4) */ 346 /* Check for malformed request: (4) */
349 if (op == 0) 347 if (op)
350 goto out; 348 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 } 349 }
353out: 350out_unlock:
354 mutex_unlock(&ksym_tracer_mutex); 351 mutex_unlock(&ksym_tracer_mutex);
355 352out:
356 kfree(input_string); 353 kfree(buf);
357 354 return !ret ? count : ret;
358 if (!ret)
359 ret = count;
360 return ret;
361} 355}
362 356
363static const struct file_operations ksym_tracing_fops = { 357static const struct file_operations ksym_tracing_fops = {
@@ -449,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
449 .print_line = ksym_trace_output 443 .print_line = ksym_trace_output
450}; 444};
451 445
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER 446#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m) 447static int ksym_profile_show(struct seq_file *m, void *v)
473{ 448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
474 seq_puts(m, " Access Type "); 454 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n"); 455 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- "); 456 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n"); 457 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480 458
481static int ksym_tracer_stat_show(struct seq_file *m, void *v) 459 rcu_read_lock();
482{ 460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487 461
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); 462 access_type = entry->attr.bp_type;
489 463
490 access_type = entry->attr.bp_type; 464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
491 477
492 switch (access_type) { 478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
493 case HW_BREAKPOINT_R: 479 seq_printf(m, " %-36s", fn_name);
494 seq_puts(m, " R "); 480 else
495 break; 481 seq_printf(m, " %-36s", "<NA>");
496 case HW_BREAKPOINT_W: 482 seq_printf(m, " %15llu\n",
497 seq_puts(m, " W "); 483 (unsigned long long)atomic64_read(&entry->counter));
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 } 484 }
505 485 rcu_read_unlock();
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511 486
512 return 0; 487 return 0;
513} 488}
514 489
515static void *ksym_tracer_stat_start(struct tracer_stat *trace) 490static int ksym_profile_open(struct inode *node, struct file *file)
516{ 491{
517 return ksym_filter_head.first; 492 return single_open(file, ksym_profile_show, NULL);
518} 493}
519 494
520static void * 495static const struct file_operations ksym_profile_fops = {
521ksym_tracer_stat_next(void *v, int idx) 496 .open = ksym_profile_open,
522{ 497 .read = seq_read,
523 struct hlist_node *stat = v; 498 .llseek = seq_lseek,
524 499 .release = single_release,
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534}; 500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
535 502
536__init static int ksym_tracer_stat_init(void) 503__init static int init_ksym_trace(void)
537{ 504{
538 int ret; 505 struct dentry *d_tracer;
539 506
540 ret = register_stat_tracer(&ksym_tracer_stats); 507 d_tracer = tracing_init_dentry();
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546 508
547 return 0; 509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
548} 518}
549fs_initcall(ksym_tracer_stat_init); 519device_initcall(init_ksym_trace);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bc..8e46b3323cd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
85 va_list ap; 93 va_list ap;
86 int ret; 94 int ret;
87 95
88 if (!len) 96 if (s->full || !len)
89 return 0; 97 return 0;
90 98
91 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
93 va_end(ap); 101 va_end(ap);
94 102
95 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
96 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
97 return 0; 106 return 0;
107 }
98 108
99 s->len += ret; 109 s->len += ret;
100 110
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
119 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
120 int ret; 130 int ret;
121 131
122 if (!len) 132 if (s->full || !len)
123 return 0; 133 return 0;
124 134
125 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
126 136
127 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
128 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
129 return 0; 140 return 0;
141 }
130 142
131 s->len += ret; 143 s->len += ret;
132 144
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
139 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
140 int ret; 152 int ret;
141 153
142 if (!len) 154 if (s->full || !len)
143 return 0; 155 return 0;
144 156
145 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
146 158
147 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
148 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
149 return 0; 162 return 0;
163 }
150 164
151 s->len += ret; 165 s->len += ret;
152 166
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
167{ 181{
168 int len = strlen(str); 182 int len = strlen(str);
169 183
170 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
185 return 0;
186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
171 return 0; 189 return 0;
190 }
172 191
173 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
174 s->len += len; 193 s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
178 197
179int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
180{ 199{
181 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
182 return 0; 201 return 0;
183 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
184 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
185 209
186 return 1; 210 return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
188 212
189int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
190{ 214{
191 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
192 return 0; 216 return 0;
193 217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
220 return 0;
221 }
222
194 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
195 s->len += len; 224 s->len += len;
196 225
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
203 const unsigned char *data = mem; 232 const unsigned char *data = mem;
204 int i, j; 233 int i, j;
205 234
235 if (s->full)
236 return 0;
237
206#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
207 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
208#else 240#else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
220{ 252{
221 void *ret; 253 void *ret;
222 254
223 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
224 return NULL; 260 return NULL;
261 }
225 262
226 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
227 s->len += len; 264 s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
233{ 270{
234 unsigned char *p; 271 unsigned char *p;
235 272
236 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
237 return 0; 278 return 0;
279 }
280
238 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
239 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
240 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
247 return 1; 290 return 1;
248 } 291 }
249 292
293 s->full = 1;
250 return 0; 294 return 0;
251} 295}
252 296
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
373 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
374 int ret = 1; 418 int ret = 1;
375 419
420 if (s->full)
421 return 0;
422
376 if (mm) { 423 if (mm) {
377 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
378 425
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d72767..0271742abb8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index dc98309e839..280fea470d6 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 67
68 /* Don't allow flipping of max traces now */ 68 /* Don't allow flipping of max traces now */
69 local_irq_save(flags); 69 local_irq_save(flags);
70 __raw_spin_lock(&ftrace_max_lock); 70 arch_spin_lock(&ftrace_max_lock);
71 71
72 cnt = ring_buffer_entries(tr->buffer); 72 cnt = ring_buffer_entries(tr->buffer);
73 73
@@ -85,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
85 break; 85 break;
86 } 86 }
87 tracing_on(); 87 tracing_on();
88 __raw_spin_unlock(&ftrace_max_lock); 88 arch_spin_unlock(&ftrace_max_lock);
89 local_irq_restore(flags); 89 local_irq_restore(flags);
90 90
91 if (count) 91 if (count)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e..f4bc9b27de5 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
213 return SEQ_START_TOKEN; 231 return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
220 __raw_spin_unlock(&max_stack_lock); 238 int cpu;
239
240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 57501d90096..75289f372dd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -217,10 +217,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
217 int i; 217 int i;
218 int offset = offsetof(typeof(trace), args); 218 int offset = offsetof(typeof(trace), args);
219 219
220 ret = trace_define_common_fields(call);
221 if (ret)
222 return ret;
223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 220 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret) 221 if (ret)
226 return ret; 222 return ret;
@@ -241,10 +237,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
241 struct syscall_trace_exit trace; 237 struct syscall_trace_exit trace;
242 int ret; 238 int ret;
243 239
244 ret = trace_define_common_fields(call);
245 if (ret)
246 return ret;
247
248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 240 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret) 241 if (ret)
250 return ret; 242 return ret;
@@ -333,10 +325,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
333 mutex_lock(&syscall_trace_lock); 325 mutex_lock(&syscall_trace_lock);
334 if (!sys_refcount_enter) 326 if (!sys_refcount_enter)
335 ret = register_trace_sys_enter(ftrace_syscall_enter); 327 ret = register_trace_sys_enter(ftrace_syscall_enter);
336 if (ret) { 328 if (!ret) {
337 pr_info("event trace: Could not activate"
338 "syscall entry trace point");
339 } else {
340 set_bit(num, enabled_enter_syscalls); 329 set_bit(num, enabled_enter_syscalls);
341 sys_refcount_enter++; 330 sys_refcount_enter++;
342 } 331 }
@@ -370,10 +359,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
370 mutex_lock(&syscall_trace_lock); 359 mutex_lock(&syscall_trace_lock);
371 if (!sys_refcount_exit) 360 if (!sys_refcount_exit)
372 ret = register_trace_sys_exit(ftrace_syscall_exit); 361 ret = register_trace_sys_exit(ftrace_syscall_exit);
373 if (ret) { 362 if (!ret) {
374 pr_info("event trace: Could not activate"
375 "syscall exit trace point");
376 } else {
377 set_bit(num, enabled_exit_syscalls); 363 set_bit(num, enabled_exit_syscalls);
378 sys_refcount_exit++; 364 sys_refcount_exit++;
379 } 365 }
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287..a7974a552ca 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
95 .address = backtrace_address, 95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
96}; 97};
97 98
98static int 99static int
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 03e2d6fd9b1..eb27fd3430a 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -6,8 +6,6 @@
6 6
7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); 7static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
8 8
9#define URN_LIST_HEAD per_cpu(return_notifier_list, raw_smp_processor_id())
10
11/* 9/*
12 * Request a notification when the current cpu returns to userspace. Must be 10 * Request a notification when the current cpu returns to userspace. Must be
13 * called in atomic context. The notifier will also be called in atomic 11 * called in atomic context. The notifier will also be called in atomic
@@ -16,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
16void user_return_notifier_register(struct user_return_notifier *urn) 14void user_return_notifier_register(struct user_return_notifier *urn)
17{ 15{
18 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
19 hlist_add_head(&urn->link, &URN_LIST_HEAD); 17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list));
20} 18}
21EXPORT_SYMBOL_GPL(user_return_notifier_register); 19EXPORT_SYMBOL_GPL(user_return_notifier_register);
22 20
@@ -27,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
27void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
28{ 26{
29 hlist_del(&urn->link); 27 hlist_del(&urn->link);
30 if (hlist_empty(&URN_LIST_HEAD)) 28 if (hlist_empty(&__get_cpu_var(return_notifier_list)))
31 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
32} 30}
33EXPORT_SYMBOL_GPL(user_return_notifier_unregister); 31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae8..dee48658805 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)