aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit_tree.c13
-rw-r--r--kernel/auditsc.c1
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/cgroup.c23
-rw-r--r--kernel/cpu.c36
-rw-r--r--kernel/cpuset.c18
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/exit.c54
-rw-r--r--kernel/fork.c31
-rw-r--r--kernel/futex.c117
-rw-r--r--kernel/hrtimer.c171
-rw-r--r--kernel/hw_breakpoint.c212
-rw-r--r--kernel/irq/autoprobe.c20
-rw-r--r--kernel/irq/chip.c86
-rw-r--r--kernel/irq/handle.c22
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/manage.c50
-rw-r--r--kernel/irq/migration.c2
-rw-r--r--kernel/irq/numa_migrate.c8
-rw-r--r--kernel/irq/pm.c8
-rw-r--r--kernel/irq/proc.c4
-rw-r--r--kernel/irq/spurious.c16
-rw-r--r--kernel/kexec.c65
-rw-r--r--kernel/kfifo.c410
-rw-r--r--kernel/kgdb.c65
-rw-r--r--kernel/kmod.c12
-rw-r--r--kernel/kprobes.c36
-rw-r--r--kernel/ksysfs.c29
-rw-r--r--kernel/kthread.c25
-rw-r--r--kernel/lockdep.c67
-rw-r--r--kernel/module.c208
-rw-r--r--kernel/mutex-debug.h12
-rw-r--r--kernel/notifier.c6
-rw-r--r--kernel/padata.c690
-rw-r--r--kernel/panic.c4
-rw-r--r--kernel/params.c8
-rw-r--r--kernel/perf_event.c832
-rw-r--r--kernel/pid.c14
-rw-r--r--kernel/pm_qos_params.c20
-rw-r--r--kernel/posix-timers.c2
-rw-r--r--kernel/power/Kconfig19
-rw-r--r--kernel/power/console.c7
-rw-r--r--kernel/power/main.c31
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/swap.c4
-rw-r--r--kernel/power/swsusp.c58
-rw-r--r--kernel/power/user.c23
-rw-r--r--kernel/printk.c120
-rw-r--r--kernel/ptrace.c88
-rw-r--r--kernel/rcupdate.c29
-rw-r--r--kernel/rcutorture.c102
-rw-r--r--kernel/rcutree.c268
-rw-r--r--kernel/rcutree.h61
-rw-r--r--kernel/rcutree_plugin.h229
-rw-r--r--kernel/rcutree_trace.c14
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/resource.c85
-rw-r--r--kernel/rtmutex-debug.c4
-rw-r--r--kernel/rtmutex.c106
-rw-r--r--kernel/sched.c2884
-rw-r--r--kernel/sched_clock.c23
-rw-r--r--kernel/sched_cpupri.c14
-rw-r--r--kernel/sched_cpupri.h2
-rw-r--r--kernel/sched_debug.c17
-rw-r--r--kernel/sched_fair.c1907
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c27
-rw-r--r--kernel/sched_rt.c118
-rw-r--r--kernel/signal.c66
-rw-r--r--kernel/smp.c45
-rw-r--r--kernel/softirq.c19
-rw-r--r--kernel/softlockup.c69
-rw-r--r--kernel/spinlock.c306
-rw-r--r--kernel/srcu.c52
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sysctl.c50
-rw-r--r--kernel/sysctl_binary.c38
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clockevents.c33
-rw-r--r--kernel/time/clocksource.c34
-rw-r--r--kernel/time/ntp.c10
-rw-r--r--kernel/time/tick-broadcast.c42
-rw-r--r--kernel/time/tick-common.c20
-rw-r--r--kernel/time/tick-internal.h1
-rw-r--r--kernel/time/timecompare.c2
-rw-r--r--kernel/time/timekeeping.c30
-rw-r--r--kernel/time/timer_list.c15
-rw-r--r--kernel/time/timer_stats.c18
-rw-r--r--kernel/timer.c5
-rw-r--r--kernel/trace/Kconfig125
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c135
-rw-r--r--kernel/trace/power-traces.c2
-rw-r--r--kernel/trace/ring_buffer.c73
-rw-r--r--kernel/trace/trace.c452
-rw-r--r--kernel/trace/trace.h33
-rw-r--r--kernel/trace/trace_branch.c19
-rw-r--r--kernel/trace/trace_clock.c8
-rw-r--r--kernel/trace/trace_event_profile.c58
-rw-r--r--kernel/trace/trace_events.c122
-rw-r--r--kernel/trace/trace_events_filter.c33
-rw-r--r--kernel/trace/trace_export.c98
-rw-r--r--kernel/trace/trace_functions_graph.c241
-rw-r--r--kernel/trace/trace_hw_branches.c51
-rw-r--r--kernel/trace/trace_irqsoff.c2
-rw-r--r--kernel/trace/trace_kprobe.c382
-rw-r--r--kernel/trace/trace_ksym.c193
-rw-r--r--kernel/trace/trace_output.c75
-rw-r--r--kernel/trace/trace_sched_wakeup.c16
-rw-r--r--kernel/trace/trace_selftest.c4
-rw-r--r--kernel/trace/trace_stack.c40
-rw-r--r--kernel/trace/trace_syscalls.c207
-rw-r--r--kernel/trace/trace_sysprof.c1
-rw-r--r--kernel/user.c305
-rw-r--r--kernel/workqueue.c131
117 files changed, 7593 insertions, 5762 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 864ff75d65f2..6aebdeb2aa34 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
100obj-$(CONFIG_PERF_EVENTS) += perf_event.o 100obj-$(CONFIG_PERF_EVENTS) += perf_event.o
101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 101obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o 102obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
103obj-$(CONFIG_PADATA) += padata.o
103 104
104ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) 105ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
105# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 106# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 9a4715a2f6bf..a6605ca921b6 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -536,7 +536,8 @@ static void do_acct_process(struct bsd_acct_struct *acct,
536 do_div(elapsed, AHZ); 536 do_div(elapsed, AHZ);
537 ac.ac_btime = get_seconds() - elapsed; 537 ac.ac_btime = get_seconds() - elapsed;
538 /* we really need to bite the bullet and change layout */ 538 /* we really need to bite the bullet and change layout */
539 current_uid_gid(&ac.ac_uid, &ac.ac_gid); 539 ac.ac_uid = orig_cred->uid;
540 ac.ac_gid = orig_cred->gid;
540#if ACCT_VERSION==2 541#if ACCT_VERSION==2
541 ac.ac_ahz = AHZ; 542 ac.ac_ahz = AHZ;
542#endif 543#endif
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 2451dc6f3282..4b05bd9479db 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -277,7 +277,7 @@ static void untag_chunk(struct node *p)
277 owner->root = NULL; 277 owner->root = NULL;
278 } 278 }
279 279
280 for (i = j = 0; i < size; i++, j++) { 280 for (i = j = 0; j <= size; i++, j++) {
281 struct audit_tree *s; 281 struct audit_tree *s;
282 if (&chunk->owners[j] == p) { 282 if (&chunk->owners[j] == p) {
283 list_del_init(&p->list); 283 list_del_init(&p->list);
@@ -290,7 +290,7 @@ static void untag_chunk(struct node *p)
290 if (!s) /* result of earlier fallback */ 290 if (!s) /* result of earlier fallback */
291 continue; 291 continue;
292 get_tree(s); 292 get_tree(s);
293 list_replace_init(&chunk->owners[i].list, &new->owners[j].list); 293 list_replace_init(&chunk->owners[j].list, &new->owners[i].list);
294 } 294 }
295 295
296 list_replace_rcu(&chunk->hash, &new->hash); 296 list_replace_rcu(&chunk->hash, &new->hash);
@@ -373,15 +373,17 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
373 for (n = 0; n < old->count; n++) { 373 for (n = 0; n < old->count; n++) {
374 if (old->owners[n].owner == tree) { 374 if (old->owners[n].owner == tree) {
375 spin_unlock(&hash_lock); 375 spin_unlock(&hash_lock);
376 put_inotify_watch(watch); 376 put_inotify_watch(&old->watch);
377 return 0; 377 return 0;
378 } 378 }
379 } 379 }
380 spin_unlock(&hash_lock); 380 spin_unlock(&hash_lock);
381 381
382 chunk = alloc_chunk(old->count + 1); 382 chunk = alloc_chunk(old->count + 1);
383 if (!chunk) 383 if (!chunk) {
384 put_inotify_watch(&old->watch);
384 return -ENOMEM; 385 return -ENOMEM;
386 }
385 387
386 mutex_lock(&inode->inotify_mutex); 388 mutex_lock(&inode->inotify_mutex);
387 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) { 389 if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
@@ -425,7 +427,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
425 spin_unlock(&hash_lock); 427 spin_unlock(&hash_lock);
426 inotify_evict_watch(&old->watch); 428 inotify_evict_watch(&old->watch);
427 mutex_unlock(&inode->inotify_mutex); 429 mutex_unlock(&inode->inotify_mutex);
428 put_inotify_watch(&old->watch); 430 put_inotify_watch(&old->watch); /* pair to inotify_find_watch */
431 put_inotify_watch(&old->watch); /* and kill it */
429 return 0; 432 return 0;
430} 433}
431 434
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 267e484f0198..fc0f928167e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -250,7 +250,6 @@ struct audit_context {
250#endif 250#endif
251}; 251};
252 252
253#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
254static inline int open_arg(int flags, int mask) 253static inline int open_arg(int flags, int mask)
255{ 254{
256 int n = ACC_MODE(flags); 255 int n = ACC_MODE(flags);
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 3c5301381837..98a51f26c136 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -12,7 +12,7 @@
12 12
13void foo(void) 13void foo(void)
14{ 14{
15 /* The enum constants to put into include/linux/bounds.h */ 15 /* The enum constants to put into include/generated/bounds.h */
16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 16 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 17 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
18 /* End of constants */ 18 /* End of constants */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0249f4be9b5c..4fd90e129772 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -23,6 +23,7 @@
23 */ 23 */
24 24
25#include <linux/cgroup.h> 25#include <linux/cgroup.h>
26#include <linux/module.h>
26#include <linux/ctype.h> 27#include <linux/ctype.h>
27#include <linux/errno.h> 28#include <linux/errno.h>
28#include <linux/fs.h> 29#include <linux/fs.h>
@@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
166 */ 167 */
167static int need_forkexit_callback __read_mostly; 168static int need_forkexit_callback __read_mostly;
168 169
170#ifdef CONFIG_PROVE_LOCKING
171int cgroup_lock_is_held(void)
172{
173 return lockdep_is_held(&cgroup_mutex);
174}
175#else /* #ifdef CONFIG_PROVE_LOCKING */
176int cgroup_lock_is_held(void)
177{
178 return mutex_is_locked(&cgroup_mutex);
179}
180#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
181
182EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
183
169/* convenient tests for these bits */ 184/* convenient tests for these bits */
170inline int cgroup_is_removed(const struct cgroup *cgrp) 185inline int cgroup_is_removed(const struct cgroup *cgrp)
171{ 186{
@@ -2468,7 +2483,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp,
2468 /* make sure l doesn't vanish out from under us */ 2483 /* make sure l doesn't vanish out from under us */
2469 down_write(&l->mutex); 2484 down_write(&l->mutex);
2470 mutex_unlock(&cgrp->pidlist_mutex); 2485 mutex_unlock(&cgrp->pidlist_mutex);
2471 l->use_count++;
2472 return l; 2486 return l;
2473 } 2487 }
2474 } 2488 }
@@ -2937,14 +2951,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2937 2951
2938 for_each_subsys(root, ss) { 2952 for_each_subsys(root, ss) {
2939 struct cgroup_subsys_state *css = ss->create(ss, cgrp); 2953 struct cgroup_subsys_state *css = ss->create(ss, cgrp);
2954
2940 if (IS_ERR(css)) { 2955 if (IS_ERR(css)) {
2941 err = PTR_ERR(css); 2956 err = PTR_ERR(css);
2942 goto err_destroy; 2957 goto err_destroy;
2943 } 2958 }
2944 init_cgroup_css(css, ss, cgrp); 2959 init_cgroup_css(css, ss, cgrp);
2945 if (ss->use_id) 2960 if (ss->use_id) {
2946 if (alloc_css_id(ss, parent, cgrp)) 2961 err = alloc_css_id(ss, parent, cgrp);
2962 if (err)
2947 goto err_destroy; 2963 goto err_destroy;
2964 }
2948 /* At error, ->destroy() callback has to free assigned ID. */ 2965 /* At error, ->destroy() callback has to free assigned ID. */
2949 } 2966 }
2950 2967
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 7c4e2713df0a..677f25376a38 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu)
151 151
152 write_lock_irq(&tasklist_lock); 152 write_lock_irq(&tasklist_lock);
153 for_each_process(p) { 153 for_each_process(p) {
154 if (task_cpu(p) == cpu && 154 if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
155 (!cputime_eq(p->utime, cputime_zero) || 155 (!cputime_eq(p->utime, cputime_zero) ||
156 !cputime_eq(p->stime, cputime_zero))) 156 !cputime_eq(p->stime, cputime_zero)))
157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ 157 printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
158 (state = %ld, flags = %x) \n", 158 "(state = %ld, flags = %x)\n",
159 p->comm, task_pid_nr(p), cpu, 159 p->comm, task_pid_nr(p), cpu,
160 p->state, p->flags); 160 p->state, p->flags);
161 } 161 }
162 write_unlock_irq(&tasklist_lock); 162 write_unlock_irq(&tasklist_lock);
163} 163}
@@ -209,9 +209,12 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
209 return -ENOMEM; 209 return -ENOMEM;
210 210
211 cpu_hotplug_begin(); 211 cpu_hotplug_begin();
212 set_cpu_active(cpu, false);
212 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
213 hcpu, -1, &nr_calls); 214 hcpu, -1, &nr_calls);
214 if (err == NOTIFY_BAD) { 215 if (err == NOTIFY_BAD) {
216 set_cpu_active(cpu, true);
217
215 nr_calls--; 218 nr_calls--;
216 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 219 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
217 hcpu, nr_calls, NULL); 220 hcpu, nr_calls, NULL);
@@ -223,11 +226,11 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
223 226
224 /* Ensure that we are not runnable on dying cpu */ 227 /* Ensure that we are not runnable on dying cpu */
225 cpumask_copy(old_allowed, &current->cpus_allowed); 228 cpumask_copy(old_allowed, &current->cpus_allowed);
226 set_cpus_allowed_ptr(current, 229 set_cpus_allowed_ptr(current, cpu_active_mask);
227 cpumask_of(cpumask_any_but(cpu_online_mask, cpu)));
228 230
229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
230 if (err) { 232 if (err) {
233 set_cpu_active(cpu, true);
231 /* CPU didn't die: tell everyone. Can't complain. */ 234 /* CPU didn't die: tell everyone. Can't complain. */
232 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
233 hcpu) == NOTIFY_BAD) 236 hcpu) == NOTIFY_BAD)
@@ -278,23 +281,8 @@ int __ref cpu_down(unsigned int cpu)
278 goto out; 281 goto out;
279 } 282 }
280 283
281 set_cpu_active(cpu, false);
282
283 /*
284 * Make sure the all cpus did the reschedule and are not
285 * using stale version of the cpu_active_mask.
286 * This is not strictly necessary becuase stop_machine()
287 * that we run down the line already provides the required
288 * synchronization. But it's really a side effect and we do not
289 * want to depend on the innards of the stop_machine here.
290 */
291 synchronize_sched();
292
293 err = _cpu_down(cpu, 0); 284 err = _cpu_down(cpu, 0);
294 285
295 if (cpu_online(cpu))
296 set_cpu_active(cpu, true);
297
298out: 286out:
299 cpu_maps_update_done(); 287 cpu_maps_update_done();
300 stop_machine_destroy(); 288 stop_machine_destroy();
@@ -383,10 +371,12 @@ int disable_nonboot_cpus(void)
383 return error; 371 return error;
384 cpu_maps_update_begin(); 372 cpu_maps_update_begin();
385 first_cpu = cpumask_first(cpu_online_mask); 373 first_cpu = cpumask_first(cpu_online_mask);
386 /* We take down all of the non-boot CPUs in one shot to avoid races 374 /*
375 * We take down all of the non-boot CPUs in one shot to avoid races
387 * with the userspace trying to use the CPU hotplug at the same time 376 * with the userspace trying to use the CPU hotplug at the same time
388 */ 377 */
389 cpumask_clear(frozen_cpus); 378 cpumask_clear(frozen_cpus);
379
390 printk("Disabling non-boot CPUs ...\n"); 380 printk("Disabling non-boot CPUs ...\n");
391 for_each_online_cpu(cpu) { 381 for_each_online_cpu(cpu) {
392 if (cpu == first_cpu) 382 if (cpu == first_cpu)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3cf2183b472d..ba401fab459f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -737,7 +737,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
737{ 737{
738} 738}
739 739
740static int generate_sched_domains(struct cpumask **domains, 740static int generate_sched_domains(cpumask_var_t **domains,
741 struct sched_domain_attr **attributes) 741 struct sched_domain_attr **attributes)
742{ 742{
743 *domains = NULL; 743 *domains = NULL;
@@ -872,7 +872,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
872 if (retval < 0) 872 if (retval < 0)
873 return retval; 873 return retval;
874 874
875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_online_mask)) 875 if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
876 return -EINVAL; 876 return -EINVAL;
877 } 877 }
878 retval = validate_change(cs, trialcs); 878 retval = validate_change(cs, trialcs);
@@ -2010,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2010 } 2010 }
2011 2011
2012 /* Continue past cpusets with all cpus, mems online */ 2012 /* Continue past cpusets with all cpus, mems online */
2013 if (cpumask_subset(cp->cpus_allowed, cpu_online_mask) && 2013 if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) &&
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2015 continue;
2016 2016
@@ -2019,7 +2019,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2019 /* Remove offline cpus and mems from this cpuset. */ 2019 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2020 mutex_lock(&callback_mutex);
2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed, 2021 cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
2022 cpu_online_mask); 2022 cpu_active_mask);
2023 nodes_and(cp->mems_allowed, cp->mems_allowed, 2023 nodes_and(cp->mems_allowed, cp->mems_allowed,
2024 node_states[N_HIGH_MEMORY]); 2024 node_states[N_HIGH_MEMORY]);
2025 mutex_unlock(&callback_mutex); 2025 mutex_unlock(&callback_mutex);
@@ -2057,8 +2057,10 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2057 switch (phase) { 2057 switch (phase) {
2058 case CPU_ONLINE: 2058 case CPU_ONLINE:
2059 case CPU_ONLINE_FROZEN: 2059 case CPU_ONLINE_FROZEN:
2060 case CPU_DEAD: 2060 case CPU_DOWN_PREPARE:
2061 case CPU_DEAD_FROZEN: 2061 case CPU_DOWN_PREPARE_FROZEN:
2062 case CPU_DOWN_FAILED:
2063 case CPU_DOWN_FAILED_FROZEN:
2062 break; 2064 break;
2063 2065
2064 default: 2066 default:
@@ -2067,7 +2069,7 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2067 2069
2068 cgroup_lock(); 2070 cgroup_lock();
2069 mutex_lock(&callback_mutex); 2071 mutex_lock(&callback_mutex);
2070 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2072 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2071 mutex_unlock(&callback_mutex); 2073 mutex_unlock(&callback_mutex);
2072 scan_for_empty_cpusets(&top_cpuset); 2074 scan_for_empty_cpusets(&top_cpuset);
2073 ndoms = generate_sched_domains(&doms, &attr); 2075 ndoms = generate_sched_domains(&doms, &attr);
@@ -2114,7 +2116,7 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2114 2116
2115void __init cpuset_init_smp(void) 2117void __init cpuset_init_smp(void)
2116{ 2118{
2117 cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask); 2119 cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
2118 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2120 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2119 2121
2120 hotcpu_notifier(cpuset_track_online_cpus, 0); 2122 hotcpu_notifier(cpuset_track_online_cpus, 0);
diff --git a/kernel/cred.c b/kernel/cred.c
index dd76cfe5f5b0..1ed8ca18790c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void)
224#ifdef CONFIG_KEYS 224#ifdef CONFIG_KEYS
225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); 225 new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL);
226 if (!new->tgcred) { 226 if (!new->tgcred) {
227 kfree(new); 227 kmem_cache_free(cred_jar, new);
228 return NULL; 228 return NULL;
229 } 229 }
230 atomic_set(&new->tgcred->usage, 1); 230 atomic_set(&new->tgcred->usage, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index 1143012951e9..45ed043b8bf5 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -68,10 +68,10 @@ static void __unhash_process(struct task_struct *p)
68 detach_pid(p, PIDTYPE_SID); 68 detach_pid(p, PIDTYPE_SID);
69 69
70 list_del_rcu(&p->tasks); 70 list_del_rcu(&p->tasks);
71 list_del_init(&p->sibling);
71 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
72 } 73 }
73 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
74 list_del_init(&p->sibling);
75} 75}
76 76
77/* 77/*
@@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 86 BUG_ON(!atomic_read(&sig->count));
87 87
88 sighand = rcu_dereference(tsk->sighand); 88 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() ||
90 lockdep_is_held(&tasklist_lock));
89 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
90 92
91 posix_cpu_timers_exit(tsk); 93 posix_cpu_timers_exit(tsk);
@@ -170,8 +172,10 @@ void release_task(struct task_struct * p)
170repeat: 172repeat:
171 tracehook_prepare_release_task(p); 173 tracehook_prepare_release_task(p);
172 /* don't need to get the RCU readlock here - the process is dead and 174 /* don't need to get the RCU readlock here - the process is dead and
173 * can't be modifying its own credentials */ 175 * can't be modifying its own credentials. But shut RCU-lockdep up */
176 rcu_read_lock();
174 atomic_dec(&__task_cred(p)->user->processes); 177 atomic_dec(&__task_cred(p)->user->processes);
178 rcu_read_unlock();
175 179
176 proc_flush_task(p); 180 proc_flush_task(p);
177 181
@@ -473,9 +477,11 @@ static void close_files(struct files_struct * files)
473 /* 477 /*
474 * It is safe to dereference the fd table without RCU or 478 * It is safe to dereference the fd table without RCU or
475 * ->file_lock because this is the last reference to the 479 * ->file_lock because this is the last reference to the
476 * files structure. 480 * files structure. But use RCU to shut RCU-lockdep up.
477 */ 481 */
482 rcu_read_lock();
478 fdt = files_fdtable(files); 483 fdt = files_fdtable(files);
484 rcu_read_unlock();
479 for (;;) { 485 for (;;) {
480 unsigned long set; 486 unsigned long set;
481 i = j * __NFDBITS; 487 i = j * __NFDBITS;
@@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files)
521 * at the end of the RCU grace period. Otherwise, 527 * at the end of the RCU grace period. Otherwise,
522 * you can free files immediately. 528 * you can free files immediately.
523 */ 529 */
530 rcu_read_lock();
524 fdt = files_fdtable(files); 531 fdt = files_fdtable(files);
525 if (fdt != &files->fdtab) 532 if (fdt != &files->fdtab)
526 kmem_cache_free(files_cachep, files); 533 kmem_cache_free(files_cachep, files);
527 free_fdtable(fdt); 534 free_fdtable(fdt);
535 rcu_read_unlock();
528 } 536 }
529} 537}
530 538
@@ -736,12 +744,9 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
736/* 744/*
737* Any that need to be release_task'd are put on the @dead list. 745* Any that need to be release_task'd are put on the @dead list.
738 */ 746 */
739static void reparent_thread(struct task_struct *father, struct task_struct *p, 747static void reparent_leader(struct task_struct *father, struct task_struct *p,
740 struct list_head *dead) 748 struct list_head *dead)
741{ 749{
742 if (p->pdeath_signal)
743 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
744
745 list_move_tail(&p->sibling, &p->real_parent->children); 750 list_move_tail(&p->sibling, &p->real_parent->children);
746 751
747 if (task_detached(p)) 752 if (task_detached(p))
@@ -780,12 +785,18 @@ static void forget_original_parent(struct task_struct *father)
780 reaper = find_new_reaper(father); 785 reaper = find_new_reaper(father);
781 786
782 list_for_each_entry_safe(p, n, &father->children, sibling) { 787 list_for_each_entry_safe(p, n, &father->children, sibling) {
783 p->real_parent = reaper; 788 struct task_struct *t = p;
784 if (p->parent == father) { 789 do {
785 BUG_ON(task_ptrace(p)); 790 t->real_parent = reaper;
786 p->parent = p->real_parent; 791 if (t->parent == father) {
787 } 792 BUG_ON(task_ptrace(t));
788 reparent_thread(father, p, &dead_children); 793 t->parent = t->real_parent;
794 }
795 if (t->pdeath_signal)
796 group_send_sig_info(t->pdeath_signal,
797 SEND_SIG_NOINFO, t);
798 } while_each_thread(p, t);
799 reparent_leader(father, p, &dead_children);
789 } 800 }
790 write_unlock_irq(&tasklist_lock); 801 write_unlock_irq(&tasklist_lock);
791 802
@@ -933,7 +944,7 @@ NORET_TYPE void do_exit(long code)
933 * an exiting task cleaning up the robust pi futexes. 944 * an exiting task cleaning up the robust pi futexes.
934 */ 945 */
935 smp_mb(); 946 smp_mb();
936 spin_unlock_wait(&tsk->pi_lock); 947 raw_spin_unlock_wait(&tsk->pi_lock);
937 948
938 if (unlikely(in_atomic())) 949 if (unlikely(in_atomic()))
939 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 950 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
@@ -971,7 +982,7 @@ NORET_TYPE void do_exit(long code)
971 exit_thread(); 982 exit_thread();
972 cgroup_exit(tsk, 1); 983 cgroup_exit(tsk, 1);
973 984
974 if (group_dead && tsk->signal->leader) 985 if (group_dead)
975 disassociate_ctty(1); 986 disassociate_ctty(1);
976 987
977 module_put(task_thread_info(tsk)->exec_domain->module); 988 module_put(task_thread_info(tsk)->exec_domain->module);
@@ -1551,14 +1562,9 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1551 struct task_struct *p; 1562 struct task_struct *p;
1552 1563
1553 list_for_each_entry(p, &tsk->children, sibling) { 1564 list_for_each_entry(p, &tsk->children, sibling) {
1554 /* 1565 int ret = wait_consider_task(wo, 0, p);
1555 * Do not consider detached threads. 1566 if (ret)
1556 */ 1567 return ret;
1557 if (!task_detached(p)) {
1558 int ret = wait_consider_task(wo, 0, p);
1559 if (ret)
1560 return ret;
1561 }
1562 } 1568 }
1563 1569
1564 return 0; 1570 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 1415dc4598ae..17bbf093356d 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */
86DEFINE_PER_CPU(unsigned long, process_counts) = 0; 86DEFINE_PER_CPU(unsigned long, process_counts) = 0;
87 87
88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 88__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
89EXPORT_SYMBOL_GPL(tasklist_lock);
89 90
90int nr_processes(void) 91int nr_processes(void)
91{ 92{
@@ -939,9 +940,9 @@ SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
939 940
940static void rt_mutex_init_task(struct task_struct *p) 941static void rt_mutex_init_task(struct task_struct *p)
941{ 942{
942 spin_lock_init(&p->pi_lock); 943 raw_spin_lock_init(&p->pi_lock);
943#ifdef CONFIG_RT_MUTEXES 944#ifdef CONFIG_RT_MUTEXES
944 plist_head_init(&p->pi_waiters, &p->pi_lock); 945 plist_head_init_raw(&p->pi_waiters, &p->pi_lock);
945 p->pi_blocked_on = NULL; 946 p->pi_blocked_on = NULL;
946#endif 947#endif
947} 948}
@@ -1127,6 +1128,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1127#ifdef CONFIG_DEBUG_MUTEXES 1128#ifdef CONFIG_DEBUG_MUTEXES
1128 p->blocked_on = NULL; /* not blocked yet */ 1129 p->blocked_on = NULL; /* not blocked yet */
1129#endif 1130#endif
1131#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1132 p->memcg_batch.do_batch = 0;
1133 p->memcg_batch.memcg = NULL;
1134#endif
1130 1135
1131 p->bts = NULL; 1136 p->bts = NULL;
1132 1137
@@ -1206,9 +1211,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1206 p->sas_ss_sp = p->sas_ss_size = 0; 1211 p->sas_ss_sp = p->sas_ss_size = 0;
1207 1212
1208 /* 1213 /*
1209 * Syscall tracing should be turned off in the child regardless 1214 * Syscall tracing and stepping should be turned off in the
1210 * of CLONE_PTRACE. 1215 * child regardless of CLONE_PTRACE.
1211 */ 1216 */
1217 user_disable_single_step(p);
1212 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); 1218 clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
1213#ifdef TIF_SYSCALL_EMU 1219#ifdef TIF_SYSCALL_EMU
1214 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1220 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
@@ -1236,21 +1242,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1236 /* Need tasklist lock for parent etc handling! */ 1242 /* Need tasklist lock for parent etc handling! */
1237 write_lock_irq(&tasklist_lock); 1243 write_lock_irq(&tasklist_lock);
1238 1244
1239 /*
1240 * The task hasn't been attached yet, so its cpus_allowed mask will
1241 * not be changed, nor will its assigned CPU.
1242 *
1243 * The cpus_allowed mask of the parent may have changed after it was
1244 * copied first time - so re-copy it here, then check the child's CPU
1245 * to ensure it is on a valid CPU (and if not, just force it back to
1246 * parent's CPU). This avoids alot of nasty races.
1247 */
1248 p->cpus_allowed = current->cpus_allowed;
1249 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1250 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1251 !cpu_online(task_cpu(p))))
1252 set_task_cpu(p, smp_processor_id());
1253
1254 /* CLONE_PARENT re-uses the old parent */ 1245 /* CLONE_PARENT re-uses the old parent */
1255 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { 1246 if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
1256 p->real_parent = current->real_parent; 1247 p->real_parent = current->real_parent;
@@ -1286,7 +1277,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1286 } 1277 }
1287 1278
1288 if (likely(p->pid)) { 1279 if (likely(p->pid)) {
1289 list_add_tail(&p->sibling, &p->real_parent->children);
1290 tracehook_finish_clone(p, clone_flags, trace); 1280 tracehook_finish_clone(p, clone_flags, trace);
1291 1281
1292 if (thread_group_leader(p)) { 1282 if (thread_group_leader(p)) {
@@ -1298,6 +1288,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1298 p->signal->tty = tty_kref_get(current->signal->tty); 1288 p->signal->tty = tty_kref_get(current->signal->tty);
1299 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1289 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1300 attach_pid(p, PIDTYPE_SID, task_session(current)); 1290 attach_pid(p, PIDTYPE_SID, task_session(current));
1291 list_add_tail(&p->sibling, &p->real_parent->children);
1301 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1292 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1302 __get_cpu_var(process_counts)++; 1293 __get_cpu_var(process_counts)++;
1303 } 1294 }
diff --git a/kernel/futex.c b/kernel/futex.c
index fb65e822fc41..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
203 * @uaddr: virtual address of the futex 203 * @uaddr: virtual address of the futex
204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED 204 * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
205 * @key: address where result is stored. 205 * @key: address where result is stored.
206 * @rw: mapping needs to be read/write (values: VERIFY_READ,
207 * VERIFY_WRITE)
208 * 206 *
209 * Returns a negative error code or 0 207 * Returns a negative error code or 0
210 * The key words are stored in *key on success. 208 * The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
216 * lock_page() might sleep, the caller should not hold a spinlock. 214 * lock_page() might sleep, the caller should not hold a spinlock.
217 */ 215 */
218static int 216static int
219get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) 217get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
220{ 218{
221 unsigned long address = (unsigned long)uaddr; 219 unsigned long address = (unsigned long)uaddr;
222 struct mm_struct *mm = current->mm; 220 struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
239 * but access_ok() should be faster than find_vma() 237 * but access_ok() should be faster than find_vma()
240 */ 238 */
241 if (!fshared) { 239 if (!fshared) {
242 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 240 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
243 return -EFAULT; 241 return -EFAULT;
244 key->private.mm = mm; 242 key->private.mm = mm;
245 key->private.address = address; 243 key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
248 } 246 }
249 247
250again: 248again:
251 err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); 249 err = get_user_pages_fast(address, 1, 1, &page);
252 if (err < 0) 250 if (err < 0)
253 return err; 251 return err;
254 252
@@ -304,8 +302,14 @@ void put_futex_key(int fshared, union futex_key *key)
304 */ 302 */
305static int fault_in_user_writeable(u32 __user *uaddr) 303static int fault_in_user_writeable(u32 __user *uaddr)
306{ 304{
307 int ret = get_user_pages(current, current->mm, (unsigned long)uaddr, 305 struct mm_struct *mm = current->mm;
308 1, 1, 0, NULL, NULL); 306 int ret;
307
308 down_read(&mm->mmap_sem);
309 ret = get_user_pages(current, mm, (unsigned long)uaddr,
310 1, 1, 0, NULL, NULL);
311 up_read(&mm->mmap_sem);
312
309 return ret < 0 ? ret : 0; 313 return ret < 0 ? ret : 0;
310} 314}
311 315
@@ -397,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state)
397 * and has cleaned up the pi_state already 401 * and has cleaned up the pi_state already
398 */ 402 */
399 if (pi_state->owner) { 403 if (pi_state->owner) {
400 spin_lock_irq(&pi_state->owner->pi_lock); 404 raw_spin_lock_irq(&pi_state->owner->pi_lock);
401 list_del_init(&pi_state->list); 405 list_del_init(&pi_state->list);
402 spin_unlock_irq(&pi_state->owner->pi_lock); 406 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
403 407
404 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); 408 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
405 } 409 }
@@ -464,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr)
464 * pi_state_list anymore, but we have to be careful 468 * pi_state_list anymore, but we have to be careful
465 * versus waiters unqueueing themselves: 469 * versus waiters unqueueing themselves:
466 */ 470 */
467 spin_lock_irq(&curr->pi_lock); 471 raw_spin_lock_irq(&curr->pi_lock);
468 while (!list_empty(head)) { 472 while (!list_empty(head)) {
469 473
470 next = head->next; 474 next = head->next;
471 pi_state = list_entry(next, struct futex_pi_state, list); 475 pi_state = list_entry(next, struct futex_pi_state, list);
472 key = pi_state->key; 476 key = pi_state->key;
473 hb = hash_futex(&key); 477 hb = hash_futex(&key);
474 spin_unlock_irq(&curr->pi_lock); 478 raw_spin_unlock_irq(&curr->pi_lock);
475 479
476 spin_lock(&hb->lock); 480 spin_lock(&hb->lock);
477 481
478 spin_lock_irq(&curr->pi_lock); 482 raw_spin_lock_irq(&curr->pi_lock);
479 /* 483 /*
480 * We dropped the pi-lock, so re-check whether this 484 * We dropped the pi-lock, so re-check whether this
481 * task still owns the PI-state: 485 * task still owns the PI-state:
@@ -489,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr)
489 WARN_ON(list_empty(&pi_state->list)); 493 WARN_ON(list_empty(&pi_state->list));
490 list_del_init(&pi_state->list); 494 list_del_init(&pi_state->list);
491 pi_state->owner = NULL; 495 pi_state->owner = NULL;
492 spin_unlock_irq(&curr->pi_lock); 496 raw_spin_unlock_irq(&curr->pi_lock);
493 497
494 rt_mutex_unlock(&pi_state->pi_mutex); 498 rt_mutex_unlock(&pi_state->pi_mutex);
495 499
496 spin_unlock(&hb->lock); 500 spin_unlock(&hb->lock);
497 501
498 spin_lock_irq(&curr->pi_lock); 502 raw_spin_lock_irq(&curr->pi_lock);
499 } 503 }
500 spin_unlock_irq(&curr->pi_lock); 504 raw_spin_unlock_irq(&curr->pi_lock);
501} 505}
502 506
503static int 507static int
@@ -526,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
526 return -EINVAL; 530 return -EINVAL;
527 531
528 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
529 WARN_ON(pid && pi_state->owner && 533
530 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
531 552
532 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
533 *ps = pi_state; 554 *ps = pi_state;
@@ -552,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
552 * change of the task flags, we do this protected by 573 * change of the task flags, we do this protected by
553 * p->pi_lock: 574 * p->pi_lock:
554 */ 575 */
555 spin_lock_irq(&p->pi_lock); 576 raw_spin_lock_irq(&p->pi_lock);
556 if (unlikely(p->flags & PF_EXITING)) { 577 if (unlikely(p->flags & PF_EXITING)) {
557 /* 578 /*
558 * The task is on the way out. When PF_EXITPIDONE is 579 * The task is on the way out. When PF_EXITPIDONE is
@@ -561,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
561 */ 582 */
562 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; 583 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
563 584
564 spin_unlock_irq(&p->pi_lock); 585 raw_spin_unlock_irq(&p->pi_lock);
565 put_task_struct(p); 586 put_task_struct(p);
566 return ret; 587 return ret;
567 } 588 }
@@ -580,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
580 WARN_ON(!list_empty(&pi_state->list)); 601 WARN_ON(!list_empty(&pi_state->list));
581 list_add(&pi_state->list, &p->pi_state_list); 602 list_add(&pi_state->list, &p->pi_state_list);
582 pi_state->owner = p; 603 pi_state->owner = p;
583 spin_unlock_irq(&p->pi_lock); 604 raw_spin_unlock_irq(&p->pi_lock);
584 605
585 put_task_struct(p); 606 put_task_struct(p);
586 607
@@ -754,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
754 if (!pi_state) 775 if (!pi_state)
755 return -EINVAL; 776 return -EINVAL;
756 777
757 spin_lock(&pi_state->pi_mutex.wait_lock); 778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
758 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
759 787
760 /* 788 /*
@@ -783,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
783 else if (curval != uval) 811 else if (curval != uval)
784 ret = -EINVAL; 812 ret = -EINVAL;
785 if (ret) { 813 if (ret) {
786 spin_unlock(&pi_state->pi_mutex.wait_lock); 814 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
787 return ret; 815 return ret;
788 } 816 }
789 } 817 }
790 818
791 spin_lock_irq(&pi_state->owner->pi_lock); 819 raw_spin_lock_irq(&pi_state->owner->pi_lock);
792 WARN_ON(list_empty(&pi_state->list)); 820 WARN_ON(list_empty(&pi_state->list));
793 list_del_init(&pi_state->list); 821 list_del_init(&pi_state->list);
794 spin_unlock_irq(&pi_state->owner->pi_lock); 822 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
795 823
796 spin_lock_irq(&new_owner->pi_lock); 824 raw_spin_lock_irq(&new_owner->pi_lock);
797 WARN_ON(!list_empty(&pi_state->list)); 825 WARN_ON(!list_empty(&pi_state->list));
798 list_add(&pi_state->list, &new_owner->pi_state_list); 826 list_add(&pi_state->list, &new_owner->pi_state_list);
799 pi_state->owner = new_owner; 827 pi_state->owner = new_owner;
800 spin_unlock_irq(&new_owner->pi_lock); 828 raw_spin_unlock_irq(&new_owner->pi_lock);
801 829
802 spin_unlock(&pi_state->pi_mutex.wait_lock); 830 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
803 rt_mutex_unlock(&pi_state->pi_mutex); 831 rt_mutex_unlock(&pi_state->pi_mutex);
804 832
805 return 0; 833 return 0;
@@ -861,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
861 if (!bitset) 889 if (!bitset)
862 return -EINVAL; 890 return -EINVAL;
863 891
864 ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); 892 ret = get_futex_key(uaddr, fshared, &key);
865 if (unlikely(ret != 0)) 893 if (unlikely(ret != 0))
866 goto out; 894 goto out;
867 895
@@ -907,10 +935,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
907 int ret, op_ret; 935 int ret, op_ret;
908 936
909retry: 937retry:
910 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 938 ret = get_futex_key(uaddr1, fshared, &key1);
911 if (unlikely(ret != 0)) 939 if (unlikely(ret != 0))
912 goto out; 940 goto out;
913 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 941 ret = get_futex_key(uaddr2, fshared, &key2);
914 if (unlikely(ret != 0)) 942 if (unlikely(ret != 0))
915 goto out_put_key1; 943 goto out_put_key1;
916 944
@@ -1004,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1004 plist_add(&q->list, &hb2->chain); 1032 plist_add(&q->list, &hb2->chain);
1005 q->lock_ptr = &hb2->lock; 1033 q->lock_ptr = &hb2->lock;
1006#ifdef CONFIG_DEBUG_PI_LIST 1034#ifdef CONFIG_DEBUG_PI_LIST
1007 q->list.plist.lock = &hb2->lock; 1035 q->list.plist.spinlock = &hb2->lock;
1008#endif 1036#endif
1009 } 1037 }
1010 get_futex_key_refs(key2); 1038 get_futex_key_refs(key2);
@@ -1040,7 +1068,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1040 1068
1041 q->lock_ptr = &hb->lock; 1069 q->lock_ptr = &hb->lock;
1042#ifdef CONFIG_DEBUG_PI_LIST 1070#ifdef CONFIG_DEBUG_PI_LIST
1043 q->list.plist.lock = &hb->lock; 1071 q->list.plist.spinlock = &hb->lock;
1044#endif 1072#endif
1045 1073
1046 wake_up_state(q->task, TASK_NORMAL); 1074 wake_up_state(q->task, TASK_NORMAL);
@@ -1169,11 +1197,10 @@ retry:
1169 pi_state = NULL; 1197 pi_state = NULL;
1170 } 1198 }
1171 1199
1172 ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); 1200 ret = get_futex_key(uaddr1, fshared, &key1);
1173 if (unlikely(ret != 0)) 1201 if (unlikely(ret != 0))
1174 goto out; 1202 goto out;
1175 ret = get_futex_key(uaddr2, fshared, &key2, 1203 ret = get_futex_key(uaddr2, fshared, &key2);
1176 requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1177 if (unlikely(ret != 0)) 1204 if (unlikely(ret != 0))
1178 goto out_put_key1; 1205 goto out_put_key1;
1179 1206
@@ -1388,7 +1415,7 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1388 1415
1389 plist_node_init(&q->list, prio); 1416 plist_node_init(&q->list, prio);
1390#ifdef CONFIG_DEBUG_PI_LIST 1417#ifdef CONFIG_DEBUG_PI_LIST
1391 q->list.plist.lock = &hb->lock; 1418 q->list.plist.spinlock = &hb->lock;
1392#endif 1419#endif
1393 plist_add(&q->list, &hb->chain); 1420 plist_add(&q->list, &hb->chain);
1394 q->task = current; 1421 q->task = current;
@@ -1523,18 +1550,18 @@ retry:
1523 * itself. 1550 * itself.
1524 */ 1551 */
1525 if (pi_state->owner != NULL) { 1552 if (pi_state->owner != NULL) {
1526 spin_lock_irq(&pi_state->owner->pi_lock); 1553 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1527 WARN_ON(list_empty(&pi_state->list)); 1554 WARN_ON(list_empty(&pi_state->list));
1528 list_del_init(&pi_state->list); 1555 list_del_init(&pi_state->list);
1529 spin_unlock_irq(&pi_state->owner->pi_lock); 1556 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1530 } 1557 }
1531 1558
1532 pi_state->owner = newowner; 1559 pi_state->owner = newowner;
1533 1560
1534 spin_lock_irq(&newowner->pi_lock); 1561 raw_spin_lock_irq(&newowner->pi_lock);
1535 WARN_ON(!list_empty(&pi_state->list)); 1562 WARN_ON(!list_empty(&pi_state->list));
1536 list_add(&pi_state->list, &newowner->pi_state_list); 1563 list_add(&pi_state->list, &newowner->pi_state_list);
1537 spin_unlock_irq(&newowner->pi_lock); 1564 raw_spin_unlock_irq(&newowner->pi_lock);
1538 return 0; 1565 return 0;
1539 1566
1540 /* 1567 /*
@@ -1732,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
1732 */ 1759 */
1733retry: 1760retry:
1734 q->key = FUTEX_KEY_INIT; 1761 q->key = FUTEX_KEY_INIT;
1735 ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); 1762 ret = get_futex_key(uaddr, fshared, &q->key);
1736 if (unlikely(ret != 0)) 1763 if (unlikely(ret != 0))
1737 return ret; 1764 return ret;
1738 1765
@@ -1898,7 +1925,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
1898 q.requeue_pi_key = NULL; 1925 q.requeue_pi_key = NULL;
1899retry: 1926retry:
1900 q.key = FUTEX_KEY_INIT; 1927 q.key = FUTEX_KEY_INIT;
1901 ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); 1928 ret = get_futex_key(uaddr, fshared, &q.key);
1902 if (unlikely(ret != 0)) 1929 if (unlikely(ret != 0))
1903 goto out; 1930 goto out;
1904 1931
@@ -1968,7 +1995,7 @@ retry_private:
1968 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1969 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1970 1997
1971 goto out; 1998 goto out_put_key;
1972 1999
1973out_unlock_put_key: 2000out_unlock_put_key:
1974 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
@@ -2017,7 +2044,7 @@ retry:
2017 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) 2044 if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
2018 return -EPERM; 2045 return -EPERM;
2019 2046
2020 ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); 2047 ret = get_futex_key(uaddr, fshared, &key);
2021 if (unlikely(ret != 0)) 2048 if (unlikely(ret != 0))
2022 goto out; 2049 goto out;
2023 2050
@@ -2209,7 +2236,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
2209 rt_waiter.task = NULL; 2236 rt_waiter.task = NULL;
2210 2237
2211 key2 = FUTEX_KEY_INIT; 2238 key2 = FUTEX_KEY_INIT;
2212 ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); 2239 ret = get_futex_key(uaddr2, fshared, &key2);
2213 if (unlikely(ret != 0)) 2240 if (unlikely(ret != 0))
2214 goto out; 2241 goto out;
2215 2242
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index ede527708123..0086628b6e97 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -127,11 +127,11 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
127 for (;;) { 127 for (;;) {
128 base = timer->base; 128 base = timer->base;
129 if (likely(base != NULL)) { 129 if (likely(base != NULL)) {
130 spin_lock_irqsave(&base->cpu_base->lock, *flags); 130 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
131 if (likely(base == timer->base)) 131 if (likely(base == timer->base))
132 return base; 132 return base;
133 /* The timer has migrated to another CPU: */ 133 /* The timer has migrated to another CPU: */
134 spin_unlock_irqrestore(&base->cpu_base->lock, *flags); 134 raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags);
135 } 135 }
136 cpu_relax(); 136 cpu_relax();
137 } 137 }
@@ -208,13 +208,13 @@ again:
208 208
209 /* See the comment in lock_timer_base() */ 209 /* See the comment in lock_timer_base() */
210 timer->base = NULL; 210 timer->base = NULL;
211 spin_unlock(&base->cpu_base->lock); 211 raw_spin_unlock(&base->cpu_base->lock);
212 spin_lock(&new_base->cpu_base->lock); 212 raw_spin_lock(&new_base->cpu_base->lock);
213 213
214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { 214 if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
215 cpu = this_cpu; 215 cpu = this_cpu;
216 spin_unlock(&new_base->cpu_base->lock); 216 raw_spin_unlock(&new_base->cpu_base->lock);
217 spin_lock(&base->cpu_base->lock); 217 raw_spin_lock(&base->cpu_base->lock);
218 timer->base = base; 218 timer->base = base;
219 goto again; 219 goto again;
220 } 220 }
@@ -230,7 +230,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
230{ 230{
231 struct hrtimer_clock_base *base = timer->base; 231 struct hrtimer_clock_base *base = timer->base;
232 232
233 spin_lock_irqsave(&base->cpu_base->lock, *flags); 233 raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
234 234
235 return base; 235 return base;
236} 236}
@@ -557,7 +557,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
557static int hrtimer_reprogram(struct hrtimer *timer, 557static int hrtimer_reprogram(struct hrtimer *timer,
558 struct hrtimer_clock_base *base) 558 struct hrtimer_clock_base *base)
559{ 559{
560 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 560 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 561 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
562 int res; 562 int res;
563 563
@@ -582,7 +582,16 @@ static int hrtimer_reprogram(struct hrtimer *timer,
582 if (expires.tv64 < 0) 582 if (expires.tv64 < 0)
583 return -ETIME; 583 return -ETIME;
584 584
585 if (expires.tv64 >= expires_next->tv64) 585 if (expires.tv64 >= cpu_base->expires_next.tv64)
586 return 0;
587
588 /*
589 * If a hang was detected in the last timer interrupt then we
590 * do not schedule a timer which is earlier than the expiry
591 * which we enforced in the hang detection. We want the system
592 * to make progress.
593 */
594 if (cpu_base->hang_detected)
586 return 0; 595 return 0;
587 596
588 /* 597 /*
@@ -590,7 +599,7 @@ static int hrtimer_reprogram(struct hrtimer *timer,
590 */ 599 */
591 res = tick_program_event(expires, 0); 600 res = tick_program_event(expires, 0);
592 if (!IS_ERR_VALUE(res)) 601 if (!IS_ERR_VALUE(res))
593 *expires_next = expires; 602 cpu_base->expires_next = expires;
594 return res; 603 return res;
595} 604}
596 605
@@ -619,12 +628,12 @@ static void retrigger_next_event(void *arg)
619 base = &__get_cpu_var(hrtimer_bases); 628 base = &__get_cpu_var(hrtimer_bases);
620 629
621 /* Adjust CLOCK_REALTIME offset */ 630 /* Adjust CLOCK_REALTIME offset */
622 spin_lock(&base->lock); 631 raw_spin_lock(&base->lock);
623 base->clock_base[CLOCK_REALTIME].offset = 632 base->clock_base[CLOCK_REALTIME].offset =
624 timespec_to_ktime(realtime_offset); 633 timespec_to_ktime(realtime_offset);
625 634
626 hrtimer_force_reprogram(base, 0); 635 hrtimer_force_reprogram(base, 0);
627 spin_unlock(&base->lock); 636 raw_spin_unlock(&base->lock);
628} 637}
629 638
630/* 639/*
@@ -685,9 +694,9 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
685{ 694{
686 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 695 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
687 if (wakeup) { 696 if (wakeup) {
688 spin_unlock(&base->cpu_base->lock); 697 raw_spin_unlock(&base->cpu_base->lock);
689 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 698 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
690 spin_lock(&base->cpu_base->lock); 699 raw_spin_lock(&base->cpu_base->lock);
691 } else 700 } else
692 __raise_softirq_irqoff(HRTIMER_SOFTIRQ); 701 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
693 702
@@ -747,17 +756,33 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
747 756
748#endif /* CONFIG_HIGH_RES_TIMERS */ 757#endif /* CONFIG_HIGH_RES_TIMERS */
749 758
750#ifdef CONFIG_TIMER_STATS 759static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
751void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
752{ 760{
761#ifdef CONFIG_TIMER_STATS
753 if (timer->start_site) 762 if (timer->start_site)
754 return; 763 return;
755 764 timer->start_site = __builtin_return_address(0);
756 timer->start_site = addr;
757 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); 765 memcpy(timer->start_comm, current->comm, TASK_COMM_LEN);
758 timer->start_pid = current->pid; 766 timer->start_pid = current->pid;
767#endif
759} 768}
769
770static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer)
771{
772#ifdef CONFIG_TIMER_STATS
773 timer->start_site = NULL;
774#endif
775}
776
777static inline void timer_stats_account_hrtimer(struct hrtimer *timer)
778{
779#ifdef CONFIG_TIMER_STATS
780 if (likely(!timer_stats_active))
781 return;
782 timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
783 timer->function, timer->start_comm, 0);
760#endif 784#endif
785}
761 786
762/* 787/*
763 * Counterpart to lock_hrtimer_base above: 788 * Counterpart to lock_hrtimer_base above:
@@ -765,7 +790,7 @@ void __timer_stats_hrtimer_set_start_info(struct hrtimer *timer, void *addr)
765static inline 790static inline
766void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) 791void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
767{ 792{
768 spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); 793 raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags);
769} 794}
770 795
771/** 796/**
@@ -1098,7 +1123,7 @@ ktime_t hrtimer_get_next_event(void)
1098 unsigned long flags; 1123 unsigned long flags;
1099 int i; 1124 int i;
1100 1125
1101 spin_lock_irqsave(&cpu_base->lock, flags); 1126 raw_spin_lock_irqsave(&cpu_base->lock, flags);
1102 1127
1103 if (!hrtimer_hres_active()) { 1128 if (!hrtimer_hres_active()) {
1104 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { 1129 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
@@ -1115,7 +1140,7 @@ ktime_t hrtimer_get_next_event(void)
1115 } 1140 }
1116 } 1141 }
1117 1142
1118 spin_unlock_irqrestore(&cpu_base->lock, flags); 1143 raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
1119 1144
1120 if (mindelta.tv64 < 0) 1145 if (mindelta.tv64 < 0)
1121 mindelta.tv64 = 0; 1146 mindelta.tv64 = 0;
@@ -1197,11 +1222,11 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1197 * they get migrated to another cpu, therefore its safe to unlock 1222 * they get migrated to another cpu, therefore its safe to unlock
1198 * the timer base. 1223 * the timer base.
1199 */ 1224 */
1200 spin_unlock(&cpu_base->lock); 1225 raw_spin_unlock(&cpu_base->lock);
1201 trace_hrtimer_expire_entry(timer, now); 1226 trace_hrtimer_expire_entry(timer, now);
1202 restart = fn(timer); 1227 restart = fn(timer);
1203 trace_hrtimer_expire_exit(timer); 1228 trace_hrtimer_expire_exit(timer);
1204 spin_lock(&cpu_base->lock); 1229 raw_spin_lock(&cpu_base->lock);
1205 1230
1206 /* 1231 /*
1207 * Note: We clear the CALLBACK bit after enqueue_hrtimer and 1232 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
@@ -1217,30 +1242,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1217 1242
1218#ifdef CONFIG_HIGH_RES_TIMERS 1243#ifdef CONFIG_HIGH_RES_TIMERS
1219 1244
1220static int force_clock_reprogram;
1221
1222/*
1223 * After 5 iteration's attempts, we consider that hrtimer_interrupt()
1224 * is hanging, which could happen with something that slows the interrupt
1225 * such as the tracing. Then we force the clock reprogramming for each future
1226 * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
1227 * threshold that we will overwrite.
1228 * The next tick event will be scheduled to 3 times we currently spend on
1229 * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
1230 * 1/4 of their time to process the hrtimer interrupts. This is enough to
1231 * let it running without serious starvation.
1232 */
1233
1234static inline void
1235hrtimer_interrupt_hanging(struct clock_event_device *dev,
1236 ktime_t try_time)
1237{
1238 force_clock_reprogram = 1;
1239 dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
1240 printk(KERN_WARNING "hrtimer: interrupt too slow, "
1241 "forcing clock min delta to %llu ns\n",
1242 (unsigned long long) dev->min_delta_ns);
1243}
1244/* 1245/*
1245 * High resolution timer interrupt 1246 * High resolution timer interrupt
1246 * Called with interrupts disabled 1247 * Called with interrupts disabled
@@ -1249,24 +1250,18 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1249{ 1250{
1250 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1251 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1251 struct hrtimer_clock_base *base; 1252 struct hrtimer_clock_base *base;
1252 ktime_t expires_next, now; 1253 ktime_t expires_next, now, entry_time, delta;
1253 int nr_retries = 0; 1254 int i, retries = 0;
1254 int i;
1255 1255
1256 BUG_ON(!cpu_base->hres_active); 1256 BUG_ON(!cpu_base->hres_active);
1257 cpu_base->nr_events++; 1257 cpu_base->nr_events++;
1258 dev->next_event.tv64 = KTIME_MAX; 1258 dev->next_event.tv64 = KTIME_MAX;
1259 1259
1260 retry: 1260 entry_time = now = ktime_get();
1261 /* 5 retries is enough to notice a hang */ 1261retry:
1262 if (!(++nr_retries % 5))
1263 hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
1264
1265 now = ktime_get();
1266
1267 expires_next.tv64 = KTIME_MAX; 1262 expires_next.tv64 = KTIME_MAX;
1268 1263
1269 spin_lock(&cpu_base->lock); 1264 raw_spin_lock(&cpu_base->lock);
1270 /* 1265 /*
1271 * We set expires_next to KTIME_MAX here with cpu_base->lock 1266 * We set expires_next to KTIME_MAX here with cpu_base->lock
1272 * held to prevent that a timer is enqueued in our queue via 1267 * held to prevent that a timer is enqueued in our queue via
@@ -1322,13 +1317,51 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1322 * against it. 1317 * against it.
1323 */ 1318 */
1324 cpu_base->expires_next = expires_next; 1319 cpu_base->expires_next = expires_next;
1325 spin_unlock(&cpu_base->lock); 1320 raw_spin_unlock(&cpu_base->lock);
1326 1321
1327 /* Reprogramming necessary ? */ 1322 /* Reprogramming necessary ? */
1328 if (expires_next.tv64 != KTIME_MAX) { 1323 if (expires_next.tv64 == KTIME_MAX ||
1329 if (tick_program_event(expires_next, force_clock_reprogram)) 1324 !tick_program_event(expires_next, 0)) {
1330 goto retry; 1325 cpu_base->hang_detected = 0;
1326 return;
1331 } 1327 }
1328
1329 /*
1330 * The next timer was already expired due to:
1331 * - tracing
1332 * - long lasting callbacks
1333 * - being scheduled away when running in a VM
1334 *
1335 * We need to prevent that we loop forever in the hrtimer
1336 * interrupt routine. We give it 3 attempts to avoid
1337 * overreacting on some spurious event.
1338 */
1339 now = ktime_get();
1340 cpu_base->nr_retries++;
1341 if (++retries < 3)
1342 goto retry;
1343 /*
1344 * Give the system a chance to do something else than looping
1345 * here. We stored the entry time, so we know exactly how long
1346 * we spent here. We schedule the next event this amount of
1347 * time away.
1348 */
1349 cpu_base->nr_hangs++;
1350 cpu_base->hang_detected = 1;
1351 delta = ktime_sub(now, entry_time);
1352 if (delta.tv64 > cpu_base->max_hang_time.tv64)
1353 cpu_base->max_hang_time = delta;
1354 /*
1355 * Limit it to a sensible value as we enforce a longer
1356 * delay. Give the CPU at least 100ms to catch up.
1357 */
1358 if (delta.tv64 > 100 * NSEC_PER_MSEC)
1359 expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC);
1360 else
1361 expires_next = ktime_add(now, delta);
1362 tick_program_event(expires_next, 1);
1363 printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n",
1364 ktime_to_ns(delta));
1332} 1365}
1333 1366
1334/* 1367/*
@@ -1424,7 +1457,7 @@ void hrtimer_run_queues(void)
1424 gettime = 0; 1457 gettime = 0;
1425 } 1458 }
1426 1459
1427 spin_lock(&cpu_base->lock); 1460 raw_spin_lock(&cpu_base->lock);
1428 1461
1429 while ((node = base->first)) { 1462 while ((node = base->first)) {
1430 struct hrtimer *timer; 1463 struct hrtimer *timer;
@@ -1436,7 +1469,7 @@ void hrtimer_run_queues(void)
1436 1469
1437 __run_hrtimer(timer, &base->softirq_time); 1470 __run_hrtimer(timer, &base->softirq_time);
1438 } 1471 }
1439 spin_unlock(&cpu_base->lock); 1472 raw_spin_unlock(&cpu_base->lock);
1440 } 1473 }
1441} 1474}
1442 1475
@@ -1592,7 +1625,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1592 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1625 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1593 int i; 1626 int i;
1594 1627
1595 spin_lock_init(&cpu_base->lock); 1628 raw_spin_lock_init(&cpu_base->lock);
1596 1629
1597 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1630 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1598 cpu_base->clock_base[i].cpu_base = cpu_base; 1631 cpu_base->clock_base[i].cpu_base = cpu_base;
@@ -1650,16 +1683,16 @@ static void migrate_hrtimers(int scpu)
1650 * The caller is globally serialized and nobody else 1683 * The caller is globally serialized and nobody else
1651 * takes two locks at once, deadlock is not possible. 1684 * takes two locks at once, deadlock is not possible.
1652 */ 1685 */
1653 spin_lock(&new_base->lock); 1686 raw_spin_lock(&new_base->lock);
1654 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1687 raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1655 1688
1656 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1689 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1657 migrate_hrtimer_list(&old_base->clock_base[i], 1690 migrate_hrtimer_list(&old_base->clock_base[i],
1658 &new_base->clock_base[i]); 1691 &new_base->clock_base[i]);
1659 } 1692 }
1660 1693
1661 spin_unlock(&old_base->lock); 1694 raw_spin_unlock(&old_base->lock);
1662 spin_unlock(&new_base->lock); 1695 raw_spin_unlock(&new_base->lock);
1663 1696
1664 /* Check, if we got expired work to do */ 1697 /* Check, if we got expired work to do */
1665 __hrtimer_peek_ahead_timers(); 1698 __hrtimer_peek_ahead_timers();
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index cf5ee1628411..967e66143e11 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,6 +40,7 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/cpu.h>
43#include <linux/smp.h> 44#include <linux/smp.h>
44 45
45#include <linux/hw_breakpoint.h> 46#include <linux/hw_breakpoint.h>
@@ -52,7 +53,7 @@
52static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned);
53 54
54/* Number of pinned task breakpoints in a cpu */ 55/* Number of pinned task breakpoints in a cpu */
55static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); 56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]);
56 57
57/* Number of non-pinned cpu/task breakpoints in a cpu */ 58/* Number of non-pinned cpu/task breakpoints in a cpu */
58static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible);
@@ -73,7 +74,7 @@ static DEFINE_MUTEX(nr_bp_mutex);
73static unsigned int max_task_bp_pinned(int cpu) 74static unsigned int max_task_bp_pinned(int cpu)
74{ 75{
75 int i; 76 int i;
76 unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); 77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
77 78
78 for (i = HBP_NUM -1; i >= 0; i--) { 79 for (i = HBP_NUM -1; i >= 0; i--) {
79 if (tsk_pinned[i] > 0) 80 if (tsk_pinned[i] > 0)
@@ -83,15 +84,51 @@ static unsigned int max_task_bp_pinned(int cpu)
83 return 0; 84 return 0;
84} 85}
85 86
87static int task_bp_pinned(struct task_struct *tsk)
88{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list;
91 struct perf_event *bp;
92 unsigned long flags;
93 int count = 0;
94
95 if (WARN_ONCE(!ctx, "No perf context for this task"))
96 return 0;
97
98 list = &ctx->event_list;
99
100 raw_spin_lock_irqsave(&ctx->lock, flags);
101
102 /*
103 * The current breakpoint counter is not included in the list
104 * at the open() callback time
105 */
106 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++;
109 }
110
111 raw_spin_unlock_irqrestore(&ctx->lock, flags);
112
113 return count;
114}
115
86/* 116/*
87 * Report the number of pinned/un-pinned breakpoints we have in 117 * Report the number of pinned/un-pinned breakpoints we have in
88 * a given cpu (cpu > -1) or in all of them (cpu = -1). 118 * a given cpu (cpu > -1) or in all of them (cpu = -1).
89 */ 119 */
90static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) 120static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
91{ 122{
123 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task;
125
92 if (cpu >= 0) { 126 if (cpu >= 0) {
93 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu);
94 slots->pinned += max_task_bp_pinned(cpu); 128 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu);
130 else
131 slots->pinned += task_bp_pinned(tsk);
95 slots->flexible = per_cpu(nr_bp_flexible, cpu); 132 slots->flexible = per_cpu(nr_bp_flexible, cpu);
96 133
97 return; 134 return;
@@ -101,7 +138,10 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
101 unsigned int nr; 138 unsigned int nr;
102 139
103 nr = per_cpu(nr_cpu_bp_pinned, cpu); 140 nr = per_cpu(nr_cpu_bp_pinned, cpu);
104 nr += max_task_bp_pinned(cpu); 141 if (!tsk)
142 nr += max_task_bp_pinned(cpu);
143 else
144 nr += task_bp_pinned(tsk);
105 145
106 if (nr > slots->pinned) 146 if (nr > slots->pinned)
107 slots->pinned = nr; 147 slots->pinned = nr;
@@ -118,35 +158,12 @@ static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu)
118 */ 158 */
119static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable)
120{ 160{
121 int count = 0;
122 struct perf_event *bp;
123 struct perf_event_context *ctx = tsk->perf_event_ctxp;
124 unsigned int *tsk_pinned; 161 unsigned int *tsk_pinned;
125 struct list_head *list; 162 int count = 0;
126 unsigned long flags;
127
128 if (WARN_ONCE(!ctx, "No perf context for this task"))
129 return;
130
131 list = &ctx->event_list;
132
133 spin_lock_irqsave(&ctx->lock, flags);
134
135 /*
136 * The current breakpoint counter is not included in the list
137 * at the open() callback time
138 */
139 list_for_each_entry(bp, list, event_entry) {
140 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
141 count++;
142 }
143 163
144 spin_unlock_irqrestore(&ctx->lock, flags); 164 count = task_bp_pinned(tsk);
145 165
146 if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) 166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu);
147 return;
148
149 tsk_pinned = per_cpu(task_bp_pinned, cpu);
150 if (enable) { 167 if (enable) {
151 tsk_pinned[count]++; 168 tsk_pinned[count]++;
152 if (count > 0) 169 if (count > 0)
@@ -193,7 +210,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
193 * - If attached to a single cpu, check: 210 * - If attached to a single cpu, check:
194 * 211 *
195 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 212 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
196 * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM 213 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
197 * 214 *
198 * -> If there are already non-pinned counters in this cpu, it means 215 * -> If there are already non-pinned counters in this cpu, it means
199 * there is already a free slot for them. 216 * there is already a free slot for them.
@@ -204,7 +221,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
204 * - If attached to every cpus, check: 221 * - If attached to every cpus, check:
205 * 222 *
206 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 223 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
207 * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM 224 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
208 * 225 *
209 * -> This is roughly the same, except we check the number of per cpu 226 * -> This is roughly the same, except we check the number of per cpu
210 * bp for every cpu and we keep the max one. Same for the per tasks 227 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -216,7 +233,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
216 * - If attached to a single cpu, check: 233 * - If attached to a single cpu, check:
217 * 234 *
218 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 235 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
219 * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM 236 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
220 * 237 *
221 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 238 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep
222 * one register at least (or they will never be fed). 239 * one register at least (or they will never be fed).
@@ -224,42 +241,74 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
224 * - If attached to every cpus, check: 241 * - If attached to every cpus, check:
225 * 242 *
226 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 243 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
227 * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM 244 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
228 */ 245 */
229int reserve_bp_slot(struct perf_event *bp) 246static int __reserve_bp_slot(struct perf_event *bp)
230{ 247{
231 struct bp_busy_slots slots = {0}; 248 struct bp_busy_slots slots = {0};
232 int ret = 0;
233
234 mutex_lock(&nr_bp_mutex);
235 249
236 fetch_bp_busy_slots(&slots, bp->cpu); 250 fetch_bp_busy_slots(&slots, bp);
237 251
238 /* Flexible counters need to keep at least one slot */ 252 /* Flexible counters need to keep at least one slot */
239 if (slots.pinned + (!!slots.flexible) == HBP_NUM) { 253 if (slots.pinned + (!!slots.flexible) == HBP_NUM)
240 ret = -ENOSPC; 254 return -ENOSPC;
241 goto end;
242 }
243 255
244 toggle_bp_slot(bp, true); 256 toggle_bp_slot(bp, true);
245 257
246end: 258 return 0;
259}
260
261int reserve_bp_slot(struct perf_event *bp)
262{
263 int ret;
264
265 mutex_lock(&nr_bp_mutex);
266
267 ret = __reserve_bp_slot(bp);
268
247 mutex_unlock(&nr_bp_mutex); 269 mutex_unlock(&nr_bp_mutex);
248 270
249 return ret; 271 return ret;
250} 272}
251 273
274static void __release_bp_slot(struct perf_event *bp)
275{
276 toggle_bp_slot(bp, false);
277}
278
252void release_bp_slot(struct perf_event *bp) 279void release_bp_slot(struct perf_event *bp)
253{ 280{
254 mutex_lock(&nr_bp_mutex); 281 mutex_lock(&nr_bp_mutex);
255 282
256 toggle_bp_slot(bp, false); 283 __release_bp_slot(bp);
257 284
258 mutex_unlock(&nr_bp_mutex); 285 mutex_unlock(&nr_bp_mutex);
259} 286}
260 287
288/*
289 * Allow the kernel debugger to reserve breakpoint slots without
290 * taking a lock using the dbg_* variant of for the reserve and
291 * release breakpoint slots.
292 */
293int dbg_reserve_bp_slot(struct perf_event *bp)
294{
295 if (mutex_is_locked(&nr_bp_mutex))
296 return -1;
297
298 return __reserve_bp_slot(bp);
299}
300
301int dbg_release_bp_slot(struct perf_event *bp)
302{
303 if (mutex_is_locked(&nr_bp_mutex))
304 return -1;
305
306 __release_bp_slot(bp);
261 307
262int __register_perf_hw_breakpoint(struct perf_event *bp) 308 return 0;
309}
310
311int register_perf_hw_breakpoint(struct perf_event *bp)
263{ 312{
264 int ret; 313 int ret;
265 314
@@ -276,17 +325,14 @@ int __register_perf_hw_breakpoint(struct perf_event *bp)
276 * This is a quick hack that will be removed soon, once we remove 325 * This is a quick hack that will be removed soon, once we remove
277 * the tmp breakpoints from ptrace 326 * the tmp breakpoints from ptrace
278 */ 327 */
279 if (!bp->attr.disabled || bp->callback == perf_bp_event) 328 if (!bp->attr.disabled || !bp->overflow_handler)
280 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
281 330
282 return ret; 331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
283} 332 if (ret)
284 333 release_bp_slot(bp);
285int register_perf_hw_breakpoint(struct perf_event *bp)
286{
287 bp->callback = perf_bp_event;
288 334
289 return __register_perf_hw_breakpoint(bp); 335 return ret;
290} 336}
291 337
292/** 338/**
@@ -297,7 +343,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
297 */ 343 */
298struct perf_event * 344struct perf_event *
299register_user_hw_breakpoint(struct perf_event_attr *attr, 345register_user_hw_breakpoint(struct perf_event_attr *attr,
300 perf_callback_t triggered, 346 perf_overflow_handler_t triggered,
301 struct task_struct *tsk) 347 struct task_struct *tsk)
302{ 348{
303 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 349 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
@@ -311,19 +357,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
311 * @triggered: callback to trigger when we hit the breakpoint 357 * @triggered: callback to trigger when we hit the breakpoint
312 * @tsk: pointer to 'task_struct' of the process to which the address belongs 358 * @tsk: pointer to 'task_struct' of the process to which the address belongs
313 */ 359 */
314struct perf_event * 360int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr)
315modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr,
316 perf_callback_t triggered,
317 struct task_struct *tsk)
318{ 361{
319 /* 362 u64 old_addr = bp->attr.bp_addr;
320 * FIXME: do it without unregistering 363 u64 old_len = bp->attr.bp_len;
321 * - We don't want to lose our slot 364 int old_type = bp->attr.bp_type;
322 * - If the new bp is incorrect, don't lose the older one 365 int err = 0;
323 */
324 unregister_hw_breakpoint(bp);
325 366
326 return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); 367 perf_event_disable(bp);
368
369 bp->attr.bp_addr = attr->bp_addr;
370 bp->attr.bp_type = attr->bp_type;
371 bp->attr.bp_len = attr->bp_len;
372
373 if (attr->disabled)
374 goto end;
375
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
377 if (!err)
378 perf_event_enable(bp);
379
380 if (err) {
381 bp->attr.bp_addr = old_addr;
382 bp->attr.bp_type = old_type;
383 bp->attr.bp_len = old_len;
384 if (!bp->attr.disabled)
385 perf_event_enable(bp);
386
387 return err;
388 }
389
390end:
391 bp->attr.disabled = attr->disabled;
392
393 return 0;
327} 394}
328EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); 395EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint);
329 396
@@ -348,7 +415,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
348 */ 415 */
349struct perf_event ** 416struct perf_event **
350register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
351 perf_callback_t triggered) 418 perf_overflow_handler_t triggered)
352{ 419{
353 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event **cpu_events, **pevent, *bp;
354 long err; 421 long err;
@@ -358,7 +425,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
358 if (!cpu_events) 425 if (!cpu_events)
359 return ERR_PTR(-ENOMEM); 426 return ERR_PTR(-ENOMEM);
360 427
361 for_each_possible_cpu(cpu) { 428 get_online_cpus();
429 for_each_online_cpu(cpu) {
362 pevent = per_cpu_ptr(cpu_events, cpu); 430 pevent = per_cpu_ptr(cpu_events, cpu);
363 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); 431 bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered);
364 432
@@ -369,18 +437,20 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
369 goto fail; 437 goto fail;
370 } 438 }
371 } 439 }
440 put_online_cpus();
372 441
373 return cpu_events; 442 return cpu_events;
374 443
375fail: 444fail:
376 for_each_possible_cpu(cpu) { 445 for_each_online_cpu(cpu) {
377 pevent = per_cpu_ptr(cpu_events, cpu); 446 pevent = per_cpu_ptr(cpu_events, cpu);
378 if (IS_ERR(*pevent)) 447 if (IS_ERR(*pevent))
379 break; 448 break;
380 unregister_hw_breakpoint(*pevent); 449 unregister_hw_breakpoint(*pevent);
381 } 450 }
451 put_online_cpus();
452
382 free_percpu(cpu_events); 453 free_percpu(cpu_events);
383 /* return the error if any */
384 return ERR_PTR(err); 454 return ERR_PTR(err);
385} 455}
386EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 1de9700f416e..2295a31ef110 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -45,7 +45,7 @@ unsigned long probe_irq_on(void)
45 * flush such a longstanding irq before considering it as spurious. 45 * flush such a longstanding irq before considering it as spurious.
46 */ 46 */
47 for_each_irq_desc_reverse(i, desc) { 47 for_each_irq_desc_reverse(i, desc) {
48 spin_lock_irq(&desc->lock); 48 raw_spin_lock_irq(&desc->lock);
49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 49 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
50 /* 50 /*
51 * An old-style architecture might still have 51 * An old-style architecture might still have
@@ -61,7 +61,7 @@ unsigned long probe_irq_on(void)
61 desc->chip->set_type(i, IRQ_TYPE_PROBE); 61 desc->chip->set_type(i, IRQ_TYPE_PROBE);
62 desc->chip->startup(i); 62 desc->chip->startup(i);
63 } 63 }
64 spin_unlock_irq(&desc->lock); 64 raw_spin_unlock_irq(&desc->lock);
65 } 65 }
66 66
67 /* Wait for longstanding interrupts to trigger. */ 67 /* Wait for longstanding interrupts to trigger. */
@@ -73,13 +73,13 @@ unsigned long probe_irq_on(void)
73 * happened in the previous stage, it may have masked itself) 73 * happened in the previous stage, it may have masked itself)
74 */ 74 */
75 for_each_irq_desc_reverse(i, desc) { 75 for_each_irq_desc_reverse(i, desc) {
76 spin_lock_irq(&desc->lock); 76 raw_spin_lock_irq(&desc->lock);
77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 77 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 78 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
79 if (desc->chip->startup(i)) 79 if (desc->chip->startup(i))
80 desc->status |= IRQ_PENDING; 80 desc->status |= IRQ_PENDING;
81 } 81 }
82 spin_unlock_irq(&desc->lock); 82 raw_spin_unlock_irq(&desc->lock);
83 } 83 }
84 84
85 /* 85 /*
@@ -91,7 +91,7 @@ unsigned long probe_irq_on(void)
91 * Now filter out any obviously spurious interrupts 91 * Now filter out any obviously spurious interrupts
92 */ 92 */
93 for_each_irq_desc(i, desc) { 93 for_each_irq_desc(i, desc) {
94 spin_lock_irq(&desc->lock); 94 raw_spin_lock_irq(&desc->lock);
95 status = desc->status; 95 status = desc->status;
96 96
97 if (status & IRQ_AUTODETECT) { 97 if (status & IRQ_AUTODETECT) {
@@ -103,7 +103,7 @@ unsigned long probe_irq_on(void)
103 if (i < 32) 103 if (i < 32)
104 mask |= 1 << i; 104 mask |= 1 << i;
105 } 105 }
106 spin_unlock_irq(&desc->lock); 106 raw_spin_unlock_irq(&desc->lock);
107 } 107 }
108 108
109 return mask; 109 return mask;
@@ -129,7 +129,7 @@ unsigned int probe_irq_mask(unsigned long val)
129 int i; 129 int i;
130 130
131 for_each_irq_desc(i, desc) { 131 for_each_irq_desc(i, desc) {
132 spin_lock_irq(&desc->lock); 132 raw_spin_lock_irq(&desc->lock);
133 status = desc->status; 133 status = desc->status;
134 134
135 if (status & IRQ_AUTODETECT) { 135 if (status & IRQ_AUTODETECT) {
@@ -139,7 +139,7 @@ unsigned int probe_irq_mask(unsigned long val)
139 desc->status = status & ~IRQ_AUTODETECT; 139 desc->status = status & ~IRQ_AUTODETECT;
140 desc->chip->shutdown(i); 140 desc->chip->shutdown(i);
141 } 141 }
142 spin_unlock_irq(&desc->lock); 142 raw_spin_unlock_irq(&desc->lock);
143 } 143 }
144 mutex_unlock(&probing_active); 144 mutex_unlock(&probing_active);
145 145
@@ -171,7 +171,7 @@ int probe_irq_off(unsigned long val)
171 unsigned int status; 171 unsigned int status;
172 172
173 for_each_irq_desc(i, desc) { 173 for_each_irq_desc(i, desc) {
174 spin_lock_irq(&desc->lock); 174 raw_spin_lock_irq(&desc->lock);
175 status = desc->status; 175 status = desc->status;
176 176
177 if (status & IRQ_AUTODETECT) { 177 if (status & IRQ_AUTODETECT) {
@@ -183,7 +183,7 @@ int probe_irq_off(unsigned long val)
183 desc->status = status & ~IRQ_AUTODETECT; 183 desc->status = status & ~IRQ_AUTODETECT;
184 desc->chip->shutdown(i); 184 desc->chip->shutdown(i);
185 } 185 }
186 spin_unlock_irq(&desc->lock); 186 raw_spin_unlock_irq(&desc->lock);
187 } 187 }
188 mutex_unlock(&probing_active); 188 mutex_unlock(&probing_active);
189 189
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index ba566c261adc..ecc3fa28f666 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -34,7 +34,7 @@ void dynamic_irq_init(unsigned int irq)
34 } 34 }
35 35
36 /* Ensure we don't have left over values from a previous use of this irq */ 36 /* Ensure we don't have left over values from a previous use of this irq */
37 spin_lock_irqsave(&desc->lock, flags); 37 raw_spin_lock_irqsave(&desc->lock, flags);
38 desc->status = IRQ_DISABLED; 38 desc->status = IRQ_DISABLED;
39 desc->chip = &no_irq_chip; 39 desc->chip = &no_irq_chip;
40 desc->handle_irq = handle_bad_irq; 40 desc->handle_irq = handle_bad_irq;
@@ -51,7 +51,7 @@ void dynamic_irq_init(unsigned int irq)
51 cpumask_clear(desc->pending_mask); 51 cpumask_clear(desc->pending_mask);
52#endif 52#endif
53#endif 53#endif
54 spin_unlock_irqrestore(&desc->lock, flags); 54 raw_spin_unlock_irqrestore(&desc->lock, flags);
55} 55}
56 56
57/** 57/**
@@ -68,9 +68,9 @@ void dynamic_irq_cleanup(unsigned int irq)
68 return; 68 return;
69 } 69 }
70 70
71 spin_lock_irqsave(&desc->lock, flags); 71 raw_spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 72 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 73 raw_spin_unlock_irqrestore(&desc->lock, flags);
74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n", 74 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 75 irq);
76 return; 76 return;
@@ -82,7 +82,7 @@ void dynamic_irq_cleanup(unsigned int irq)
82 desc->chip = &no_irq_chip; 82 desc->chip = &no_irq_chip;
83 desc->name = NULL; 83 desc->name = NULL;
84 clear_kstat_irqs(desc); 84 clear_kstat_irqs(desc);
85 spin_unlock_irqrestore(&desc->lock, flags); 85 raw_spin_unlock_irqrestore(&desc->lock, flags);
86} 86}
87 87
88 88
@@ -104,10 +104,10 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
104 if (!chip) 104 if (!chip)
105 chip = &no_irq_chip; 105 chip = &no_irq_chip;
106 106
107 spin_lock_irqsave(&desc->lock, flags); 107 raw_spin_lock_irqsave(&desc->lock, flags);
108 irq_chip_set_defaults(chip); 108 irq_chip_set_defaults(chip);
109 desc->chip = chip; 109 desc->chip = chip;
110 spin_unlock_irqrestore(&desc->lock, flags); 110 raw_spin_unlock_irqrestore(&desc->lock, flags);
111 111
112 return 0; 112 return 0;
113} 113}
@@ -133,9 +133,9 @@ int set_irq_type(unsigned int irq, unsigned int type)
133 if (type == IRQ_TYPE_NONE) 133 if (type == IRQ_TYPE_NONE)
134 return 0; 134 return 0;
135 135
136 spin_lock_irqsave(&desc->lock, flags); 136 raw_spin_lock_irqsave(&desc->lock, flags);
137 ret = __irq_set_trigger(desc, irq, type); 137 ret = __irq_set_trigger(desc, irq, type);
138 spin_unlock_irqrestore(&desc->lock, flags); 138 raw_spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 139 return ret;
140} 140}
141EXPORT_SYMBOL(set_irq_type); 141EXPORT_SYMBOL(set_irq_type);
@@ -158,9 +158,9 @@ int set_irq_data(unsigned int irq, void *data)
158 return -EINVAL; 158 return -EINVAL;
159 } 159 }
160 160
161 spin_lock_irqsave(&desc->lock, flags); 161 raw_spin_lock_irqsave(&desc->lock, flags);
162 desc->handler_data = data; 162 desc->handler_data = data;
163 spin_unlock_irqrestore(&desc->lock, flags); 163 raw_spin_unlock_irqrestore(&desc->lock, flags);
164 return 0; 164 return 0;
165} 165}
166EXPORT_SYMBOL(set_irq_data); 166EXPORT_SYMBOL(set_irq_data);
@@ -183,11 +183,11 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
183 return -EINVAL; 183 return -EINVAL;
184 } 184 }
185 185
186 spin_lock_irqsave(&desc->lock, flags); 186 raw_spin_lock_irqsave(&desc->lock, flags);
187 desc->msi_desc = entry; 187 desc->msi_desc = entry;
188 if (entry) 188 if (entry)
189 entry->irq = irq; 189 entry->irq = irq;
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return 0; 191 return 0;
192} 192}
193 193
@@ -214,9 +214,9 @@ int set_irq_chip_data(unsigned int irq, void *data)
214 return -EINVAL; 214 return -EINVAL;
215 } 215 }
216 216
217 spin_lock_irqsave(&desc->lock, flags); 217 raw_spin_lock_irqsave(&desc->lock, flags);
218 desc->chip_data = data; 218 desc->chip_data = data;
219 spin_unlock_irqrestore(&desc->lock, flags); 219 raw_spin_unlock_irqrestore(&desc->lock, flags);
220 220
221 return 0; 221 return 0;
222} 222}
@@ -241,12 +241,12 @@ void set_irq_nested_thread(unsigned int irq, int nest)
241 if (!desc) 241 if (!desc)
242 return; 242 return;
243 243
244 spin_lock_irqsave(&desc->lock, flags); 244 raw_spin_lock_irqsave(&desc->lock, flags);
245 if (nest) 245 if (nest)
246 desc->status |= IRQ_NESTED_THREAD; 246 desc->status |= IRQ_NESTED_THREAD;
247 else 247 else
248 desc->status &= ~IRQ_NESTED_THREAD; 248 desc->status &= ~IRQ_NESTED_THREAD;
249 spin_unlock_irqrestore(&desc->lock, flags); 249 raw_spin_unlock_irqrestore(&desc->lock, flags);
250} 250}
251EXPORT_SYMBOL_GPL(set_irq_nested_thread); 251EXPORT_SYMBOL_GPL(set_irq_nested_thread);
252 252
@@ -343,7 +343,7 @@ void handle_nested_irq(unsigned int irq)
343 343
344 might_sleep(); 344 might_sleep();
345 345
346 spin_lock_irq(&desc->lock); 346 raw_spin_lock_irq(&desc->lock);
347 347
348 kstat_incr_irqs_this_cpu(irq, desc); 348 kstat_incr_irqs_this_cpu(irq, desc);
349 349
@@ -352,17 +352,17 @@ void handle_nested_irq(unsigned int irq)
352 goto out_unlock; 352 goto out_unlock;
353 353
354 desc->status |= IRQ_INPROGRESS; 354 desc->status |= IRQ_INPROGRESS;
355 spin_unlock_irq(&desc->lock); 355 raw_spin_unlock_irq(&desc->lock);
356 356
357 action_ret = action->thread_fn(action->irq, action->dev_id); 357 action_ret = action->thread_fn(action->irq, action->dev_id);
358 if (!noirqdebug) 358 if (!noirqdebug)
359 note_interrupt(irq, desc, action_ret); 359 note_interrupt(irq, desc, action_ret);
360 360
361 spin_lock_irq(&desc->lock); 361 raw_spin_lock_irq(&desc->lock);
362 desc->status &= ~IRQ_INPROGRESS; 362 desc->status &= ~IRQ_INPROGRESS;
363 363
364out_unlock: 364out_unlock:
365 spin_unlock_irq(&desc->lock); 365 raw_spin_unlock_irq(&desc->lock);
366} 366}
367EXPORT_SYMBOL_GPL(handle_nested_irq); 367EXPORT_SYMBOL_GPL(handle_nested_irq);
368 368
@@ -384,7 +384,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
384 struct irqaction *action; 384 struct irqaction *action;
385 irqreturn_t action_ret; 385 irqreturn_t action_ret;
386 386
387 spin_lock(&desc->lock); 387 raw_spin_lock(&desc->lock);
388 388
389 if (unlikely(desc->status & IRQ_INPROGRESS)) 389 if (unlikely(desc->status & IRQ_INPROGRESS))
390 goto out_unlock; 390 goto out_unlock;
@@ -396,16 +396,16 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
396 goto out_unlock; 396 goto out_unlock;
397 397
398 desc->status |= IRQ_INPROGRESS; 398 desc->status |= IRQ_INPROGRESS;
399 spin_unlock(&desc->lock); 399 raw_spin_unlock(&desc->lock);
400 400
401 action_ret = handle_IRQ_event(irq, action); 401 action_ret = handle_IRQ_event(irq, action);
402 if (!noirqdebug) 402 if (!noirqdebug)
403 note_interrupt(irq, desc, action_ret); 403 note_interrupt(irq, desc, action_ret);
404 404
405 spin_lock(&desc->lock); 405 raw_spin_lock(&desc->lock);
406 desc->status &= ~IRQ_INPROGRESS; 406 desc->status &= ~IRQ_INPROGRESS;
407out_unlock: 407out_unlock:
408 spin_unlock(&desc->lock); 408 raw_spin_unlock(&desc->lock);
409} 409}
410 410
411/** 411/**
@@ -424,7 +424,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
424 struct irqaction *action; 424 struct irqaction *action;
425 irqreturn_t action_ret; 425 irqreturn_t action_ret;
426 426
427 spin_lock(&desc->lock); 427 raw_spin_lock(&desc->lock);
428 mask_ack_irq(desc, irq); 428 mask_ack_irq(desc, irq);
429 429
430 if (unlikely(desc->status & IRQ_INPROGRESS)) 430 if (unlikely(desc->status & IRQ_INPROGRESS))
@@ -441,13 +441,13 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
441 goto out_unlock; 441 goto out_unlock;
442 442
443 desc->status |= IRQ_INPROGRESS; 443 desc->status |= IRQ_INPROGRESS;
444 spin_unlock(&desc->lock); 444 raw_spin_unlock(&desc->lock);
445 445
446 action_ret = handle_IRQ_event(irq, action); 446 action_ret = handle_IRQ_event(irq, action);
447 if (!noirqdebug) 447 if (!noirqdebug)
448 note_interrupt(irq, desc, action_ret); 448 note_interrupt(irq, desc, action_ret);
449 449
450 spin_lock(&desc->lock); 450 raw_spin_lock(&desc->lock);
451 desc->status &= ~IRQ_INPROGRESS; 451 desc->status &= ~IRQ_INPROGRESS;
452 452
453 if (unlikely(desc->status & IRQ_ONESHOT)) 453 if (unlikely(desc->status & IRQ_ONESHOT))
@@ -455,7 +455,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) 455 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
456 desc->chip->unmask(irq); 456 desc->chip->unmask(irq);
457out_unlock: 457out_unlock:
458 spin_unlock(&desc->lock); 458 raw_spin_unlock(&desc->lock);
459} 459}
460EXPORT_SYMBOL_GPL(handle_level_irq); 460EXPORT_SYMBOL_GPL(handle_level_irq);
461 461
@@ -475,7 +475,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
475 struct irqaction *action; 475 struct irqaction *action;
476 irqreturn_t action_ret; 476 irqreturn_t action_ret;
477 477
478 spin_lock(&desc->lock); 478 raw_spin_lock(&desc->lock);
479 479
480 if (unlikely(desc->status & IRQ_INPROGRESS)) 480 if (unlikely(desc->status & IRQ_INPROGRESS))
481 goto out; 481 goto out;
@@ -497,18 +497,18 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
497 497
498 desc->status |= IRQ_INPROGRESS; 498 desc->status |= IRQ_INPROGRESS;
499 desc->status &= ~IRQ_PENDING; 499 desc->status &= ~IRQ_PENDING;
500 spin_unlock(&desc->lock); 500 raw_spin_unlock(&desc->lock);
501 501
502 action_ret = handle_IRQ_event(irq, action); 502 action_ret = handle_IRQ_event(irq, action);
503 if (!noirqdebug) 503 if (!noirqdebug)
504 note_interrupt(irq, desc, action_ret); 504 note_interrupt(irq, desc, action_ret);
505 505
506 spin_lock(&desc->lock); 506 raw_spin_lock(&desc->lock);
507 desc->status &= ~IRQ_INPROGRESS; 507 desc->status &= ~IRQ_INPROGRESS;
508out: 508out:
509 desc->chip->eoi(irq); 509 desc->chip->eoi(irq);
510 510
511 spin_unlock(&desc->lock); 511 raw_spin_unlock(&desc->lock);
512} 512}
513 513
514/** 514/**
@@ -530,7 +530,7 @@ out:
530void 530void
531handle_edge_irq(unsigned int irq, struct irq_desc *desc) 531handle_edge_irq(unsigned int irq, struct irq_desc *desc)
532{ 532{
533 spin_lock(&desc->lock); 533 raw_spin_lock(&desc->lock);
534 534
535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 535 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
536 536
@@ -576,17 +576,17 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
576 } 576 }
577 577
578 desc->status &= ~IRQ_PENDING; 578 desc->status &= ~IRQ_PENDING;
579 spin_unlock(&desc->lock); 579 raw_spin_unlock(&desc->lock);
580 action_ret = handle_IRQ_event(irq, action); 580 action_ret = handle_IRQ_event(irq, action);
581 if (!noirqdebug) 581 if (!noirqdebug)
582 note_interrupt(irq, desc, action_ret); 582 note_interrupt(irq, desc, action_ret);
583 spin_lock(&desc->lock); 583 raw_spin_lock(&desc->lock);
584 584
585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING); 585 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
586 586
587 desc->status &= ~IRQ_INPROGRESS; 587 desc->status &= ~IRQ_INPROGRESS;
588out_unlock: 588out_unlock:
589 spin_unlock(&desc->lock); 589 raw_spin_unlock(&desc->lock);
590} 590}
591 591
592/** 592/**
@@ -643,7 +643,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
643 } 643 }
644 644
645 chip_bus_lock(irq, desc); 645 chip_bus_lock(irq, desc);
646 spin_lock_irqsave(&desc->lock, flags); 646 raw_spin_lock_irqsave(&desc->lock, flags);
647 647
648 /* Uninstall? */ 648 /* Uninstall? */
649 if (handle == handle_bad_irq) { 649 if (handle == handle_bad_irq) {
@@ -661,7 +661,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
661 desc->depth = 0; 661 desc->depth = 0;
662 desc->chip->startup(irq); 662 desc->chip->startup(irq);
663 } 663 }
664 spin_unlock_irqrestore(&desc->lock, flags); 664 raw_spin_unlock_irqrestore(&desc->lock, flags);
665 chip_bus_sync_unlock(irq, desc); 665 chip_bus_sync_unlock(irq, desc);
666} 666}
667EXPORT_SYMBOL_GPL(__set_irq_handler); 667EXPORT_SYMBOL_GPL(__set_irq_handler);
@@ -692,9 +692,9 @@ void __init set_irq_noprobe(unsigned int irq)
692 return; 692 return;
693 } 693 }
694 694
695 spin_lock_irqsave(&desc->lock, flags); 695 raw_spin_lock_irqsave(&desc->lock, flags);
696 desc->status |= IRQ_NOPROBE; 696 desc->status |= IRQ_NOPROBE;
697 spin_unlock_irqrestore(&desc->lock, flags); 697 raw_spin_unlock_irqrestore(&desc->lock, flags);
698} 698}
699 699
700void __init set_irq_probe(unsigned int irq) 700void __init set_irq_probe(unsigned int irq)
@@ -707,7 +707,7 @@ void __init set_irq_probe(unsigned int irq)
707 return; 707 return;
708 } 708 }
709 709
710 spin_lock_irqsave(&desc->lock, flags); 710 raw_spin_lock_irqsave(&desc->lock, flags);
711 desc->status &= ~IRQ_NOPROBE; 711 desc->status &= ~IRQ_NOPROBE;
712 spin_unlock_irqrestore(&desc->lock, flags); 712 raw_spin_unlock_irqrestore(&desc->lock, flags);
713} 713}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 17c71bb565c6..814940e7f485 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -80,7 +80,7 @@ static struct irq_desc irq_desc_init = {
80 .chip = &no_irq_chip, 80 .chip = &no_irq_chip,
81 .handle_irq = handle_bad_irq, 81 .handle_irq = handle_bad_irq,
82 .depth = 1, 82 .depth = 1,
83 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 83 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
84}; 84};
85 85
86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) 86void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
@@ -108,7 +108,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
108{ 108{
109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); 109 memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
110 110
111 spin_lock_init(&desc->lock); 111 raw_spin_lock_init(&desc->lock);
112 desc->irq = irq; 112 desc->irq = irq;
113#ifdef CONFIG_SMP 113#ifdef CONFIG_SMP
114 desc->node = node; 114 desc->node = node;
@@ -130,7 +130,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
130/* 130/*
131 * Protect the sparse_irqs: 131 * Protect the sparse_irqs:
132 */ 132 */
133DEFINE_SPINLOCK(sparse_irq_lock); 133DEFINE_RAW_SPINLOCK(sparse_irq_lock);
134 134
135struct irq_desc **irq_desc_ptrs __read_mostly; 135struct irq_desc **irq_desc_ptrs __read_mostly;
136 136
@@ -141,7 +141,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
141 .chip = &no_irq_chip, 141 .chip = &no_irq_chip,
142 .handle_irq = handle_bad_irq, 142 .handle_irq = handle_bad_irq,
143 .depth = 1, 143 .depth = 1,
144 .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), 144 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
145 } 145 }
146}; 146};
147 147
@@ -212,7 +212,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
212 if (desc) 212 if (desc)
213 return desc; 213 return desc;
214 214
215 spin_lock_irqsave(&sparse_irq_lock, flags); 215 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
216 216
217 /* We have to check it to avoid races with another CPU */ 217 /* We have to check it to avoid races with another CPU */
218 desc = irq_desc_ptrs[irq]; 218 desc = irq_desc_ptrs[irq];
@@ -234,7 +234,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
234 irq_desc_ptrs[irq] = desc; 234 irq_desc_ptrs[irq] = desc;
235 235
236out_unlock: 236out_unlock:
237 spin_unlock_irqrestore(&sparse_irq_lock, flags); 237 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
238 238
239 return desc; 239 return desc;
240} 240}
@@ -247,7 +247,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
247 .chip = &no_irq_chip, 247 .chip = &no_irq_chip,
248 .handle_irq = handle_bad_irq, 248 .handle_irq = handle_bad_irq,
249 .depth = 1, 249 .depth = 1,
250 .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), 250 .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
251 } 251 }
252}; 252};
253 253
@@ -473,7 +473,7 @@ unsigned int __do_IRQ(unsigned int irq)
473 return 1; 473 return 1;
474 } 474 }
475 475
476 spin_lock(&desc->lock); 476 raw_spin_lock(&desc->lock);
477 if (desc->chip->ack) 477 if (desc->chip->ack)
478 desc->chip->ack(irq); 478 desc->chip->ack(irq);
479 /* 479 /*
@@ -517,13 +517,13 @@ unsigned int __do_IRQ(unsigned int irq)
517 for (;;) { 517 for (;;) {
518 irqreturn_t action_ret; 518 irqreturn_t action_ret;
519 519
520 spin_unlock(&desc->lock); 520 raw_spin_unlock(&desc->lock);
521 521
522 action_ret = handle_IRQ_event(irq, action); 522 action_ret = handle_IRQ_event(irq, action);
523 if (!noirqdebug) 523 if (!noirqdebug)
524 note_interrupt(irq, desc, action_ret); 524 note_interrupt(irq, desc, action_ret);
525 525
526 spin_lock(&desc->lock); 526 raw_spin_lock(&desc->lock);
527 if (likely(!(desc->status & IRQ_PENDING))) 527 if (likely(!(desc->status & IRQ_PENDING)))
528 break; 528 break;
529 desc->status &= ~IRQ_PENDING; 529 desc->status &= ~IRQ_PENDING;
@@ -536,7 +536,7 @@ out:
536 * disabled while the handler was running. 536 * disabled while the handler was running.
537 */ 537 */
538 desc->chip->end(irq); 538 desc->chip->end(irq);
539 spin_unlock(&desc->lock); 539 raw_spin_unlock(&desc->lock);
540 540
541 return 1; 541 return 1;
542} 542}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 1b5d742c6a77..b2821f070a3d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -18,7 +18,7 @@ extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
18extern struct lock_class_key irq_desc_lock_class; 18extern struct lock_class_key irq_desc_lock_class;
19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); 19extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
20extern void clear_kstat_irqs(struct irq_desc *desc); 20extern void clear_kstat_irqs(struct irq_desc *desc);
21extern spinlock_t sparse_irq_lock; 21extern raw_spinlock_t sparse_irq_lock;
22 22
23#ifdef CONFIG_SPARSE_IRQ 23#ifdef CONFIG_SPARSE_IRQ
24/* irq_desc_ptrs allocated at boot time */ 24/* irq_desc_ptrs allocated at boot time */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7305b297d1eb..eb6078ca60c7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -46,9 +46,9 @@ void synchronize_irq(unsigned int irq)
46 cpu_relax(); 46 cpu_relax();
47 47
48 /* Ok, that indicated we're done: double-check carefully. */ 48 /* Ok, that indicated we're done: double-check carefully. */
49 spin_lock_irqsave(&desc->lock, flags); 49 raw_spin_lock_irqsave(&desc->lock, flags);
50 status = desc->status; 50 status = desc->status;
51 spin_unlock_irqrestore(&desc->lock, flags); 51 raw_spin_unlock_irqrestore(&desc->lock, flags);
52 52
53 /* Oops, that failed? */ 53 /* Oops, that failed? */
54 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
@@ -114,7 +114,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
114 if (!desc->chip->set_affinity) 114 if (!desc->chip->set_affinity)
115 return -EINVAL; 115 return -EINVAL;
116 116
117 spin_lock_irqsave(&desc->lock, flags); 117 raw_spin_lock_irqsave(&desc->lock, flags);
118 118
119#ifdef CONFIG_GENERIC_PENDING_IRQ 119#ifdef CONFIG_GENERIC_PENDING_IRQ
120 if (desc->status & IRQ_MOVE_PCNTXT) { 120 if (desc->status & IRQ_MOVE_PCNTXT) {
@@ -134,7 +134,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
134 } 134 }
135#endif 135#endif
136 desc->status |= IRQ_AFFINITY_SET; 136 desc->status |= IRQ_AFFINITY_SET;
137 spin_unlock_irqrestore(&desc->lock, flags); 137 raw_spin_unlock_irqrestore(&desc->lock, flags);
138 return 0; 138 return 0;
139} 139}
140 140
@@ -181,11 +181,11 @@ int irq_select_affinity_usr(unsigned int irq)
181 unsigned long flags; 181 unsigned long flags;
182 int ret; 182 int ret;
183 183
184 spin_lock_irqsave(&desc->lock, flags); 184 raw_spin_lock_irqsave(&desc->lock, flags);
185 ret = setup_affinity(irq, desc); 185 ret = setup_affinity(irq, desc);
186 if (!ret) 186 if (!ret)
187 irq_set_thread_affinity(desc); 187 irq_set_thread_affinity(desc);
188 spin_unlock_irqrestore(&desc->lock, flags); 188 raw_spin_unlock_irqrestore(&desc->lock, flags);
189 189
190 return ret; 190 return ret;
191} 191}
@@ -231,9 +231,9 @@ void disable_irq_nosync(unsigned int irq)
231 return; 231 return;
232 232
233 chip_bus_lock(irq, desc); 233 chip_bus_lock(irq, desc);
234 spin_lock_irqsave(&desc->lock, flags); 234 raw_spin_lock_irqsave(&desc->lock, flags);
235 __disable_irq(desc, irq, false); 235 __disable_irq(desc, irq, false);
236 spin_unlock_irqrestore(&desc->lock, flags); 236 raw_spin_unlock_irqrestore(&desc->lock, flags);
237 chip_bus_sync_unlock(irq, desc); 237 chip_bus_sync_unlock(irq, desc);
238} 238}
239EXPORT_SYMBOL(disable_irq_nosync); 239EXPORT_SYMBOL(disable_irq_nosync);
@@ -308,9 +308,9 @@ void enable_irq(unsigned int irq)
308 return; 308 return;
309 309
310 chip_bus_lock(irq, desc); 310 chip_bus_lock(irq, desc);
311 spin_lock_irqsave(&desc->lock, flags); 311 raw_spin_lock_irqsave(&desc->lock, flags);
312 __enable_irq(desc, irq, false); 312 __enable_irq(desc, irq, false);
313 spin_unlock_irqrestore(&desc->lock, flags); 313 raw_spin_unlock_irqrestore(&desc->lock, flags);
314 chip_bus_sync_unlock(irq, desc); 314 chip_bus_sync_unlock(irq, desc);
315} 315}
316EXPORT_SYMBOL(enable_irq); 316EXPORT_SYMBOL(enable_irq);
@@ -347,7 +347,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
347 /* wakeup-capable irqs can be shared between drivers that 347 /* wakeup-capable irqs can be shared between drivers that
348 * don't need to have the same sleep mode behaviors. 348 * don't need to have the same sleep mode behaviors.
349 */ 349 */
350 spin_lock_irqsave(&desc->lock, flags); 350 raw_spin_lock_irqsave(&desc->lock, flags);
351 if (on) { 351 if (on) {
352 if (desc->wake_depth++ == 0) { 352 if (desc->wake_depth++ == 0) {
353 ret = set_irq_wake_real(irq, on); 353 ret = set_irq_wake_real(irq, on);
@@ -368,7 +368,7 @@ int set_irq_wake(unsigned int irq, unsigned int on)
368 } 368 }
369 } 369 }
370 370
371 spin_unlock_irqrestore(&desc->lock, flags); 371 raw_spin_unlock_irqrestore(&desc->lock, flags);
372 return ret; 372 return ret;
373} 373}
374EXPORT_SYMBOL(set_irq_wake); 374EXPORT_SYMBOL(set_irq_wake);
@@ -484,12 +484,12 @@ static int irq_wait_for_interrupt(struct irqaction *action)
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 485{
486 chip_bus_lock(irq, desc); 486 chip_bus_lock(irq, desc);
487 spin_lock_irq(&desc->lock); 487 raw_spin_lock_irq(&desc->lock);
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 489 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 490 desc->chip->unmask(irq);
491 } 491 }
492 spin_unlock_irq(&desc->lock); 492 raw_spin_unlock_irq(&desc->lock);
493 chip_bus_sync_unlock(irq, desc); 493 chip_bus_sync_unlock(irq, desc);
494} 494}
495 495
@@ -514,9 +514,9 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
514 return; 514 return;
515 } 515 }
516 516
517 spin_lock_irq(&desc->lock); 517 raw_spin_lock_irq(&desc->lock);
518 cpumask_copy(mask, desc->affinity); 518 cpumask_copy(mask, desc->affinity);
519 spin_unlock_irq(&desc->lock); 519 raw_spin_unlock_irq(&desc->lock);
520 520
521 set_cpus_allowed_ptr(current, mask); 521 set_cpus_allowed_ptr(current, mask);
522 free_cpumask_var(mask); 522 free_cpumask_var(mask);
@@ -545,7 +545,7 @@ static int irq_thread(void *data)
545 545
546 atomic_inc(&desc->threads_active); 546 atomic_inc(&desc->threads_active);
547 547
548 spin_lock_irq(&desc->lock); 548 raw_spin_lock_irq(&desc->lock);
549 if (unlikely(desc->status & IRQ_DISABLED)) { 549 if (unlikely(desc->status & IRQ_DISABLED)) {
550 /* 550 /*
551 * CHECKME: We might need a dedicated 551 * CHECKME: We might need a dedicated
@@ -555,9 +555,9 @@ static int irq_thread(void *data)
555 * retriggers the interrupt itself --- tglx 555 * retriggers the interrupt itself --- tglx
556 */ 556 */
557 desc->status |= IRQ_PENDING; 557 desc->status |= IRQ_PENDING;
558 spin_unlock_irq(&desc->lock); 558 raw_spin_unlock_irq(&desc->lock);
559 } else { 559 } else {
560 spin_unlock_irq(&desc->lock); 560 raw_spin_unlock_irq(&desc->lock);
561 561
562 action->thread_fn(action->irq, action->dev_id); 562 action->thread_fn(action->irq, action->dev_id);
563 563
@@ -679,7 +679,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
679 /* 679 /*
680 * The following block of code has to be executed atomically 680 * The following block of code has to be executed atomically
681 */ 681 */
682 spin_lock_irqsave(&desc->lock, flags); 682 raw_spin_lock_irqsave(&desc->lock, flags);
683 old_ptr = &desc->action; 683 old_ptr = &desc->action;
684 old = *old_ptr; 684 old = *old_ptr;
685 if (old) { 685 if (old) {
@@ -775,7 +775,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
775 __enable_irq(desc, irq, false); 775 __enable_irq(desc, irq, false);
776 } 776 }
777 777
778 spin_unlock_irqrestore(&desc->lock, flags); 778 raw_spin_unlock_irqrestore(&desc->lock, flags);
779 779
780 /* 780 /*
781 * Strictly no need to wake it up, but hung_task complains 781 * Strictly no need to wake it up, but hung_task complains
@@ -802,7 +802,7 @@ mismatch:
802 ret = -EBUSY; 802 ret = -EBUSY;
803 803
804out_thread: 804out_thread:
805 spin_unlock_irqrestore(&desc->lock, flags); 805 raw_spin_unlock_irqrestore(&desc->lock, flags);
806 if (new->thread) { 806 if (new->thread) {
807 struct task_struct *t = new->thread; 807 struct task_struct *t = new->thread;
808 808
@@ -844,7 +844,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
844 if (!desc) 844 if (!desc)
845 return NULL; 845 return NULL;
846 846
847 spin_lock_irqsave(&desc->lock, flags); 847 raw_spin_lock_irqsave(&desc->lock, flags);
848 848
849 /* 849 /*
850 * There can be multiple actions per IRQ descriptor, find the right 850 * There can be multiple actions per IRQ descriptor, find the right
@@ -856,7 +856,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
856 856
857 if (!action) { 857 if (!action) {
858 WARN(1, "Trying to free already-free IRQ %d\n", irq); 858 WARN(1, "Trying to free already-free IRQ %d\n", irq);
859 spin_unlock_irqrestore(&desc->lock, flags); 859 raw_spin_unlock_irqrestore(&desc->lock, flags);
860 860
861 return NULL; 861 return NULL;
862 } 862 }
@@ -884,7 +884,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 884 desc->chip->disable(irq);
885 } 885 }
886 886
887 spin_unlock_irqrestore(&desc->lock, flags); 887 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 888
889 unregister_handler_proc(irq, action); 889 unregister_handler_proc(irq, action);
890 890
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index fcb6c96f2627..241962280836 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -27,7 +27,7 @@ void move_masked_irq(int irq)
27 if (!desc->chip->set_affinity) 27 if (!desc->chip->set_affinity)
28 return; 28 return;
29 29
30 assert_spin_locked(&desc->lock); 30 assert_raw_spin_locked(&desc->lock);
31 31
32 /* 32 /*
33 * If there was a valid mask to work with, please 33 * If there was a valid mask to work with, please
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 3fd30197da2e..26bac9d8f860 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -42,7 +42,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
42 "for migration.\n", irq); 42 "for migration.\n", irq);
43 return false; 43 return false;
44 } 44 }
45 spin_lock_init(&desc->lock); 45 raw_spin_lock_init(&desc->lock);
46 desc->node = node; 46 desc->node = node;
47 lockdep_set_class(&desc->lock, &irq_desc_lock_class); 47 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids); 48 init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
@@ -67,7 +67,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
67 67
68 irq = old_desc->irq; 68 irq = old_desc->irq;
69 69
70 spin_lock_irqsave(&sparse_irq_lock, flags); 70 raw_spin_lock_irqsave(&sparse_irq_lock, flags);
71 71
72 /* We have to check it to avoid races with another CPU */ 72 /* We have to check it to avoid races with another CPU */
73 desc = irq_desc_ptrs[irq]; 73 desc = irq_desc_ptrs[irq];
@@ -91,7 +91,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
91 } 91 }
92 92
93 irq_desc_ptrs[irq] = desc; 93 irq_desc_ptrs[irq] = desc;
94 spin_unlock_irqrestore(&sparse_irq_lock, flags); 94 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
95 95
96 /* free the old one */ 96 /* free the old one */
97 free_one_irq_desc(old_desc, desc); 97 free_one_irq_desc(old_desc, desc);
@@ -100,7 +100,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
100 return desc; 100 return desc;
101 101
102out_unlock: 102out_unlock:
103 spin_unlock_irqrestore(&sparse_irq_lock, flags); 103 raw_spin_unlock_irqrestore(&sparse_irq_lock, flags);
104 104
105 return desc; 105 return desc;
106} 106}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index a0bb09e79867..0d4005d85b03 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -28,9 +28,9 @@ void suspend_device_irqs(void)
28 for_each_irq_desc(irq, desc) { 28 for_each_irq_desc(irq, desc) {
29 unsigned long flags; 29 unsigned long flags;
30 30
31 spin_lock_irqsave(&desc->lock, flags); 31 raw_spin_lock_irqsave(&desc->lock, flags);
32 __disable_irq(desc, irq, true); 32 __disable_irq(desc, irq, true);
33 spin_unlock_irqrestore(&desc->lock, flags); 33 raw_spin_unlock_irqrestore(&desc->lock, flags);
34 } 34 }
35 35
36 for_each_irq_desc(irq, desc) 36 for_each_irq_desc(irq, desc)
@@ -56,9 +56,9 @@ void resume_device_irqs(void)
56 if (!(desc->status & IRQ_SUSPENDED)) 56 if (!(desc->status & IRQ_SUSPENDED))
57 continue; 57 continue;
58 58
59 spin_lock_irqsave(&desc->lock, flags); 59 raw_spin_lock_irqsave(&desc->lock, flags);
60 __enable_irq(desc, irq, true); 60 __enable_irq(desc, irq, true);
61 spin_unlock_irqrestore(&desc->lock, flags); 61 raw_spin_unlock_irqrestore(&desc->lock, flags);
62 } 62 }
63} 63}
64EXPORT_SYMBOL_GPL(resume_device_irqs); 64EXPORT_SYMBOL_GPL(resume_device_irqs);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 0832145fea97..6f50eccc79c0 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -179,7 +179,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
179 unsigned long flags; 179 unsigned long flags;
180 int ret = 1; 180 int ret = 1;
181 181
182 spin_lock_irqsave(&desc->lock, flags); 182 raw_spin_lock_irqsave(&desc->lock, flags);
183 for (action = desc->action ; action; action = action->next) { 183 for (action = desc->action ; action; action = action->next) {
184 if ((action != new_action) && action->name && 184 if ((action != new_action) && action->name &&
185 !strcmp(new_action->name, action->name)) { 185 !strcmp(new_action->name, action->name)) {
@@ -187,7 +187,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
187 break; 187 break;
188 } 188 }
189 } 189 }
190 spin_unlock_irqrestore(&desc->lock, flags); 190 raw_spin_unlock_irqrestore(&desc->lock, flags);
191 return ret; 191 return ret;
192} 192}
193 193
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 22b0a6eedf24..89fb90ae534f 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -28,7 +28,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
28 struct irqaction *action; 28 struct irqaction *action;
29 int ok = 0, work = 0; 29 int ok = 0, work = 0;
30 30
31 spin_lock(&desc->lock); 31 raw_spin_lock(&desc->lock);
32 /* Already running on another processor */ 32 /* Already running on another processor */
33 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
34 /* 34 /*
@@ -37,13 +37,13 @@ static int try_one_irq(int irq, struct irq_desc *desc)
37 */ 37 */
38 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
39 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
40 spin_unlock(&desc->lock); 40 raw_spin_unlock(&desc->lock);
41 return ok; 41 return ok;
42 } 42 }
43 /* Honour the normal IRQ locking */ 43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS; 44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action; 45 action = desc->action;
46 spin_unlock(&desc->lock); 46 raw_spin_unlock(&desc->lock);
47 47
48 while (action) { 48 while (action) {
49 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
@@ -56,7 +56,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
56 } 56 }
57 local_irq_disable(); 57 local_irq_disable();
58 /* Now clean up the flags */ 58 /* Now clean up the flags */
59 spin_lock(&desc->lock); 59 raw_spin_lock(&desc->lock);
60 action = desc->action; 60 action = desc->action;
61 61
62 /* 62 /*
@@ -68,9 +68,9 @@ static int try_one_irq(int irq, struct irq_desc *desc)
68 * Perform real IRQ processing for the IRQ we deferred 68 * Perform real IRQ processing for the IRQ we deferred
69 */ 69 */
70 work = 1; 70 work = 1;
71 spin_unlock(&desc->lock); 71 raw_spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action); 72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock); 73 raw_spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING; 74 desc->status &= ~IRQ_PENDING;
75 } 75 }
76 desc->status &= ~IRQ_INPROGRESS; 76 desc->status &= ~IRQ_INPROGRESS;
@@ -80,7 +80,7 @@ static int try_one_irq(int irq, struct irq_desc *desc)
80 */ 80 */
81 if (work && desc->chip && desc->chip->end) 81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq); 82 desc->chip->end(irq);
83 spin_unlock(&desc->lock); 83 raw_spin_unlock(&desc->lock);
84 84
85 return ok; 85 return ok;
86} 86}
@@ -220,7 +220,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
220 /* 220 /*
221 * If we are seeing only the odd spurious IRQ caused by 221 * If we are seeing only the odd spurious IRQ caused by
222 * bus asynchronicity then don't eventually trigger an error, 222 * bus asynchronicity then don't eventually trigger an error,
223 * otherwise the couter becomes a doomsday timer for otherwise 223 * otherwise the counter becomes a doomsday timer for otherwise
224 * working systems 224 * working systems
225 */ 225 */
226 if (time_after(jiffies, desc->last_unhandled + HZ/10)) 226 if (time_after(jiffies, desc->last_unhandled + HZ/10))
diff --git a/kernel/kexec.c b/kernel/kexec.c
index f336e2107f98..ef077fb73155 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -21,7 +21,7 @@
21#include <linux/hardirq.h> 21#include <linux/hardirq.h>
22#include <linux/elf.h> 22#include <linux/elf.h>
23#include <linux/elfcore.h> 23#include <linux/elfcore.h>
24#include <linux/utsrelease.h> 24#include <generated/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h> 27#include <linux/suspend.h>
@@ -31,6 +31,8 @@
31#include <linux/cpu.h> 31#include <linux/cpu.h>
32#include <linux/console.h> 32#include <linux/console.h>
33#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
34#include <linux/swap.h>
35#include <linux/kmsg_dump.h>
34 36
35#include <asm/page.h> 37#include <asm/page.h>
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
@@ -1073,6 +1075,9 @@ void crash_kexec(struct pt_regs *regs)
1073 if (mutex_trylock(&kexec_mutex)) { 1075 if (mutex_trylock(&kexec_mutex)) {
1074 if (kexec_crash_image) { 1076 if (kexec_crash_image) {
1075 struct pt_regs fixed_regs; 1077 struct pt_regs fixed_regs;
1078
1079 kmsg_dump(KMSG_DUMP_KEXEC);
1080
1076 crash_setup_regs(&fixed_regs, regs); 1081 crash_setup_regs(&fixed_regs, regs);
1077 crash_save_vmcoreinfo(); 1082 crash_save_vmcoreinfo();
1078 machine_crash_shutdown(&fixed_regs); 1083 machine_crash_shutdown(&fixed_regs);
@@ -1082,6 +1087,64 @@ void crash_kexec(struct pt_regs *regs)
1082 } 1087 }
1083} 1088}
1084 1089
1090size_t crash_get_memory_size(void)
1091{
1092 size_t size;
1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex);
1096 return size;
1097}
1098
1099static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1100{
1101 unsigned long addr;
1102
1103 for (addr = begin; addr < end; addr += PAGE_SIZE) {
1104 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1105 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1106 free_page((unsigned long)__va(addr));
1107 totalram_pages++;
1108 }
1109}
1110
1111int crash_shrink_memory(unsigned long new_size)
1112{
1113 int ret = 0;
1114 unsigned long start, end;
1115
1116 mutex_lock(&kexec_mutex);
1117
1118 if (kexec_crash_image) {
1119 ret = -ENOENT;
1120 goto unlock;
1121 }
1122 start = crashk_res.start;
1123 end = crashk_res.end;
1124
1125 if (new_size >= end - start + 1) {
1126 ret = -EINVAL;
1127 if (new_size == end - start + 1)
1128 ret = 0;
1129 goto unlock;
1130 }
1131
1132 start = roundup(start, PAGE_SIZE);
1133 end = roundup(start + new_size, PAGE_SIZE);
1134
1135 free_reserved_phys_range(end, crashk_res.end);
1136
1137 if (start == end) {
1138 crashk_res.end = end;
1139 release_resource(&crashk_res);
1140 } else
1141 crashk_res.end = end - 1;
1142
1143unlock:
1144 mutex_unlock(&kexec_mutex);
1145 return ret;
1146}
1147
1085static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, 1148static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1086 size_t data_len) 1149 size_t data_len)
1087{ 1150{
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 3765ff3c1bbe..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -1,6 +1,7 @@
1/* 1/*
2 * A simple kernel FIFO implementation. 2 * A generic kernel FIFO implementation.
3 * 3 *
4 * Copyright (C) 2009 Stefani Seibold <stefani@seibold.net>
4 * Copyright (C) 2004 Stelian Pop <stelian@popies.net> 5 * Copyright (C) 2004 Stelian Pop <stelian@popies.net>
5 * 6 *
6 * This program is free software; you can redistribute it and/or modify 7 * This program is free software; you can redistribute it and/or modify
@@ -25,50 +26,48 @@
25#include <linux/err.h> 26#include <linux/err.h>
26#include <linux/kfifo.h> 27#include <linux/kfifo.h>
27#include <linux/log2.h> 28#include <linux/log2.h>
29#include <linux/uaccess.h>
30
31static void _kfifo_init(struct kfifo *fifo, void *buffer,
32 unsigned int size)
33{
34 fifo->buffer = buffer;
35 fifo->size = size;
36
37 kfifo_reset(fifo);
38}
28 39
29/** 40/**
30 * kfifo_init - allocates a new FIFO using a preallocated buffer 41 * kfifo_init - initialize a FIFO using a preallocated buffer
42 * @fifo: the fifo to assign the buffer
31 * @buffer: the preallocated buffer to be used. 43 * @buffer: the preallocated buffer to be used.
32 * @size: the size of the internal buffer, this have to be a power of 2. 44 * @size: the size of the internal buffer, this has to be a power of 2.
33 * @gfp_mask: get_free_pages mask, passed to kmalloc()
34 * @lock: the lock to be used to protect the fifo buffer
35 * 45 *
36 * Do NOT pass the kfifo to kfifo_free() after use! Simply free the
37 * &struct kfifo with kfree().
38 */ 46 */
39struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, 47void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
40 gfp_t gfp_mask, spinlock_t *lock)
41{ 48{
42 struct kfifo *fifo;
43
44 /* size must be a power of 2 */ 49 /* size must be a power of 2 */
45 BUG_ON(!is_power_of_2(size)); 50 BUG_ON(!is_power_of_2(size));
46 51
47 fifo = kmalloc(sizeof(struct kfifo), gfp_mask); 52 _kfifo_init(fifo, buffer, size);
48 if (!fifo)
49 return ERR_PTR(-ENOMEM);
50
51 fifo->buffer = buffer;
52 fifo->size = size;
53 fifo->in = fifo->out = 0;
54 fifo->lock = lock;
55
56 return fifo;
57} 53}
58EXPORT_SYMBOL(kfifo_init); 54EXPORT_SYMBOL(kfifo_init);
59 55
60/** 56/**
61 * kfifo_alloc - allocates a new FIFO and its internal buffer 57 * kfifo_alloc - allocates a new FIFO internal buffer
62 * @size: the size of the internal buffer to be allocated. 58 * @fifo: the fifo to assign then new buffer
59 * @size: the size of the buffer to be allocated, this have to be a power of 2.
63 * @gfp_mask: get_free_pages mask, passed to kmalloc() 60 * @gfp_mask: get_free_pages mask, passed to kmalloc()
64 * @lock: the lock to be used to protect the fifo buffer 61 *
62 * This function dynamically allocates a new fifo internal buffer
65 * 63 *
66 * The size will be rounded-up to a power of 2. 64 * The size will be rounded-up to a power of 2.
65 * The buffer will be release with kfifo_free().
66 * Return 0 if no error, otherwise the an error code
67 */ 67 */
68struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock) 68int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
69{ 69{
70 unsigned char *buffer; 70 unsigned char *buffer;
71 struct kfifo *ret;
72 71
73 /* 72 /*
74 * round up to the next power of 2, since our 'let the indices 73 * round up to the next power of 2, since our 'let the indices
@@ -80,48 +79,93 @@ struct kfifo *kfifo_alloc(unsigned int size, gfp_t gfp_mask, spinlock_t *lock)
80 } 79 }
81 80
82 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
83 if (!buffer) 82 if (!buffer) {
84 return ERR_PTR(-ENOMEM); 83 _kfifo_init(fifo, NULL, 0);
85 84 return -ENOMEM;
86 ret = kfifo_init(buffer, size, gfp_mask, lock); 85 }
87 86
88 if (IS_ERR(ret)) 87 _kfifo_init(fifo, buffer, size);
89 kfree(buffer);
90 88
91 return ret; 89 return 0;
92} 90}
93EXPORT_SYMBOL(kfifo_alloc); 91EXPORT_SYMBOL(kfifo_alloc);
94 92
95/** 93/**
96 * kfifo_free - frees the FIFO 94 * kfifo_free - frees the FIFO internal buffer
97 * @fifo: the fifo to be freed. 95 * @fifo: the fifo to be freed.
98 */ 96 */
99void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
100{ 98{
101 kfree(fifo->buffer); 99 kfree(fifo->buffer);
102 kfree(fifo); 100 _kfifo_init(fifo, NULL, 0);
103} 101}
104EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
105 103
106/** 104/**
107 * __kfifo_put - puts some data into the FIFO, no locking version 105 * kfifo_skip - skip output data
108 * @fifo: the fifo to be used. 106 * @fifo: the fifo to be used.
109 * @buffer: the data to be added. 107 * @len: number of bytes to skip
110 * @len: the length of the data to be added.
111 *
112 * This function copies at most @len bytes from the @buffer into
113 * the FIFO depending on the free space, and returns the number of
114 * bytes copied.
115 *
116 * Note that with only one concurrent reader and one concurrent
117 * writer, you don't need extra locking to use these functions.
118 */ 108 */
119unsigned int __kfifo_put(struct kfifo *fifo, 109void kfifo_skip(struct kfifo *fifo, unsigned int len)
120 const unsigned char *buffer, unsigned int len) 110{
111 if (len < kfifo_len(fifo)) {
112 __kfifo_add_out(fifo, len);
113 return;
114 }
115 kfifo_reset_out(fifo);
116}
117EXPORT_SYMBOL(kfifo_skip);
118
119static inline void __kfifo_in_data(struct kfifo *fifo,
120 const void *from, unsigned int len, unsigned int off)
121{ 121{
122 unsigned int l; 122 unsigned int l;
123 123
124 len = min(len, fifo->size - fifo->in + fifo->out); 124 /*
125 * Ensure that we sample the fifo->out index -before- we
126 * start putting bytes into the kfifo.
127 */
128
129 smp_mb();
130
131 off = __kfifo_off(fifo, fifo->in + off);
132
133 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + off, from, l);
136
137 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, from + l, len - l);
139}
140
141static inline void __kfifo_out_data(struct kfifo *fifo,
142 void *to, unsigned int len, unsigned int off)
143{
144 unsigned int l;
145
146 /*
147 * Ensure that we sample the fifo->in index -before- we
148 * start removing bytes from the kfifo.
149 */
150
151 smp_rmb();
152
153 off = __kfifo_off(fifo, fifo->out + off);
154
155 /* first get the data from fifo->out until the end of the buffer */
156 l = min(len, fifo->size - off);
157 memcpy(to, fifo->buffer + off, l);
158
159 /* then get the rest (if any) from the beginning of the buffer */
160 memcpy(to + l, fifo->buffer, len - l);
161}
162
163static inline int __kfifo_from_user_data(struct kfifo *fifo,
164 const void __user *from, unsigned int len, unsigned int off,
165 unsigned *lenout)
166{
167 unsigned int l;
168 int ret;
125 169
126 /* 170 /*
127 * Ensure that we sample the fifo->out index -before- we 171 * Ensure that we sample the fifo->out index -before- we
@@ -130,68 +174,272 @@ unsigned int __kfifo_put(struct kfifo *fifo,
130 174
131 smp_mb(); 175 smp_mb();
132 176
177 off = __kfifo_off(fifo, fifo->in + off);
178
133 /* first put the data starting from fifo->in to buffer end */ 179 /* first put the data starting from fifo->in to buffer end */
134 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 180 l = min(len, fifo->size - off);
135 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 181 ret = copy_from_user(fifo->buffer + off, from, l);
182 if (unlikely(ret)) {
183 *lenout = ret;
184 return -EFAULT;
185 }
186 *lenout = l;
136 187
137 /* then put the rest (if any) at the beginning of the buffer */ 188 /* then put the rest (if any) at the beginning of the buffer */
138 memcpy(fifo->buffer, buffer + l, len - l); 189 ret = copy_from_user(fifo->buffer, from + l, len - l);
190 *lenout += ret ? ret : len - l;
191 return ret ? -EFAULT : 0;
192}
193
194static inline int __kfifo_to_user_data(struct kfifo *fifo,
195 void __user *to, unsigned int len, unsigned int off, unsigned *lenout)
196{
197 unsigned int l;
198 int ret;
139 199
140 /* 200 /*
141 * Ensure that we add the bytes to the kfifo -before- 201 * Ensure that we sample the fifo->in index -before- we
142 * we update the fifo->in index. 202 * start removing bytes from the kfifo.
143 */ 203 */
144 204
145 smp_wmb(); 205 smp_rmb();
206
207 off = __kfifo_off(fifo, fifo->out + off);
208
209 /* first get the data from fifo->out until the end of the buffer */
210 l = min(len, fifo->size - off);
211 ret = copy_to_user(to, fifo->buffer + off, l);
212 *lenout = l;
213 if (unlikely(ret)) {
214 *lenout -= ret;
215 return -EFAULT;
216 }
217
218 /* then get the rest (if any) from the beginning of the buffer */
219 len -= l;
220 ret = copy_to_user(to + l, fifo->buffer, len);
221 if (unlikely(ret)) {
222 *lenout += len - ret;
223 return -EFAULT;
224 }
225 *lenout += len;
226 return 0;
227}
228
229unsigned int __kfifo_in_n(struct kfifo *fifo,
230 const void *from, unsigned int len, unsigned int recsize)
231{
232 if (kfifo_avail(fifo) < len + recsize)
233 return len + 1;
234
235 __kfifo_in_data(fifo, from, len, recsize);
236 return 0;
237}
238EXPORT_SYMBOL(__kfifo_in_n);
146 239
147 fifo->in += len; 240/**
241 * kfifo_in - puts some data into the FIFO
242 * @fifo: the fifo to be used.
243 * @from: the data to be added.
244 * @len: the length of the data to be added.
245 *
246 * This function copies at most @len bytes from the @from buffer into
247 * the FIFO depending on the free space, and returns the number of
248 * bytes copied.
249 *
250 * Note that with only one concurrent reader and one concurrent
251 * writer, you don't need extra locking to use these functions.
252 */
253unsigned int kfifo_in(struct kfifo *fifo, const void *from,
254 unsigned int len)
255{
256 len = min(kfifo_avail(fifo), len);
148 257
258 __kfifo_in_data(fifo, from, len, 0);
259 __kfifo_add_in(fifo, len);
149 return len; 260 return len;
150} 261}
151EXPORT_SYMBOL(__kfifo_put); 262EXPORT_SYMBOL(kfifo_in);
263
264unsigned int __kfifo_in_generic(struct kfifo *fifo,
265 const void *from, unsigned int len, unsigned int recsize)
266{
267 return __kfifo_in_rec(fifo, from, len, recsize);
268}
269EXPORT_SYMBOL(__kfifo_in_generic);
270
271unsigned int __kfifo_out_n(struct kfifo *fifo,
272 void *to, unsigned int len, unsigned int recsize)
273{
274 if (kfifo_len(fifo) < len + recsize)
275 return len;
276
277 __kfifo_out_data(fifo, to, len, recsize);
278 __kfifo_add_out(fifo, len + recsize);
279 return 0;
280}
281EXPORT_SYMBOL(__kfifo_out_n);
152 282
153/** 283/**
154 * __kfifo_get - gets some data from the FIFO, no locking version 284 * kfifo_out - gets some data from the FIFO
155 * @fifo: the fifo to be used. 285 * @fifo: the fifo to be used.
156 * @buffer: where the data must be copied. 286 * @to: where the data must be copied.
157 * @len: the size of the destination buffer. 287 * @len: the size of the destination buffer.
158 * 288 *
159 * This function copies at most @len bytes from the FIFO into the 289 * This function copies at most @len bytes from the FIFO into the
160 * @buffer and returns the number of copied bytes. 290 * @to buffer and returns the number of copied bytes.
161 * 291 *
162 * Note that with only one concurrent reader and one concurrent 292 * Note that with only one concurrent reader and one concurrent
163 * writer, you don't need extra locking to use these functions. 293 * writer, you don't need extra locking to use these functions.
164 */ 294 */
165unsigned int __kfifo_get(struct kfifo *fifo, 295unsigned int kfifo_out(struct kfifo *fifo, void *to, unsigned int len)
166 unsigned char *buffer, unsigned int len)
167{ 296{
168 unsigned int l; 297 len = min(kfifo_len(fifo), len);
169 298
170 len = min(len, fifo->in - fifo->out); 299 __kfifo_out_data(fifo, to, len, 0);
300 __kfifo_add_out(fifo, len);
171 301
172 /* 302 return len;
173 * Ensure that we sample the fifo->in index -before- we 303}
174 * start removing bytes from the kfifo. 304EXPORT_SYMBOL(kfifo_out);
175 */
176 305
177 smp_rmb(); 306/**
307 * kfifo_out_peek - copy some data from the FIFO, but do not remove it
308 * @fifo: the fifo to be used.
309 * @to: where the data must be copied.
310 * @len: the size of the destination buffer.
311 * @offset: offset into the fifo
312 *
313 * This function copies at most @len bytes at @offset from the FIFO
314 * into the @to buffer and returns the number of copied bytes.
315 * The data is not removed from the FIFO.
316 */
317unsigned int kfifo_out_peek(struct kfifo *fifo, void *to, unsigned int len,
318 unsigned offset)
319{
320 len = min(kfifo_len(fifo), len + offset);
178 321
179 /* first get the data from fifo->out until the end of the buffer */ 322 __kfifo_out_data(fifo, to, len, offset);
180 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 323 return len;
181 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 324}
325EXPORT_SYMBOL(kfifo_out_peek);
182 326
183 /* then get the rest (if any) from the beginning of the buffer */ 327unsigned int __kfifo_out_generic(struct kfifo *fifo,
184 memcpy(buffer + l, fifo->buffer, len - l); 328 void *to, unsigned int len, unsigned int recsize,
329 unsigned int *total)
330{
331 return __kfifo_out_rec(fifo, to, len, recsize, total);
332}
333EXPORT_SYMBOL(__kfifo_out_generic);
185 334
186 /* 335unsigned int __kfifo_from_user_n(struct kfifo *fifo,
187 * Ensure that we remove the bytes from the kfifo -before- 336 const void __user *from, unsigned int len, unsigned int recsize)
188 * we update the fifo->out index. 337{
189 */ 338 unsigned total;
190 339
191 smp_mb(); 340 if (kfifo_avail(fifo) < len + recsize)
341 return len + 1;
192 342
193 fifo->out += len; 343 __kfifo_from_user_data(fifo, from, len, recsize, &total);
344 return total;
345}
346EXPORT_SYMBOL(__kfifo_from_user_n);
194 347
195 return len; 348/**
349 * kfifo_from_user - puts some data from user space into the FIFO
350 * @fifo: the fifo to be used.
351 * @from: pointer to the data to be added.
352 * @len: the length of the data to be added.
353 * @total: the actual returned data length.
354 *
355 * This function copies at most @len bytes from the @from into the
356 * FIFO depending and returns -EFAULT/0.
357 *
358 * Note that with only one concurrent reader and one concurrent
359 * writer, you don't need extra locking to use these functions.
360 */
361int kfifo_from_user(struct kfifo *fifo,
362 const void __user *from, unsigned int len, unsigned *total)
363{
364 int ret;
365 len = min(kfifo_avail(fifo), len);
366 ret = __kfifo_from_user_data(fifo, from, len, 0, total);
367 if (ret)
368 return ret;
369 __kfifo_add_in(fifo, len);
370 return 0;
196} 371}
197EXPORT_SYMBOL(__kfifo_get); 372EXPORT_SYMBOL(kfifo_from_user);
373
374unsigned int __kfifo_from_user_generic(struct kfifo *fifo,
375 const void __user *from, unsigned int len, unsigned int recsize)
376{
377 return __kfifo_from_user_rec(fifo, from, len, recsize);
378}
379EXPORT_SYMBOL(__kfifo_from_user_generic);
380
381unsigned int __kfifo_to_user_n(struct kfifo *fifo,
382 void __user *to, unsigned int len, unsigned int reclen,
383 unsigned int recsize)
384{
385 unsigned int ret, total;
386
387 if (kfifo_len(fifo) < reclen + recsize)
388 return len;
389
390 ret = __kfifo_to_user_data(fifo, to, reclen, recsize, &total);
391
392 if (likely(ret == 0))
393 __kfifo_add_out(fifo, reclen + recsize);
394
395 return total;
396}
397EXPORT_SYMBOL(__kfifo_to_user_n);
398
399/**
400 * kfifo_to_user - gets data from the FIFO and write it to user space
401 * @fifo: the fifo to be used.
402 * @to: where the data must be copied.
403 * @len: the size of the destination buffer.
404 * @lenout: pointer to output variable with copied data
405 *
406 * This function copies at most @len bytes from the FIFO into the
407 * @to buffer and 0 or -EFAULT.
408 *
409 * Note that with only one concurrent reader and one concurrent
410 * writer, you don't need extra locking to use these functions.
411 */
412int kfifo_to_user(struct kfifo *fifo,
413 void __user *to, unsigned int len, unsigned *lenout)
414{
415 int ret;
416 len = min(kfifo_len(fifo), len);
417 ret = __kfifo_to_user_data(fifo, to, len, 0, lenout);
418 __kfifo_add_out(fifo, *lenout);
419 return ret;
420}
421EXPORT_SYMBOL(kfifo_to_user);
422
423unsigned int __kfifo_to_user_generic(struct kfifo *fifo,
424 void __user *to, unsigned int len, unsigned int recsize,
425 unsigned int *total)
426{
427 return __kfifo_to_user_rec(fifo, to, len, recsize, total);
428}
429EXPORT_SYMBOL(__kfifo_to_user_generic);
430
431unsigned int __kfifo_peek_generic(struct kfifo *fifo, unsigned int recsize)
432{
433 if (recsize == 0)
434 return kfifo_avail(fifo);
435
436 return __kfifo_peek_n(fifo, recsize);
437}
438EXPORT_SYMBOL(__kfifo_peek_generic);
439
440void __kfifo_skip_generic(struct kfifo *fifo, unsigned int recsize)
441{
442 __kfifo_skip_rec(fifo, recsize);
443}
444EXPORT_SYMBOL(__kfifo_skip_generic);
445
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 7d7014634022..761fdd2b3034 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -129,6 +129,7 @@ struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread; 129struct task_struct *kgdb_contthread;
130 130
131int kgdb_single_step; 131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
132 133
133/* Our I/O buffers. */ 134/* Our I/O buffers. */
134static char remcom_in_buffer[BUFMAX]; 135static char remcom_in_buffer[BUFMAX];
@@ -541,12 +542,17 @@ static struct task_struct *getthread(struct pt_regs *regs, int tid)
541 */ 542 */
542 if (tid == 0 || tid == -1) 543 if (tid == 0 || tid == -1)
543 tid = -atomic_read(&kgdb_active) - 2; 544 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) { 545 if (tid < -1 && tid > -NR_CPUS - 2) {
545 if (kgdb_info[-tid - 2].task) 546 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task; 547 return kgdb_info[-tid - 2].task;
547 else 548 else
548 return idle_task(-tid - 2); 549 return idle_task(-tid - 2);
549 } 550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
550 556
551 /* 557 /*
552 * find_task_by_pid_ns() does not take the tasklist lock anymore 558 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -577,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs)
577 smp_wmb(); 583 smp_wmb();
578 atomic_set(&cpu_in_kgdb[cpu], 1); 584 atomic_set(&cpu_in_kgdb[cpu], 1);
579 585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
580 /* Wait till primary CPU is done with debugging */ 589 /* Wait till primary CPU is done with debugging */
581 while (atomic_read(&passive_cpu_wait[cpu])) 590 while (atomic_read(&passive_cpu_wait[cpu]))
582 cpu_relax(); 591 cpu_relax();
@@ -590,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
590 599
591 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
592 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
594 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
595 local_irq_restore(flags); 604 local_irq_restore(flags);
596} 605}
@@ -619,7 +628,8 @@ static void kgdb_flush_swbreak_addr(unsigned long addr)
619static int kgdb_activate_sw_breakpoints(void) 628static int kgdb_activate_sw_breakpoints(void)
620{ 629{
621 unsigned long addr; 630 unsigned long addr;
622 int error = 0; 631 int error;
632 int ret = 0;
623 int i; 633 int i;
624 634
625 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 635 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -629,13 +639,16 @@ static int kgdb_activate_sw_breakpoints(void)
629 addr = kgdb_break[i].bpt_addr; 639 addr = kgdb_break[i].bpt_addr;
630 error = kgdb_arch_set_breakpoint(addr, 640 error = kgdb_arch_set_breakpoint(addr,
631 kgdb_break[i].saved_instr); 641 kgdb_break[i].saved_instr);
632 if (error) 642 if (error) {
633 return error; 643 ret = error;
644 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
645 continue;
646 }
634 647
635 kgdb_flush_swbreak_addr(addr); 648 kgdb_flush_swbreak_addr(addr);
636 kgdb_break[i].state = BP_ACTIVE; 649 kgdb_break[i].state = BP_ACTIVE;
637 } 650 }
638 return 0; 651 return ret;
639} 652}
640 653
641static int kgdb_set_sw_break(unsigned long addr) 654static int kgdb_set_sw_break(unsigned long addr)
@@ -682,7 +695,8 @@ static int kgdb_set_sw_break(unsigned long addr)
682static int kgdb_deactivate_sw_breakpoints(void) 695static int kgdb_deactivate_sw_breakpoints(void)
683{ 696{
684 unsigned long addr; 697 unsigned long addr;
685 int error = 0; 698 int error;
699 int ret = 0;
686 int i; 700 int i;
687 701
688 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { 702 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
@@ -691,13 +705,15 @@ static int kgdb_deactivate_sw_breakpoints(void)
691 addr = kgdb_break[i].bpt_addr; 705 addr = kgdb_break[i].bpt_addr;
692 error = kgdb_arch_remove_breakpoint(addr, 706 error = kgdb_arch_remove_breakpoint(addr,
693 kgdb_break[i].saved_instr); 707 kgdb_break[i].saved_instr);
694 if (error) 708 if (error) {
695 return error; 709 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
710 ret = error;
711 }
696 712
697 kgdb_flush_swbreak_addr(addr); 713 kgdb_flush_swbreak_addr(addr);
698 kgdb_break[i].state = BP_SET; 714 kgdb_break[i].state = BP_SET;
699 } 715 }
700 return 0; 716 return ret;
701} 717}
702 718
703static int kgdb_remove_sw_break(unsigned long addr) 719static int kgdb_remove_sw_break(unsigned long addr)
@@ -1204,8 +1220,10 @@ static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1204 return 1; 1220 return 1;
1205 1221
1206 } else { 1222 } else {
1207 error_packet(remcom_out_buffer, -EINVAL); 1223 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1208 return 0; 1224 " and 15 (pass and disconnect)\n"
1225 "Executing a continue without signal passing\n", 0);
1226 remcom_in_buffer[0] = 'c';
1209 } 1227 }
1210 1228
1211 /* Indicate fall through */ 1229 /* Indicate fall through */
@@ -1395,6 +1413,7 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1395 struct kgdb_state kgdb_var; 1413 struct kgdb_state kgdb_var;
1396 struct kgdb_state *ks = &kgdb_var; 1414 struct kgdb_state *ks = &kgdb_var;
1397 unsigned long flags; 1415 unsigned long flags;
1416 int sstep_tries = 100;
1398 int error = 0; 1417 int error = 0;
1399 int i, cpu; 1418 int i, cpu;
1400 1419
@@ -1425,15 +1444,16 @@ acquirelock:
1425 cpu_relax(); 1444 cpu_relax();
1426 1445
1427 /* 1446 /*
1428 * Do not start the debugger connection on this CPU if the last 1447 * For single stepping, try to only enter on the processor
1429 * instance of the exception handler wanted to come into the 1448 * that was single stepping. To gaurd against a deadlock, the
1430 * debugger on a different CPU via a single step 1449 * kernel will only try for the value of sstep_tries before
1450 * giving up and continuing on.
1431 */ 1451 */
1432 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && 1452 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1453 (kgdb_info[cpu].task &&
1434 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1435 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1437 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1438 local_irq_restore(flags); 1458 local_irq_restore(flags);
1439 1459
@@ -1524,9 +1544,16 @@ acquirelock:
1524 } 1544 }
1525 1545
1526kgdb_restore: 1546kgdb_restore:
1547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1548 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1549 if (kgdb_info[sstep_cpu].task)
1550 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1551 else
1552 kgdb_sstep_pid = 0;
1553 }
1527 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1528 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1530 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1531 local_irq_restore(flags); 1558 local_irq_restore(flags);
1532 1559
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 25b103190364..bf0e231d9702 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -520,13 +520,15 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
520 return -ENOMEM; 520 return -ENOMEM;
521 521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp); 522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) 523 if (ret < 0) {
524 goto out; 524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
525 527
526 return call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); 528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
527 531
528 out:
529 call_usermodehelper_freeinfo(sub_info);
530 return ret; 532 return ret;
531} 533}
532EXPORT_SYMBOL(call_usermodehelper_pipe); 534EXPORT_SYMBOL(call_usermodehelper_pipe);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e5342a344c43..ccec774c716d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h> 46#include <linux/memory.h>
47#include <linux/ftrace.h>
47 48
48#include <asm-generic/sections.h> 49#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 94 {"native_get_debugreg",},
94 {"irq_entries_start",}, 95 {"irq_entries_start",},
95 {"common_interrupt",}, 96 {"common_interrupt",},
97 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 98 {NULL} /* Terminator */
97}; 99};
98 100
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
124static int kprobe_garbage_slots; 126static int kprobe_garbage_slots;
125static int collect_garbage_slots(void); 127static int collect_garbage_slots(void);
126 128
127static int __kprobes check_safety(void)
128{
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150
151/** 129/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 130 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 131 * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
235{ 213{
236 struct kprobe_insn_page *kip, *next; 214 struct kprobe_insn_page *kip, *next;
237 215
238 /* Ensure no-one is preepmted on the garbages */ 216 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 217 synchronize_sched();
240 return -EAGAIN;
241 218
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
243 int i; 220 int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 705
729 preempt_disable(); 706 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 707 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 708 in_kprobes_functions((unsigned long) p->addr) ||
709 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 710 preempt_enable();
733 return -EINVAL; 711 return -EINVAL;
734 } 712 }
@@ -1035,7 +1013,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
1035 /* Pre-allocate memory for max kretprobe instances */ 1013 /* Pre-allocate memory for max kretprobe instances */
1036 if (rp->maxactive <= 0) { 1014 if (rp->maxactive <= 0) {
1037#ifdef CONFIG_PREEMPT 1015#ifdef CONFIG_PREEMPT
1038 rp->maxactive = max(10, 2 * num_possible_cpus()); 1016 rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus());
1039#else 1017#else
1040 rp->maxactive = num_possible_cpus(); 1018 rp->maxactive = num_possible_cpus();
1041#endif 1019#endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 528dd78e7e7e..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -100,6 +100,26 @@ static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
100} 100}
101KERNEL_ATTR_RO(kexec_crash_loaded); 101KERNEL_ATTR_RO(kexec_crash_loaded);
102 102
103static ssize_t kexec_crash_size_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf)
105{
106 return sprintf(buf, "%zu\n", crash_get_memory_size());
107}
108static ssize_t kexec_crash_size_store(struct kobject *kobj,
109 struct kobj_attribute *attr,
110 const char *buf, size_t count)
111{
112 unsigned long cnt;
113 int ret;
114
115 if (strict_strtoul(buf, 0, &cnt))
116 return -EINVAL;
117
118 ret = crash_shrink_memory(cnt);
119 return ret < 0 ? ret : count;
120}
121KERNEL_ATTR_RW(kexec_crash_size);
122
103static ssize_t vmcoreinfo_show(struct kobject *kobj, 123static ssize_t vmcoreinfo_show(struct kobject *kobj,
104 struct kobj_attribute *attr, char *buf) 124 struct kobj_attribute *attr, char *buf)
105{ 125{
@@ -147,6 +167,7 @@ static struct attribute * kernel_attrs[] = {
147#ifdef CONFIG_KEXEC 167#ifdef CONFIG_KEXEC
148 &kexec_loaded_attr.attr, 168 &kexec_loaded_attr.attr,
149 &kexec_crash_loaded_attr.attr, 169 &kexec_crash_loaded_attr.attr,
170 &kexec_crash_size_attr.attr,
150 &vmcoreinfo_attr.attr, 171 &vmcoreinfo_attr.attr,
151#endif 172#endif
152 NULL 173 NULL
@@ -176,16 +197,8 @@ static int __init ksysfs_init(void)
176 goto group_exit; 197 goto group_exit;
177 } 198 }
178 199
179 /* create the /sys/kernel/uids/ directory */
180 error = uids_sysfs_init();
181 if (error)
182 goto notes_exit;
183
184 return 0; 200 return 0;
185 201
186notes_exit:
187 if (notes_size > 0)
188 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
189group_exit: 202group_exit:
190 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
191kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ab7ae57773e1..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
@@ -150,6 +150,29 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @p: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *p, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168
169 p->cpus_allowed = cpumask_of_cpu(cpu);
170 p->rt.nr_cpus_allowed = 1;
171 p->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
153 * kthread_stop - stop a thread created by kthread_create(). 176 * kthread_stop - stop a thread created by kthread_create().
154 * @k: thread created by kthread_create(). 177 * @k: thread created by kthread_create().
155 * 178 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f5dcd36d3151..0c30d0455de1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -73,11 +73,11 @@ module_param(lock_stat, int, 0644);
73 * to use a raw spinlock - we really dont want the spinlock 73 * to use a raw spinlock - we really dont want the spinlock
74 * code to recurse back into the lockdep code... 74 * code to recurse back into the lockdep code...
75 */ 75 */
76static raw_spinlock_t lockdep_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 76static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
77 77
78static int graph_lock(void) 78static int graph_lock(void)
79{ 79{
80 __raw_spin_lock(&lockdep_lock); 80 arch_spin_lock(&lockdep_lock);
81 /* 81 /*
82 * Make sure that if another CPU detected a bug while 82 * Make sure that if another CPU detected a bug while
83 * walking the graph we dont change it (while the other 83 * walking the graph we dont change it (while the other
@@ -85,7 +85,7 @@ static int graph_lock(void)
85 * dropped already) 85 * dropped already)
86 */ 86 */
87 if (!debug_locks) { 87 if (!debug_locks) {
88 __raw_spin_unlock(&lockdep_lock); 88 arch_spin_unlock(&lockdep_lock);
89 return 0; 89 return 0;
90 } 90 }
91 /* prevent any recursions within lockdep from causing deadlocks */ 91 /* prevent any recursions within lockdep from causing deadlocks */
@@ -95,11 +95,11 @@ static int graph_lock(void)
95 95
96static inline int graph_unlock(void) 96static inline int graph_unlock(void)
97{ 97{
98 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 98 if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
99 return DEBUG_LOCKS_WARN_ON(1); 99 return DEBUG_LOCKS_WARN_ON(1);
100 100
101 current->lockdep_recursion--; 101 current->lockdep_recursion--;
102 __raw_spin_unlock(&lockdep_lock); 102 arch_spin_unlock(&lockdep_lock);
103 return 0; 103 return 0;
104} 104}
105 105
@@ -111,7 +111,7 @@ static inline int debug_locks_off_graph_unlock(void)
111{ 111{
112 int ret = debug_locks_off(); 112 int ret = debug_locks_off();
113 113
114 __raw_spin_unlock(&lockdep_lock); 114 arch_spin_unlock(&lockdep_lock);
115 115
116 return ret; 116 return ret;
117} 117}
@@ -140,7 +140,8 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
140} 140}
141 141
142#ifdef CONFIG_LOCK_STAT 142#ifdef CONFIG_LOCK_STAT
143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 143static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS],
144 cpu_lock_stats);
144 145
145static inline u64 lockstat_clock(void) 146static inline u64 lockstat_clock(void)
146{ 147{
@@ -168,7 +169,7 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
168 if (time > lt->max) 169 if (time > lt->max)
169 lt->max = time; 170 lt->max = time;
170 171
171 if (time < lt->min || !lt->min) 172 if (time < lt->min || !lt->nr)
172 lt->min = time; 173 lt->min = time;
173 174
174 lt->total += time; 175 lt->total += time;
@@ -177,8 +178,15 @@ static void lock_time_inc(struct lock_time *lt, u64 time)
177 178
178static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) 179static inline void lock_time_add(struct lock_time *src, struct lock_time *dst)
179{ 180{
180 dst->min += src->min; 181 if (!src->nr)
181 dst->max += src->max; 182 return;
183
184 if (src->max > dst->max)
185 dst->max = src->max;
186
187 if (src->min < dst->min || !dst->nr)
188 dst->min = src->min;
189
182 dst->total += src->total; 190 dst->total += src->total;
183 dst->nr += src->nr; 191 dst->nr += src->nr;
184} 192}
@@ -191,7 +199,7 @@ struct lock_class_stats lock_stats(struct lock_class *class)
191 memset(&stats, 0, sizeof(struct lock_class_stats)); 199 memset(&stats, 0, sizeof(struct lock_class_stats));
192 for_each_possible_cpu(cpu) { 200 for_each_possible_cpu(cpu) {
193 struct lock_class_stats *pcs = 201 struct lock_class_stats *pcs =
194 &per_cpu(lock_stats, cpu)[class - lock_classes]; 202 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
195 203
196 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) 204 for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++)
197 stats.contention_point[i] += pcs->contention_point[i]; 205 stats.contention_point[i] += pcs->contention_point[i];
@@ -218,7 +226,7 @@ void clear_lock_stats(struct lock_class *class)
218 226
219 for_each_possible_cpu(cpu) { 227 for_each_possible_cpu(cpu) {
220 struct lock_class_stats *cpu_stats = 228 struct lock_class_stats *cpu_stats =
221 &per_cpu(lock_stats, cpu)[class - lock_classes]; 229 &per_cpu(cpu_lock_stats, cpu)[class - lock_classes];
222 230
223 memset(cpu_stats, 0, sizeof(struct lock_class_stats)); 231 memset(cpu_stats, 0, sizeof(struct lock_class_stats));
224 } 232 }
@@ -228,12 +236,12 @@ void clear_lock_stats(struct lock_class *class)
228 236
229static struct lock_class_stats *get_lock_stats(struct lock_class *class) 237static struct lock_class_stats *get_lock_stats(struct lock_class *class)
230{ 238{
231 return &get_cpu_var(lock_stats)[class - lock_classes]; 239 return &get_cpu_var(cpu_lock_stats)[class - lock_classes];
232} 240}
233 241
234static void put_lock_stats(struct lock_class_stats *stats) 242static void put_lock_stats(struct lock_class_stats *stats)
235{ 243{
236 put_cpu_var(lock_stats); 244 put_cpu_var(cpu_lock_stats);
237} 245}
238 246
239static void lock_release_holdtime(struct held_lock *hlock) 247static void lock_release_holdtime(struct held_lock *hlock)
@@ -379,7 +387,8 @@ static int save_trace(struct stack_trace *trace)
379 * complete trace that maxes out the entries provided will be reported 387 * complete trace that maxes out the entries provided will be reported
380 * as incomplete, friggin useless </rant> 388 * as incomplete, friggin useless </rant>
381 */ 389 */
382 if (trace->entries[trace->nr_entries-1] == ULONG_MAX) 390 if (trace->nr_entries != 0 &&
391 trace->entries[trace->nr_entries-1] == ULONG_MAX)
383 trace->nr_entries--; 392 trace->nr_entries--;
384 393
385 trace->max_entries = trace->nr_entries; 394 trace->max_entries = trace->nr_entries;
@@ -1161,9 +1170,9 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
1161 this.class = class; 1170 this.class = class;
1162 1171
1163 local_irq_save(flags); 1172 local_irq_save(flags);
1164 __raw_spin_lock(&lockdep_lock); 1173 arch_spin_lock(&lockdep_lock);
1165 ret = __lockdep_count_forward_deps(&this); 1174 ret = __lockdep_count_forward_deps(&this);
1166 __raw_spin_unlock(&lockdep_lock); 1175 arch_spin_unlock(&lockdep_lock);
1167 local_irq_restore(flags); 1176 local_irq_restore(flags);
1168 1177
1169 return ret; 1178 return ret;
@@ -1188,9 +1197,9 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
1188 this.class = class; 1197 this.class = class;
1189 1198
1190 local_irq_save(flags); 1199 local_irq_save(flags);
1191 __raw_spin_lock(&lockdep_lock); 1200 arch_spin_lock(&lockdep_lock);
1192 ret = __lockdep_count_backward_deps(&this); 1201 ret = __lockdep_count_backward_deps(&this);
1193 __raw_spin_unlock(&lockdep_lock); 1202 arch_spin_unlock(&lockdep_lock);
1194 local_irq_restore(flags); 1203 local_irq_restore(flags);
1195 1204
1196 return ret; 1205 return ret;
@@ -2138,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
2138 return ret; 2147 return ret;
2139 2148
2140 return print_irq_inversion_bug(curr, &root, target_entry, 2149 return print_irq_inversion_bug(curr, &root, target_entry,
2141 this, 1, irqclass); 2150 this, 0, irqclass);
2142} 2151}
2143 2152
2144void print_irqtrace_events(struct task_struct *curr) 2153void print_irqtrace_events(struct task_struct *curr)
@@ -3800,3 +3809,21 @@ void lockdep_sys_exit(void)
3800 lockdep_print_held_locks(curr); 3809 lockdep_print_held_locks(curr);
3801 } 3810 }
3802} 3811}
3812
3813void lockdep_rcu_dereference(const char *file, const int line)
3814{
3815 struct task_struct *curr = current;
3816
3817 if (!debug_locks_off())
3818 return;
3819 printk("\n===================================================\n");
3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3821 printk( "---------------------------------------------------\n");
3822 printk("%s:%d invoked rcu_dereference_check() without protection!\n",
3823 file, line);
3824 printk("\nother info that might help us debug this:\n\n");
3825 lockdep_print_held_locks(curr);
3826 printk("\nstack backtrace:\n");
3827 dump_stack();
3828}
3829EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
diff --git a/kernel/module.c b/kernel/module.c
index 5842a71cf052..f82386bd9ee9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -370,8 +370,6 @@ EXPORT_SYMBOL_GPL(find_module);
370 370
371#ifdef CONFIG_SMP 371#ifdef CONFIG_SMP
372 372
373#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
374
375static void *percpu_modalloc(unsigned long size, unsigned long align, 373static void *percpu_modalloc(unsigned long size, unsigned long align,
376 const char *name) 374 const char *name)
377{ 375{
@@ -395,154 +393,6 @@ static void percpu_modfree(void *freeme)
395 free_percpu(freeme); 393 free_percpu(freeme);
396} 394}
397 395
398#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
399
400/* Number of blocks used and allocated. */
401static unsigned int pcpu_num_used, pcpu_num_allocated;
402/* Size of each block. -ve means used. */
403static int *pcpu_size;
404
405static int split_block(unsigned int i, unsigned short size)
406{
407 /* Reallocation required? */
408 if (pcpu_num_used + 1 > pcpu_num_allocated) {
409 int *new;
410
411 new = krealloc(pcpu_size, sizeof(new[0])*pcpu_num_allocated*2,
412 GFP_KERNEL);
413 if (!new)
414 return 0;
415
416 pcpu_num_allocated *= 2;
417 pcpu_size = new;
418 }
419
420 /* Insert a new subblock */
421 memmove(&pcpu_size[i+1], &pcpu_size[i],
422 sizeof(pcpu_size[0]) * (pcpu_num_used - i));
423 pcpu_num_used++;
424
425 pcpu_size[i+1] -= size;
426 pcpu_size[i] = size;
427 return 1;
428}
429
430static inline unsigned int block_size(int val)
431{
432 if (val < 0)
433 return -val;
434 return val;
435}
436
437static void *percpu_modalloc(unsigned long size, unsigned long align,
438 const char *name)
439{
440 unsigned long extra;
441 unsigned int i;
442 void *ptr;
443 int cpu;
444
445 if (align > PAGE_SIZE) {
446 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
447 name, align, PAGE_SIZE);
448 align = PAGE_SIZE;
449 }
450
451 ptr = __per_cpu_start;
452 for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
453 /* Extra for alignment requirement. */
454 extra = ALIGN((unsigned long)ptr, align) - (unsigned long)ptr;
455 BUG_ON(i == 0 && extra != 0);
456
457 if (pcpu_size[i] < 0 || pcpu_size[i] < extra + size)
458 continue;
459
460 /* Transfer extra to previous block. */
461 if (pcpu_size[i-1] < 0)
462 pcpu_size[i-1] -= extra;
463 else
464 pcpu_size[i-1] += extra;
465 pcpu_size[i] -= extra;
466 ptr += extra;
467
468 /* Split block if warranted */
469 if (pcpu_size[i] - size > sizeof(unsigned long))
470 if (!split_block(i, size))
471 return NULL;
472
473 /* add the per-cpu scanning areas */
474 for_each_possible_cpu(cpu)
475 kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
476 GFP_KERNEL);
477
478 /* Mark allocated */
479 pcpu_size[i] = -pcpu_size[i];
480 return ptr;
481 }
482
483 printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n",
484 size);
485 return NULL;
486}
487
488static void percpu_modfree(void *freeme)
489{
490 unsigned int i;
491 void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
492 int cpu;
493
494 /* First entry is core kernel percpu data. */
495 for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
496 if (ptr == freeme) {
497 pcpu_size[i] = -pcpu_size[i];
498 goto free;
499 }
500 }
501 BUG();
502
503 free:
504 /* remove the per-cpu scanning areas */
505 for_each_possible_cpu(cpu)
506 kmemleak_free(freeme + per_cpu_offset(cpu));
507
508 /* Merge with previous? */
509 if (pcpu_size[i-1] >= 0) {
510 pcpu_size[i-1] += pcpu_size[i];
511 pcpu_num_used--;
512 memmove(&pcpu_size[i], &pcpu_size[i+1],
513 (pcpu_num_used - i) * sizeof(pcpu_size[0]));
514 i--;
515 }
516 /* Merge with next? */
517 if (i+1 < pcpu_num_used && pcpu_size[i+1] >= 0) {
518 pcpu_size[i] += pcpu_size[i+1];
519 pcpu_num_used--;
520 memmove(&pcpu_size[i+1], &pcpu_size[i+2],
521 (pcpu_num_used - (i+1)) * sizeof(pcpu_size[0]));
522 }
523}
524
525static int percpu_modinit(void)
526{
527 pcpu_num_used = 2;
528 pcpu_num_allocated = 2;
529 pcpu_size = kmalloc(sizeof(pcpu_size[0]) * pcpu_num_allocated,
530 GFP_KERNEL);
531 /* Static in-kernel percpu data (used). */
532 pcpu_size[0] = -(__per_cpu_end-__per_cpu_start);
533 /* Free room. */
534 pcpu_size[1] = PERCPU_ENOUGH_ROOM + pcpu_size[0];
535 if (pcpu_size[1] < 0) {
536 printk(KERN_ERR "No per-cpu room for modules.\n");
537 pcpu_num_used = 1;
538 }
539
540 return 0;
541}
542__initcall(percpu_modinit);
543
544#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
545
546static unsigned int find_pcpusec(Elf_Ehdr *hdr, 396static unsigned int find_pcpusec(Elf_Ehdr *hdr,
547 Elf_Shdr *sechdrs, 397 Elf_Shdr *sechdrs,
548 const char *secstrings) 398 const char *secstrings)
@@ -1030,11 +880,23 @@ static int try_to_force_load(struct module *mod, const char *reason)
1030} 880}
1031 881
1032#ifdef CONFIG_MODVERSIONS 882#ifdef CONFIG_MODVERSIONS
883/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */
884static unsigned long maybe_relocated(unsigned long crc,
885 const struct module *crc_owner)
886{
887#ifdef ARCH_RELOCATES_KCRCTAB
888 if (crc_owner == NULL)
889 return crc - (unsigned long)reloc_start;
890#endif
891 return crc;
892}
893
1033static int check_version(Elf_Shdr *sechdrs, 894static int check_version(Elf_Shdr *sechdrs,
1034 unsigned int versindex, 895 unsigned int versindex,
1035 const char *symname, 896 const char *symname,
1036 struct module *mod, 897 struct module *mod,
1037 const unsigned long *crc) 898 const unsigned long *crc,
899 const struct module *crc_owner)
1038{ 900{
1039 unsigned int i, num_versions; 901 unsigned int i, num_versions;
1040 struct modversion_info *versions; 902 struct modversion_info *versions;
@@ -1055,10 +917,10 @@ static int check_version(Elf_Shdr *sechdrs,
1055 if (strcmp(versions[i].name, symname) != 0) 917 if (strcmp(versions[i].name, symname) != 0)
1056 continue; 918 continue;
1057 919
1058 if (versions[i].crc == *crc) 920 if (versions[i].crc == maybe_relocated(*crc, crc_owner))
1059 return 1; 921 return 1;
1060 DEBUGP("Found checksum %lX vs module %lX\n", 922 DEBUGP("Found checksum %lX vs module %lX\n",
1061 *crc, versions[i].crc); 923 maybe_relocated(*crc, crc_owner), versions[i].crc);
1062 goto bad_version; 924 goto bad_version;
1063 } 925 }
1064 926
@@ -1081,7 +943,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
1081 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 943 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
1082 &crc, true, false)) 944 &crc, true, false))
1083 BUG(); 945 BUG();
1084 return check_version(sechdrs, versindex, "module_layout", mod, crc); 946 return check_version(sechdrs, versindex, "module_layout", mod, crc,
947 NULL);
1085} 948}
1086 949
1087/* First part is kernel version, which we ignore if module has crcs. */ 950/* First part is kernel version, which we ignore if module has crcs. */
@@ -1099,7 +962,8 @@ static inline int check_version(Elf_Shdr *sechdrs,
1099 unsigned int versindex, 962 unsigned int versindex,
1100 const char *symname, 963 const char *symname,
1101 struct module *mod, 964 struct module *mod,
1102 const unsigned long *crc) 965 const unsigned long *crc,
966 const struct module *crc_owner)
1103{ 967{
1104 return 1; 968 return 1;
1105} 969}
@@ -1134,8 +998,8 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1134 /* use_module can fail due to OOM, 998 /* use_module can fail due to OOM,
1135 or module initialization or unloading */ 999 or module initialization or unloading */
1136 if (sym) { 1000 if (sym) {
1137 if (!check_version(sechdrs, versindex, name, mod, crc) || 1001 if (!check_version(sechdrs, versindex, name, mod, crc, owner)
1138 !use_module(mod, owner)) 1002 || !use_module(mod, owner))
1139 sym = NULL; 1003 sym = NULL;
1140 } 1004 }
1141 return sym; 1005 return sym;
@@ -1146,6 +1010,12 @@ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
1146 * J. Corbet <corbet@lwn.net> 1010 * J. Corbet <corbet@lwn.net>
1147 */ 1011 */
1148#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) 1012#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS)
1013
1014static inline bool sect_empty(const Elf_Shdr *sect)
1015{
1016 return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0;
1017}
1018
1149struct module_sect_attr 1019struct module_sect_attr
1150{ 1020{
1151 struct module_attribute mattr; 1021 struct module_attribute mattr;
@@ -1187,8 +1057,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1187 1057
1188 /* Count loaded sections and allocate structures */ 1058 /* Count loaded sections and allocate structures */
1189 for (i = 0; i < nsect; i++) 1059 for (i = 0; i < nsect; i++)
1190 if (sechdrs[i].sh_flags & SHF_ALLOC 1060 if (!sect_empty(&sechdrs[i]))
1191 && sechdrs[i].sh_size)
1192 nloaded++; 1061 nloaded++;
1193 size[0] = ALIGN(sizeof(*sect_attrs) 1062 size[0] = ALIGN(sizeof(*sect_attrs)
1194 + nloaded * sizeof(sect_attrs->attrs[0]), 1063 + nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1206,9 +1075,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
1206 sattr = &sect_attrs->attrs[0]; 1075 sattr = &sect_attrs->attrs[0];
1207 gattr = &sect_attrs->grp.attrs[0]; 1076 gattr = &sect_attrs->grp.attrs[0];
1208 for (i = 0; i < nsect; i++) { 1077 for (i = 0; i < nsect; i++) {
1209 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 1078 if (sect_empty(&sechdrs[i]))
1210 continue;
1211 if (!sechdrs[i].sh_size)
1212 continue; 1079 continue;
1213 sattr->address = sechdrs[i].sh_addr; 1080 sattr->address = sechdrs[i].sh_addr;
1214 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name, 1081 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
@@ -1292,7 +1159,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1292 /* Count notes sections and allocate structures. */ 1159 /* Count notes sections and allocate structures. */
1293 notes = 0; 1160 notes = 0;
1294 for (i = 0; i < nsect; i++) 1161 for (i = 0; i < nsect; i++)
1295 if ((sechdrs[i].sh_flags & SHF_ALLOC) && 1162 if (!sect_empty(&sechdrs[i]) &&
1296 (sechdrs[i].sh_type == SHT_NOTE)) 1163 (sechdrs[i].sh_type == SHT_NOTE))
1297 ++notes; 1164 ++notes;
1298 1165
@@ -1308,7 +1175,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1308 notes_attrs->notes = notes; 1175 notes_attrs->notes = notes;
1309 nattr = &notes_attrs->attrs[0]; 1176 nattr = &notes_attrs->attrs[0];
1310 for (loaded = i = 0; i < nsect; ++i) { 1177 for (loaded = i = 0; i < nsect; ++i) {
1311 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1178 if (sect_empty(&sechdrs[i]))
1312 continue; 1179 continue;
1313 if (sechdrs[i].sh_type == SHT_NOTE) { 1180 if (sechdrs[i].sh_type == SHT_NOTE) {
1314 nattr->attr.name = mod->sect_attrs->attrs[loaded].name; 1181 nattr->attr.name = mod->sect_attrs->attrs[loaded].name;
@@ -2046,9 +1913,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2046 unsigned int i; 1913 unsigned int i;
2047 1914
2048 /* only scan the sections containing data */ 1915 /* only scan the sections containing data */
2049 kmemleak_scan_area(mod->module_core, (unsigned long)mod - 1916 kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL);
2050 (unsigned long)mod->module_core,
2051 sizeof(struct module), GFP_KERNEL);
2052 1917
2053 for (i = 1; i < hdr->e_shnum; i++) { 1918 for (i = 1; i < hdr->e_shnum; i++) {
2054 if (!(sechdrs[i].sh_flags & SHF_ALLOC)) 1919 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
@@ -2057,8 +1922,7 @@ static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
2057 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0) 1922 && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
2058 continue; 1923 continue;
2059 1924
2060 kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr - 1925 kmemleak_scan_area((void *)sechdrs[i].sh_addr,
2061 (unsigned long)mod->module_core,
2062 sechdrs[i].sh_size, GFP_KERNEL); 1926 sechdrs[i].sh_size, GFP_KERNEL);
2063 } 1927 }
2064} 1928}
@@ -2386,6 +2250,12 @@ static noinline struct module *load_module(void __user *umod,
2386 "_ftrace_events", 2250 "_ftrace_events",
2387 sizeof(*mod->trace_events), 2251 sizeof(*mod->trace_events),
2388 &mod->num_trace_events); 2252 &mod->num_trace_events);
2253 /*
2254 * This section contains pointers to allocated objects in the trace
2255 * code and not scanning it leads to false positives.
2256 */
2257 kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) *
2258 mod->num_trace_events, GFP_KERNEL);
2389#endif 2259#endif
2390#ifdef CONFIG_FTRACE_MCOUNT_RECORD 2260#ifdef CONFIG_FTRACE_MCOUNT_RECORD
2391 /* sechdrs[0].sh_size is always zero */ 2261 /* sechdrs[0].sh_size is always zero */
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 6b2d735846a5..57d527a16f9d 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -43,13 +43,13 @@ static inline void mutex_clear_owner(struct mutex *lock)
43 \ 43 \
44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \ 44 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
45 local_irq_save(flags); \ 45 local_irq_save(flags); \
46 __raw_spin_lock(&(lock)->raw_lock); \ 46 arch_spin_lock(&(lock)->rlock.raw_lock);\
47 DEBUG_LOCKS_WARN_ON(l->magic != l); \ 47 DEBUG_LOCKS_WARN_ON(l->magic != l); \
48 } while (0) 48 } while (0)
49 49
50#define spin_unlock_mutex(lock, flags) \ 50#define spin_unlock_mutex(lock, flags) \
51 do { \ 51 do { \
52 __raw_spin_unlock(&(lock)->raw_lock); \ 52 arch_spin_unlock(&(lock)->rlock.raw_lock); \
53 local_irq_restore(flags); \ 53 local_irq_restore(flags); \
54 preempt_check_resched(); \ 54 preempt_check_resched(); \
55 } while (0) 55 } while (0)
diff --git a/kernel/notifier.c b/kernel/notifier.c
index acd24e7643eb..2488ba7eb568 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
78 int ret = NOTIFY_DONE; 78 int ret = NOTIFY_DONE;
79 struct notifier_block *nb, *next_nb; 79 struct notifier_block *nb, *next_nb;
80 80
81 nb = rcu_dereference(*nl); 81 nb = rcu_dereference_raw(*nl);
82 82
83 while (nb && nr_to_call) { 83 while (nb && nr_to_call) {
84 next_nb = rcu_dereference(nb->next); 84 next_nb = rcu_dereference_raw(nb->next);
85 85
86#ifdef CONFIG_DEBUG_NOTIFIERS 86#ifdef CONFIG_DEBUG_NOTIFIERS
87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { 87 if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) {
@@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh,
309 * racy then it does not matter what the result of the test 309 * racy then it does not matter what the result of the test
310 * is, we re-check the list after having taken the lock anyway: 310 * is, we re-check the list after having taken the lock anyway:
311 */ 311 */
312 if (rcu_dereference(nh->head)) { 312 if (rcu_dereference_raw(nh->head)) {
313 down_read(&nh->rwsem); 313 down_read(&nh->rwsem);
314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call, 314 ret = notifier_call_chain(&nh->head, val, v, nr_to_call,
315 nr_calls); 315 nr_calls);
diff --git a/kernel/padata.c b/kernel/padata.c
new file mode 100644
index 000000000000..6f9bcb8313d6
--- /dev/null
+++ b/kernel/padata.c
@@ -0,0 +1,690 @@
1/*
2 * padata.c - generic interface to process data streams in parallel
3 *
4 * Copyright (C) 2008, 2009 secunet Security Networks AG
5 * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
6 *
7 * This program is free software; you can redistribute it and/or modify it
8 * under the terms and conditions of the GNU General Public License,
9 * version 2, as published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope it will be useful, but WITHOUT
12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
14 * more details.
15 *
16 * You should have received a copy of the GNU General Public License along with
17 * this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
19 */
20
21#include <linux/module.h>
22#include <linux/cpumask.h>
23#include <linux/err.h>
24#include <linux/cpu.h>
25#include <linux/padata.h>
26#include <linux/mutex.h>
27#include <linux/sched.h>
28#include <linux/rcupdate.h>
29
30#define MAX_SEQ_NR INT_MAX - NR_CPUS
31#define MAX_OBJ_NUM 10000 * NR_CPUS
32
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{
35 int cpu, target_cpu;
36
37 target_cpu = cpumask_first(pd->cpumask);
38 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask);
40
41 return target_cpu;
42}
43
44static int padata_cpu_hash(struct padata_priv *padata)
45{
46 int cpu_index;
47 struct parallel_data *pd;
48
49 pd = padata->pd;
50
51 /*
52 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use.
54 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask);
56
57 return padata_index_to_cpu(pd, cpu_index);
58}
59
60static void padata_parallel_worker(struct work_struct *work)
61{
62 struct padata_queue *queue;
63 struct parallel_data *pd;
64 struct padata_instance *pinst;
65 LIST_HEAD(local_list);
66
67 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork);
69 pd = queue->pd;
70 pinst = pd->pinst;
71
72 spin_lock(&queue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock);
75
76 while (!list_empty(&local_list)) {
77 struct padata_priv *padata;
78
79 padata = list_entry(local_list.next,
80 struct padata_priv, list);
81
82 list_del_init(&padata->list);
83
84 padata->parallel(padata);
85 }
86
87 local_bh_enable();
88}
89
90/*
91 * padata_do_parallel - padata parallelization function
92 *
93 * @pinst: padata instance
94 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata.
97 *
98 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel
100 * must be seen by padata_do_serial.
101 */
102int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu)
104{
105 int target_cpu, err;
106 struct padata_queue *queue;
107 struct parallel_data *pd;
108
109 rcu_read_lock_bh();
110
111 pd = rcu_dereference(pinst->pd);
112
113 err = 0;
114 if (!(pinst->flags & PADATA_INIT))
115 goto out;
116
117 err = -EBUSY;
118 if ((pinst->flags & PADATA_RESET))
119 goto out;
120
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out;
123
124 err = -EINVAL;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt);
130 padata->pd = pd;
131 padata->cb_cpu = cb_cpu;
132
133 if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr))
134 atomic_set(&pd->seq_nr, -1);
135
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137
138 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu);
140
141 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock);
144
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork);
146
147out:
148 rcu_read_unlock_bh();
149
150 return err;
151}
152EXPORT_SYMBOL(padata_do_parallel);
153
154static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{
156 int cpu, num_cpus, empty, calc_seq_nr;
157 int seq_nr, next_nr, overrun, next_overrun;
158 struct padata_queue *queue, *next_queue;
159 struct padata_priv *padata;
160 struct padata_list *reorder;
161
162 empty = 0;
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196
197 if (next_nr < 0 || seq_nr < next_nr
198 || (next_overrun && !overrun)) {
199 next_nr = seq_nr;
200 next_overrun = overrun;
201 next_queue = queue;
202 }
203 }
204
205 padata = NULL;
206
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder;
211
212 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next,
214 struct padata_priv, list);
215
216 if (unlikely(next_overrun)) {
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222
223 spin_lock(&reorder->lock);
224 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock);
227
228 atomic_inc(&next_queue->num_obj);
229
230 goto out;
231 }
232
233 if (next_nr % num_cpus == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA);
235 goto out;
236 }
237
238 padata = ERR_PTR(-EINPROGRESS);
239out:
240 return padata;
241}
242
243static void padata_reorder(struct parallel_data *pd)
244{
245 struct padata_priv *padata;
246 struct padata_queue *queue;
247 struct padata_instance *pinst = pd->pinst;
248
249try_again:
250 if (!spin_trylock_bh(&pd->lock))
251 goto out;
252
253 while (1) {
254 padata = padata_get_next(pd);
255
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break;
258
259 if (PTR_ERR(padata) == -ENODATA) {
260 spin_unlock_bh(&pd->lock);
261 goto out;
262 }
263
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
265
266 spin_lock(&queue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list);
268 spin_unlock(&queue->serial.lock);
269
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork);
271 }
272
273 spin_unlock_bh(&pd->lock);
274
275 if (atomic_read(&pd->reorder_objects))
276 goto try_again;
277
278out:
279 return;
280}
281
282static void padata_serial_worker(struct work_struct *work)
283{
284 struct padata_queue *queue;
285 struct parallel_data *pd;
286 LIST_HEAD(local_list);
287
288 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork);
290 pd = queue->pd;
291
292 spin_lock(&queue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock);
295
296 while (!list_empty(&local_list)) {
297 struct padata_priv *padata;
298
299 padata = list_entry(local_list.next,
300 struct padata_priv, list);
301
302 list_del_init(&padata->list);
303
304 padata->serial(padata);
305 atomic_dec(&pd->refcnt);
306 }
307 local_bh_enable();
308}
309
310/*
311 * padata_do_serial - padata serialization function
312 *
313 * @padata: object to be serialized.
314 *
315 * padata_do_serial must be called for every parallelized object.
316 * The serialization callback function will run with BHs off.
317 */
318void padata_do_serial(struct padata_priv *padata)
319{
320 int cpu;
321 struct padata_queue *queue;
322 struct parallel_data *pd;
323
324 pd = padata->pd;
325
326 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu);
328
329 spin_lock(&queue->reorder.lock);
330 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list);
332 spin_unlock(&queue->reorder.lock);
333
334 put_cpu();
335
336 padata_reorder(pd);
337}
338EXPORT_SYMBOL(padata_do_serial);
339
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
341 const struct cpumask *cpumask)
342{
343 int cpu, cpu_index, num_cpus;
344 struct padata_queue *queue;
345 struct parallel_data *pd;
346
347 cpu_index = 0;
348
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
350 if (!pd)
351 goto err;
352
353 pd->queue = alloc_percpu(struct padata_queue);
354 if (!pd->queue)
355 goto err_free_pd;
356
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
358 goto err_free_queue;
359
360 for_each_possible_cpu(cpu) {
361 queue = per_cpu_ptr(pd->queue, cpu);
362
363 queue->pd = pd;
364
365 if (cpumask_test_cpu(cpu, cpumask)
366 && cpumask_test_cpu(cpu, cpu_active_mask)) {
367 queue->cpu_index = cpu_index;
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371
372 INIT_LIST_HEAD(&queue->reorder.list);
373 INIT_LIST_HEAD(&queue->parallel.list);
374 INIT_LIST_HEAD(&queue->serial.list);
375 spin_lock_init(&queue->reorder.lock);
376 spin_lock_init(&queue->parallel.lock);
377 spin_lock_init(&queue->serial.lock);
378
379 INIT_WORK(&queue->pwork, padata_parallel_worker);
380 INIT_WORK(&queue->swork, padata_serial_worker);
381 atomic_set(&queue->num_obj, 0);
382 }
383
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
385
386 num_cpus = cpumask_weight(pd->cpumask);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
388
389 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0);
392 pd->pinst = pinst;
393 spin_lock_init(&pd->lock);
394
395 return pd;
396
397err_free_queue:
398 free_percpu(pd->queue);
399err_free_pd:
400 kfree(pd);
401err:
402 return NULL;
403}
404
405static void padata_free_pd(struct parallel_data *pd)
406{
407 free_cpumask_var(pd->cpumask);
408 free_percpu(pd->queue);
409 kfree(pd);
410}
411
412static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new)
414{
415 struct parallel_data *pd_old = pinst->pd;
416
417 pinst->flags |= PADATA_RESET;
418
419 rcu_assign_pointer(pinst->pd, pd_new);
420
421 synchronize_rcu();
422
423 while (atomic_read(&pd_old->refcnt) != 0)
424 yield();
425
426 flush_workqueue(pinst->wq);
427
428 padata_free_pd(pd_old);
429
430 pinst->flags &= ~PADATA_RESET;
431}
432
433/*
434 * padata_set_cpumask - set the cpumask that padata should use
435 *
436 * @pinst: padata instance
437 * @cpumask: the cpumask to use
438 */
439int padata_set_cpumask(struct padata_instance *pinst,
440 cpumask_var_t cpumask)
441{
442 struct parallel_data *pd;
443 int err = 0;
444
445 might_sleep();
446
447 mutex_lock(&pinst->lock);
448
449 pd = padata_alloc_pd(pinst, cpumask);
450 if (!pd) {
451 err = -ENOMEM;
452 goto out;
453 }
454
455 cpumask_copy(pinst->cpumask, cpumask);
456
457 padata_replace(pinst, pd);
458
459out:
460 mutex_unlock(&pinst->lock);
461
462 return err;
463}
464EXPORT_SYMBOL(padata_set_cpumask);
465
466static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
467{
468 struct parallel_data *pd;
469
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask);
472 if (!pd)
473 return -ENOMEM;
474
475 padata_replace(pinst, pd);
476 }
477
478 return 0;
479}
480
481/*
482 * padata_add_cpu - add a cpu to the padata cpumask
483 *
484 * @pinst: padata instance
485 * @cpu: cpu to add
486 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu)
488{
489 int err;
490
491 might_sleep();
492
493 mutex_lock(&pinst->lock);
494
495 cpumask_set_cpu(cpu, pinst->cpumask);
496 err = __padata_add_cpu(pinst, cpu);
497
498 mutex_unlock(&pinst->lock);
499
500 return err;
501}
502EXPORT_SYMBOL(padata_add_cpu);
503
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{
506 struct parallel_data *pd;
507
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask);
510 if (!pd)
511 return -ENOMEM;
512
513 padata_replace(pinst, pd);
514 }
515
516 return 0;
517}
518
519/*
520 * padata_remove_cpu - remove a cpu from the padata cpumask
521 *
522 * @pinst: padata instance
523 * @cpu: cpu to remove
524 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu)
526{
527 int err;
528
529 might_sleep();
530
531 mutex_lock(&pinst->lock);
532
533 cpumask_clear_cpu(cpu, pinst->cpumask);
534 err = __padata_remove_cpu(pinst, cpu);
535
536 mutex_unlock(&pinst->lock);
537
538 return err;
539}
540EXPORT_SYMBOL(padata_remove_cpu);
541
542/*
543 * padata_start - start the parallel processing
544 *
545 * @pinst: padata instance to start
546 */
547void padata_start(struct padata_instance *pinst)
548{
549 might_sleep();
550
551 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT;
553 mutex_unlock(&pinst->lock);
554}
555EXPORT_SYMBOL(padata_start);
556
557/*
558 * padata_stop - stop the parallel processing
559 *
560 * @pinst: padata instance to stop
561 */
562void padata_stop(struct padata_instance *pinst)
563{
564 might_sleep();
565
566 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT;
568 mutex_unlock(&pinst->lock);
569}
570EXPORT_SYMBOL(padata_stop);
571
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
573 unsigned long action, void *hcpu)
574{
575 int err;
576 struct padata_instance *pinst;
577 int cpu = (unsigned long)hcpu;
578
579 pinst = container_of(nfb, struct padata_instance, cpu_notifier);
580
581 switch (action) {
582 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask))
585 break;
586 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock);
589 if (err)
590 return NOTIFY_BAD;
591 break;
592
593 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask))
596 break;
597 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock);
600 if (err)
601 return NOTIFY_BAD;
602 break;
603
604 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask))
607 break;
608 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu);
610 mutex_unlock(&pinst->lock);
611
612 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask))
615 break;
616 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu);
618 mutex_unlock(&pinst->lock);
619 }
620
621 return NOTIFY_OK;
622}
623
624/*
625 * padata_alloc - allocate and initialize a padata instance
626 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance
629 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask,
631 struct workqueue_struct *wq)
632{
633 int err;
634 struct padata_instance *pinst;
635 struct parallel_data *pd;
636
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst)
639 goto err;
640
641 pd = padata_alloc_pd(pinst, cpumask);
642 if (!pd)
643 goto err_free_inst;
644
645 rcu_assign_pointer(pinst->pd, pd);
646
647 pinst->wq = wq;
648
649 cpumask_copy(pinst->cpumask, cpumask);
650
651 pinst->flags = 0;
652
653 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
654 pinst->cpu_notifier.priority = 0;
655 err = register_hotcpu_notifier(&pinst->cpu_notifier);
656 if (err)
657 goto err_free_pd;
658
659 mutex_init(&pinst->lock);
660
661 return pinst;
662
663err_free_pd:
664 padata_free_pd(pd);
665err_free_inst:
666 kfree(pinst);
667err:
668 return NULL;
669}
670EXPORT_SYMBOL(padata_alloc);
671
672/*
673 * padata_free - free a padata instance
674 *
675 * @ padata_inst: padata instance to free
676 */
677void padata_free(struct padata_instance *pinst)
678{
679 padata_stop(pinst);
680
681 synchronize_rcu();
682
683 while (atomic_read(&pinst->pd->refcnt) != 0)
684 yield();
685
686 unregister_hotcpu_notifier(&pinst->cpu_notifier);
687 padata_free_pd(pinst->pd);
688 kfree(pinst);
689}
690EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 96b45d0b4ba5..c787333282b8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -10,6 +10,7 @@
10 */ 10 */
11#include <linux/debug_locks.h> 11#include <linux/debug_locks.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/kmsg_dump.h>
13#include <linux/kallsyms.h> 14#include <linux/kallsyms.h>
14#include <linux/notifier.h> 15#include <linux/notifier.h>
15#include <linux/module.h> 16#include <linux/module.h>
@@ -81,6 +82,8 @@ NORET_TYPE void panic(const char * fmt, ...)
81 */ 82 */
82 crash_kexec(NULL); 83 crash_kexec(NULL);
83 84
85 kmsg_dump(KMSG_DUMP_PANIC);
86
84 /* 87 /*
85 * Note smp_send_stop is the usual smp shutdown function, which 88 * Note smp_send_stop is the usual smp shutdown function, which
86 * unfortunately means it may not be hardened to work in a panic 89 * unfortunately means it may not be hardened to work in a panic
@@ -339,6 +342,7 @@ void oops_exit(void)
339{ 342{
340 do_oops_enter_exit(); 343 do_oops_enter_exit();
341 print_oops_end_marker(); 344 print_oops_end_marker();
345 kmsg_dump(KMSG_DUMP_OOPS);
342} 346}
343 347
344#ifdef WANT_WARN_ON_SLOWPATH 348#ifdef WANT_WARN_ON_SLOWPATH
diff --git a/kernel/params.c b/kernel/params.c
index d656c276508d..cf1b69183127 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,7 @@
24#include <linux/err.h> 24#include <linux/err.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ctype.h> 26#include <linux/ctype.h>
27#include <linux/string.h>
27 28
28#if 0 29#if 0
29#define DEBUGP printk 30#define DEBUGP printk
@@ -122,9 +123,7 @@ static char *next_arg(char *args, char **param, char **val)
122 next = args + i; 123 next = args + i;
123 124
124 /* Chew up trailing spaces. */ 125 /* Chew up trailing spaces. */
125 while (isspace(*next)) 126 return skip_spaces(next);
126 next++;
127 return next;
128} 127}
129 128
130/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ 129/* Args looks like "foo=bar,bar2 baz=fuz wiz". */
@@ -139,8 +138,7 @@ int parse_args(const char *name,
139 DEBUGP("Parsing ARGS: %s\n", args); 138 DEBUGP("Parsing ARGS: %s\n", args);
140 139
141 /* Chew leading spaces */ 140 /* Chew leading spaces */
142 while (isspace(*args)) 141 args = skip_spaces(args);
143 args++;
144 142
145 while (*args) { 143 while (*args) {
146 int ret; 144 int ret;
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 6b7ddba1dd64..a661e7991865 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -36,7 +36,7 @@
36/* 36/*
37 * Each CPU has a list of per CPU events: 37 * Each CPU has a list of per CPU events:
38 */ 38 */
39DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); 39static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40 40
41int perf_max_events __read_mostly = 1; 41int perf_max_events __read_mostly = 1;
42static int perf_reserved_percpu __read_mostly; 42static int perf_reserved_percpu __read_mostly;
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -203,14 +204,14 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags)
203 * if so. If we locked the right context, then it 204 * if so. If we locked the right context, then it
204 * can't get swapped on us any more. 205 * can't get swapped on us any more.
205 */ 206 */
206 spin_lock_irqsave(&ctx->lock, *flags); 207 raw_spin_lock_irqsave(&ctx->lock, *flags);
207 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 208 if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208 spin_unlock_irqrestore(&ctx->lock, *flags); 209 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
209 goto retry; 210 goto retry;
210 } 211 }
211 212
212 if (!atomic_inc_not_zero(&ctx->refcount)) { 213 if (!atomic_inc_not_zero(&ctx->refcount)) {
213 spin_unlock_irqrestore(&ctx->lock, *flags); 214 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
214 ctx = NULL; 215 ctx = NULL;
215 } 216 }
216 } 217 }
@@ -231,7 +232,7 @@ static struct perf_event_context *perf_pin_task_context(struct task_struct *task
231 ctx = perf_lock_task_context(task, &flags); 232 ctx = perf_lock_task_context(task, &flags);
232 if (ctx) { 233 if (ctx) {
233 ++ctx->pin_count; 234 ++ctx->pin_count;
234 spin_unlock_irqrestore(&ctx->lock, flags); 235 raw_spin_unlock_irqrestore(&ctx->lock, flags);
235 } 236 }
236 return ctx; 237 return ctx;
237} 238}
@@ -240,15 +241,15 @@ static void perf_unpin_context(struct perf_event_context *ctx)
240{ 241{
241 unsigned long flags; 242 unsigned long flags;
242 243
243 spin_lock_irqsave(&ctx->lock, flags); 244 raw_spin_lock_irqsave(&ctx->lock, flags);
244 --ctx->pin_count; 245 --ctx->pin_count;
245 spin_unlock_irqrestore(&ctx->lock, flags); 246 raw_spin_unlock_irqrestore(&ctx->lock, flags);
246 put_ctx(ctx); 247 put_ctx(ctx);
247} 248}
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -427,7 +452,7 @@ static void __perf_event_remove_from_context(void *info)
427 if (ctx->task && cpuctx->task_ctx != ctx) 452 if (ctx->task && cpuctx->task_ctx != ctx)
428 return; 453 return;
429 454
430 spin_lock(&ctx->lock); 455 raw_spin_lock(&ctx->lock);
431 /* 456 /*
432 * Protect the list operation against NMI by disabling the 457 * Protect the list operation against NMI by disabling the
433 * events on a global level. 458 * events on a global level.
@@ -449,7 +474,7 @@ static void __perf_event_remove_from_context(void *info)
449 } 474 }
450 475
451 perf_enable(); 476 perf_enable();
452 spin_unlock(&ctx->lock); 477 raw_spin_unlock(&ctx->lock);
453} 478}
454 479
455 480
@@ -476,7 +501,7 @@ static void perf_event_remove_from_context(struct perf_event *event)
476 if (!task) { 501 if (!task) {
477 /* 502 /*
478 * Per cpu events are removed via an smp call and 503 * Per cpu events are removed via an smp call and
479 * the removal is always sucessful. 504 * the removal is always successful.
480 */ 505 */
481 smp_call_function_single(event->cpu, 506 smp_call_function_single(event->cpu,
482 __perf_event_remove_from_context, 507 __perf_event_remove_from_context,
@@ -488,12 +513,12 @@ retry:
488 task_oncpu_function_call(task, __perf_event_remove_from_context, 513 task_oncpu_function_call(task, __perf_event_remove_from_context,
489 event); 514 event);
490 515
491 spin_lock_irq(&ctx->lock); 516 raw_spin_lock_irq(&ctx->lock);
492 /* 517 /*
493 * If the context is active we need to retry the smp call. 518 * If the context is active we need to retry the smp call.
494 */ 519 */
495 if (ctx->nr_active && !list_empty(&event->group_entry)) { 520 if (ctx->nr_active && !list_empty(&event->group_entry)) {
496 spin_unlock_irq(&ctx->lock); 521 raw_spin_unlock_irq(&ctx->lock);
497 goto retry; 522 goto retry;
498 } 523 }
499 524
@@ -504,7 +529,7 @@ retry:
504 */ 529 */
505 if (!list_empty(&event->group_entry)) 530 if (!list_empty(&event->group_entry))
506 list_del_event(event, ctx); 531 list_del_event(event, ctx);
507 spin_unlock_irq(&ctx->lock); 532 raw_spin_unlock_irq(&ctx->lock);
508} 533}
509 534
510/* 535/*
@@ -535,7 +560,7 @@ static void __perf_event_disable(void *info)
535 if (ctx->task && cpuctx->task_ctx != ctx) 560 if (ctx->task && cpuctx->task_ctx != ctx)
536 return; 561 return;
537 562
538 spin_lock(&ctx->lock); 563 raw_spin_lock(&ctx->lock);
539 564
540 /* 565 /*
541 * If the event is on, turn it off. 566 * If the event is on, turn it off.
@@ -551,7 +576,7 @@ static void __perf_event_disable(void *info)
551 event->state = PERF_EVENT_STATE_OFF; 576 event->state = PERF_EVENT_STATE_OFF;
552 } 577 }
553 578
554 spin_unlock(&ctx->lock); 579 raw_spin_unlock(&ctx->lock);
555} 580}
556 581
557/* 582/*
@@ -567,7 +592,7 @@ static void __perf_event_disable(void *info)
567 * is the current context on this CPU and preemption is disabled, 592 * is the current context on this CPU and preemption is disabled,
568 * hence we can't get into perf_event_task_sched_out for this context. 593 * hence we can't get into perf_event_task_sched_out for this context.
569 */ 594 */
570static void perf_event_disable(struct perf_event *event) 595void perf_event_disable(struct perf_event *event)
571{ 596{
572 struct perf_event_context *ctx = event->ctx; 597 struct perf_event_context *ctx = event->ctx;
573 struct task_struct *task = ctx->task; 598 struct task_struct *task = ctx->task;
@@ -584,12 +609,12 @@ static void perf_event_disable(struct perf_event *event)
584 retry: 609 retry:
585 task_oncpu_function_call(task, __perf_event_disable, event); 610 task_oncpu_function_call(task, __perf_event_disable, event);
586 611
587 spin_lock_irq(&ctx->lock); 612 raw_spin_lock_irq(&ctx->lock);
588 /* 613 /*
589 * If the event is still active, we need to retry the cross-call. 614 * If the event is still active, we need to retry the cross-call.
590 */ 615 */
591 if (event->state == PERF_EVENT_STATE_ACTIVE) { 616 if (event->state == PERF_EVENT_STATE_ACTIVE) {
592 spin_unlock_irq(&ctx->lock); 617 raw_spin_unlock_irq(&ctx->lock);
593 goto retry; 618 goto retry;
594 } 619 }
595 620
@@ -602,20 +627,19 @@ static void perf_event_disable(struct perf_event *event)
602 event->state = PERF_EVENT_STATE_OFF; 627 event->state = PERF_EVENT_STATE_OFF;
603 } 628 }
604 629
605 spin_unlock_irq(&ctx->lock); 630 raw_spin_unlock_irq(&ctx->lock);
606} 631}
607 632
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -770,7 +774,7 @@ static void __perf_install_in_context(void *info)
770 cpuctx->task_ctx = ctx; 774 cpuctx->task_ctx = ctx;
771 } 775 }
772 776
773 spin_lock(&ctx->lock); 777 raw_spin_lock(&ctx->lock);
774 ctx->is_active = 1; 778 ctx->is_active = 1;
775 update_context_time(ctx); 779 update_context_time(ctx);
776 780
@@ -782,6 +786,9 @@ static void __perf_install_in_context(void *info)
782 786
783 add_event_to_ctx(event, ctx); 787 add_event_to_ctx(event, ctx);
784 788
789 if (event->cpu != -1 && event->cpu != smp_processor_id())
790 goto unlock;
791
785 /* 792 /*
786 * Don't put the event on if it is disabled or if 793 * Don't put the event on if it is disabled or if
787 * it is in a group and the group isn't on. 794 * it is in a group and the group isn't on.
@@ -798,7 +805,7 @@ static void __perf_install_in_context(void *info)
798 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
799 err = -EEXIST; 806 err = -EEXIST;
800 else 807 else
801 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
802 809
803 if (err) { 810 if (err) {
804 /* 811 /*
@@ -820,7 +827,7 @@ static void __perf_install_in_context(void *info)
820 unlock: 827 unlock:
821 perf_enable(); 828 perf_enable();
822 829
823 spin_unlock(&ctx->lock); 830 raw_spin_unlock(&ctx->lock);
824} 831}
825 832
826/* 833/*
@@ -845,7 +852,7 @@ perf_install_in_context(struct perf_event_context *ctx,
845 if (!task) { 852 if (!task) {
846 /* 853 /*
847 * Per cpu events are installed via an smp call and 854 * Per cpu events are installed via an smp call and
848 * the install is always sucessful. 855 * the install is always successful.
849 */ 856 */
850 smp_call_function_single(cpu, __perf_install_in_context, 857 smp_call_function_single(cpu, __perf_install_in_context,
851 event, 1); 858 event, 1);
@@ -856,12 +863,12 @@ retry:
856 task_oncpu_function_call(task, __perf_install_in_context, 863 task_oncpu_function_call(task, __perf_install_in_context,
857 event); 864 event);
858 865
859 spin_lock_irq(&ctx->lock); 866 raw_spin_lock_irq(&ctx->lock);
860 /* 867 /*
861 * we need to retry the smp call. 868 * we need to retry the smp call.
862 */ 869 */
863 if (ctx->is_active && list_empty(&event->group_entry)) { 870 if (ctx->is_active && list_empty(&event->group_entry)) {
864 spin_unlock_irq(&ctx->lock); 871 raw_spin_unlock_irq(&ctx->lock);
865 goto retry; 872 goto retry;
866 } 873 }
867 874
@@ -872,7 +879,7 @@ retry:
872 */ 879 */
873 if (list_empty(&event->group_entry)) 880 if (list_empty(&event->group_entry))
874 add_event_to_ctx(event, ctx); 881 add_event_to_ctx(event, ctx);
875 spin_unlock_irq(&ctx->lock); 882 raw_spin_unlock_irq(&ctx->lock);
876} 883}
877 884
878/* 885/*
@@ -917,7 +924,7 @@ static void __perf_event_enable(void *info)
917 cpuctx->task_ctx = ctx; 924 cpuctx->task_ctx = ctx;
918 } 925 }
919 926
920 spin_lock(&ctx->lock); 927 raw_spin_lock(&ctx->lock);
921 ctx->is_active = 1; 928 ctx->is_active = 1;
922 update_context_time(ctx); 929 update_context_time(ctx);
923 930
@@ -925,6 +932,9 @@ static void __perf_event_enable(void *info)
925 goto unlock; 932 goto unlock;
926 __perf_event_mark_enabled(event, ctx); 933 __perf_event_mark_enabled(event, ctx);
927 934
935 if (event->cpu != -1 && event->cpu != smp_processor_id())
936 goto unlock;
937
928 /* 938 /*
929 * If the event is in a group and isn't the group leader, 939 * If the event is in a group and isn't the group leader,
930 * then don't put it on unless the group is on. 940 * then don't put it on unless the group is on.
@@ -937,11 +947,9 @@ static void __perf_event_enable(void *info)
937 } else { 947 } else {
938 perf_disable(); 948 perf_disable();
939 if (event == leader) 949 if (event == leader)
940 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
941 smp_processor_id());
942 else 951 else
943 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
944 smp_processor_id());
945 perf_enable(); 953 perf_enable();
946 } 954 }
947 955
@@ -959,7 +967,7 @@ static void __perf_event_enable(void *info)
959 } 967 }
960 968
961 unlock: 969 unlock:
962 spin_unlock(&ctx->lock); 970 raw_spin_unlock(&ctx->lock);
963} 971}
964 972
965/* 973/*
@@ -971,7 +979,7 @@ static void __perf_event_enable(void *info)
971 * perf_event_for_each_child or perf_event_for_each as described 979 * perf_event_for_each_child or perf_event_for_each as described
972 * for perf_event_disable. 980 * for perf_event_disable.
973 */ 981 */
974static void perf_event_enable(struct perf_event *event) 982void perf_event_enable(struct perf_event *event)
975{ 983{
976 struct perf_event_context *ctx = event->ctx; 984 struct perf_event_context *ctx = event->ctx;
977 struct task_struct *task = ctx->task; 985 struct task_struct *task = ctx->task;
@@ -985,7 +993,7 @@ static void perf_event_enable(struct perf_event *event)
985 return; 993 return;
986 } 994 }
987 995
988 spin_lock_irq(&ctx->lock); 996 raw_spin_lock_irq(&ctx->lock);
989 if (event->state >= PERF_EVENT_STATE_INACTIVE) 997 if (event->state >= PERF_EVENT_STATE_INACTIVE)
990 goto out; 998 goto out;
991 999
@@ -1000,10 +1008,10 @@ static void perf_event_enable(struct perf_event *event)
1000 event->state = PERF_EVENT_STATE_OFF; 1008 event->state = PERF_EVENT_STATE_OFF;
1001 1009
1002 retry: 1010 retry:
1003 spin_unlock_irq(&ctx->lock); 1011 raw_spin_unlock_irq(&ctx->lock);
1004 task_oncpu_function_call(task, __perf_event_enable, event); 1012 task_oncpu_function_call(task, __perf_event_enable, event);
1005 1013
1006 spin_lock_irq(&ctx->lock); 1014 raw_spin_lock_irq(&ctx->lock);
1007 1015
1008 /* 1016 /*
1009 * If the context is active and the event is still off, 1017 * If the context is active and the event is still off,
@@ -1020,7 +1028,7 @@ static void perf_event_enable(struct perf_event *event)
1020 __perf_event_mark_enabled(event, ctx); 1028 __perf_event_mark_enabled(event, ctx);
1021 1029
1022 out: 1030 out:
1023 spin_unlock_irq(&ctx->lock); 1031 raw_spin_unlock_irq(&ctx->lock);
1024} 1032}
1025 1033
1026static int perf_event_refresh(struct perf_event *event, int refresh) 1034static int perf_event_refresh(struct perf_event *event, int refresh)
@@ -1037,25 +1045,40 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1037 return 0; 1045 return 0;
1038} 1046}
1039 1047
1040void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1041 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1042{ 1057{
1043 struct perf_event *event; 1058 struct perf_event *event;
1044 1059
1045 spin_lock(&ctx->lock); 1060 raw_spin_lock(&ctx->lock);
1046 ctx->is_active = 0; 1061 ctx->is_active = 0;
1047 if (likely(!ctx->nr_events)) 1062 if (likely(!ctx->nr_events))
1048 goto out; 1063 goto out;
1049 update_context_time(ctx); 1064 update_context_time(ctx);
1050 1065
1051 perf_disable(); 1066 perf_disable();
1052 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1053 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1054 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1055 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1056 perf_enable(); 1079 perf_enable();
1057 out: 1080 out:
1058 spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
1059} 1082}
1060 1083
1061/* 1084/*
@@ -1164,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1164 * not restart the event. 1187 * not restart the event.
1165 */ 1188 */
1166void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1167 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1168{ 1191{
1169 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1170 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1171 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1172 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1193,8 +1216,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1193 * order we take the locks because no other cpu could 1216 * order we take the locks because no other cpu could
1194 * be trying to lock both of these tasks. 1217 * be trying to lock both of these tasks.
1195 */ 1218 */
1196 spin_lock(&ctx->lock); 1219 raw_spin_lock(&ctx->lock);
1197 spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); 1220 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
1198 if (context_equiv(ctx, next_ctx)) { 1221 if (context_equiv(ctx, next_ctx)) {
1199 /* 1222 /*
1200 * XXX do we need a memory barrier of sorts 1223 * XXX do we need a memory barrier of sorts
@@ -1208,21 +1231,19 @@ void perf_event_task_sched_out(struct task_struct *task,
1208 1231
1209 perf_event_sync_stat(ctx, next_ctx); 1232 perf_event_sync_stat(ctx, next_ctx);
1210 } 1233 }
1211 spin_unlock(&next_ctx->lock); 1234 raw_spin_unlock(&next_ctx->lock);
1212 spin_unlock(&ctx->lock); 1235 raw_spin_unlock(&ctx->lock);
1213 } 1236 }
1214 rcu_read_unlock(); 1237 rcu_read_unlock();
1215 1238
1216 if (do_switch) { 1239 if (do_switch) {
1217 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1218 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1219 } 1242 }
1220} 1243}
1221 1244
1222/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1223 * Called with IRQs disabled 1246 enum event_type_t event_type)
1224 */
1225static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1226{ 1247{
1227 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1228 1249
@@ -1232,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1233 return; 1254 return;
1234 1255
1235 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1236 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1237} 1258}
1238 1259
1239/* 1260/*
1240 * Called with IRQs disabled 1261 * Called with IRQs disabled
1241 */ 1262 */
1242static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1243{ 1264{
1244 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1273{
1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1245} 1275}
1246 1276
1247static void 1277static void
1248__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1249 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1250{ 1280{
1251 struct perf_event *event; 1281 struct perf_event *event;
1252 int can_add_hw = 1;
1253 1282
1254 spin_lock(&ctx->lock); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1255 ctx->is_active = 1; 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1256 if (likely(!ctx->nr_events))
1257 goto out;
1258
1259 ctx->timestamp = perf_clock();
1260
1261 perf_disable();
1262
1263 /*
1264 * First go through the list and put on any pinned groups
1265 * in order to give them the best chance of going on.
1266 */
1267 list_for_each_entry(event, &ctx->group_list, group_entry) {
1268 if (event->state <= PERF_EVENT_STATE_OFF ||
1269 !event->attr.pinned)
1270 continue; 1285 continue;
1271 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1272 continue; 1287 continue;
1273 1288
1274 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1275 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1276 1291
1277 /* 1292 /*
1278 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1283,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1283 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1284 } 1299 }
1285 } 1300 }
1301}
1286 1302
1287 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1288 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1289 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1290 * ignore pinned events since we did them already. 1306{
1291 */ 1307 struct perf_event *event;
1292 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1293 event->attr.pinned)
1294 continue;
1295 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1296 /* 1314 /*
1297 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1298 * of events: 1316 * of events:
1299 */ 1317 */
1300 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1301 continue; 1319 continue;
1302 1320
1303 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1304 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1305 can_add_hw = 0; 1323 can_add_hw = 0;
1306 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1307 perf_enable(); 1352 perf_enable();
1308 out: 1353 out:
1309 spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1310} 1355}
1311 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1312/* 1378/*
1313 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1314 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1320,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1320 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1321 * keep the event running. 1387 * keep the event running.
1322 */ 1388 */
1323void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1324{ 1390{
1325 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1326 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1327 1393
1328 if (likely(!ctx)) 1394 if (likely(!ctx))
1329 return; 1395 return;
1396
1330 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1331 return; 1398 return;
1332 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1333 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1334} 1412}
1335 1413
1336static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1419{
1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1430
1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1486}
1487
1488static void perf_event_stop(struct perf_event *event)
1337{ 1489{
1338 struct perf_event_context *ctx = &cpuctx->ctx; 1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1339 1492
1340 __perf_event_sched_in(ctx, cpuctx, cpu); 1493 return event->pmu->stop(event);
1341} 1494}
1342 1495
1343#define MAX_INTERRUPTS (~0ULL) 1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1344 1500
1345static void perf_log_throttle(struct perf_event *event, int enable); 1501 return event->pmu->start(event);
1502}
1346 1503
1347static void perf_adjust_period(struct perf_event *event, u64 events) 1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1348{ 1505{
1349 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1350 u64 period, sample_period; 1507 u64 period, sample_period;
1351 s64 delta; 1508 s64 delta;
1352 1509
1353 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1354 period = div64_u64(events, event->attr.sample_freq);
1355 1511
1356 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1357 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1362,19 +1518,31 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1362 sample_period = 1; 1518 sample_period = 1;
1363 1519
1364 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1365} 1529}
1366 1530
1367static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1368{ 1532{
1369 struct perf_event *event; 1533 struct perf_event *event;
1370 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1371 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1372 1537
1373 spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1374 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1375 if (event->state != PERF_EVENT_STATE_ACTIVE) 1540 if (event->state != PERF_EVENT_STATE_ACTIVE)
1376 continue; 1541 continue;
1377 1542
1543 if (event->cpu != -1 && event->cpu != smp_processor_id())
1544 continue;
1545
1378 hwc = &event->hw; 1546 hwc = &event->hw;
1379 1547
1380 interrupts = hwc->interrupts; 1548 interrupts = hwc->interrupts;
@@ -1386,46 +1554,20 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1386 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1387 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1388 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1389 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1390 } 1557 }
1391 1558
1392 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1393 continue; 1560 continue;
1394 1561
1395 /* 1562 event->pmu->read(event);
1396 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1397 */ 1564 delta = now - hwc->freq_count_stamp;
1398 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1399 freq = event->attr.sample_freq;
1400
1401 hwc->freq_count += freq;
1402 hwc->freq_interrupts += interrupts;
1403
1404 if (hwc->freq_count < HZ)
1405 continue;
1406
1407 interrupts = hwc->freq_interrupts;
1408 hwc->freq_interrupts = 0;
1409 hwc->freq_count -= HZ;
1410 } else
1411 freq = HZ;
1412
1413 perf_adjust_period(event, freq * interrupts);
1414 1566
1415 /* 1567 if (delta > 0)
1416 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1417 * sample period, force reset the sample period if we didn't
1418 * get any events in this freq period.
1419 */
1420 if (!interrupts) {
1421 perf_disable();
1422 event->pmu->disable(event);
1423 atomic64_set(&hwc->period_left, 0);
1424 event->pmu->enable(event);
1425 perf_enable();
1426 }
1427 } 1569 }
1428 spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1429} 1571}
1430 1572
1431/* 1573/*
@@ -1433,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1433 */ 1575 */
1434static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1435{ 1577{
1436 struct perf_event *event;
1437
1438 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1439 return; 1579 return;
1440 1580
1441 spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1442 /* 1582
1443 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1444 */ 1584 list_rotate_left(&ctx->flexible_groups);
1445 perf_disable();
1446 list_for_each_entry(event, &ctx->group_list, group_entry) {
1447 list_move_tail(&event->group_entry, &ctx->group_list);
1448 break;
1449 }
1450 perf_enable();
1451 1585
1452 spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1453} 1587}
1454 1588
1455void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1456{ 1590{
1457 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1458 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1460,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1460 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1461 return; 1595 return;
1462 1596
1463 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1464 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1465 1599
1600 perf_disable();
1601
1466 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1467 if (ctx) 1603 if (ctx)
1468 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1469 1605
1470 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1471 if (ctx) 1607 if (ctx)
1472 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1473 1609
1474 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1475 if (ctx) 1611 if (ctx)
1476 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1477 1613
1478 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1479 if (ctx) 1615 if (ctx)
1480 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1481} 1634}
1482 1635
1483/* 1636/*
@@ -1490,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1490 struct perf_event *event; 1643 struct perf_event *event;
1491 unsigned long flags; 1644 unsigned long flags;
1492 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1493 1647
1494 local_irq_save(flags); 1648 local_irq_save(flags);
1495 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1498,16 +1652,18 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1498 1652
1499 __perf_event_task_sched_out(ctx); 1653 __perf_event_task_sched_out(ctx);
1500 1654
1501 spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1502 1656
1503 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1504 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1505 continue; 1659 if (ret)
1506 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1507 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1508 continue; 1662
1509 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1510 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1511 } 1667 }
1512 1668
1513 /* 1669 /*
@@ -1516,9 +1672,9 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1516 if (enabled) 1672 if (enabled)
1517 unclone_ctx(ctx); 1673 unclone_ctx(ctx);
1518 1674
1519 spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1520 1676
1521 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1522 out: 1678 out:
1523 local_irq_restore(flags); 1679 local_irq_restore(flags);
1524} 1680}
@@ -1542,10 +1698,10 @@ static void __perf_event_read(void *info)
1542 if (ctx->task && cpuctx->task_ctx != ctx) 1698 if (ctx->task && cpuctx->task_ctx != ctx)
1543 return; 1699 return;
1544 1700
1545 spin_lock(&ctx->lock); 1701 raw_spin_lock(&ctx->lock);
1546 update_context_time(ctx); 1702 update_context_time(ctx);
1547 update_event_times(event); 1703 update_event_times(event);
1548 spin_unlock(&ctx->lock); 1704 raw_spin_unlock(&ctx->lock);
1549 1705
1550 event->pmu->read(event); 1706 event->pmu->read(event);
1551} 1707}
@@ -1563,10 +1719,10 @@ static u64 perf_event_read(struct perf_event *event)
1563 struct perf_event_context *ctx = event->ctx; 1719 struct perf_event_context *ctx = event->ctx;
1564 unsigned long flags; 1720 unsigned long flags;
1565 1721
1566 spin_lock_irqsave(&ctx->lock, flags); 1722 raw_spin_lock_irqsave(&ctx->lock, flags);
1567 update_context_time(ctx); 1723 update_context_time(ctx);
1568 update_event_times(event); 1724 update_event_times(event);
1569 spin_unlock_irqrestore(&ctx->lock, flags); 1725 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1570 } 1726 }
1571 1727
1572 return atomic64_read(&event->count); 1728 return atomic64_read(&event->count);
@@ -1579,10 +1735,10 @@ static void
1579__perf_event_init_context(struct perf_event_context *ctx, 1735__perf_event_init_context(struct perf_event_context *ctx,
1580 struct task_struct *task) 1736 struct task_struct *task)
1581{ 1737{
1582 memset(ctx, 0, sizeof(*ctx)); 1738 raw_spin_lock_init(&ctx->lock);
1583 spin_lock_init(&ctx->lock);
1584 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1585 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1586 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1587 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1588 ctx->task = task; 1744 ctx->task = task;
@@ -1596,15 +1752,12 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1596 unsigned long flags; 1752 unsigned long flags;
1597 int err; 1753 int err;
1598 1754
1599 /* 1755 if (pid == -1 && cpu != -1) {
1600 * If cpu is not a wildcard then this is a percpu event:
1601 */
1602 if (cpu != -1) {
1603 /* Must be root to operate on a CPU event: */ 1756 /* Must be root to operate on a CPU event: */
1604 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) 1757 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
1605 return ERR_PTR(-EACCES); 1758 return ERR_PTR(-EACCES);
1606 1759
1607 if (cpu < 0 || cpu > num_possible_cpus()) 1760 if (cpu < 0 || cpu >= nr_cpumask_bits)
1608 return ERR_PTR(-EINVAL); 1761 return ERR_PTR(-EINVAL);
1609 1762
1610 /* 1763 /*
@@ -1612,7 +1765,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1612 * offline CPU and activate it when the CPU comes up, but 1765 * offline CPU and activate it when the CPU comes up, but
1613 * that's for later. 1766 * that's for later.
1614 */ 1767 */
1615 if (!cpu_isset(cpu, cpu_online_map)) 1768 if (!cpu_online(cpu))
1616 return ERR_PTR(-ENODEV); 1769 return ERR_PTR(-ENODEV);
1617 1770
1618 cpuctx = &per_cpu(perf_cpu_context, cpu); 1771 cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -1650,11 +1803,11 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu)
1650 ctx = perf_lock_task_context(task, &flags); 1803 ctx = perf_lock_task_context(task, &flags);
1651 if (ctx) { 1804 if (ctx) {
1652 unclone_ctx(ctx); 1805 unclone_ctx(ctx);
1653 spin_unlock_irqrestore(&ctx->lock, flags); 1806 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1654 } 1807 }
1655 1808
1656 if (!ctx) { 1809 if (!ctx) {
1657 ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); 1810 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
1658 err = -ENOMEM; 1811 err = -ENOMEM;
1659 if (!ctx) 1812 if (!ctx)
1660 goto errout; 1813 goto errout;
@@ -1988,7 +2141,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
1988 if (!value) 2141 if (!value)
1989 return -EINVAL; 2142 return -EINVAL;
1990 2143
1991 spin_lock_irq(&ctx->lock); 2144 raw_spin_lock_irq(&ctx->lock);
1992 if (event->attr.freq) { 2145 if (event->attr.freq) {
1993 if (value > sysctl_perf_event_sample_rate) { 2146 if (value > sysctl_perf_event_sample_rate) {
1994 ret = -EINVAL; 2147 ret = -EINVAL;
@@ -2001,7 +2154,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
2001 event->hw.sample_period = value; 2154 event->hw.sample_period = value;
2002 } 2155 }
2003unlock: 2156unlock:
2004 spin_unlock_irq(&ctx->lock); 2157 raw_spin_unlock_irq(&ctx->lock);
2005 2158
2006 return ret; 2159 return ret;
2007} 2160}
@@ -3254,8 +3407,6 @@ static void perf_event_task_output(struct perf_event *event,
3254 task_event->event_id.tid = perf_event_tid(event, task); 3407 task_event->event_id.tid = perf_event_tid(event, task);
3255 task_event->event_id.ptid = perf_event_tid(event, current); 3408 task_event->event_id.ptid = perf_event_tid(event, current);
3256 3409
3257 task_event->event_id.time = perf_clock();
3258
3259 perf_output_put(&handle, task_event->event_id); 3410 perf_output_put(&handle, task_event->event_id);
3260 3411
3261 perf_output_end(&handle); 3412 perf_output_end(&handle);
@@ -3263,6 +3414,12 @@ static void perf_event_task_output(struct perf_event *event,
3263 3414
3264static int perf_event_task_match(struct perf_event *event) 3415static int perf_event_task_match(struct perf_event *event)
3265{ 3416{
3417 if (event->state < PERF_EVENT_STATE_INACTIVE)
3418 return 0;
3419
3420 if (event->cpu != -1 && event->cpu != smp_processor_id())
3421 return 0;
3422
3266 if (event->attr.comm || event->attr.mmap || event->attr.task) 3423 if (event->attr.comm || event->attr.mmap || event->attr.task)
3267 return 1; 3424 return 1;
3268 3425
@@ -3288,12 +3445,11 @@ static void perf_event_task_event(struct perf_task_event *task_event)
3288 rcu_read_lock(); 3445 rcu_read_lock();
3289 cpuctx = &get_cpu_var(perf_cpu_context); 3446 cpuctx = &get_cpu_var(perf_cpu_context);
3290 perf_event_task_ctx(&cpuctx->ctx, task_event); 3447 perf_event_task_ctx(&cpuctx->ctx, task_event);
3291 put_cpu_var(perf_cpu_context);
3292
3293 if (!ctx) 3448 if (!ctx)
3294 ctx = rcu_dereference(task_event->task->perf_event_ctxp); 3449 ctx = rcu_dereference(current->perf_event_ctxp);
3295 if (ctx) 3450 if (ctx)
3296 perf_event_task_ctx(ctx, task_event); 3451 perf_event_task_ctx(ctx, task_event);
3452 put_cpu_var(perf_cpu_context);
3297 rcu_read_unlock(); 3453 rcu_read_unlock();
3298} 3454}
3299 3455
@@ -3321,6 +3477,7 @@ static void perf_event_task(struct task_struct *task,
3321 /* .ppid */ 3477 /* .ppid */
3322 /* .tid */ 3478 /* .tid */
3323 /* .ptid */ 3479 /* .ptid */
3480 .time = perf_clock(),
3324 }, 3481 },
3325 }; 3482 };
3326 3483
@@ -3370,6 +3527,12 @@ static void perf_event_comm_output(struct perf_event *event,
3370 3527
3371static int perf_event_comm_match(struct perf_event *event) 3528static int perf_event_comm_match(struct perf_event *event)
3372{ 3529{
3530 if (event->state < PERF_EVENT_STATE_INACTIVE)
3531 return 0;
3532
3533 if (event->cpu != -1 && event->cpu != smp_processor_id())
3534 return 0;
3535
3373 if (event->attr.comm) 3536 if (event->attr.comm)
3374 return 1; 3537 return 1;
3375 3538
@@ -3406,15 +3569,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3406 rcu_read_lock(); 3569 rcu_read_lock();
3407 cpuctx = &get_cpu_var(perf_cpu_context); 3570 cpuctx = &get_cpu_var(perf_cpu_context);
3408 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3571 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3409 put_cpu_var(perf_cpu_context);
3410
3411 /*
3412 * doesn't really matter which of the child contexts the
3413 * events ends up in.
3414 */
3415 ctx = rcu_dereference(current->perf_event_ctxp); 3572 ctx = rcu_dereference(current->perf_event_ctxp);
3416 if (ctx) 3573 if (ctx)
3417 perf_event_comm_ctx(ctx, comm_event); 3574 perf_event_comm_ctx(ctx, comm_event);
3575 put_cpu_var(perf_cpu_context);
3418 rcu_read_unlock(); 3576 rcu_read_unlock();
3419} 3577}
3420 3578
@@ -3489,6 +3647,12 @@ static void perf_event_mmap_output(struct perf_event *event,
3489static int perf_event_mmap_match(struct perf_event *event, 3647static int perf_event_mmap_match(struct perf_event *event,
3490 struct perf_mmap_event *mmap_event) 3648 struct perf_mmap_event *mmap_event)
3491{ 3649{
3650 if (event->state < PERF_EVENT_STATE_INACTIVE)
3651 return 0;
3652
3653 if (event->cpu != -1 && event->cpu != smp_processor_id())
3654 return 0;
3655
3492 if (event->attr.mmap) 3656 if (event->attr.mmap)
3493 return 1; 3657 return 1;
3494 3658
@@ -3562,15 +3726,10 @@ got_name:
3562 rcu_read_lock(); 3726 rcu_read_lock();
3563 cpuctx = &get_cpu_var(perf_cpu_context); 3727 cpuctx = &get_cpu_var(perf_cpu_context);
3564 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); 3728 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event);
3565 put_cpu_var(perf_cpu_context);
3566
3567 /*
3568 * doesn't really matter which of the child contexts the
3569 * events ends up in.
3570 */
3571 ctx = rcu_dereference(current->perf_event_ctxp); 3729 ctx = rcu_dereference(current->perf_event_ctxp);
3572 if (ctx) 3730 if (ctx)
3573 perf_event_mmap_ctx(ctx, mmap_event); 3731 perf_event_mmap_ctx(ctx, mmap_event);
3732 put_cpu_var(perf_cpu_context);
3574 rcu_read_unlock(); 3733 rcu_read_unlock();
3575 3734
3576 kfree(buf); 3735 kfree(buf);
@@ -3597,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3597 /* .tid */ 3756 /* .tid */
3598 .start = vma->vm_start, 3757 .start = vma->vm_start,
3599 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3600 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3601 }, 3760 },
3602 }; 3761 };
3603 3762
@@ -3677,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3677 3836
3678 if (event->attr.freq) { 3837 if (event->attr.freq) {
3679 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3680 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3681 3840
3682 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3683 3842
3684 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3685 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3686 } 3845 }
3687 3846
3688 /* 3847 /*
@@ -3861,6 +4020,9 @@ static int perf_swevent_match(struct perf_event *event,
3861 struct perf_sample_data *data, 4020 struct perf_sample_data *data,
3862 struct pt_regs *regs) 4021 struct pt_regs *regs)
3863{ 4022{
4023 if (event->cpu != -1 && event->cpu != smp_processor_id())
4024 return 0;
4025
3864 if (!perf_swevent_is_counting(event)) 4026 if (!perf_swevent_is_counting(event))
3865 return 0; 4027 return 0;
3866 4028
@@ -4011,6 +4173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4011 event->pmu->read(event); 4173 event->pmu->read(event);
4012 4174
4013 data.addr = 0; 4175 data.addr = 0;
4176 data.raw = NULL;
4014 data.period = event->hw.last_period; 4177 data.period = event->hw.last_period;
4015 regs = get_irq_regs(); 4178 regs = get_irq_regs();
4016 /* 4179 /*
@@ -4080,8 +4243,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event)
4080 u64 now; 4243 u64 now;
4081 4244
4082 now = cpu_clock(cpu); 4245 now = cpu_clock(cpu);
4083 prev = atomic64_read(&event->hw.prev_count); 4246 prev = atomic64_xchg(&event->hw.prev_count, now);
4084 atomic64_set(&event->hw.prev_count, now);
4085 atomic64_add(now - prev, &event->count); 4247 atomic64_add(now - prev, &event->count);
4086} 4248}
4087 4249
@@ -4170,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4170 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4171}; 4333};
4172 4334
4173#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4174 4336
4175void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4176 int entry_size) 4338 int entry_size)
@@ -4275,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4275{ 4437{
4276} 4438}
4277 4439
4278#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4279 4441
4280#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4281static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4286,15 +4448,8 @@ static void bp_perf_event_destroy(struct perf_event *event)
4286static const struct pmu *bp_perf_event_init(struct perf_event *bp) 4448static const struct pmu *bp_perf_event_init(struct perf_event *bp)
4287{ 4449{
4288 int err; 4450 int err;
4289 /* 4451
4290 * The breakpoint is already filled if we haven't created the counter 4452 err = register_perf_hw_breakpoint(bp);
4291 * through perf syscall
4292 * FIXME: manage to get trigerred to NULL if it comes from syscalls
4293 */
4294 if (!bp->callback)
4295 err = register_perf_hw_breakpoint(bp);
4296 else
4297 err = __register_perf_hw_breakpoint(bp);
4298 if (err) 4453 if (err)
4299 return ERR_PTR(err); 4454 return ERR_PTR(err);
4300 4455
@@ -4308,6 +4463,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4308 struct perf_sample_data sample; 4463 struct perf_sample_data sample;
4309 struct pt_regs *regs = data; 4464 struct pt_regs *regs = data;
4310 4465
4466 sample.raw = NULL;
4311 sample.addr = bp->attr.bp_addr; 4467 sample.addr = bp->attr.bp_addr;
4312 4468
4313 if (!perf_exclude_event(bp, regs)) 4469 if (!perf_exclude_event(bp, regs))
@@ -4390,7 +4546,7 @@ perf_event_alloc(struct perf_event_attr *attr,
4390 struct perf_event_context *ctx, 4546 struct perf_event_context *ctx,
4391 struct perf_event *group_leader, 4547 struct perf_event *group_leader,
4392 struct perf_event *parent_event, 4548 struct perf_event *parent_event,
4393 perf_callback_t callback, 4549 perf_overflow_handler_t overflow_handler,
4394 gfp_t gfpflags) 4550 gfp_t gfpflags)
4395{ 4551{
4396 const struct pmu *pmu; 4552 const struct pmu *pmu;
@@ -4433,10 +4589,10 @@ perf_event_alloc(struct perf_event_attr *attr,
4433 4589
4434 event->state = PERF_EVENT_STATE_INACTIVE; 4590 event->state = PERF_EVENT_STATE_INACTIVE;
4435 4591
4436 if (!callback && parent_event) 4592 if (!overflow_handler && parent_event)
4437 callback = parent_event->callback; 4593 overflow_handler = parent_event->overflow_handler;
4438 4594
4439 event->callback = callback; 4595 event->overflow_handler = overflow_handler;
4440 4596
4441 if (attr->disabled) 4597 if (attr->disabled)
4442 event->state = PERF_EVENT_STATE_OFF; 4598 event->state = PERF_EVENT_STATE_OFF;
@@ -4571,7 +4727,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
4571 if (attr->type >= PERF_TYPE_MAX) 4727 if (attr->type >= PERF_TYPE_MAX)
4572 return -EINVAL; 4728 return -EINVAL;
4573 4729
4574 if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) 4730 if (attr->__reserved_1)
4575 return -EINVAL; 4731 return -EINVAL;
4576 4732
4577 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) 4733 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
@@ -4724,7 +4880,7 @@ SYSCALL_DEFINE5(perf_event_open,
4724 if (IS_ERR(event)) 4880 if (IS_ERR(event))
4725 goto err_put_context; 4881 goto err_put_context;
4726 4882
4727 err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); 4883 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR);
4728 if (err < 0) 4884 if (err < 0)
4729 goto err_free_put_context; 4885 goto err_free_put_context;
4730 4886
@@ -4776,7 +4932,8 @@ err_put_context:
4776 */ 4932 */
4777struct perf_event * 4933struct perf_event *
4778perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 4934perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4779 pid_t pid, perf_callback_t callback) 4935 pid_t pid,
4936 perf_overflow_handler_t overflow_handler)
4780{ 4937{
4781 struct perf_event *event; 4938 struct perf_event *event;
4782 struct perf_event_context *ctx; 4939 struct perf_event_context *ctx;
@@ -4793,7 +4950,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
4793 } 4950 }
4794 4951
4795 event = perf_event_alloc(attr, cpu, ctx, NULL, 4952 event = perf_event_alloc(attr, cpu, ctx, NULL,
4796 NULL, callback, GFP_KERNEL); 4953 NULL, overflow_handler, GFP_KERNEL);
4797 if (IS_ERR(event)) { 4954 if (IS_ERR(event)) {
4798 err = PTR_ERR(event); 4955 err = PTR_ERR(event);
4799 goto err_put_context; 4956 goto err_put_context;
@@ -4861,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4861 else 5018 else
4862 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4863 5020
4864 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4865 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4866 5030
4867 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4868 5032
@@ -4998,7 +5162,7 @@ void perf_event_exit_task(struct task_struct *child)
4998 * reading child->perf_event_ctxp, we wait until it has 5162 * reading child->perf_event_ctxp, we wait until it has
4999 * incremented the context's refcount before we do put_ctx below. 5163 * incremented the context's refcount before we do put_ctx below.
5000 */ 5164 */
5001 spin_lock(&child_ctx->lock); 5165 raw_spin_lock(&child_ctx->lock);
5002 child->perf_event_ctxp = NULL; 5166 child->perf_event_ctxp = NULL;
5003 /* 5167 /*
5004 * If this context is a clone; unclone it so it can't get 5168 * If this context is a clone; unclone it so it can't get
@@ -5007,7 +5171,7 @@ void perf_event_exit_task(struct task_struct *child)
5007 */ 5171 */
5008 unclone_ctx(child_ctx); 5172 unclone_ctx(child_ctx);
5009 update_context_time(child_ctx); 5173 update_context_time(child_ctx);
5010 spin_unlock_irqrestore(&child_ctx->lock, flags); 5174 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
5011 5175
5012 /* 5176 /*
5013 * Report the task dead after unscheduling the events so that we 5177 * Report the task dead after unscheduling the events so that we
@@ -5030,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5030 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5031 5195
5032again: 5196again:
5033 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5034 group_entry) 5202 group_entry)
5035 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5036 5204
@@ -5039,7 +5207,8 @@ again:
5039 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5040 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5041 */ 5209 */
5042 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5043 goto again; 5212 goto again;
5044 5213
5045 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5047,6 +5216,24 @@ again:
5047 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5048} 5217}
5049 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5050/* 5237/*
5051 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5052 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5061,30 +5248,64 @@ void perf_event_free_task(struct task_struct *task)
5061 5248
5062 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5063again: 5250again:
5064 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5065 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5066 5253
5067 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5068 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5069 5257
5070 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5071 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5072 mutex_unlock(&parent->child_mutex); 5260 goto again;
5073 5261
5074 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5075 5263
5076 list_del_event(event, ctx); 5264 put_ctx(ctx);
5077 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5078 } 5279 }
5079 5280
5080 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5081 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5082 5288
5083 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5084 5293
5085 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5086} 5306}
5087 5307
5308
5088/* 5309/*
5089 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5090 */ 5311 */
@@ -5106,20 +5327,6 @@ int perf_event_init_task(struct task_struct *child)
5106 return 0; 5327 return 0;
5107 5328
5108 /* 5329 /*
5109 * This is executed from the parent task context, so inherit
5110 * events that have been marked for cloning.
5111 * First allocate and initialize a context for the child.
5112 */
5113
5114 child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL);
5115 if (!child_ctx)
5116 return -ENOMEM;
5117
5118 __perf_event_init_context(child_ctx, child);
5119 child->perf_event_ctxp = child_ctx;
5120 get_task_struct(child);
5121
5122 /*
5123 * If the parent's context is a clone, pin it so it won't get 5330 * If the parent's context is a clone, pin it so it won't get
5124 * swapped under us. 5331 * swapped under us.
5125 */ 5332 */
@@ -5142,22 +5349,23 @@ int perf_event_init_task(struct task_struct *child)
5142 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5143 * the list, not manipulating it: 5350 * the list, not manipulating it:
5144 */ 5351 */
5145 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5146 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5147 if (!event->attr.inherit) { 5354 &inherited_all);
5148 inherited_all = 0; 5355 if (ret)
5149 continue; 5356 break;
5150 } 5357 }
5151 5358
5152 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5153 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5154 if (ret) { 5361 &inherited_all);
5155 inherited_all = 0; 5362 if (ret)
5156 break; 5363 break;
5157 }
5158 } 5364 }
5159 5365
5160 if (inherited_all) { 5366 child_ctx = child->perf_event_ctxp;
5367
5368 if (child_ctx && inherited_all) {
5161 /* 5369 /*
5162 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
5163 * context, or of whatever the parent is a clone of. 5371 * context, or of whatever the parent is a clone of.
@@ -5205,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5205 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5206 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5207 5415
5208 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5209 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5210} 5420}
5211static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5243,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5243 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5244 break; 5454 break;
5245 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5246 default: 5460 default:
5247 break; 5461 break;
5248 } 5462 }
@@ -5291,11 +5505,11 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5291 perf_reserved_percpu = val; 5505 perf_reserved_percpu = val;
5292 for_each_online_cpu(cpu) { 5506 for_each_online_cpu(cpu) {
5293 cpuctx = &per_cpu(perf_cpu_context, cpu); 5507 cpuctx = &per_cpu(perf_cpu_context, cpu);
5294 spin_lock_irq(&cpuctx->ctx.lock); 5508 raw_spin_lock_irq(&cpuctx->ctx.lock);
5295 mpt = min(perf_max_events - cpuctx->ctx.nr_events, 5509 mpt = min(perf_max_events - cpuctx->ctx.nr_events,
5296 perf_max_events - perf_reserved_percpu); 5510 perf_max_events - perf_reserved_percpu);
5297 cpuctx->max_pertask = mpt; 5511 cpuctx->max_pertask = mpt;
5298 spin_unlock_irq(&cpuctx->ctx.lock); 5512 raw_spin_unlock_irq(&cpuctx->ctx.lock);
5299 } 5513 }
5300 spin_unlock(&perf_resource_lock); 5514 spin_unlock(&perf_resource_lock);
5301 5515
diff --git a/kernel/pid.c b/kernel/pid.c
index d3f722d20f9c..b08e697cd83f 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -141,11 +141,12 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
141 * installing it: 141 * installing it:
142 */ 142 */
143 spin_lock_irq(&pidmap_lock); 143 spin_lock_irq(&pidmap_lock);
144 if (map->page) 144 if (!map->page) {
145 kfree(page);
146 else
147 map->page = page; 145 map->page = page;
146 page = NULL;
147 }
148 spin_unlock_irq(&pidmap_lock); 148 spin_unlock_irq(&pidmap_lock);
149 kfree(page);
149 if (unlikely(!map->page)) 150 if (unlikely(!map->page))
150 break; 151 break;
151 } 152 }
@@ -268,12 +269,11 @@ struct pid *alloc_pid(struct pid_namespace *ns)
268 for (type = 0; type < PIDTYPE_MAX; ++type) 269 for (type = 0; type < PIDTYPE_MAX; ++type)
269 INIT_HLIST_HEAD(&pid->tasks[type]); 270 INIT_HLIST_HEAD(&pid->tasks[type]);
270 271
272 upid = pid->numbers + ns->level;
271 spin_lock_irq(&pidmap_lock); 273 spin_lock_irq(&pidmap_lock);
272 for (i = ns->level; i >= 0; i--) { 274 for ( ; upid >= pid->numbers; --upid)
273 upid = &pid->numbers[i];
274 hlist_add_head_rcu(&upid->pid_chain, 275 hlist_add_head_rcu(&upid->pid_chain,
275 &pid_hash[pid_hashfn(upid->nr, upid->ns)]); 276 &pid_hash[pid_hashfn(upid->nr, upid->ns)]);
276 }
277 spin_unlock_irq(&pidmap_lock); 277 spin_unlock_irq(&pidmap_lock);
278 278
279out: 279out:
@@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
367 struct task_struct *result = NULL; 367 struct task_struct *result = NULL;
368 if (pid) { 368 if (pid) {
369 struct hlist_node *first; 369 struct hlist_node *first;
370 first = rcu_dereference(pid->tasks[type].first); 370 first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock));
371 if (first) 371 if (first)
372 result = hlist_entry(first, struct task_struct, pids[(type)].node); 372 result = hlist_entry(first, struct task_struct, pids[(type)].node);
373 } 373 }
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index dfdec524d1b7..3db49b9ca374 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,7 +29,6 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
33#include <linux/spinlock.h> 32#include <linux/spinlock.h>
34#include <linux/slab.h> 33#include <linux/slab.h>
35#include <linux/time.h> 34#include <linux/time.h>
@@ -344,37 +343,33 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344} 343}
345EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
346 345
347#define PID_NAME_LEN sizeof("process_1234567890") 346#define PID_NAME_LEN 32
348static char name[PID_NAME_LEN];
349 347
350static int pm_qos_power_open(struct inode *inode, struct file *filp) 348static int pm_qos_power_open(struct inode *inode, struct file *filp)
351{ 349{
352 int ret; 350 int ret;
353 long pm_qos_class; 351 long pm_qos_class;
352 char name[PID_NAME_LEN];
354 353
355 lock_kernel();
356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
357 if (pm_qos_class >= 0) { 355 if (pm_qos_class >= 0) {
358 filp->private_data = (void *)pm_qos_class; 356 filp->private_data = (void *)pm_qos_class;
359 sprintf(name, "process_%d", current->pid); 357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
360 ret = pm_qos_add_requirement(pm_qos_class, name, 358 ret = pm_qos_add_requirement(pm_qos_class, name,
361 PM_QOS_DEFAULT_VALUE); 359 PM_QOS_DEFAULT_VALUE);
362 if (ret >= 0) { 360 if (ret >= 0)
363 unlock_kernel();
364 return 0; 361 return 0;
365 }
366 } 362 }
367 unlock_kernel();
368
369 return -EPERM; 363 return -EPERM;
370} 364}
371 365
372static int pm_qos_power_release(struct inode *inode, struct file *filp) 366static int pm_qos_power_release(struct inode *inode, struct file *filp)
373{ 367{
374 int pm_qos_class; 368 int pm_qos_class;
369 char name[PID_NAME_LEN];
375 370
376 pm_qos_class = (long)filp->private_data; 371 pm_qos_class = (long)filp->private_data;
377 sprintf(name, "process_%d", current->pid); 372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
378 pm_qos_remove_requirement(pm_qos_class, name); 373 pm_qos_remove_requirement(pm_qos_class, name);
379 374
380 return 0; 375 return 0;
@@ -385,13 +380,14 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
385{ 380{
386 s32 value; 381 s32 value;
387 int pm_qos_class; 382 int pm_qos_class;
383 char name[PID_NAME_LEN];
388 384
389 pm_qos_class = (long)filp->private_data; 385 pm_qos_class = (long)filp->private_data;
390 if (count != sizeof(s32)) 386 if (count != sizeof(s32))
391 return -EINVAL; 387 return -EINVAL;
392 if (copy_from_user(&value, buf, sizeof(s32))) 388 if (copy_from_user(&value, buf, sizeof(s32)))
393 return -EFAULT; 389 return -EFAULT;
394 sprintf(name, "process_%d", current->pid); 390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
395 pm_qos_update_requirement(pm_qos_class, name, value); 391 pm_qos_update_requirement(pm_qos_class, name, value);
396 392
397 return sizeof(s32); 393 return sizeof(s32);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 495440779ce3..00d1fda58ab6 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -256,7 +256,7 @@ static int posix_get_monotonic_coarse(clockid_t which_clock,
256 return 0; 256 return 0;
257} 257}
258 258
259int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) 259static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp)
260{ 260{
261 *tp = ktime_to_timespec(KTIME_LOW_RES); 261 *tp = ktime_to_timespec(KTIME_LOW_RES);
262 return 0; 262 return 0;
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 91e09d3b2eb2..5c36ea9d55d2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,15 @@ config PM_DEBUG
27 code. This is helpful when debugging and reporting PM bugs, like 27 code. This is helpful when debugging and reporting PM bugs, like
28 suspend support. 28 suspend support.
29 29
30config PM_ADVANCED_DEBUG
31 bool "Extra PM attributes in sysfs for low-level debugging/testing"
32 depends on PM_DEBUG
33 default n
34 ---help---
35 Add extra sysfs attributes allowing one to access some Power Management
36 fields of device objects from user space. If you are not a kernel
37 developer interested in debugging/testing Power Management, say "no".
38
30config PM_VERBOSE 39config PM_VERBOSE
31 bool "Verbose Power Management debugging" 40 bool "Verbose Power Management debugging"
32 depends on PM_DEBUG 41 depends on PM_DEBUG
@@ -85,6 +94,11 @@ config PM_SLEEP
85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE 94 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 95 default y
87 96
97config PM_SLEEP_ADVANCED_DEBUG
98 bool
99 depends on PM_ADVANCED_DEBUG
100 default n
101
88config SUSPEND 102config SUSPEND
89 bool "Suspend to RAM and standby" 103 bool "Suspend to RAM and standby"
90 depends on PM && ARCH_SUSPEND_POSSIBLE 104 depends on PM && ARCH_SUSPEND_POSSIBLE
@@ -222,3 +236,8 @@ config PM_RUNTIME
222 and the bus type drivers of the buses the devices are on are 236 and the bus type drivers of the buses the devices are on are
223 responsible for the actual handling of the autosuspend requests and 237 responsible for the actual handling of the autosuspend requests and
224 wake-up events. 238 wake-up events.
239
240config PM_OPS
241 bool
242 depends on PM_SLEEP || PM_RUNTIME
243 default y
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 5187136fe1de..218e5af90156 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -6,7 +6,7 @@
6 6
7#include <linux/vt_kern.h> 7#include <linux/vt_kern.h>
8#include <linux/kbd_kern.h> 8#include <linux/kbd_kern.h>
9#include <linux/console.h> 9#include <linux/vt.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include "power.h" 11#include "power.h"
12 12
@@ -21,8 +21,7 @@ int pm_prepare_console(void)
21 if (orig_fgconsole < 0) 21 if (orig_fgconsole < 0)
22 return 1; 22 return 1;
23 23
24 orig_kmsg = kmsg_redirect; 24 orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
25 kmsg_redirect = SUSPEND_CONSOLE;
26 return 0; 25 return 0;
27} 26}
28 27
@@ -30,7 +29,7 @@ void pm_restore_console(void)
30{ 29{
31 if (orig_fgconsole >= 0) { 30 if (orig_fgconsole >= 0) {
32 vt_move_to_console(orig_fgconsole, 0); 31 vt_move_to_console(orig_fgconsole, 0);
33 kmsg_redirect = orig_kmsg; 32 vt_kmsg_redirect(orig_kmsg);
34 } 33 }
35} 34}
36#endif 35#endif
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0998c7139053..b58800b21fc0 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val)
44 == NOTIFY_BAD) ? -EINVAL : 0; 44 == NOTIFY_BAD) ? -EINVAL : 0;
45} 45}
46 46
47/* If set, devices may be suspended and resumed asynchronously. */
48int pm_async_enabled = 1;
49
50static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr,
51 char *buf)
52{
53 return sprintf(buf, "%d\n", pm_async_enabled);
54}
55
56static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr,
57 const char *buf, size_t n)
58{
59 unsigned long val;
60
61 if (strict_strtoul(buf, 10, &val))
62 return -EINVAL;
63
64 if (val > 1)
65 return -EINVAL;
66
67 pm_async_enabled = val;
68 return n;
69}
70
71power_attr(pm_async);
72
47#ifdef CONFIG_PM_DEBUG 73#ifdef CONFIG_PM_DEBUG
48int pm_test_level = TEST_NONE; 74int pm_test_level = TEST_NONE;
49 75
@@ -208,9 +234,12 @@ static struct attribute * g[] = {
208#ifdef CONFIG_PM_TRACE 234#ifdef CONFIG_PM_TRACE
209 &pm_trace_attr.attr, 235 &pm_trace_attr.attr,
210#endif 236#endif
211#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) 237#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr,
239#ifdef CONFIG_PM_DEBUG
212 &pm_test_attr.attr, 240 &pm_test_attr.attr,
213#endif 241#endif
242#endif
214 NULL, 243 NULL,
215}; 244};
216 245
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 36cb168e4330..830cadecbdfc 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void)
1181 1181
1182 memory_bm_position_reset(&copy_bm); 1182 memory_bm_position_reset(&copy_bm);
1183 1183
1184 while (to_free_normal > 0 && to_free_highmem > 0) { 1184 while (to_free_normal > 0 || to_free_highmem > 0) {
1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm); 1185 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
1186 struct page *page = pfn_to_page(pfn); 1186 struct page *page = pfn_to_page(pfn);
1187 1187
@@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void)
1500{ 1500{
1501 unsigned int nr_pages, nr_highmem; 1501 unsigned int nr_pages, nr_highmem;
1502 1502
1503 printk(KERN_INFO "PM: Creating hibernation image: \n"); 1503 printk(KERN_INFO "PM: Creating hibernation image:\n");
1504 1504
1505 drain_local_pages(NULL); 1505 drain_local_pages(NULL);
1506 nr_pages = count_data_pages(); 1506 nr_pages = count_data_pages();
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 09b2b0ae9e9d..1d575733d4e1 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p)
657 struct swsusp_info *header; 657 struct swsusp_info *header;
658 658
659 *flags_p = swsusp_header->flags; 659 *flags_p = swsusp_header->flags;
660 if (IS_ERR(resume_bdev)) {
661 pr_debug("PM: Image device not initialised\n");
662 return PTR_ERR(resume_bdev);
663 }
664 660
665 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 661 memset(&snapshot, 0, sizeof(struct snapshot_handle));
666 error = snapshot_write_next(&snapshot, PAGE_SIZE); 662 error = snapshot_write_next(&snapshot, PAGE_SIZE);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
deleted file mode 100644
index 5b3601bd1893..000000000000
--- a/kernel/power/swsusp.c
+++ /dev/null
@@ -1,58 +0,0 @@
1/*
2 * linux/kernel/power/swsusp.c
3 *
4 * This file provides code to write suspend image to swap and read it back.
5 *
6 * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz>
8 *
9 * This file is released under the GPLv2.
10 *
11 * I'd like to thank the following people for their work:
12 *
13 * Pavel Machek <pavel@ucw.cz>:
14 * Modifications, defectiveness pointing, being with me at the very beginning,
15 * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17.
16 *
17 * Steve Doddi <dirk@loth.demon.co.uk>:
18 * Support the possibility of hardware state restoring.
19 *
20 * Raph <grey.havens@earthling.net>:
21 * Support for preserving states of network devices and virtual console
22 * (including X and svgatextmode)
23 *
24 * Kurt Garloff <garloff@suse.de>:
25 * Straightened the critical function in order to prevent compilers from
26 * playing tricks with local variables.
27 *
28 * Andreas Mohr <a.mohr@mailto.de>
29 *
30 * Alex Badea <vampire@go.ro>:
31 * Fixed runaway init
32 *
33 * Rafael J. Wysocki <rjw@sisk.pl>
34 * Reworked the freeing of memory and the handling of swap
35 *
36 * More state savers are welcome. Especially for the scsi layer...
37 *
38 * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt
39 */
40
41#include <linux/mm.h>
42#include <linux/suspend.h>
43#include <linux/spinlock.h>
44#include <linux/kernel.h>
45#include <linux/major.h>
46#include <linux/swap.h>
47#include <linux/pm.h>
48#include <linux/swapops.h>
49#include <linux/bootmem.h>
50#include <linux/syscalls.h>
51#include <linux/highmem.h>
52#include <linux/time.h>
53#include <linux/rbtree.h>
54#include <linux/io.h>
55
56#include "power.h"
57
58int in_suspend __nosavedata = 0;
diff --git a/kernel/power/user.c b/kernel/power/user.c
index bf0014d6a5f0..4d2289626a84 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
195 return res; 195 return res;
196} 196}
197 197
198static void snapshot_deprecated_ioctl(unsigned int cmd)
199{
200 if (printk_ratelimit())
201 printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will "
202 "be removed soon, update your suspend-to-disk "
203 "utilities\n",
204 __builtin_return_address(0), cmd);
205}
206
198static long snapshot_ioctl(struct file *filp, unsigned int cmd, 207static long snapshot_ioctl(struct file *filp, unsigned int cmd,
199 unsigned long arg) 208 unsigned long arg)
200{ 209{
@@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
246 data->frozen = 0; 255 data->frozen = 0;
247 break; 256 break;
248 257
249 case SNAPSHOT_CREATE_IMAGE:
250 case SNAPSHOT_ATOMIC_SNAPSHOT: 258 case SNAPSHOT_ATOMIC_SNAPSHOT:
259 snapshot_deprecated_ioctl(cmd);
260 case SNAPSHOT_CREATE_IMAGE:
251 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 261 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
252 error = -EPERM; 262 error = -EPERM;
253 break; 263 break;
@@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
275 data->ready = 0; 285 data->ready = 0;
276 break; 286 break;
277 287
278 case SNAPSHOT_PREF_IMAGE_SIZE:
279 case SNAPSHOT_SET_IMAGE_SIZE: 288 case SNAPSHOT_SET_IMAGE_SIZE:
289 snapshot_deprecated_ioctl(cmd);
290 case SNAPSHOT_PREF_IMAGE_SIZE:
280 image_size = arg; 291 image_size = arg;
281 break; 292 break;
282 293
@@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
290 error = put_user(size, (loff_t __user *)arg); 301 error = put_user(size, (loff_t __user *)arg);
291 break; 302 break;
292 303
293 case SNAPSHOT_AVAIL_SWAP_SIZE:
294 case SNAPSHOT_AVAIL_SWAP: 304 case SNAPSHOT_AVAIL_SWAP:
305 snapshot_deprecated_ioctl(cmd);
306 case SNAPSHOT_AVAIL_SWAP_SIZE:
295 size = count_swap_pages(data->swap, 1); 307 size = count_swap_pages(data->swap, 1);
296 size <<= PAGE_SHIFT; 308 size <<= PAGE_SHIFT;
297 error = put_user(size, (loff_t __user *)arg); 309 error = put_user(size, (loff_t __user *)arg);
298 break; 310 break;
299 311
300 case SNAPSHOT_ALLOC_SWAP_PAGE:
301 case SNAPSHOT_GET_SWAP_PAGE: 312 case SNAPSHOT_GET_SWAP_PAGE:
313 snapshot_deprecated_ioctl(cmd);
314 case SNAPSHOT_ALLOC_SWAP_PAGE:
302 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 315 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
303 error = -ENODEV; 316 error = -ENODEV;
304 break; 317 break;
@@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
321 break; 334 break;
322 335
323 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ 336 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
337 snapshot_deprecated_ioctl(cmd);
324 if (!swsusp_swap_in_use()) { 338 if (!swsusp_swap_in_use()) {
325 /* 339 /*
326 * User space encodes device types as two-byte values, 340 * User space encodes device types as two-byte values,
@@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
362 break; 376 break;
363 377
364 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ 378 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
379 snapshot_deprecated_ioctl(cmd);
365 error = -EINVAL; 380 error = -EINVAL;
366 381
367 switch (arg) { 382 switch (arg) {
diff --git a/kernel/printk.c b/kernel/printk.c
index b5ac4d99c667..1751c456b71f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -34,6 +34,7 @@
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/ratelimit.h> 36#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h>
37 38
38#include <asm/uaccess.h> 39#include <asm/uaccess.h>
39 40
@@ -1405,4 +1406,123 @@ bool printk_timed_ratelimit(unsigned long *caller_jiffies,
1405 return false; 1406 return false;
1406} 1407}
1407EXPORT_SYMBOL(printk_timed_ratelimit); 1408EXPORT_SYMBOL(printk_timed_ratelimit);
1409
1410static DEFINE_SPINLOCK(dump_list_lock);
1411static LIST_HEAD(dump_list);
1412
1413/**
1414 * kmsg_dump_register - register a kernel log dumper.
1415 * @dumper: pointer to the kmsg_dumper structure
1416 *
1417 * Adds a kernel log dumper to the system. The dump callback in the
1418 * structure will be called when the kernel oopses or panics and must be
1419 * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise.
1420 */
1421int kmsg_dump_register(struct kmsg_dumper *dumper)
1422{
1423 unsigned long flags;
1424 int err = -EBUSY;
1425
1426 /* The dump callback needs to be set */
1427 if (!dumper->dump)
1428 return -EINVAL;
1429
1430 spin_lock_irqsave(&dump_list_lock, flags);
1431 /* Don't allow registering multiple times */
1432 if (!dumper->registered) {
1433 dumper->registered = 1;
1434 list_add_tail(&dumper->list, &dump_list);
1435 err = 0;
1436 }
1437 spin_unlock_irqrestore(&dump_list_lock, flags);
1438
1439 return err;
1440}
1441EXPORT_SYMBOL_GPL(kmsg_dump_register);
1442
1443/**
1444 * kmsg_dump_unregister - unregister a kmsg dumper.
1445 * @dumper: pointer to the kmsg_dumper structure
1446 *
1447 * Removes a dump device from the system. Returns zero on success and
1448 * %-EINVAL otherwise.
1449 */
1450int kmsg_dump_unregister(struct kmsg_dumper *dumper)
1451{
1452 unsigned long flags;
1453 int err = -EINVAL;
1454
1455 spin_lock_irqsave(&dump_list_lock, flags);
1456 if (dumper->registered) {
1457 dumper->registered = 0;
1458 list_del(&dumper->list);
1459 err = 0;
1460 }
1461 spin_unlock_irqrestore(&dump_list_lock, flags);
1462
1463 return err;
1464}
1465EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
1466
1467static const char const *kmsg_reasons[] = {
1468 [KMSG_DUMP_OOPS] = "oops",
1469 [KMSG_DUMP_PANIC] = "panic",
1470 [KMSG_DUMP_KEXEC] = "kexec",
1471};
1472
1473static const char *kmsg_to_str(enum kmsg_dump_reason reason)
1474{
1475 if (reason >= ARRAY_SIZE(kmsg_reasons) || reason < 0)
1476 return "unknown";
1477
1478 return kmsg_reasons[reason];
1479}
1480
1481/**
1482 * kmsg_dump - dump kernel log to kernel message dumpers.
1483 * @reason: the reason (oops, panic etc) for dumping
1484 *
1485 * Iterate through each of the dump devices and call the oops/panic
1486 * callbacks with the log buffer.
1487 */
1488void kmsg_dump(enum kmsg_dump_reason reason)
1489{
1490 unsigned long end;
1491 unsigned chars;
1492 struct kmsg_dumper *dumper;
1493 const char *s1, *s2;
1494 unsigned long l1, l2;
1495 unsigned long flags;
1496
1497 /* Theoretically, the log could move on after we do this, but
1498 there's not a lot we can do about that. The new messages
1499 will overwrite the start of what we dump. */
1500 spin_lock_irqsave(&logbuf_lock, flags);
1501 end = log_end & LOG_BUF_MASK;
1502 chars = logged_chars;
1503 spin_unlock_irqrestore(&logbuf_lock, flags);
1504
1505 if (logged_chars > end) {
1506 s1 = log_buf + log_buf_len - logged_chars + end;
1507 l1 = logged_chars - end;
1508
1509 s2 = log_buf;
1510 l2 = end;
1511 } else {
1512 s1 = "";
1513 l1 = 0;
1514
1515 s2 = log_buf + end - logged_chars;
1516 l2 = logged_chars;
1517 }
1518
1519 if (!spin_trylock_irqsave(&dump_list_lock, flags)) {
1520 printk(KERN_ERR "dump_kmsg: dump list lock is held during %s, skipping dump\n",
1521 kmsg_to_str(reason));
1522 return;
1523 }
1524 list_for_each_entry(dumper, &dump_list, list)
1525 dumper->dump(dumper, reason, s1, l1, s2, l2);
1526 spin_unlock_irqrestore(&dump_list_lock, flags);
1527}
1408#endif 1528#endif
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 23bd09cd042e..42ad8ae729a0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/regset.h>
25 26
26 27
27/* 28/*
@@ -511,6 +512,47 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
511 return 0; 512 return 0;
512} 513}
513 514
515#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
516
517static const struct user_regset *
518find_regset(const struct user_regset_view *view, unsigned int type)
519{
520 const struct user_regset *regset;
521 int n;
522
523 for (n = 0; n < view->n; ++n) {
524 regset = view->regsets + n;
525 if (regset->core_note_type == type)
526 return regset;
527 }
528
529 return NULL;
530}
531
532static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
533 struct iovec *kiov)
534{
535 const struct user_regset_view *view = task_user_regset_view(task);
536 const struct user_regset *regset = find_regset(view, type);
537 int regset_no;
538
539 if (!regset || (kiov->iov_len % regset->size) != 0)
540 return -EINVAL;
541
542 regset_no = regset - view->regsets;
543 kiov->iov_len = min(kiov->iov_len,
544 (__kernel_size_t) (regset->n * regset->size));
545
546 if (req == PTRACE_GETREGSET)
547 return copy_regset_to_user(task, view, regset_no, 0,
548 kiov->iov_len, kiov->iov_base);
549 else
550 return copy_regset_from_user(task, view, regset_no, 0,
551 kiov->iov_len, kiov->iov_base);
552}
553
554#endif
555
514int ptrace_request(struct task_struct *child, long request, 556int ptrace_request(struct task_struct *child, long request,
515 long addr, long data) 557 long addr, long data)
516{ 558{
@@ -573,6 +615,26 @@ int ptrace_request(struct task_struct *child, long request,
573 return 0; 615 return 0;
574 return ptrace_resume(child, request, SIGKILL); 616 return ptrace_resume(child, request, SIGKILL);
575 617
618#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
619 case PTRACE_GETREGSET:
620 case PTRACE_SETREGSET:
621 {
622 struct iovec kiov;
623 struct iovec __user *uiov = (struct iovec __user *) data;
624
625 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
626 return -EFAULT;
627
628 if (__get_user(kiov.iov_base, &uiov->iov_base) ||
629 __get_user(kiov.iov_len, &uiov->iov_len))
630 return -EFAULT;
631
632 ret = ptrace_regset(child, request, addr, &kiov);
633 if (!ret)
634 ret = __put_user(kiov.iov_len, &uiov->iov_len);
635 break;
636 }
637#endif
576 default: 638 default:
577 break; 639 break;
578 } 640 }
@@ -711,6 +773,32 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request,
711 else 773 else
712 ret = ptrace_setsiginfo(child, &siginfo); 774 ret = ptrace_setsiginfo(child, &siginfo);
713 break; 775 break;
776#ifdef CONFIG_HAVE_ARCH_TRACEHOOK
777 case PTRACE_GETREGSET:
778 case PTRACE_SETREGSET:
779 {
780 struct iovec kiov;
781 struct compat_iovec __user *uiov =
782 (struct compat_iovec __user *) datap;
783 compat_uptr_t ptr;
784 compat_size_t len;
785
786 if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
787 return -EFAULT;
788
789 if (__get_user(ptr, &uiov->iov_base) ||
790 __get_user(len, &uiov->iov_len))
791 return -EFAULT;
792
793 kiov.iov_base = compat_ptr(ptr);
794 kiov.iov_len = len;
795
796 ret = ptrace_regset(child, request, addr, &kiov);
797 if (!ret)
798 ret = __put_user(kiov.iov_len, &uiov->iov_len);
799 break;
800 }
801#endif
714 802
715 default: 803 default:
716 ret = ptrace_request(child, request, addr, data); 804 ret = ptrace_request(child, request, addr, data);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 9b7fd4723878..f1125c1a6321 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,14 +44,43 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
47 48
48#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
49static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
50struct lockdep_map rcu_lock_map = 51struct lockdep_map rcu_lock_map =
51 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); 52 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
52EXPORT_SYMBOL_GPL(rcu_lock_map); 53EXPORT_SYMBOL_GPL(rcu_lock_map);
54
55static struct lock_class_key rcu_bh_lock_key;
56struct lockdep_map rcu_bh_lock_map =
57 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key);
58EXPORT_SYMBOL_GPL(rcu_bh_lock_map);
59
60static struct lock_class_key rcu_sched_lock_key;
61struct lockdep_map rcu_sched_lock_map =
62 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key);
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
53#endif 64#endif
54 65
66int rcu_scheduler_active __read_mostly;
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68
69/*
70 * This function is invoked towards the end of the scheduler's initialization
71 * process. Before this is called, the idle task might contain
72 * RCU read-side critical sections (during which time, this idle
73 * task is booting the system). After this function is called, the
74 * idle tasks are prohibited from containing RCU read-side critical
75 * sections.
76 */
77void rcu_scheduler_starting(void)
78{
79 WARN_ON(num_online_cpus() != 1);
80 WARN_ON(nr_context_switches() > 0);
81 rcu_scheduler_active = 1;
82}
83
55/* 84/*
56 * Awaken the corresponding synchronize_rcu() instance now that a 85 * Awaken the corresponding synchronize_rcu() instance now that a
57 * grace period has elapsed. 86 * grace period has elapsed.
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index a621a67ef4e3..258cdf0a91eb 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ 61static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
62static int stutter = 5; /* Start/stop testing interval (in sec) */ 62static int stutter = 5; /* Start/stop testing interval (in sec) */
63static int irqreader = 1; /* RCU readers from irq (timers). */ 63static int irqreader = 1; /* RCU readers from irq (timers). */
64static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */
65static int fqs_holdoff = 0; /* Hold time within burst (us). */
66static int fqs_stutter = 3; /* Wait time between bursts (s). */
64static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 67static char *torture_type = "rcu"; /* What RCU implementation to torture. */
65 68
66module_param(nreaders, int, 0444); 69module_param(nreaders, int, 0444);
@@ -79,6 +82,12 @@ module_param(stutter, int, 0444);
79MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); 82MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
80module_param(irqreader, int, 0444); 83module_param(irqreader, int, 0444);
81MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); 84MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
85module_param(fqs_duration, int, 0444);
86MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)");
87module_param(fqs_holdoff, int, 0444);
88MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
89module_param(fqs_stutter, int, 0444);
90MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
82module_param(torture_type, charp, 0444); 91module_param(torture_type, charp, 0444);
83MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 92MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
84 93
@@ -99,6 +108,7 @@ static struct task_struct **reader_tasks;
99static struct task_struct *stats_task; 108static struct task_struct *stats_task;
100static struct task_struct *shuffler_task; 109static struct task_struct *shuffler_task;
101static struct task_struct *stutter_task; 110static struct task_struct *stutter_task;
111static struct task_struct *fqs_task;
102 112
103#define RCU_TORTURE_PIPE_LEN 10 113#define RCU_TORTURE_PIPE_LEN 10
104 114
@@ -263,6 +273,7 @@ struct rcu_torture_ops {
263 void (*deferred_free)(struct rcu_torture *p); 273 void (*deferred_free)(struct rcu_torture *p);
264 void (*sync)(void); 274 void (*sync)(void);
265 void (*cb_barrier)(void); 275 void (*cb_barrier)(void);
276 void (*fqs)(void);
266 int (*stats)(char *page); 277 int (*stats)(char *page);
267 int irq_capable; 278 int irq_capable;
268 char *name; 279 char *name;
@@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = {
347 .deferred_free = rcu_torture_deferred_free, 358 .deferred_free = rcu_torture_deferred_free,
348 .sync = synchronize_rcu, 359 .sync = synchronize_rcu,
349 .cb_barrier = rcu_barrier, 360 .cb_barrier = rcu_barrier,
361 .fqs = rcu_force_quiescent_state,
350 .stats = NULL, 362 .stats = NULL,
351 .irq_capable = 1, 363 .irq_capable = 1,
352 .name = "rcu" 364 .name = "rcu"
@@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = {
388 .deferred_free = rcu_sync_torture_deferred_free, 400 .deferred_free = rcu_sync_torture_deferred_free,
389 .sync = synchronize_rcu, 401 .sync = synchronize_rcu,
390 .cb_barrier = NULL, 402 .cb_barrier = NULL,
403 .fqs = rcu_force_quiescent_state,
391 .stats = NULL, 404 .stats = NULL,
392 .irq_capable = 1, 405 .irq_capable = 1,
393 .name = "rcu_sync" 406 .name = "rcu_sync"
@@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = {
403 .deferred_free = rcu_sync_torture_deferred_free, 416 .deferred_free = rcu_sync_torture_deferred_free,
404 .sync = synchronize_rcu_expedited, 417 .sync = synchronize_rcu_expedited,
405 .cb_barrier = NULL, 418 .cb_barrier = NULL,
419 .fqs = rcu_force_quiescent_state,
406 .stats = NULL, 420 .stats = NULL,
407 .irq_capable = 1, 421 .irq_capable = 1,
408 .name = "rcu_expedited" 422 .name = "rcu_expedited"
@@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
465 .deferred_free = rcu_bh_torture_deferred_free, 479 .deferred_free = rcu_bh_torture_deferred_free,
466 .sync = rcu_bh_torture_synchronize, 480 .sync = rcu_bh_torture_synchronize,
467 .cb_barrier = rcu_barrier_bh, 481 .cb_barrier = rcu_barrier_bh,
482 .fqs = rcu_bh_force_quiescent_state,
468 .stats = NULL, 483 .stats = NULL,
469 .irq_capable = 1, 484 .irq_capable = 1,
470 .name = "rcu_bh" 485 .name = "rcu_bh"
@@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
480 .deferred_free = rcu_sync_torture_deferred_free, 495 .deferred_free = rcu_sync_torture_deferred_free,
481 .sync = rcu_bh_torture_synchronize, 496 .sync = rcu_bh_torture_synchronize,
482 .cb_barrier = NULL, 497 .cb_barrier = NULL,
498 .fqs = rcu_bh_force_quiescent_state,
483 .stats = NULL, 499 .stats = NULL,
484 .irq_capable = 1, 500 .irq_capable = 1,
485 .name = "rcu_bh_sync" 501 .name = "rcu_bh_sync"
@@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = {
621 .deferred_free = rcu_sched_torture_deferred_free, 637 .deferred_free = rcu_sched_torture_deferred_free,
622 .sync = sched_torture_synchronize, 638 .sync = sched_torture_synchronize,
623 .cb_barrier = rcu_barrier_sched, 639 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state,
624 .stats = NULL, 641 .stats = NULL,
625 .irq_capable = 1, 642 .irq_capable = 1,
626 .name = "sched" 643 .name = "sched"
@@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = {
636 .deferred_free = rcu_sync_torture_deferred_free, 653 .deferred_free = rcu_sync_torture_deferred_free,
637 .sync = sched_torture_synchronize, 654 .sync = sched_torture_synchronize,
638 .cb_barrier = NULL, 655 .cb_barrier = NULL,
656 .fqs = rcu_sched_force_quiescent_state,
639 .stats = NULL, 657 .stats = NULL,
640 .name = "sched_sync" 658 .name = "sched_sync"
641}; 659};
@@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = {
650 .deferred_free = rcu_sync_torture_deferred_free, 668 .deferred_free = rcu_sync_torture_deferred_free,
651 .sync = synchronize_sched_expedited, 669 .sync = synchronize_sched_expedited,
652 .cb_barrier = NULL, 670 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state,
653 .stats = rcu_expedited_torture_stats, 672 .stats = rcu_expedited_torture_stats,
654 .irq_capable = 1, 673 .irq_capable = 1,
655 .name = "sched_expedited" 674 .name = "sched_expedited"
656}; 675};
657 676
658/* 677/*
678 * RCU torture force-quiescent-state kthread. Repeatedly induces
679 * bursts of calls to force_quiescent_state(), increasing the probability
680 * of occurrence of some important types of race conditions.
681 */
682static int
683rcu_torture_fqs(void *arg)
684{
685 unsigned long fqs_resume_time;
686 int fqs_burst_remaining;
687
688 VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
689 do {
690 fqs_resume_time = jiffies + fqs_stutter * HZ;
691 while (jiffies - fqs_resume_time > LONG_MAX) {
692 schedule_timeout_interruptible(1);
693 }
694 fqs_burst_remaining = fqs_duration;
695 while (fqs_burst_remaining > 0) {
696 cur_ops->fqs();
697 udelay(fqs_holdoff);
698 fqs_burst_remaining -= fqs_holdoff;
699 }
700 rcu_stutter_wait("rcu_torture_fqs");
701 } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
702 VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping");
703 rcutorture_shutdown_absorb("rcu_torture_fqs");
704 while (!kthread_should_stop())
705 schedule_timeout_uninterruptible(1);
706 return 0;
707}
708
709/*
659 * RCU torture writer kthread. Repeatedly substitutes a new structure 710 * RCU torture writer kthread. Repeatedly substitutes a new structure
660 * for that pointed to by rcu_torture_current, freeing the old structure 711 * for that pointed to by rcu_torture_current, freeing the old structure
661 * after a series of grace periods (the "pipeline"). 712 * after a series of grace periods (the "pipeline").
@@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused)
745 796
746 idx = cur_ops->readlock(); 797 idx = cur_ops->readlock();
747 completed = cur_ops->completed(); 798 completed = cur_ops->completed();
748 p = rcu_dereference(rcu_torture_current); 799 p = rcu_dereference_check(rcu_torture_current,
800 rcu_read_lock_held() ||
801 rcu_read_lock_bh_held() ||
802 rcu_read_lock_sched_held() ||
803 srcu_read_lock_held(&srcu_ctl));
749 if (p == NULL) { 804 if (p == NULL) {
750 /* Leave because rcu_torture_writer is not yet underway */ 805 /* Leave because rcu_torture_writer is not yet underway */
751 cur_ops->readunlock(idx); 806 cur_ops->readunlock(idx);
@@ -763,13 +818,13 @@ static void rcu_torture_timer(unsigned long unused)
763 /* Should not happen, but... */ 818 /* Should not happen, but... */
764 pipe_count = RCU_TORTURE_PIPE_LEN; 819 pipe_count = RCU_TORTURE_PIPE_LEN;
765 } 820 }
766 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 821 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
767 completed = cur_ops->completed() - completed; 822 completed = cur_ops->completed() - completed;
768 if (completed > RCU_TORTURE_PIPE_LEN) { 823 if (completed > RCU_TORTURE_PIPE_LEN) {
769 /* Should not happen, but... */ 824 /* Should not happen, but... */
770 completed = RCU_TORTURE_PIPE_LEN; 825 completed = RCU_TORTURE_PIPE_LEN;
771 } 826 }
772 ++__get_cpu_var(rcu_torture_batch)[completed]; 827 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
773 preempt_enable(); 828 preempt_enable();
774 cur_ops->readunlock(idx); 829 cur_ops->readunlock(idx);
775} 830}
@@ -798,11 +853,15 @@ rcu_torture_reader(void *arg)
798 do { 853 do {
799 if (irqreader && cur_ops->irq_capable) { 854 if (irqreader && cur_ops->irq_capable) {
800 if (!timer_pending(&t)) 855 if (!timer_pending(&t))
801 mod_timer(&t, 1); 856 mod_timer(&t, jiffies + 1);
802 } 857 }
803 idx = cur_ops->readlock(); 858 idx = cur_ops->readlock();
804 completed = cur_ops->completed(); 859 completed = cur_ops->completed();
805 p = rcu_dereference(rcu_torture_current); 860 p = rcu_dereference_check(rcu_torture_current,
861 rcu_read_lock_held() ||
862 rcu_read_lock_bh_held() ||
863 rcu_read_lock_sched_held() ||
864 srcu_read_lock_held(&srcu_ctl));
806 if (p == NULL) { 865 if (p == NULL) {
807 /* Wait for rcu_torture_writer to get underway */ 866 /* Wait for rcu_torture_writer to get underway */
808 cur_ops->readunlock(idx); 867 cur_ops->readunlock(idx);
@@ -818,13 +877,13 @@ rcu_torture_reader(void *arg)
818 /* Should not happen, but... */ 877 /* Should not happen, but... */
819 pipe_count = RCU_TORTURE_PIPE_LEN; 878 pipe_count = RCU_TORTURE_PIPE_LEN;
820 } 879 }
821 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 880 __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]);
822 completed = cur_ops->completed() - completed; 881 completed = cur_ops->completed() - completed;
823 if (completed > RCU_TORTURE_PIPE_LEN) { 882 if (completed > RCU_TORTURE_PIPE_LEN) {
824 /* Should not happen, but... */ 883 /* Should not happen, but... */
825 completed = RCU_TORTURE_PIPE_LEN; 884 completed = RCU_TORTURE_PIPE_LEN;
826 } 885 }
827 ++__get_cpu_var(rcu_torture_batch)[completed]; 886 __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]);
828 preempt_enable(); 887 preempt_enable();
829 cur_ops->readunlock(idx); 888 cur_ops->readunlock(idx);
830 schedule(); 889 schedule();
@@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag)
1030 printk(KERN_ALERT "%s" TORTURE_FLAG 1089 printk(KERN_ALERT "%s" TORTURE_FLAG
1031 "--- %s: nreaders=%d nfakewriters=%d " 1090 "--- %s: nreaders=%d nfakewriters=%d "
1032 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 1091 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
1033 "shuffle_interval=%d stutter=%d irqreader=%d\n", 1092 "shuffle_interval=%d stutter=%d irqreader=%d "
1093 "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n",
1034 torture_type, tag, nrealreaders, nfakewriters, 1094 torture_type, tag, nrealreaders, nfakewriters,
1035 stat_interval, verbose, test_no_idle_hz, shuffle_interval, 1095 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
1036 stutter, irqreader); 1096 stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter);
1037} 1097}
1038 1098
1039static struct notifier_block rcutorture_nb = { 1099static struct notifier_block rcutorture_nb = {
@@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void)
1109 } 1169 }
1110 stats_task = NULL; 1170 stats_task = NULL;
1111 1171
1172 if (fqs_task) {
1173 VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task");
1174 kthread_stop(fqs_task);
1175 }
1176 fqs_task = NULL;
1177
1112 /* Wait for all RCU callbacks to fire. */ 1178 /* Wait for all RCU callbacks to fire. */
1113 1179
1114 if (cur_ops->cb_barrier != NULL) 1180 if (cur_ops->cb_barrier != NULL)
@@ -1154,6 +1220,11 @@ rcu_torture_init(void)
1154 mutex_unlock(&fullstop_mutex); 1220 mutex_unlock(&fullstop_mutex);
1155 return -EINVAL; 1221 return -EINVAL;
1156 } 1222 }
1223 if (cur_ops->fqs == NULL && fqs_duration != 0) {
1224 printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero "
1225 "fqs_duration, fqs disabled.\n");
1226 fqs_duration = 0;
1227 }
1157 if (cur_ops->init) 1228 if (cur_ops->init)
1158 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 1229 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
1159 1230
@@ -1282,6 +1353,19 @@ rcu_torture_init(void)
1282 goto unwind; 1353 goto unwind;
1283 } 1354 }
1284 } 1355 }
1356 if (fqs_duration < 0)
1357 fqs_duration = 0;
1358 if (fqs_duration) {
1359 /* Create the stutter thread */
1360 fqs_task = kthread_run(rcu_torture_fqs, NULL,
1361 "rcu_torture_fqs");
1362 if (IS_ERR(fqs_task)) {
1363 firsterr = PTR_ERR(fqs_task);
1364 VERBOSE_PRINTK_ERRSTRING("Failed to create fqs");
1365 fqs_task = NULL;
1366 goto unwind;
1367 }
1368 }
1285 register_reboot_notifier(&rcutorture_nb); 1369 register_reboot_notifier(&rcutorture_nb);
1286 mutex_unlock(&fullstop_mutex); 1370 mutex_unlock(&fullstop_mutex);
1287 return 0; 1371 return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53ae9598f798..3ec8160fc75f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,7 +46,6 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
50 49
51#include "rcutree.h" 50#include "rcutree.h"
52 51
@@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
66 .signaled = RCU_GP_IDLE, \ 65 .signaled = RCU_GP_IDLE, \
67 .gpnum = -300, \ 66 .gpnum = -300, \
68 .completed = -300, \ 67 .completed = -300, \
69 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \
70 .orphan_cbs_list = NULL, \ 69 .orphan_cbs_list = NULL, \
71 .orphan_cbs_tail = &name.orphan_cbs_list, \ 70 .orphan_cbs_tail = &name.orphan_cbs_list, \
72 .orphan_qlen = 0, \ 71 .orphan_qlen = 0, \
73 .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ 72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \
74 .n_force_qs = 0, \ 73 .n_force_qs = 0, \
75 .n_force_qs_ngp = 0, \ 74 .n_force_qs_ngp = 0, \
76} 75}
@@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
81struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
82DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
83 82
84static int rcu_scheduler_active __read_mostly;
85
86
87/* 83/*
88 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
89 * permit this function to be invoked without holding the root rcu_node 85 * permit this function to be invoked without holding the root rcu_node
@@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void)
157EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 153EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
158 154
159/* 155/*
156 * Force a quiescent state for RCU BH.
157 */
158void rcu_bh_force_quiescent_state(void)
159{
160 force_quiescent_state(&rcu_bh_state, 0);
161}
162EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
163
164/*
165 * Force a quiescent state for RCU-sched.
166 */
167void rcu_sched_force_quiescent_state(void)
168{
169 force_quiescent_state(&rcu_sched_state, 0);
170}
171EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
172
173/*
160 * Does the CPU have callbacks ready to be invoked? 174 * Does the CPU have callbacks ready to be invoked?
161 */ 175 */
162static int 176static int
@@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
439 453
440 /* Only let one CPU complain about others per time interval. */ 454 /* Only let one CPU complain about others per time interval. */
441 455
442 spin_lock_irqsave(&rnp->lock, flags); 456 raw_spin_lock_irqsave(&rnp->lock, flags);
443 delta = jiffies - rsp->jiffies_stall; 457 delta = jiffies - rsp->jiffies_stall;
444 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 458 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
445 spin_unlock_irqrestore(&rnp->lock, flags); 459 raw_spin_unlock_irqrestore(&rnp->lock, flags);
446 return; 460 return;
447 } 461 }
448 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 462 rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
@@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
452 * due to CPU offlining. 466 * due to CPU offlining.
453 */ 467 */
454 rcu_print_task_stall(rnp); 468 rcu_print_task_stall(rnp);
455 spin_unlock_irqrestore(&rnp->lock, flags); 469 raw_spin_unlock_irqrestore(&rnp->lock, flags);
456 470
457 /* OK, time to rat on our buddy... */ 471 /* OK, time to rat on our buddy... */
458 472
459 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 473 printk(KERN_ERR "INFO: RCU detected CPU stalls:");
460 rcu_for_each_leaf_node(rsp, rnp) { 474 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags);
461 rcu_print_task_stall(rnp); 476 rcu_print_task_stall(rnp);
477 raw_spin_unlock_irqrestore(&rnp->lock, flags);
462 if (rnp->qsmask == 0) 478 if (rnp->qsmask == 0)
463 continue; 479 continue;
464 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) 480 for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
@@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
469 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 485 smp_processor_id(), (long)(jiffies - rsp->gp_start));
470 trigger_all_cpu_backtrace(); 486 trigger_all_cpu_backtrace();
471 487
488 /* If so configured, complain about tasks blocking the grace period. */
489
490 rcu_print_detail_task_stall(rsp);
491
472 force_quiescent_state(rsp, 0); /* Kick them all. */ 492 force_quiescent_state(rsp, 0); /* Kick them all. */
473} 493}
474 494
@@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp)
481 smp_processor_id(), jiffies - rsp->gp_start); 501 smp_processor_id(), jiffies - rsp->gp_start);
482 trigger_all_cpu_backtrace(); 502 trigger_all_cpu_backtrace();
483 503
484 spin_lock_irqsave(&rnp->lock, flags); 504 raw_spin_lock_irqsave(&rnp->lock, flags);
485 if ((long)(jiffies - rsp->jiffies_stall) >= 0) 505 if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
486 rsp->jiffies_stall = 506 rsp->jiffies_stall =
487 jiffies + RCU_SECONDS_TILL_STALL_RECHECK; 507 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
488 spin_unlock_irqrestore(&rnp->lock, flags); 508 raw_spin_unlock_irqrestore(&rnp->lock, flags);
489 509
490 set_need_resched(); /* kick ourselves to get things going. */ 510 set_need_resched(); /* kick ourselves to get things going. */
491} 511}
@@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)
545 local_irq_save(flags); 565 local_irq_save(flags);
546 rnp = rdp->mynode; 566 rnp = rdp->mynode;
547 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ 567 if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */
548 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 568 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
549 local_irq_restore(flags); 569 local_irq_restore(flags);
550 return; 570 return;
551 } 571 }
552 __note_new_gpnum(rsp, rnp, rdp); 572 __note_new_gpnum(rsp, rnp, rdp);
553 spin_unlock_irqrestore(&rnp->lock, flags); 573 raw_spin_unlock_irqrestore(&rnp->lock, flags);
554} 574}
555 575
556/* 576/*
@@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
609 local_irq_save(flags); 629 local_irq_save(flags);
610 rnp = rdp->mynode; 630 rnp = rdp->mynode;
611 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ 631 if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */
612 !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ 632 !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
613 local_irq_restore(flags); 633 local_irq_restore(flags);
614 return; 634 return;
615 } 635 }
616 __rcu_process_gp_end(rsp, rnp, rdp); 636 __rcu_process_gp_end(rsp, rnp, rdp);
617 spin_unlock_irqrestore(&rnp->lock, flags); 637 raw_spin_unlock_irqrestore(&rnp->lock, flags);
618} 638}
619 639
620/* 640/*
@@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
659 struct rcu_data *rdp = rsp->rda[smp_processor_id()]; 679 struct rcu_data *rdp = rsp->rda[smp_processor_id()];
660 struct rcu_node *rnp = rcu_get_root(rsp); 680 struct rcu_node *rnp = rcu_get_root(rsp);
661 681
662 if (!cpu_needs_another_gp(rsp, rdp)) { 682 if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
683 if (cpu_needs_another_gp(rsp, rdp))
684 rsp->fqs_need_gp = 1;
663 if (rnp->completed == rsp->completed) { 685 if (rnp->completed == rsp->completed) {
664 spin_unlock_irqrestore(&rnp->lock, flags); 686 raw_spin_unlock_irqrestore(&rnp->lock, flags);
665 return; 687 return;
666 } 688 }
667 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 689 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 690
669 /* 691 /*
670 * Propagate new ->completed value to rcu_node structures 692 * Propagate new ->completed value to rcu_node structures
@@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
672 * of the next grace period to process their callbacks. 694 * of the next grace period to process their callbacks.
673 */ 695 */
674 rcu_for_each_node_breadth_first(rsp, rnp) { 696 rcu_for_each_node_breadth_first(rsp, rnp) {
675 spin_lock(&rnp->lock); /* irqs already disabled. */ 697 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
676 rnp->completed = rsp->completed; 698 rnp->completed = rsp->completed;
677 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 699 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
678 } 700 }
679 local_irq_restore(flags); 701 local_irq_restore(flags);
680 return; 702 return;
@@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
695 rnp->completed = rsp->completed; 717 rnp->completed = rsp->completed;
696 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ 718 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
697 rcu_start_gp_per_cpu(rsp, rnp, rdp); 719 rcu_start_gp_per_cpu(rsp, rnp, rdp);
698 spin_unlock_irqrestore(&rnp->lock, flags); 720 raw_spin_unlock_irqrestore(&rnp->lock, flags);
699 return; 721 return;
700 } 722 }
701 723
702 spin_unlock(&rnp->lock); /* leave irqs disabled. */ 724 raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */
703 725
704 726
705 /* Exclude any concurrent CPU-hotplug operations. */ 727 /* Exclude any concurrent CPU-hotplug operations. */
706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 728 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
707 729
708 /* 730 /*
709 * Set the quiescent-state-needed bits in all the rcu_node 731 * Set the quiescent-state-needed bits in all the rcu_node
@@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
723 * irqs disabled. 745 * irqs disabled.
724 */ 746 */
725 rcu_for_each_node_breadth_first(rsp, rnp) { 747 rcu_for_each_node_breadth_first(rsp, rnp) {
726 spin_lock(&rnp->lock); /* irqs already disabled. */ 748 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
727 rcu_preempt_check_blocked_tasks(rnp); 749 rcu_preempt_check_blocked_tasks(rnp);
728 rnp->qsmask = rnp->qsmaskinit; 750 rnp->qsmask = rnp->qsmaskinit;
729 rnp->gpnum = rsp->gpnum; 751 rnp->gpnum = rsp->gpnum;
730 rnp->completed = rsp->completed; 752 rnp->completed = rsp->completed;
731 if (rnp == rdp->mynode) 753 if (rnp == rdp->mynode)
732 rcu_start_gp_per_cpu(rsp, rnp, rdp); 754 rcu_start_gp_per_cpu(rsp, rnp, rdp);
733 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 755 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
734 } 756 }
735 757
736 rnp = rcu_get_root(rsp); 758 rnp = rcu_get_root(rsp);
737 spin_lock(&rnp->lock); /* irqs already disabled. */ 759 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
738 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 760 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
739 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 761 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
740 spin_unlock_irqrestore(&rsp->onofflock, flags); 762 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
741} 763}
742 764
743/* 765/*
@@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
776 if (!(rnp->qsmask & mask)) { 798 if (!(rnp->qsmask & mask)) {
777 799
778 /* Our bit has already been cleared, so done. */ 800 /* Our bit has already been cleared, so done. */
779 spin_unlock_irqrestore(&rnp->lock, flags); 801 raw_spin_unlock_irqrestore(&rnp->lock, flags);
780 return; 802 return;
781 } 803 }
782 rnp->qsmask &= ~mask; 804 rnp->qsmask &= ~mask;
783 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 805 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
784 806
785 /* Other bits still set at this level, so done. */ 807 /* Other bits still set at this level, so done. */
786 spin_unlock_irqrestore(&rnp->lock, flags); 808 raw_spin_unlock_irqrestore(&rnp->lock, flags);
787 return; 809 return;
788 } 810 }
789 mask = rnp->grpmask; 811 mask = rnp->grpmask;
@@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
793 815
794 break; 816 break;
795 } 817 }
796 spin_unlock_irqrestore(&rnp->lock, flags); 818 raw_spin_unlock_irqrestore(&rnp->lock, flags);
797 rnp_c = rnp; 819 rnp_c = rnp;
798 rnp = rnp->parent; 820 rnp = rnp->parent;
799 spin_lock_irqsave(&rnp->lock, flags); 821 raw_spin_lock_irqsave(&rnp->lock, flags);
800 WARN_ON_ONCE(rnp_c->qsmask); 822 WARN_ON_ONCE(rnp_c->qsmask);
801 } 823 }
802 824
@@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
825 struct rcu_node *rnp; 847 struct rcu_node *rnp;
826 848
827 rnp = rdp->mynode; 849 rnp = rdp->mynode;
828 spin_lock_irqsave(&rnp->lock, flags); 850 raw_spin_lock_irqsave(&rnp->lock, flags);
829 if (lastcomp != rnp->completed) { 851 if (lastcomp != rnp->completed) {
830 852
831 /* 853 /*
@@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
837 * race occurred. 859 * race occurred.
838 */ 860 */
839 rdp->passed_quiesc = 0; /* try again later! */ 861 rdp->passed_quiesc = 0; /* try again later! */
840 spin_unlock_irqrestore(&rnp->lock, flags); 862 raw_spin_unlock_irqrestore(&rnp->lock, flags);
841 return; 863 return;
842 } 864 }
843 mask = rdp->grpmask; 865 mask = rdp->grpmask;
844 if ((rnp->qsmask & mask) == 0) { 866 if ((rnp->qsmask & mask) == 0) {
845 spin_unlock_irqrestore(&rnp->lock, flags); 867 raw_spin_unlock_irqrestore(&rnp->lock, flags);
846 } else { 868 } else {
847 rdp->qs_pending = 0; 869 rdp->qs_pending = 0;
848 870
@@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
906 928
907 if (rdp->nxtlist == NULL) 929 if (rdp->nxtlist == NULL)
908 return; /* irqs disabled, so comparison is stable. */ 930 return; /* irqs disabled, so comparison is stable. */
909 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 931 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
910 *rsp->orphan_cbs_tail = rdp->nxtlist; 932 *rsp->orphan_cbs_tail = rdp->nxtlist;
911 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; 933 rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
912 rdp->nxtlist = NULL; 934 rdp->nxtlist = NULL;
@@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
914 rdp->nxttail[i] = &rdp->nxtlist; 936 rdp->nxttail[i] = &rdp->nxtlist;
915 rsp->orphan_qlen += rdp->qlen; 937 rsp->orphan_qlen += rdp->qlen;
916 rdp->qlen = 0; 938 rdp->qlen = 0;
917 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 939 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
918} 940}
919 941
920/* 942/*
@@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
925 unsigned long flags; 947 unsigned long flags;
926 struct rcu_data *rdp; 948 struct rcu_data *rdp;
927 949
928 spin_lock_irqsave(&rsp->onofflock, flags); 950 raw_spin_lock_irqsave(&rsp->onofflock, flags);
929 rdp = rsp->rda[smp_processor_id()]; 951 rdp = rsp->rda[smp_processor_id()];
930 if (rsp->orphan_cbs_list == NULL) { 952 if (rsp->orphan_cbs_list == NULL) {
931 spin_unlock_irqrestore(&rsp->onofflock, flags); 953 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
932 return; 954 return;
933 } 955 }
934 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; 956 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
@@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
937 rsp->orphan_cbs_list = NULL; 959 rsp->orphan_cbs_list = NULL;
938 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; 960 rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
939 rsp->orphan_qlen = 0; 961 rsp->orphan_qlen = 0;
940 spin_unlock_irqrestore(&rsp->onofflock, flags); 962 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
941} 963}
942 964
943/* 965/*
@@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
953 struct rcu_node *rnp; 975 struct rcu_node *rnp;
954 976
955 /* Exclude any attempts to start a new grace period. */ 977 /* Exclude any attempts to start a new grace period. */
956 spin_lock_irqsave(&rsp->onofflock, flags); 978 raw_spin_lock_irqsave(&rsp->onofflock, flags);
957 979
958 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ 980 /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */
959 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ 981 rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */
960 mask = rdp->grpmask; /* rnp->grplo is constant. */ 982 mask = rdp->grpmask; /* rnp->grplo is constant. */
961 do { 983 do {
962 spin_lock(&rnp->lock); /* irqs already disabled. */ 984 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
963 rnp->qsmaskinit &= ~mask; 985 rnp->qsmaskinit &= ~mask;
964 if (rnp->qsmaskinit != 0) { 986 if (rnp->qsmaskinit != 0) {
965 if (rnp != rdp->mynode) 987 if (rnp != rdp->mynode)
966 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 988 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
967 break; 989 break;
968 } 990 }
969 if (rnp == rdp->mynode) 991 if (rnp == rdp->mynode)
970 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); 992 need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
971 else 993 else
972 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 994 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
973 mask = rnp->grpmask; 995 mask = rnp->grpmask;
974 rnp = rnp->parent; 996 rnp = rnp->parent;
975 } while (rnp != NULL); 997 } while (rnp != NULL);
@@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
980 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock 1002 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock
981 * held leads to deadlock. 1003 * held leads to deadlock.
982 */ 1004 */
983 spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ 1005 raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */
984 rnp = rdp->mynode; 1006 rnp = rdp->mynode;
985 if (need_report & RCU_OFL_TASKS_NORM_GP) 1007 if (need_report & RCU_OFL_TASKS_NORM_GP)
986 rcu_report_unblock_qs_rnp(rnp, flags); 1008 rcu_report_unblock_qs_rnp(rnp, flags);
987 else 1009 else
988 spin_unlock_irqrestore(&rnp->lock, flags); 1010 raw_spin_unlock_irqrestore(&rnp->lock, flags);
989 if (need_report & RCU_OFL_TASKS_EXP_GP) 1011 if (need_report & RCU_OFL_TASKS_EXP_GP)
990 rcu_report_exp_rnp(rsp, rnp); 1012 rcu_report_exp_rnp(rsp, rnp);
991 1013
@@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user)
1144/* 1166/*
1145 * Scan the leaf rcu_node structures, processing dyntick state for any that 1167 * Scan the leaf rcu_node structures, processing dyntick state for any that
1146 * have not yet encountered a quiescent state, using the function specified. 1168 * have not yet encountered a quiescent state, using the function specified.
1147 * Returns 1 if the current grace period ends while scanning (possibly 1169 * The caller must have suppressed start of new grace periods.
1148 * because we made it end).
1149 */ 1170 */
1150static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, 1171static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
1151 int (*f)(struct rcu_data *))
1152{ 1172{
1153 unsigned long bit; 1173 unsigned long bit;
1154 int cpu; 1174 int cpu;
@@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1158 1178
1159 rcu_for_each_leaf_node(rsp, rnp) { 1179 rcu_for_each_leaf_node(rsp, rnp) {
1160 mask = 0; 1180 mask = 0;
1161 spin_lock_irqsave(&rnp->lock, flags); 1181 raw_spin_lock_irqsave(&rnp->lock, flags);
1162 if (rnp->completed != lastcomp) { 1182 if (!rcu_gp_in_progress(rsp)) {
1163 spin_unlock_irqrestore(&rnp->lock, flags); 1183 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1164 return 1; 1184 return;
1165 } 1185 }
1166 if (rnp->qsmask == 0) { 1186 if (rnp->qsmask == 0) {
1167 spin_unlock_irqrestore(&rnp->lock, flags); 1187 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1168 continue; 1188 continue;
1169 } 1189 }
1170 cpu = rnp->grplo; 1190 cpu = rnp->grplo;
@@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1173 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) 1193 if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
1174 mask |= bit; 1194 mask |= bit;
1175 } 1195 }
1176 if (mask != 0 && rnp->completed == lastcomp) { 1196 if (mask != 0) {
1177 1197
1178 /* rcu_report_qs_rnp() releases rnp->lock. */ 1198 /* rcu_report_qs_rnp() releases rnp->lock. */
1179 rcu_report_qs_rnp(mask, rsp, rnp, flags); 1199 rcu_report_qs_rnp(mask, rsp, rnp, flags);
1180 continue; 1200 continue;
1181 } 1201 }
1182 spin_unlock_irqrestore(&rnp->lock, flags); 1202 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1183 } 1203 }
1184 return 0;
1185} 1204}
1186 1205
1187/* 1206/*
@@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
1191static void force_quiescent_state(struct rcu_state *rsp, int relaxed) 1210static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1192{ 1211{
1193 unsigned long flags; 1212 unsigned long flags;
1194 long lastcomp;
1195 struct rcu_node *rnp = rcu_get_root(rsp); 1213 struct rcu_node *rnp = rcu_get_root(rsp);
1196 u8 signaled;
1197 u8 forcenow;
1198 1214
1199 if (!rcu_gp_in_progress(rsp)) 1215 if (!rcu_gp_in_progress(rsp))
1200 return; /* No grace period in progress, nothing to force. */ 1216 return; /* No grace period in progress, nothing to force. */
1201 if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { 1217 if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
1202 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ 1218 rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */
1203 return; /* Someone else is already on the job. */ 1219 return; /* Someone else is already on the job. */
1204 } 1220 }
1205 if (relaxed && 1221 if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
1206 (long)(rsp->jiffies_force_qs - jiffies) >= 0) 1222 goto unlock_fqs_ret; /* no emergency and done recently. */
1207 goto unlock_ret; /* no emergency and done recently. */
1208 rsp->n_force_qs++; 1223 rsp->n_force_qs++;
1209 spin_lock(&rnp->lock); 1224 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1210 lastcomp = rsp->gpnum - 1;
1211 signaled = rsp->signaled;
1212 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; 1225 rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
1213 if(!rcu_gp_in_progress(rsp)) { 1226 if(!rcu_gp_in_progress(rsp)) {
1214 rsp->n_force_qs_ngp++; 1227 rsp->n_force_qs_ngp++;
1215 spin_unlock(&rnp->lock); 1228 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1216 goto unlock_ret; /* no GP in progress, time updated. */ 1229 goto unlock_fqs_ret; /* no GP in progress, time updated. */
1217 } 1230 }
1218 spin_unlock(&rnp->lock); 1231 rsp->fqs_active = 1;
1219 switch (signaled) { 1232 switch (rsp->signaled) {
1220 case RCU_GP_IDLE: 1233 case RCU_GP_IDLE:
1221 case RCU_GP_INIT: 1234 case RCU_GP_INIT:
1222 1235
@@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1224 1237
1225 case RCU_SAVE_DYNTICK: 1238 case RCU_SAVE_DYNTICK:
1226 1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1227 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1228 break; /* So gcc recognizes the dead code. */ 1242 break; /* So gcc recognizes the dead code. */
1229 1243
1230 /* Record dyntick-idle state. */ 1244 /* Record dyntick-idle state. */
1231 if (rcu_process_dyntick(rsp, lastcomp, 1245 force_qs_rnp(rsp, dyntick_save_progress_counter);
1232 dyntick_save_progress_counter)) 1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1233 goto unlock_ret; 1247 if (rcu_gp_in_progress(rsp))
1234 /* fall into next case. */
1235
1236 case RCU_SAVE_COMPLETED:
1237
1238 /* Update state, record completion counter. */
1239 forcenow = 0;
1240 spin_lock(&rnp->lock);
1241 if (lastcomp + 1 == rsp->gpnum &&
1242 lastcomp == rsp->completed &&
1243 rsp->signaled == signaled) {
1244 rsp->signaled = RCU_FORCE_QS; 1248 rsp->signaled = RCU_FORCE_QS;
1245 rsp->completed_fqs = lastcomp; 1249 break;
1246 forcenow = signaled == RCU_SAVE_COMPLETED;
1247 }
1248 spin_unlock(&rnp->lock);
1249 if (!forcenow)
1250 break;
1251 /* fall into next case. */
1252 1250
1253 case RCU_FORCE_QS: 1251 case RCU_FORCE_QS:
1254 1252
1255 /* Check dyntick-idle state, send IPI to laggarts. */ 1253 /* Check dyntick-idle state, send IPI to laggarts. */
1256 if (rcu_process_dyntick(rsp, rsp->completed_fqs, 1254 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1257 rcu_implicit_dynticks_qs)) 1255 force_qs_rnp(rsp, rcu_implicit_dynticks_qs);
1258 goto unlock_ret;
1259 1256
1260 /* Leave state in case more forcing is required. */ 1257 /* Leave state in case more forcing is required. */
1261 1258
1259 raw_spin_lock(&rnp->lock); /* irqs already disabled */
1262 break; 1260 break;
1263 } 1261 }
1264unlock_ret: 1262 rsp->fqs_active = 0;
1265 spin_unlock_irqrestore(&rsp->fqslock, flags); 1263 if (rsp->fqs_need_gp) {
1264 raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
1265 rsp->fqs_need_gp = 0;
1266 rcu_start_gp(rsp, flags); /* releases rnp->lock */
1267 return;
1268 }
1269 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1270unlock_fqs_ret:
1271 raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
1266} 1272}
1267 1273
1268#else /* #ifdef CONFIG_SMP */ 1274#else /* #ifdef CONFIG_SMP */
@@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1290 * If an RCU GP has gone long enough, go check for dyntick 1296 * If an RCU GP has gone long enough, go check for dyntick
1291 * idle CPUs and, if needed, send resched IPIs. 1297 * idle CPUs and, if needed, send resched IPIs.
1292 */ 1298 */
1293 if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1299 if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1294 force_quiescent_state(rsp, 1); 1300 force_quiescent_state(rsp, 1);
1295 1301
1296 /* 1302 /*
@@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1304 1310
1305 /* Does this CPU require a not-yet-started grace period? */ 1311 /* Does this CPU require a not-yet-started grace period? */
1306 if (cpu_needs_another_gp(rsp, rdp)) { 1312 if (cpu_needs_another_gp(rsp, rdp)) {
1307 spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); 1313 raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags);
1308 rcu_start_gp(rsp, flags); /* releases above lock */ 1314 rcu_start_gp(rsp, flags); /* releases above lock */
1309 } 1315 }
1310 1316
@@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused)
1335 * grace-period manipulations above. 1341 * grace-period manipulations above.
1336 */ 1342 */
1337 smp_mb(); /* See above block comment. */ 1343 smp_mb(); /* See above block comment. */
1344
1345 /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
1346 rcu_needs_cpu_flush();
1338} 1347}
1339 1348
1340static void 1349static void
@@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1369 unsigned long nestflag; 1378 unsigned long nestflag;
1370 struct rcu_node *rnp_root = rcu_get_root(rsp); 1379 struct rcu_node *rnp_root = rcu_get_root(rsp);
1371 1380
1372 spin_lock_irqsave(&rnp_root->lock, nestflag); 1381 raw_spin_lock_irqsave(&rnp_root->lock, nestflag);
1373 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1382 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1374 } 1383 }
1375 1384
@@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1387 force_quiescent_state(rsp, 0); 1396 force_quiescent_state(rsp, 0);
1388 rdp->n_force_qs_snap = rsp->n_force_qs; 1397 rdp->n_force_qs_snap = rsp->n_force_qs;
1389 rdp->qlen_last_fqs_check = rdp->qlen; 1398 rdp->qlen_last_fqs_check = rdp->qlen;
1390 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1399 } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies))
1391 force_quiescent_state(rsp, 1); 1400 force_quiescent_state(rsp, 1);
1392 local_irq_restore(flags); 1401 local_irq_restore(flags);
1393} 1402}
@@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1520 1529
1521 /* Has an RCU GP gone long enough to send resched IPIs &c? */ 1530 /* Has an RCU GP gone long enough to send resched IPIs &c? */
1522 if (rcu_gp_in_progress(rsp) && 1531 if (rcu_gp_in_progress(rsp) &&
1523 ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { 1532 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) {
1524 rdp->n_rp_need_fqs++; 1533 rdp->n_rp_need_fqs++;
1525 return 1; 1534 return 1;
1526 } 1535 }
@@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu)
1545/* 1554/*
1546 * Check to see if any future RCU-related work will need to be done 1555 * Check to see if any future RCU-related work will need to be done
1547 * by the current CPU, even if none need be done immediately, returning 1556 * by the current CPU, even if none need be done immediately, returning
1548 * 1 if so. This function is part of the RCU implementation; it is -not- 1557 * 1 if so.
1549 * an exported member of the RCU API.
1550 */ 1558 */
1551int rcu_needs_cpu(int cpu) 1559static int rcu_needs_cpu_quick_check(int cpu)
1552{ 1560{
1553 /* RCU callbacks either ready or pending? */ 1561 /* RCU callbacks either ready or pending? */
1554 return per_cpu(rcu_sched_data, cpu).nxtlist || 1562 return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu)
1556 rcu_preempt_needs_cpu(cpu); 1564 rcu_preempt_needs_cpu(cpu);
1557} 1565}
1558 1566
1559/*
1560 * This function is invoked towards the end of the scheduler's initialization
1561 * process. Before this is called, the idle task might contain
1562 * RCU read-side critical sections (during which time, this idle
1563 * task is booting the system). After this function is called, the
1564 * idle tasks are prohibited from containing RCU read-side critical
1565 * sections.
1566 */
1567void rcu_scheduler_starting(void)
1568{
1569 WARN_ON(num_online_cpus() != 1);
1570 WARN_ON(nr_context_switches() > 0);
1571 rcu_scheduler_active = 1;
1572}
1573
1574static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 1567static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
1575static atomic_t rcu_barrier_cpu_count; 1568static atomic_t rcu_barrier_cpu_count;
1576static DEFINE_MUTEX(rcu_barrier_mutex); 1569static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1659 struct rcu_node *rnp = rcu_get_root(rsp); 1652 struct rcu_node *rnp = rcu_get_root(rsp);
1660 1653
1661 /* Set up local state, ensuring consistent view of global state. */ 1654 /* Set up local state, ensuring consistent view of global state. */
1662 spin_lock_irqsave(&rnp->lock, flags); 1655 raw_spin_lock_irqsave(&rnp->lock, flags);
1663 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); 1656 rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
1664 rdp->nxtlist = NULL; 1657 rdp->nxtlist = NULL;
1665 for (i = 0; i < RCU_NEXT_SIZE; i++) 1658 for (i = 0; i < RCU_NEXT_SIZE; i++)
@@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
1669 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 1662 rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
1670#endif /* #ifdef CONFIG_NO_HZ */ 1663#endif /* #ifdef CONFIG_NO_HZ */
1671 rdp->cpu = cpu; 1664 rdp->cpu = cpu;
1672 spin_unlock_irqrestore(&rnp->lock, flags); 1665 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1673} 1666}
1674 1667
1675/* 1668/*
@@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1687 struct rcu_node *rnp = rcu_get_root(rsp); 1680 struct rcu_node *rnp = rcu_get_root(rsp);
1688 1681
1689 /* Set up local state, ensuring consistent view of global state. */ 1682 /* Set up local state, ensuring consistent view of global state. */
1690 spin_lock_irqsave(&rnp->lock, flags); 1683 raw_spin_lock_irqsave(&rnp->lock, flags);
1691 rdp->passed_quiesc = 0; /* We could be racing with new GP, */ 1684 rdp->passed_quiesc = 0; /* We could be racing with new GP, */
1692 rdp->qs_pending = 1; /* so set up to respond to current GP. */ 1685 rdp->qs_pending = 1; /* so set up to respond to current GP. */
1693 rdp->beenonline = 1; /* We have now been online. */ 1686 rdp->beenonline = 1; /* We have now been online. */
@@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1695 rdp->qlen_last_fqs_check = 0; 1688 rdp->qlen_last_fqs_check = 0;
1696 rdp->n_force_qs_snap = rsp->n_force_qs; 1689 rdp->n_force_qs_snap = rsp->n_force_qs;
1697 rdp->blimit = blimit; 1690 rdp->blimit = blimit;
1698 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1691 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1699 1692
1700 /* 1693 /*
1701 * A new grace period might start here. If so, we won't be part 1694 * A new grace period might start here. If so, we won't be part
@@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1703 */ 1696 */
1704 1697
1705 /* Exclude any attempts to start a new GP on large systems. */ 1698 /* Exclude any attempts to start a new GP on large systems. */
1706 spin_lock(&rsp->onofflock); /* irqs already disabled. */ 1699 raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */
1707 1700
1708 /* Add CPU to rcu_node bitmasks. */ 1701 /* Add CPU to rcu_node bitmasks. */
1709 rnp = rdp->mynode; 1702 rnp = rdp->mynode;
1710 mask = rdp->grpmask; 1703 mask = rdp->grpmask;
1711 do { 1704 do {
1712 /* Exclude any attempts to start a new GP on small systems. */ 1705 /* Exclude any attempts to start a new GP on small systems. */
1713 spin_lock(&rnp->lock); /* irqs already disabled. */ 1706 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1714 rnp->qsmaskinit |= mask; 1707 rnp->qsmaskinit |= mask;
1715 mask = rnp->grpmask; 1708 mask = rnp->grpmask;
1716 if (rnp == rdp->mynode) { 1709 if (rnp == rdp->mynode) {
@@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1718 rdp->completed = rnp->completed; 1711 rdp->completed = rnp->completed;
1719 rdp->passed_quiesc_completed = rnp->completed - 1; 1712 rdp->passed_quiesc_completed = rnp->completed - 1;
1720 } 1713 }
1721 spin_unlock(&rnp->lock); /* irqs already disabled. */ 1714 raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
1722 rnp = rnp->parent; 1715 rnp = rnp->parent;
1723 } while (rnp != NULL && !(rnp->qsmaskinit & mask)); 1716 } while (rnp != NULL && !(rnp->qsmaskinit & mask));
1724 1717
1725 spin_unlock_irqrestore(&rsp->onofflock, flags); 1718 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
1726} 1719}
1727 1720
1728static void __cpuinit rcu_online_cpu(int cpu) 1721static void __cpuinit rcu_online_cpu(int cpu)
@@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
1806 */ 1799 */
1807static void __init rcu_init_one(struct rcu_state *rsp) 1800static void __init rcu_init_one(struct rcu_state *rsp)
1808{ 1801{
1802 static char *buf[] = { "rcu_node_level_0",
1803 "rcu_node_level_1",
1804 "rcu_node_level_2",
1805 "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */
1809 int cpustride = 1; 1806 int cpustride = 1;
1810 int i; 1807 int i;
1811 int j; 1808 int j;
1812 struct rcu_node *rnp; 1809 struct rcu_node *rnp;
1813 1810
1811 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
1812
1814 /* Initialize the level-tracking arrays. */ 1813 /* Initialize the level-tracking arrays. */
1815 1814
1816 for (i = 1; i < NUM_RCU_LVLS; i++) 1815 for (i = 1; i < NUM_RCU_LVLS; i++)
@@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1823 cpustride *= rsp->levelspread[i]; 1822 cpustride *= rsp->levelspread[i];
1824 rnp = rsp->level[i]; 1823 rnp = rsp->level[i];
1825 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 1824 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
1826 spin_lock_init(&rnp->lock); 1825 raw_spin_lock_init(&rnp->lock);
1827 lockdep_set_class(&rnp->lock, &rcu_node_class[i]); 1826 lockdep_set_class_and_name(&rnp->lock,
1827 &rcu_node_class[i], buf[i]);
1828 rnp->gpnum = 0; 1828 rnp->gpnum = 0;
1829 rnp->qsmask = 0; 1829 rnp->qsmask = 0;
1830 rnp->qsmaskinit = 0; 1830 rnp->qsmaskinit = 0;
@@ -1876,7 +1876,7 @@ do { \
1876 1876
1877void __init rcu_init(void) 1877void __init rcu_init(void)
1878{ 1878{
1879 int i; 1879 int cpu;
1880 1880
1881 rcu_bootup_announce(); 1881 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
@@ -1896,8 +1896,8 @@ void __init rcu_init(void)
1896 * or the scheduler are operational. 1896 * or the scheduler are operational.
1897 */ 1897 */
1898 cpu_notifier(rcu_cpu_notify, 0); 1898 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(i) 1899 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); 1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1901} 1901}
1902 1902
1903#include "rcutree_plugin.h" 1903#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index d2a0046f63b2..1439eb504c22 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -90,12 +90,12 @@ struct rcu_dynticks {
90 * Definition for node within the RCU grace-period-detection hierarchy. 90 * Definition for node within the RCU grace-period-detection hierarchy.
91 */ 91 */
92struct rcu_node { 92struct rcu_node {
93 spinlock_t lock; /* Root rcu_node's lock protects some */ 93 raw_spinlock_t lock; /* Root rcu_node's lock protects some */
94 /* rcu_state fields as well as following. */ 94 /* rcu_state fields as well as following. */
95 long gpnum; /* Current grace period for this node. */ 95 unsigned long gpnum; /* Current grace period for this node. */
96 /* This will either be equal to or one */ 96 /* This will either be equal to or one */
97 /* behind the root rcu_node's gpnum. */ 97 /* behind the root rcu_node's gpnum. */
98 long completed; /* Last grace period completed for this node. */ 98 unsigned long completed; /* Last GP completed for this node. */
99 /* This will either be equal to or one */ 99 /* This will either be equal to or one */
100 /* behind the root rcu_node's gpnum. */ 100 /* behind the root rcu_node's gpnum. */
101 unsigned long qsmask; /* CPUs or groups that need to switch in */ 101 unsigned long qsmask; /* CPUs or groups that need to switch in */
@@ -161,11 +161,11 @@ struct rcu_node {
161/* Per-CPU data for read-copy update. */ 161/* Per-CPU data for read-copy update. */
162struct rcu_data { 162struct rcu_data {
163 /* 1) quiescent-state and grace-period handling : */ 163 /* 1) quiescent-state and grace-period handling : */
164 long completed; /* Track rsp->completed gp number */ 164 unsigned long completed; /* Track rsp->completed gp number */
165 /* in order to detect GP end. */ 165 /* in order to detect GP end. */
166 long gpnum; /* Highest gp number that this CPU */ 166 unsigned long gpnum; /* Highest gp number that this CPU */
167 /* is aware of having started. */ 167 /* is aware of having started. */
168 long passed_quiesc_completed; 168 unsigned long passed_quiesc_completed;
169 /* Value of completed at time of qs. */ 169 /* Value of completed at time of qs. */
170 bool passed_quiesc; /* User-mode/idle loop etc. */ 170 bool passed_quiesc; /* User-mode/idle loop etc. */
171 bool qs_pending; /* Core waits for quiesc state. */ 171 bool qs_pending; /* Core waits for quiesc state. */
@@ -221,14 +221,14 @@ struct rcu_data {
221 unsigned long resched_ipi; /* Sent a resched IPI. */ 221 unsigned long resched_ipi; /* Sent a resched IPI. */
222 222
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 long n_rp_cb_ready; 226 unsigned long n_rp_cb_ready;
227 long n_rp_cpu_needs_gp; 227 unsigned long n_rp_cpu_needs_gp;
228 long n_rp_gp_completed; 228 unsigned long n_rp_gp_completed;
229 long n_rp_gp_started; 229 unsigned long n_rp_gp_started;
230 long n_rp_need_fqs; 230 unsigned long n_rp_need_fqs;
231 long n_rp_need_nothing; 231 unsigned long n_rp_need_nothing;
232 232
233 int cpu; 233 int cpu;
234}; 234};
@@ -237,12 +237,11 @@ struct rcu_data {
237#define RCU_GP_IDLE 0 /* No grace period in progress. */ 237#define RCU_GP_IDLE 0 /* No grace period in progress. */
238#define RCU_GP_INIT 1 /* Grace period being initialized. */ 238#define RCU_GP_INIT 1 /* Grace period being initialized. */
239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ 239#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
240#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ 240#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
241#define RCU_FORCE_QS 4 /* Need to force quiescent state. */
242#ifdef CONFIG_NO_HZ 241#ifdef CONFIG_NO_HZ
243#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 242#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
244#else /* #ifdef CONFIG_NO_HZ */ 243#else /* #ifdef CONFIG_NO_HZ */
245#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED 244#define RCU_SIGNAL_INIT RCU_FORCE_QS
246#endif /* #else #ifdef CONFIG_NO_HZ */ 245#endif /* #else #ifdef CONFIG_NO_HZ */
247 246
248#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ 247#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
@@ -256,6 +255,9 @@ struct rcu_data {
256 255
257#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 256#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
258 257
258#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b))
259#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b))
260
259/* 261/*
260 * RCU global state, including node hierarchy. This hierarchy is 262 * RCU global state, including node hierarchy. This hierarchy is
261 * represented in "heap" form in a dense array. The root (first level) 263 * represented in "heap" form in a dense array. The root (first level)
@@ -277,12 +279,19 @@ struct rcu_state {
277 279
278 u8 signaled ____cacheline_internodealigned_in_smp; 280 u8 signaled ____cacheline_internodealigned_in_smp;
279 /* Force QS state. */ 281 /* Force QS state. */
280 long gpnum; /* Current gp number. */ 282 u8 fqs_active; /* force_quiescent_state() */
281 long completed; /* # of last completed gp. */ 283 /* is running. */
284 u8 fqs_need_gp; /* A CPU was prevented from */
285 /* starting a new grace */
286 /* period because */
287 /* force_quiescent_state() */
288 /* was running. */
289 unsigned long gpnum; /* Current gp number. */
290 unsigned long completed; /* # of last completed gp. */
282 291
283 /* End of fields guarded by root rcu_node's lock. */ 292 /* End of fields guarded by root rcu_node's lock. */
284 293
285 spinlock_t onofflock; /* exclude on/offline and */ 294 raw_spinlock_t onofflock; /* exclude on/offline and */
286 /* starting new GP. Also */ 295 /* starting new GP. Also */
287 /* protects the following */ 296 /* protects the following */
288 /* orphan_cbs fields. */ 297 /* orphan_cbs fields. */
@@ -292,10 +301,8 @@ struct rcu_state {
292 /* going offline. */ 301 /* going offline. */
293 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ 302 struct rcu_head **orphan_cbs_tail; /* And tail pointer. */
294 long orphan_qlen; /* Number of orphaned cbs. */ 303 long orphan_qlen; /* Number of orphaned cbs. */
295 spinlock_t fqslock; /* Only one task forcing */ 304 raw_spinlock_t fqslock; /* Only one task forcing */
296 /* quiescent states. */ 305 /* quiescent states. */
297 long completed_fqs; /* Value of completed @ snap. */
298 /* Protected by fqslock. */
299 unsigned long jiffies_force_qs; /* Time at which to invoke */ 306 unsigned long jiffies_force_qs; /* Time at which to invoke */
300 /* force_quiescent_state(). */ 307 /* force_quiescent_state(). */
301 unsigned long n_force_qs; /* Number of calls to */ 308 unsigned long n_force_qs; /* Number of calls to */
@@ -319,8 +326,6 @@ struct rcu_state {
319#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ 326#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */
320 /* GP were moved to root. */ 327 /* GP were moved to root. */
321 328
322#ifdef RCU_TREE_NONCORE
323
324/* 329/*
325 * RCU implementation internal declarations: 330 * RCU implementation internal declarations:
326 */ 331 */
@@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state;
335DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); 340DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
336#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 341#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
337 342
338#else /* #ifdef RCU_TREE_NONCORE */ 343#ifndef RCU_TREE_NONCORE
339 344
340/* Forward declarations for rcutree_plugin.h */ 345/* Forward declarations for rcutree_plugin.h */
341static void rcu_bootup_announce(void); 346static void rcu_bootup_announce(void);
@@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
347 unsigned long flags); 352 unsigned long flags);
348#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 353#endif /* #ifdef CONFIG_HOTPLUG_CPU */
349#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 354#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
355static void rcu_print_detail_task_stall(struct rcu_state *rsp);
350static void rcu_print_task_stall(struct rcu_node *rnp); 356static void rcu_print_task_stall(struct rcu_node *rnp);
351#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 357#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
352static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 358static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
@@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu);
367static void __cpuinit rcu_preempt_init_percpu_data(int cpu); 373static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
368static void rcu_preempt_send_cbs_to_orphanage(void); 374static void rcu_preempt_send_cbs_to_orphanage(void);
369static void __init __rcu_init_preempt(void); 375static void __init __rcu_init_preempt(void);
376static void rcu_needs_cpu_flush(void);
370 377
371#endif /* #else #ifdef RCU_TREE_NONCORE */ 378#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 37fbccdf41d5..464ad2cdee00 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -62,6 +62,15 @@ long rcu_batches_completed(void)
62EXPORT_SYMBOL_GPL(rcu_batches_completed); 62EXPORT_SYMBOL_GPL(rcu_batches_completed);
63 63
64/* 64/*
65 * Force a quiescent state for preemptible RCU.
66 */
67void rcu_force_quiescent_state(void)
68{
69 force_quiescent_state(&rcu_preempt_state, 0);
70}
71EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
72
73/*
65 * Record a preemptable-RCU quiescent state for the specified CPU. Note 74 * Record a preemptable-RCU quiescent state for the specified CPU. Note
66 * that this just means that the task currently running on the CPU is 75 * that this just means that the task currently running on the CPU is
67 * not in a quiescent state. There might be any number of tasks blocked 76 * not in a quiescent state. There might be any number of tasks blocked
@@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu)
102 /* Possibly blocking in an RCU read-side critical section. */ 111 /* Possibly blocking in an RCU read-side critical section. */
103 rdp = rcu_preempt_state.rda[cpu]; 112 rdp = rcu_preempt_state.rda[cpu];
104 rnp = rdp->mynode; 113 rnp = rdp->mynode;
105 spin_lock_irqsave(&rnp->lock, flags); 114 raw_spin_lock_irqsave(&rnp->lock, flags);
106 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 115 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED;
107 t->rcu_blocked_node = rnp; 116 t->rcu_blocked_node = rnp;
108 117
@@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu)
123 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); 132 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
124 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; 133 phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
125 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); 134 list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
126 spin_unlock_irqrestore(&rnp->lock, flags); 135 raw_spin_unlock_irqrestore(&rnp->lock, flags);
127 } 136 }
128 137
129 /* 138 /*
@@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
180 struct rcu_node *rnp_p; 189 struct rcu_node *rnp_p;
181 190
182 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { 191 if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
183 spin_unlock_irqrestore(&rnp->lock, flags); 192 raw_spin_unlock_irqrestore(&rnp->lock, flags);
184 return; /* Still need more quiescent states! */ 193 return; /* Still need more quiescent states! */
185 } 194 }
186 195
@@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
197 206
198 /* Report up the rest of the hierarchy. */ 207 /* Report up the rest of the hierarchy. */
199 mask = rnp->grpmask; 208 mask = rnp->grpmask;
200 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 209 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
201 spin_lock(&rnp_p->lock); /* irqs already disabled. */ 210 raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */
202 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); 211 rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
203} 212}
204 213
@@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t)
248 */ 257 */
249 for (;;) { 258 for (;;) {
250 rnp = t->rcu_blocked_node; 259 rnp = t->rcu_blocked_node;
251 spin_lock(&rnp->lock); /* irqs already disabled. */ 260 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
252 if (rnp == t->rcu_blocked_node) 261 if (rnp == t->rcu_blocked_node)
253 break; 262 break;
254 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 263 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
255 } 264 }
256 empty = !rcu_preempted_readers(rnp); 265 empty = !rcu_preempted_readers(rnp);
257 empty_exp = !rcu_preempted_readers_exp(rnp); 266 empty_exp = !rcu_preempted_readers_exp(rnp);
@@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
265 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. 274 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
266 */ 275 */
267 if (empty) 276 if (empty)
268 spin_unlock_irqrestore(&rnp->lock, flags); 277 raw_spin_unlock_irqrestore(&rnp->lock, flags);
269 else 278 else
270 rcu_report_unblock_qs_rnp(rnp, flags); 279 rcu_report_unblock_qs_rnp(rnp, flags);
271 280
@@ -295,29 +304,73 @@ void __rcu_read_unlock(void)
295 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && 304 if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 &&
296 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 305 unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
297 rcu_read_unlock_special(t); 306 rcu_read_unlock_special(t);
307#ifdef CONFIG_PROVE_LOCKING
308 WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0);
309#endif /* #ifdef CONFIG_PROVE_LOCKING */
298} 310}
299EXPORT_SYMBOL_GPL(__rcu_read_unlock); 311EXPORT_SYMBOL_GPL(__rcu_read_unlock);
300 312
301#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 313#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
302 314
315#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
316
317/*
318 * Dump detailed information for all tasks blocking the current RCU
319 * grace period on the specified rcu_node structure.
320 */
321static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
322{
323 unsigned long flags;
324 struct list_head *lp;
325 int phase;
326 struct task_struct *t;
327
328 if (rcu_preempted_readers(rnp)) {
329 raw_spin_lock_irqsave(&rnp->lock, flags);
330 phase = rnp->gpnum & 0x1;
331 lp = &rnp->blocked_tasks[phase];
332 list_for_each_entry(t, lp, rcu_node_entry)
333 sched_show_task(t);
334 raw_spin_unlock_irqrestore(&rnp->lock, flags);
335 }
336}
337
338/*
339 * Dump detailed information for all tasks blocking the current RCU
340 * grace period.
341 */
342static void rcu_print_detail_task_stall(struct rcu_state *rsp)
343{
344 struct rcu_node *rnp = rcu_get_root(rsp);
345
346 rcu_print_detail_task_stall_rnp(rnp);
347 rcu_for_each_leaf_node(rsp, rnp)
348 rcu_print_detail_task_stall_rnp(rnp);
349}
350
351#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
352
353static void rcu_print_detail_task_stall(struct rcu_state *rsp)
354{
355}
356
357#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */
358
303/* 359/*
304 * Scan the current list of tasks blocked within RCU read-side critical 360 * Scan the current list of tasks blocked within RCU read-side critical
305 * sections, printing out the tid of each. 361 * sections, printing out the tid of each.
306 */ 362 */
307static void rcu_print_task_stall(struct rcu_node *rnp) 363static void rcu_print_task_stall(struct rcu_node *rnp)
308{ 364{
309 unsigned long flags;
310 struct list_head *lp; 365 struct list_head *lp;
311 int phase; 366 int phase;
312 struct task_struct *t; 367 struct task_struct *t;
313 368
314 if (rcu_preempted_readers(rnp)) { 369 if (rcu_preempted_readers(rnp)) {
315 spin_lock_irqsave(&rnp->lock, flags);
316 phase = rnp->gpnum & 0x1; 370 phase = rnp->gpnum & 0x1;
317 lp = &rnp->blocked_tasks[phase]; 371 lp = &rnp->blocked_tasks[phase];
318 list_for_each_entry(t, lp, rcu_node_entry) 372 list_for_each_entry(t, lp, rcu_node_entry)
319 printk(" P%d", t->pid); 373 printk(" P%d", t->pid);
320 spin_unlock_irqrestore(&rnp->lock, flags);
321 } 374 }
322} 375}
323 376
@@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
388 lp_root = &rnp_root->blocked_tasks[i]; 441 lp_root = &rnp_root->blocked_tasks[i];
389 while (!list_empty(lp)) { 442 while (!list_empty(lp)) {
390 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); 443 tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
391 spin_lock(&rnp_root->lock); /* irqs already disabled */ 444 raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
392 list_del(&tp->rcu_node_entry); 445 list_del(&tp->rcu_node_entry);
393 tp->rcu_blocked_node = rnp_root; 446 tp->rcu_blocked_node = rnp_root;
394 list_add(&tp->rcu_node_entry, lp_root); 447 list_add(&tp->rcu_node_entry, lp_root);
395 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 448 raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
396 } 449 }
397 } 450 }
398 return retval; 451 return retval;
@@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
516 unsigned long flags; 569 unsigned long flags;
517 unsigned long mask; 570 unsigned long mask;
518 571
519 spin_lock_irqsave(&rnp->lock, flags); 572 raw_spin_lock_irqsave(&rnp->lock, flags);
520 for (;;) { 573 for (;;) {
521 if (!sync_rcu_preempt_exp_done(rnp)) 574 if (!sync_rcu_preempt_exp_done(rnp))
522 break; 575 break;
@@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
525 break; 578 break;
526 } 579 }
527 mask = rnp->grpmask; 580 mask = rnp->grpmask;
528 spin_unlock(&rnp->lock); /* irqs remain disabled */ 581 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
529 rnp = rnp->parent; 582 rnp = rnp->parent;
530 spin_lock(&rnp->lock); /* irqs already disabled */ 583 raw_spin_lock(&rnp->lock); /* irqs already disabled */
531 rnp->expmask &= ~mask; 584 rnp->expmask &= ~mask;
532 } 585 }
533 spin_unlock_irqrestore(&rnp->lock, flags); 586 raw_spin_unlock_irqrestore(&rnp->lock, flags);
534} 587}
535 588
536/* 589/*
@@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
545{ 598{
546 int must_wait; 599 int must_wait;
547 600
548 spin_lock(&rnp->lock); /* irqs already disabled */ 601 raw_spin_lock(&rnp->lock); /* irqs already disabled */
549 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); 602 list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
550 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); 603 list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
551 must_wait = rcu_preempted_readers_exp(rnp); 604 must_wait = rcu_preempted_readers_exp(rnp);
552 spin_unlock(&rnp->lock); /* irqs remain disabled */ 605 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
553 if (!must_wait) 606 if (!must_wait)
554 rcu_report_exp_rnp(rsp, rnp); 607 rcu_report_exp_rnp(rsp, rnp);
555} 608}
@@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void)
594 /* force all RCU readers onto blocked_tasks[]. */ 647 /* force all RCU readers onto blocked_tasks[]. */
595 synchronize_sched_expedited(); 648 synchronize_sched_expedited();
596 649
597 spin_lock_irqsave(&rsp->onofflock, flags); 650 raw_spin_lock_irqsave(&rsp->onofflock, flags);
598 651
599 /* Initialize ->expmask for all non-leaf rcu_node structures. */ 652 /* Initialize ->expmask for all non-leaf rcu_node structures. */
600 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { 653 rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
601 spin_lock(&rnp->lock); /* irqs already disabled. */ 654 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
602 rnp->expmask = rnp->qsmaskinit; 655 rnp->expmask = rnp->qsmaskinit;
603 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 656 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
604 } 657 }
605 658
606 /* Snapshot current state of ->blocked_tasks[] lists. */ 659 /* Snapshot current state of ->blocked_tasks[] lists. */
@@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void)
609 if (NUM_RCU_NODES > 1) 662 if (NUM_RCU_NODES > 1)
610 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); 663 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
611 664
612 spin_unlock_irqrestore(&rsp->onofflock, flags); 665 raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
613 666
614 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ 667 /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
615 rnp = rcu_get_root(rsp); 668 rnp = rcu_get_root(rsp);
@@ -713,6 +766,16 @@ long rcu_batches_completed(void)
713EXPORT_SYMBOL_GPL(rcu_batches_completed); 766EXPORT_SYMBOL_GPL(rcu_batches_completed);
714 767
715/* 768/*
769 * Force a quiescent state for RCU, which, because there is no preemptible
770 * RCU, becomes the same as rcu-sched.
771 */
772void rcu_force_quiescent_state(void)
773{
774 rcu_sched_force_quiescent_state();
775}
776EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
777
778/*
716 * Because preemptable RCU does not exist, we never have to check for 779 * Because preemptable RCU does not exist, we never have to check for
717 * CPUs being in quiescent states. 780 * CPUs being in quiescent states.
718 */ 781 */
@@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp)
734/* Because preemptible RCU does not exist, no quieting of tasks. */ 797/* Because preemptible RCU does not exist, no quieting of tasks. */
735static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) 798static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
736{ 799{
737 spin_unlock_irqrestore(&rnp->lock, flags); 800 raw_spin_unlock_irqrestore(&rnp->lock, flags);
738} 801}
739 802
740#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 803#endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
745 * Because preemptable RCU does not exist, we never have to check for 808 * Because preemptable RCU does not exist, we never have to check for
746 * tasks blocked within RCU read-side critical sections. 809 * tasks blocked within RCU read-side critical sections.
747 */ 810 */
811static void rcu_print_detail_task_stall(struct rcu_state *rsp)
812{
813}
814
815/*
816 * Because preemptable RCU does not exist, we never have to check for
817 * tasks blocked within RCU read-side critical sections.
818 */
748static void rcu_print_task_stall(struct rcu_node *rnp) 819static void rcu_print_task_stall(struct rcu_node *rnp)
749{ 820{
750} 821}
@@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void)
884} 955}
885 956
886#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ 957#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
958
959#if !defined(CONFIG_RCU_FAST_NO_HZ)
960
961/*
962 * Check to see if any future RCU-related work will need to be done
963 * by the current CPU, even if none need be done immediately, returning
964 * 1 if so. This function is part of the RCU implementation; it is -not-
965 * an exported member of the RCU API.
966 *
967 * Because we have preemptible RCU, just check whether this CPU needs
968 * any flavor of RCU. Do not chew up lots of CPU cycles with preemption
969 * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
970 */
971int rcu_needs_cpu(int cpu)
972{
973 return rcu_needs_cpu_quick_check(cpu);
974}
975
976/*
977 * Check to see if we need to continue a callback-flush operations to
978 * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle
979 * entry is not configured, so we never do need to.
980 */
981static void rcu_needs_cpu_flush(void)
982{
983}
984
985#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
986
987#define RCU_NEEDS_CPU_FLUSHES 5
988static DEFINE_PER_CPU(int, rcu_dyntick_drain);
989static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
990
991/*
992 * Check to see if any future RCU-related work will need to be done
993 * by the current CPU, even if none need be done immediately, returning
994 * 1 if so. This function is part of the RCU implementation; it is -not-
995 * an exported member of the RCU API.
996 *
997 * Because we are not supporting preemptible RCU, attempt to accelerate
998 * any current grace periods so that RCU no longer needs this CPU, but
999 * only if all other CPUs are already in dynticks-idle mode. This will
1000 * allow the CPU cores to be powered down immediately, as opposed to after
1001 * waiting many milliseconds for grace periods to elapse.
1002 *
1003 * Because it is not legal to invoke rcu_process_callbacks() with irqs
1004 * disabled, we do one pass of force_quiescent_state(), then do a
1005 * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
1006 * The per-cpu rcu_dyntick_drain variable controls the sequencing.
1007 */
1008int rcu_needs_cpu(int cpu)
1009{
1010 int c = 0;
1011 int thatcpu;
1012
1013 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1014 for_each_cpu_not(thatcpu, nohz_cpu_mask)
1015 if (thatcpu != cpu) {
1016 per_cpu(rcu_dyntick_drain, cpu) = 0;
1017 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1018 return rcu_needs_cpu_quick_check(cpu);
1019 }
1020
1021 /* Check and update the rcu_dyntick_drain sequencing. */
1022 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1023 /* First time through, initialize the counter. */
1024 per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
1025 } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
1026 /* We have hit the limit, so time to give up. */
1027 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1028 return rcu_needs_cpu_quick_check(cpu);
1029 }
1030
1031 /* Do one step pushing remaining RCU callbacks through. */
1032 if (per_cpu(rcu_sched_data, cpu).nxtlist) {
1033 rcu_sched_qs(cpu);
1034 force_quiescent_state(&rcu_sched_state, 0);
1035 c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
1036 }
1037 if (per_cpu(rcu_bh_data, cpu).nxtlist) {
1038 rcu_bh_qs(cpu);
1039 force_quiescent_state(&rcu_bh_state, 0);
1040 c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
1041 }
1042
1043 /* If RCU callbacks are still pending, RCU still needs this CPU. */
1044 if (c) {
1045 raise_softirq(RCU_SOFTIRQ);
1046 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
1047 }
1048 return c;
1049}
1050
1051/*
1052 * Check to see if we need to continue a callback-flush operations to
1053 * allow the last CPU to enter dyntick-idle mode.
1054 */
1055static void rcu_needs_cpu_flush(void)
1056{
1057 int cpu = smp_processor_id();
1058 unsigned long flags;
1059
1060 if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
1061 return;
1062 local_irq_save(flags);
1063 (void)rcu_needs_cpu(cpu);
1064 local_irq_restore(flags);
1065}
1066
1067#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9d2c88423b31..d45db2e35d27 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
50{ 50{
51 if (!rdp->beenonline) 51 if (!rdp->beenonline)
52 return; 52 return;
53 seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", 53 seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
54 rdp->cpu, 54 rdp->cpu,
55 cpu_is_offline(rdp->cpu) ? '!' : ' ', 55 cpu_is_offline(rdp->cpu) ? '!' : ' ',
56 rdp->completed, rdp->gpnum, 56 rdp->completed, rdp->gpnum,
@@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
105{ 105{
106 if (!rdp->beenonline) 106 if (!rdp->beenonline)
107 return; 107 return;
108 seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", 108 seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d",
109 rdp->cpu, 109 rdp->cpu,
110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", 110 cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
111 rdp->completed, rdp->gpnum, 111 rdp->completed, rdp->gpnum,
@@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = {
155 155
156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) 156static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
157{ 157{
158 long gpnum; 158 unsigned long gpnum;
159 int level = 0; 159 int level = 0;
160 int phase; 160 int phase;
161 struct rcu_node *rnp; 161 struct rcu_node *rnp;
162 162
163 gpnum = rsp->gpnum; 163 gpnum = rsp->gpnum;
164 seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " 164 seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", 165 "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
166 rsp->completed, gpnum, rsp->signaled, 166 rsp->completed, gpnum, rsp->signaled,
167 (long)(rsp->jiffies_force_qs - jiffies), 167 (long)(rsp->jiffies_force_qs - jiffies),
@@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = {
215static int show_rcugp(struct seq_file *m, void *unused) 215static int show_rcugp(struct seq_file *m, void *unused)
216{ 216{
217#ifdef CONFIG_TREE_PREEMPT_RCU 217#ifdef CONFIG_TREE_PREEMPT_RCU
218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", 218 seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
219 rcu_preempt_state.completed, rcu_preempt_state.gpnum); 219 rcu_preempt_state.completed, rcu_preempt_state.gpnum);
220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ 220#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", 221 seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
222 rcu_sched_state.completed, rcu_sched_state.gpnum); 222 rcu_sched_state.completed, rcu_sched_state.gpnum);
223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", 223 seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
224 rcu_bh_state.completed, rcu_bh_state.gpnum); 224 rcu_bh_state.completed, rcu_bh_state.gpnum);
225 return 0; 225 return 0;
226} 226}
diff --git a/kernel/relay.c b/kernel/relay.c
index 760c26209a3c..c705a41b4ba3 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1198,7 +1198,7 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe,
1198 relay_consume_bytes(rbuf, buf->private); 1198 relay_consume_bytes(rbuf, buf->private);
1199} 1199}
1200 1200
1201static struct pipe_buf_operations relay_pipe_buf_ops = { 1201static const struct pipe_buf_operations relay_pipe_buf_ops = {
1202 .can_merge = 0, 1202 .can_merge = 0,
1203 .map = generic_pipe_buf_map, 1203 .map = generic_pipe_buf_map,
1204 .unmap = generic_pipe_buf_unmap, 1204 .unmap = generic_pipe_buf_unmap,
diff --git a/kernel/resource.c b/kernel/resource.c
index fb11a58b9594..4e9d87fd7bc5 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -188,6 +188,36 @@ static int __release_resource(struct resource *old)
188 return -EINVAL; 188 return -EINVAL;
189} 189}
190 190
191static void __release_child_resources(struct resource *r)
192{
193 struct resource *tmp, *p;
194 resource_size_t size;
195
196 p = r->child;
197 r->child = NULL;
198 while (p) {
199 tmp = p;
200 p = p->sibling;
201
202 tmp->parent = NULL;
203 tmp->sibling = NULL;
204 __release_child_resources(tmp);
205
206 printk(KERN_DEBUG "release child resource %pR\n", tmp);
207 /* need to restore size, and keep flags */
208 size = resource_size(tmp);
209 tmp->start = 0;
210 tmp->end = size - 1;
211 }
212}
213
214void release_child_resources(struct resource *r)
215{
216 write_lock(&resource_lock);
217 __release_child_resources(r);
218 write_unlock(&resource_lock);
219}
220
191/** 221/**
192 * request_resource - request and reserve an I/O or memory resource 222 * request_resource - request and reserve an I/O or memory resource
193 * @root: root resource descriptor 223 * @root: root resource descriptor
@@ -297,46 +327,63 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
297 327
298#endif 328#endif
299 329
330static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
331{
332 return 1;
333}
334/*
335 * This generic page_is_ram() returns true if specified address is
336 * registered as "System RAM" in iomem_resource list.
337 */
338int __weak page_is_ram(unsigned long pfn)
339{
340 return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
341}
342
300/* 343/*
301 * Find empty slot in the resource tree given range and alignment. 344 * Find empty slot in the resource tree given range and alignment.
302 */ 345 */
303static int find_resource(struct resource *root, struct resource *new, 346static int find_resource(struct resource *root, struct resource *new,
304 resource_size_t size, resource_size_t min, 347 resource_size_t size, resource_size_t min,
305 resource_size_t max, resource_size_t align, 348 resource_size_t max, resource_size_t align,
306 void (*alignf)(void *, struct resource *, 349 resource_size_t (*alignf)(void *,
307 resource_size_t, resource_size_t), 350 const struct resource *,
351 resource_size_t,
352 resource_size_t),
308 void *alignf_data) 353 void *alignf_data)
309{ 354{
310 struct resource *this = root->child; 355 struct resource *this = root->child;
356 struct resource tmp = *new;
311 357
312 new->start = root->start; 358 tmp.start = root->start;
313 /* 359 /*
314 * Skip past an allocated resource that starts at 0, since the assignment 360 * Skip past an allocated resource that starts at 0, since the assignment
315 * of this->start - 1 to new->end below would cause an underflow. 361 * of this->start - 1 to tmp->end below would cause an underflow.
316 */ 362 */
317 if (this && this->start == 0) { 363 if (this && this->start == 0) {
318 new->start = this->end + 1; 364 tmp.start = this->end + 1;
319 this = this->sibling; 365 this = this->sibling;
320 } 366 }
321 for(;;) { 367 for(;;) {
322 if (this) 368 if (this)
323 new->end = this->start - 1; 369 tmp.end = this->start - 1;
324 else 370 else
325 new->end = root->end; 371 tmp.end = root->end;
326 if (new->start < min) 372 if (tmp.start < min)
327 new->start = min; 373 tmp.start = min;
328 if (new->end > max) 374 if (tmp.end > max)
329 new->end = max; 375 tmp.end = max;
330 new->start = ALIGN(new->start, align); 376 tmp.start = ALIGN(tmp.start, align);
331 if (alignf) 377 if (alignf)
332 alignf(alignf_data, new, size, align); 378 tmp.start = alignf(alignf_data, &tmp, size, align);
333 if (new->start < new->end && new->end - new->start >= size - 1) { 379 if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
334 new->end = new->start + size - 1; 380 new->start = tmp.start;
381 new->end = tmp.start + size - 1;
335 return 0; 382 return 0;
336 } 383 }
337 if (!this) 384 if (!this)
338 break; 385 break;
339 new->start = this->end + 1; 386 tmp.start = this->end + 1;
340 this = this->sibling; 387 this = this->sibling;
341 } 388 }
342 return -EBUSY; 389 return -EBUSY;
@@ -356,8 +403,10 @@ static int find_resource(struct resource *root, struct resource *new,
356int allocate_resource(struct resource *root, struct resource *new, 403int allocate_resource(struct resource *root, struct resource *new,
357 resource_size_t size, resource_size_t min, 404 resource_size_t size, resource_size_t min,
358 resource_size_t max, resource_size_t align, 405 resource_size_t max, resource_size_t align,
359 void (*alignf)(void *, struct resource *, 406 resource_size_t (*alignf)(void *,
360 resource_size_t, resource_size_t), 407 const struct resource *,
408 resource_size_t,
409 resource_size_t),
361 void *alignf_data) 410 void *alignf_data)
362{ 411{
363 int err; 412 int err;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 5fcb4fe645e2..ddabb54bb5c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -37,8 +37,8 @@ do { \
37 if (rt_trace_on) { \ 37 if (rt_trace_on) { \
38 rt_trace_on = 0; \ 38 rt_trace_on = 0; \
39 console_verbose(); \ 39 console_verbose(); \
40 if (spin_is_locked(&current->pi_lock)) \ 40 if (raw_spin_is_locked(&current->pi_lock)) \
41 spin_unlock(&current->pi_lock); \ 41 raw_spin_unlock(&current->pi_lock); \
42 } \ 42 } \
43} while (0) 43} while (0)
44 44
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 29bd4baf9e75..a9604815786a 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -138,9 +138,9 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
138{ 138{
139 unsigned long flags; 139 unsigned long flags;
140 140
141 spin_lock_irqsave(&task->pi_lock, flags); 141 raw_spin_lock_irqsave(&task->pi_lock, flags);
142 __rt_mutex_adjust_prio(task); 142 __rt_mutex_adjust_prio(task);
143 spin_unlock_irqrestore(&task->pi_lock, flags); 143 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
144} 144}
145 145
146/* 146/*
@@ -195,7 +195,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
195 /* 195 /*
196 * Task can not go away as we did a get_task() before ! 196 * Task can not go away as we did a get_task() before !
197 */ 197 */
198 spin_lock_irqsave(&task->pi_lock, flags); 198 raw_spin_lock_irqsave(&task->pi_lock, flags);
199 199
200 waiter = task->pi_blocked_on; 200 waiter = task->pi_blocked_on;
201 /* 201 /*
@@ -231,8 +231,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
231 goto out_unlock_pi; 231 goto out_unlock_pi;
232 232
233 lock = waiter->lock; 233 lock = waiter->lock;
234 if (!spin_trylock(&lock->wait_lock)) { 234 if (!raw_spin_trylock(&lock->wait_lock)) {
235 spin_unlock_irqrestore(&task->pi_lock, flags); 235 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
236 cpu_relax(); 236 cpu_relax();
237 goto retry; 237 goto retry;
238 } 238 }
@@ -240,7 +240,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
240 /* Deadlock detection */ 240 /* Deadlock detection */
241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { 241 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); 242 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
243 spin_unlock(&lock->wait_lock); 243 raw_spin_unlock(&lock->wait_lock);
244 ret = deadlock_detect ? -EDEADLK : 0; 244 ret = deadlock_detect ? -EDEADLK : 0;
245 goto out_unlock_pi; 245 goto out_unlock_pi;
246 } 246 }
@@ -253,13 +253,13 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
253 plist_add(&waiter->list_entry, &lock->wait_list); 253 plist_add(&waiter->list_entry, &lock->wait_list);
254 254
255 /* Release the task */ 255 /* Release the task */
256 spin_unlock_irqrestore(&task->pi_lock, flags); 256 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
257 put_task_struct(task); 257 put_task_struct(task);
258 258
259 /* Grab the next task */ 259 /* Grab the next task */
260 task = rt_mutex_owner(lock); 260 task = rt_mutex_owner(lock);
261 get_task_struct(task); 261 get_task_struct(task);
262 spin_lock_irqsave(&task->pi_lock, flags); 262 raw_spin_lock_irqsave(&task->pi_lock, flags);
263 263
264 if (waiter == rt_mutex_top_waiter(lock)) { 264 if (waiter == rt_mutex_top_waiter(lock)) {
265 /* Boost the owner */ 265 /* Boost the owner */
@@ -277,10 +277,10 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
277 __rt_mutex_adjust_prio(task); 277 __rt_mutex_adjust_prio(task);
278 } 278 }
279 279
280 spin_unlock_irqrestore(&task->pi_lock, flags); 280 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
281 281
282 top_waiter = rt_mutex_top_waiter(lock); 282 top_waiter = rt_mutex_top_waiter(lock);
283 spin_unlock(&lock->wait_lock); 283 raw_spin_unlock(&lock->wait_lock);
284 284
285 if (!detect_deadlock && waiter != top_waiter) 285 if (!detect_deadlock && waiter != top_waiter)
286 goto out_put_task; 286 goto out_put_task;
@@ -288,7 +288,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
288 goto again; 288 goto again;
289 289
290 out_unlock_pi: 290 out_unlock_pi:
291 spin_unlock_irqrestore(&task->pi_lock, flags); 291 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
292 out_put_task: 292 out_put_task:
293 put_task_struct(task); 293 put_task_struct(task);
294 294
@@ -313,9 +313,9 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
313 if (pendowner == task) 313 if (pendowner == task)
314 return 1; 314 return 1;
315 315
316 spin_lock_irqsave(&pendowner->pi_lock, flags); 316 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
317 if (task->prio >= pendowner->prio) { 317 if (task->prio >= pendowner->prio) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 318 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 0; 319 return 0;
320 } 320 }
321 321
@@ -325,7 +325,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
325 * priority. 325 * priority.
326 */ 326 */
327 if (likely(!rt_mutex_has_waiters(lock))) { 327 if (likely(!rt_mutex_has_waiters(lock))) {
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 328 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329 return 1; 329 return 1;
330 } 330 }
331 331
@@ -333,7 +333,7 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
333 next = rt_mutex_top_waiter(lock); 333 next = rt_mutex_top_waiter(lock);
334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters); 334 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
335 __rt_mutex_adjust_prio(pendowner); 335 __rt_mutex_adjust_prio(pendowner);
336 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 336 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
337 337
338 /* 338 /*
339 * We are going to steal the lock and a waiter was 339 * We are going to steal the lock and a waiter was
@@ -350,10 +350,10 @@ static inline int try_to_steal_lock(struct rt_mutex *lock,
350 * might be task: 350 * might be task:
351 */ 351 */
352 if (likely(next->task != task)) { 352 if (likely(next->task != task)) {
353 spin_lock_irqsave(&task->pi_lock, flags); 353 raw_spin_lock_irqsave(&task->pi_lock, flags);
354 plist_add(&next->pi_list_entry, &task->pi_waiters); 354 plist_add(&next->pi_list_entry, &task->pi_waiters);
355 __rt_mutex_adjust_prio(task); 355 __rt_mutex_adjust_prio(task);
356 spin_unlock_irqrestore(&task->pi_lock, flags); 356 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
357 } 357 }
358 return 1; 358 return 1;
359} 359}
@@ -420,7 +420,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
420 unsigned long flags; 420 unsigned long flags;
421 int chain_walk = 0, res; 421 int chain_walk = 0, res;
422 422
423 spin_lock_irqsave(&task->pi_lock, flags); 423 raw_spin_lock_irqsave(&task->pi_lock, flags);
424 __rt_mutex_adjust_prio(task); 424 __rt_mutex_adjust_prio(task);
425 waiter->task = task; 425 waiter->task = task;
426 waiter->lock = lock; 426 waiter->lock = lock;
@@ -434,17 +434,17 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
434 434
435 task->pi_blocked_on = waiter; 435 task->pi_blocked_on = waiter;
436 436
437 spin_unlock_irqrestore(&task->pi_lock, flags); 437 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
438 438
439 if (waiter == rt_mutex_top_waiter(lock)) { 439 if (waiter == rt_mutex_top_waiter(lock)) {
440 spin_lock_irqsave(&owner->pi_lock, flags); 440 raw_spin_lock_irqsave(&owner->pi_lock, flags);
441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); 441 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters); 442 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
443 443
444 __rt_mutex_adjust_prio(owner); 444 __rt_mutex_adjust_prio(owner);
445 if (owner->pi_blocked_on) 445 if (owner->pi_blocked_on)
446 chain_walk = 1; 446 chain_walk = 1;
447 spin_unlock_irqrestore(&owner->pi_lock, flags); 447 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
448 } 448 }
449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) 449 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
450 chain_walk = 1; 450 chain_walk = 1;
@@ -459,12 +459,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
459 */ 459 */
460 get_task_struct(owner); 460 get_task_struct(owner);
461 461
462 spin_unlock(&lock->wait_lock); 462 raw_spin_unlock(&lock->wait_lock);
463 463
464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, 464 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
465 task); 465 task);
466 466
467 spin_lock(&lock->wait_lock); 467 raw_spin_lock(&lock->wait_lock);
468 468
469 return res; 469 return res;
470} 470}
@@ -483,7 +483,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
483 struct task_struct *pendowner; 483 struct task_struct *pendowner;
484 unsigned long flags; 484 unsigned long flags;
485 485
486 spin_lock_irqsave(&current->pi_lock, flags); 486 raw_spin_lock_irqsave(&current->pi_lock, flags);
487 487
488 waiter = rt_mutex_top_waiter(lock); 488 waiter = rt_mutex_top_waiter(lock);
489 plist_del(&waiter->list_entry, &lock->wait_list); 489 plist_del(&waiter->list_entry, &lock->wait_list);
@@ -500,7 +500,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
500 500
501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); 501 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
502 502
503 spin_unlock_irqrestore(&current->pi_lock, flags); 503 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
504 504
505 /* 505 /*
506 * Clear the pi_blocked_on variable and enqueue a possible 506 * Clear the pi_blocked_on variable and enqueue a possible
@@ -509,7 +509,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
509 * waiter with higher priority than pending-owner->normal_prio 509 * waiter with higher priority than pending-owner->normal_prio
510 * is blocked on the unboosted (pending) owner. 510 * is blocked on the unboosted (pending) owner.
511 */ 511 */
512 spin_lock_irqsave(&pendowner->pi_lock, flags); 512 raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
513 513
514 WARN_ON(!pendowner->pi_blocked_on); 514 WARN_ON(!pendowner->pi_blocked_on);
515 WARN_ON(pendowner->pi_blocked_on != waiter); 515 WARN_ON(pendowner->pi_blocked_on != waiter);
@@ -523,7 +523,7 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
523 next = rt_mutex_top_waiter(lock); 523 next = rt_mutex_top_waiter(lock);
524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters); 524 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
525 } 525 }
526 spin_unlock_irqrestore(&pendowner->pi_lock, flags); 526 raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
527 527
528 wake_up_process(pendowner); 528 wake_up_process(pendowner);
529} 529}
@@ -541,15 +541,15 @@ static void remove_waiter(struct rt_mutex *lock,
541 unsigned long flags; 541 unsigned long flags;
542 int chain_walk = 0; 542 int chain_walk = 0;
543 543
544 spin_lock_irqsave(&current->pi_lock, flags); 544 raw_spin_lock_irqsave(&current->pi_lock, flags);
545 plist_del(&waiter->list_entry, &lock->wait_list); 545 plist_del(&waiter->list_entry, &lock->wait_list);
546 waiter->task = NULL; 546 waiter->task = NULL;
547 current->pi_blocked_on = NULL; 547 current->pi_blocked_on = NULL;
548 spin_unlock_irqrestore(&current->pi_lock, flags); 548 raw_spin_unlock_irqrestore(&current->pi_lock, flags);
549 549
550 if (first && owner != current) { 550 if (first && owner != current) {
551 551
552 spin_lock_irqsave(&owner->pi_lock, flags); 552 raw_spin_lock_irqsave(&owner->pi_lock, flags);
553 553
554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters); 554 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
555 555
@@ -564,7 +564,7 @@ static void remove_waiter(struct rt_mutex *lock,
564 if (owner->pi_blocked_on) 564 if (owner->pi_blocked_on)
565 chain_walk = 1; 565 chain_walk = 1;
566 566
567 spin_unlock_irqrestore(&owner->pi_lock, flags); 567 raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
568 } 568 }
569 569
570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); 570 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
@@ -575,11 +575,11 @@ static void remove_waiter(struct rt_mutex *lock,
575 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 575 /* gets dropped in rt_mutex_adjust_prio_chain()! */
576 get_task_struct(owner); 576 get_task_struct(owner);
577 577
578 spin_unlock(&lock->wait_lock); 578 raw_spin_unlock(&lock->wait_lock);
579 579
580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); 580 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
581 581
582 spin_lock(&lock->wait_lock); 582 raw_spin_lock(&lock->wait_lock);
583} 583}
584 584
585/* 585/*
@@ -592,15 +592,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
592 struct rt_mutex_waiter *waiter; 592 struct rt_mutex_waiter *waiter;
593 unsigned long flags; 593 unsigned long flags;
594 594
595 spin_lock_irqsave(&task->pi_lock, flags); 595 raw_spin_lock_irqsave(&task->pi_lock, flags);
596 596
597 waiter = task->pi_blocked_on; 597 waiter = task->pi_blocked_on;
598 if (!waiter || waiter->list_entry.prio == task->prio) { 598 if (!waiter || waiter->list_entry.prio == task->prio) {
599 spin_unlock_irqrestore(&task->pi_lock, flags); 599 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
600 return; 600 return;
601 } 601 }
602 602
603 spin_unlock_irqrestore(&task->pi_lock, flags); 603 raw_spin_unlock_irqrestore(&task->pi_lock, flags);
604 604
605 /* gets dropped in rt_mutex_adjust_prio_chain()! */ 605 /* gets dropped in rt_mutex_adjust_prio_chain()! */
606 get_task_struct(task); 606 get_task_struct(task);
@@ -672,14 +672,14 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
672 break; 672 break;
673 } 673 }
674 674
675 spin_unlock(&lock->wait_lock); 675 raw_spin_unlock(&lock->wait_lock);
676 676
677 debug_rt_mutex_print_deadlock(waiter); 677 debug_rt_mutex_print_deadlock(waiter);
678 678
679 if (waiter->task) 679 if (waiter->task)
680 schedule_rt_mutex(lock); 680 schedule_rt_mutex(lock);
681 681
682 spin_lock(&lock->wait_lock); 682 raw_spin_lock(&lock->wait_lock);
683 set_current_state(state); 683 set_current_state(state);
684 } 684 }
685 685
@@ -700,11 +700,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
700 debug_rt_mutex_init_waiter(&waiter); 700 debug_rt_mutex_init_waiter(&waiter);
701 waiter.task = NULL; 701 waiter.task = NULL;
702 702
703 spin_lock(&lock->wait_lock); 703 raw_spin_lock(&lock->wait_lock);
704 704
705 /* Try to acquire the lock again: */ 705 /* Try to acquire the lock again: */
706 if (try_to_take_rt_mutex(lock)) { 706 if (try_to_take_rt_mutex(lock)) {
707 spin_unlock(&lock->wait_lock); 707 raw_spin_unlock(&lock->wait_lock);
708 return 0; 708 return 0;
709 } 709 }
710 710
@@ -731,7 +731,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
731 */ 731 */
732 fixup_rt_mutex_waiters(lock); 732 fixup_rt_mutex_waiters(lock);
733 733
734 spin_unlock(&lock->wait_lock); 734 raw_spin_unlock(&lock->wait_lock);
735 735
736 /* Remove pending timer: */ 736 /* Remove pending timer: */
737 if (unlikely(timeout)) 737 if (unlikely(timeout))
@@ -758,7 +758,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
758{ 758{
759 int ret = 0; 759 int ret = 0;
760 760
761 spin_lock(&lock->wait_lock); 761 raw_spin_lock(&lock->wait_lock);
762 762
763 if (likely(rt_mutex_owner(lock) != current)) { 763 if (likely(rt_mutex_owner(lock) != current)) {
764 764
@@ -770,7 +770,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
770 fixup_rt_mutex_waiters(lock); 770 fixup_rt_mutex_waiters(lock);
771 } 771 }
772 772
773 spin_unlock(&lock->wait_lock); 773 raw_spin_unlock(&lock->wait_lock);
774 774
775 return ret; 775 return ret;
776} 776}
@@ -781,7 +781,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
781static void __sched 781static void __sched
782rt_mutex_slowunlock(struct rt_mutex *lock) 782rt_mutex_slowunlock(struct rt_mutex *lock)
783{ 783{
784 spin_lock(&lock->wait_lock); 784 raw_spin_lock(&lock->wait_lock);
785 785
786 debug_rt_mutex_unlock(lock); 786 debug_rt_mutex_unlock(lock);
787 787
@@ -789,13 +789,13 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
789 789
790 if (!rt_mutex_has_waiters(lock)) { 790 if (!rt_mutex_has_waiters(lock)) {
791 lock->owner = NULL; 791 lock->owner = NULL;
792 spin_unlock(&lock->wait_lock); 792 raw_spin_unlock(&lock->wait_lock);
793 return; 793 return;
794 } 794 }
795 795
796 wakeup_next_waiter(lock); 796 wakeup_next_waiter(lock);
797 797
798 spin_unlock(&lock->wait_lock); 798 raw_spin_unlock(&lock->wait_lock);
799 799
800 /* Undo pi boosting if necessary: */ 800 /* Undo pi boosting if necessary: */
801 rt_mutex_adjust_prio(current); 801 rt_mutex_adjust_prio(current);
@@ -970,8 +970,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
970void __rt_mutex_init(struct rt_mutex *lock, const char *name) 970void __rt_mutex_init(struct rt_mutex *lock, const char *name)
971{ 971{
972 lock->owner = NULL; 972 lock->owner = NULL;
973 spin_lock_init(&lock->wait_lock); 973 raw_spin_lock_init(&lock->wait_lock);
974 plist_head_init(&lock->wait_list, &lock->wait_lock); 974 plist_head_init_raw(&lock->wait_list, &lock->wait_lock);
975 975
976 debug_rt_mutex_init(lock, name); 976 debug_rt_mutex_init(lock, name);
977} 977}
@@ -1032,7 +1032,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1032{ 1032{
1033 int ret; 1033 int ret;
1034 1034
1035 spin_lock(&lock->wait_lock); 1035 raw_spin_lock(&lock->wait_lock);
1036 1036
1037 mark_rt_mutex_waiters(lock); 1037 mark_rt_mutex_waiters(lock);
1038 1038
@@ -1040,7 +1040,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1040 /* We got the lock for task. */ 1040 /* We got the lock for task. */
1041 debug_rt_mutex_lock(lock); 1041 debug_rt_mutex_lock(lock);
1042 rt_mutex_set_owner(lock, task, 0); 1042 rt_mutex_set_owner(lock, task, 0);
1043 spin_unlock(&lock->wait_lock); 1043 raw_spin_unlock(&lock->wait_lock);
1044 rt_mutex_deadlock_account_lock(lock, task); 1044 rt_mutex_deadlock_account_lock(lock, task);
1045 return 1; 1045 return 1;
1046 } 1046 }
@@ -1056,7 +1056,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
1056 */ 1056 */
1057 ret = 0; 1057 ret = 0;
1058 } 1058 }
1059 spin_unlock(&lock->wait_lock); 1059 raw_spin_unlock(&lock->wait_lock);
1060 1060
1061 debug_rt_mutex_print_deadlock(waiter); 1061 debug_rt_mutex_print_deadlock(waiter);
1062 1062
@@ -1106,7 +1106,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1106{ 1106{
1107 int ret; 1107 int ret;
1108 1108
1109 spin_lock(&lock->wait_lock); 1109 raw_spin_lock(&lock->wait_lock);
1110 1110
1111 set_current_state(TASK_INTERRUPTIBLE); 1111 set_current_state(TASK_INTERRUPTIBLE);
1112 1112
@@ -1124,7 +1124,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
1124 */ 1124 */
1125 fixup_rt_mutex_waiters(lock); 1125 fixup_rt_mutex_waiters(lock);
1126 1126
1127 spin_unlock(&lock->wait_lock); 1127 raw_spin_unlock(&lock->wait_lock);
1128 1128
1129 /* 1129 /*
1130 * Readjust priority, when we did not get the lock. We might have been 1130 * Readjust priority, when we did not get the lock. We might have been
diff --git a/kernel/sched.c b/kernel/sched.c
index e7f2cfa6a257..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -141,7 +141,7 @@ struct rt_prio_array {
141 141
142struct rt_bandwidth { 142struct rt_bandwidth {
143 /* nests inside the rq lock: */ 143 /* nests inside the rq lock: */
144 spinlock_t rt_runtime_lock; 144 raw_spinlock_t rt_runtime_lock;
145 ktime_t rt_period; 145 ktime_t rt_period;
146 u64 rt_runtime; 146 u64 rt_runtime;
147 struct hrtimer rt_period_timer; 147 struct hrtimer rt_period_timer;
@@ -178,7 +178,7 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
178 rt_b->rt_period = ns_to_ktime(period); 178 rt_b->rt_period = ns_to_ktime(period);
179 rt_b->rt_runtime = runtime; 179 rt_b->rt_runtime = runtime;
180 180
181 spin_lock_init(&rt_b->rt_runtime_lock); 181 raw_spin_lock_init(&rt_b->rt_runtime_lock);
182 182
183 hrtimer_init(&rt_b->rt_period_timer, 183 hrtimer_init(&rt_b->rt_period_timer,
184 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 184 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -200,7 +200,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
200 if (hrtimer_active(&rt_b->rt_period_timer)) 200 if (hrtimer_active(&rt_b->rt_period_timer))
201 return; 201 return;
202 202
203 spin_lock(&rt_b->rt_runtime_lock); 203 raw_spin_lock(&rt_b->rt_runtime_lock);
204 for (;;) { 204 for (;;) {
205 unsigned long delta; 205 unsigned long delta;
206 ktime_t soft, hard; 206 ktime_t soft, hard;
@@ -217,7 +217,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, 217 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218 HRTIMER_MODE_ABS_PINNED, 0); 218 HRTIMER_MODE_ABS_PINNED, 0);
219 } 219 }
220 spin_unlock(&rt_b->rt_runtime_lock); 220 raw_spin_unlock(&rt_b->rt_runtime_lock);
221} 221}
222 222
223#ifdef CONFIG_RT_GROUP_SCHED 223#ifdef CONFIG_RT_GROUP_SCHED
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -470,7 +428,7 @@ struct rt_rq {
470 u64 rt_time; 428 u64 rt_time;
471 u64 rt_runtime; 429 u64 rt_runtime;
472 /* Nests inside the rq lock: */ 430 /* Nests inside the rq lock: */
473 spinlock_t rt_runtime_lock; 431 raw_spinlock_t rt_runtime_lock;
474 432
475#ifdef CONFIG_RT_GROUP_SCHED 433#ifdef CONFIG_RT_GROUP_SCHED
476 unsigned long rt_nr_boosted; 434 unsigned long rt_nr_boosted;
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -525,7 +482,7 @@ static struct root_domain def_root_domain;
525 */ 482 */
526struct rq { 483struct rq {
527 /* runqueue lock: */ 484 /* runqueue lock: */
528 spinlock_t lock; 485 raw_spinlock_t lock;
529 486
530 /* 487 /*
531 * nr_running and cpu_load should be in the same cacheline because 488 * nr_running and cpu_load should be in the same cacheline because
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -685,7 +647,7 @@ inline void update_rq_clock(struct rq *rq)
685 */ 647 */
686int runqueue_is_locked(int cpu) 648int runqueue_is_locked(int cpu)
687{ 649{
688 return spin_is_locked(&cpu_rq(cpu)->lock); 650 return raw_spin_is_locked(&cpu_rq(cpu)->lock);
689} 651}
690 652
691/* 653/*
@@ -814,6 +776,7 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
814 * default: 0.25ms 776 * default: 0.25ms
815 */ 777 */
816unsigned int sysctl_sched_shares_ratelimit = 250000; 778unsigned int sysctl_sched_shares_ratelimit = 250000;
779unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
817 780
818/* 781/*
819 * Inject some fuzzyness into changing the per-cpu group shares 782 * Inject some fuzzyness into changing the per-cpu group shares
@@ -892,7 +855,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
892 */ 855 */
893 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); 856 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
894 857
895 spin_unlock_irq(&rq->lock); 858 raw_spin_unlock_irq(&rq->lock);
896} 859}
897 860
898#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 861#else /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -916,9 +879,9 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
916 next->oncpu = 1; 879 next->oncpu = 1;
917#endif 880#endif
918#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 881#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
919 spin_unlock_irq(&rq->lock); 882 raw_spin_unlock_irq(&rq->lock);
920#else 883#else
921 spin_unlock(&rq->lock); 884 raw_spin_unlock(&rq->lock);
922#endif 885#endif
923} 886}
924 887
@@ -940,18 +903,35 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
940#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
941 904
942/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
943 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
944 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
945 */ 921 */
946static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
947 __acquires(rq->lock) 923 __acquires(rq->lock)
948{ 924{
925 struct rq *rq;
926
949 for (;;) { 927 for (;;) {
950 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
951 spin_lock(&rq->lock); 929 cpu_relax();
952 if (likely(rq == task_rq(p))) 930 rq = task_rq(p);
931 raw_spin_lock(&rq->lock);
932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
953 return rq; 933 return rq;
954 spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
955 } 935 }
956} 936}
957 937
@@ -966,12 +946,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
966 struct rq *rq; 946 struct rq *rq;
967 947
968 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
969 local_irq_save(*flags); 951 local_irq_save(*flags);
970 rq = task_rq(p); 952 rq = task_rq(p);
971 spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
972 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
973 return rq; 955 return rq;
974 spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
975 } 957 }
976} 958}
977 959
@@ -980,19 +962,19 @@ void task_rq_unlock_wait(struct task_struct *p)
980 struct rq *rq = task_rq(p); 962 struct rq *rq = task_rq(p);
981 963
982 smp_mb(); /* spin-unlock-wait is not a full memory barrier */ 964 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
983 spin_unlock_wait(&rq->lock); 965 raw_spin_unlock_wait(&rq->lock);
984} 966}
985 967
986static void __task_rq_unlock(struct rq *rq) 968static void __task_rq_unlock(struct rq *rq)
987 __releases(rq->lock) 969 __releases(rq->lock)
988{ 970{
989 spin_unlock(&rq->lock); 971 raw_spin_unlock(&rq->lock);
990} 972}
991 973
992static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) 974static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
993 __releases(rq->lock) 975 __releases(rq->lock)
994{ 976{
995 spin_unlock_irqrestore(&rq->lock, *flags); 977 raw_spin_unlock_irqrestore(&rq->lock, *flags);
996} 978}
997 979
998/* 980/*
@@ -1005,7 +987,7 @@ static struct rq *this_rq_lock(void)
1005 987
1006 local_irq_disable(); 988 local_irq_disable();
1007 rq = this_rq(); 989 rq = this_rq();
1008 spin_lock(&rq->lock); 990 raw_spin_lock(&rq->lock);
1009 991
1010 return rq; 992 return rq;
1011} 993}
@@ -1052,10 +1034,10 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1052 1034
1053 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); 1035 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1054 1036
1055 spin_lock(&rq->lock); 1037 raw_spin_lock(&rq->lock);
1056 update_rq_clock(rq); 1038 update_rq_clock(rq);
1057 rq->curr->sched_class->task_tick(rq, rq->curr, 1); 1039 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
1058 spin_unlock(&rq->lock); 1040 raw_spin_unlock(&rq->lock);
1059 1041
1060 return HRTIMER_NORESTART; 1042 return HRTIMER_NORESTART;
1061} 1043}
@@ -1068,10 +1050,10 @@ static void __hrtick_start(void *arg)
1068{ 1050{
1069 struct rq *rq = arg; 1051 struct rq *rq = arg;
1070 1052
1071 spin_lock(&rq->lock); 1053 raw_spin_lock(&rq->lock);
1072 hrtimer_restart(&rq->hrtick_timer); 1054 hrtimer_restart(&rq->hrtick_timer);
1073 rq->hrtick_csd_pending = 0; 1055 rq->hrtick_csd_pending = 0;
1074 spin_unlock(&rq->lock); 1056 raw_spin_unlock(&rq->lock);
1075} 1057}
1076 1058
1077/* 1059/*
@@ -1178,7 +1160,7 @@ static void resched_task(struct task_struct *p)
1178{ 1160{
1179 int cpu; 1161 int cpu;
1180 1162
1181 assert_spin_locked(&task_rq(p)->lock); 1163 assert_raw_spin_locked(&task_rq(p)->lock);
1182 1164
1183 if (test_tsk_need_resched(p)) 1165 if (test_tsk_need_resched(p))
1184 return; 1166 return;
@@ -1200,10 +1182,10 @@ static void resched_cpu(int cpu)
1200 struct rq *rq = cpu_rq(cpu); 1182 struct rq *rq = cpu_rq(cpu);
1201 unsigned long flags; 1183 unsigned long flags;
1202 1184
1203 if (!spin_trylock_irqsave(&rq->lock, flags)) 1185 if (!raw_spin_trylock_irqsave(&rq->lock, flags))
1204 return; 1186 return;
1205 resched_task(cpu_curr(cpu)); 1187 resched_task(cpu_curr(cpu));
1206 spin_unlock_irqrestore(&rq->lock, flags); 1188 raw_spin_unlock_irqrestore(&rq->lock, flags);
1207} 1189}
1208 1190
1209#ifdef CONFIG_NO_HZ 1191#ifdef CONFIG_NO_HZ
@@ -1272,7 +1254,7 @@ static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
1272#else /* !CONFIG_SMP */ 1254#else /* !CONFIG_SMP */
1273static void resched_task(struct task_struct *p) 1255static void resched_task(struct task_struct *p)
1274{ 1256{
1275 assert_spin_locked(&task_rq(p)->lock); 1257 assert_raw_spin_locked(&task_rq(p)->lock);
1276 set_tsk_need_resched(p); 1258 set_tsk_need_resched(p);
1277} 1259}
1278 1260
@@ -1389,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1389 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1390}; 1372};
1391 1373
1392static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1393
1394/*
1395 * runqueue iterator, to support SMP load-balancing between different
1396 * scheduling classes, without having to expose their internal data
1397 * structures to the load-balancing proper:
1398 */
1399struct rq_iterator {
1400 void *arg;
1401 struct task_struct *(*start)(void *);
1402 struct task_struct *(*next)(void *);
1403};
1404
1405#ifdef CONFIG_SMP
1406static unsigned long
1407balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1408 unsigned long max_load_move, struct sched_domain *sd,
1409 enum cpu_idle_type idle, int *all_pinned,
1410 int *this_best_prio, struct rq_iterator *iterator);
1411
1412static int
1413iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1414 struct sched_domain *sd, enum cpu_idle_type idle,
1415 struct rq_iterator *iterator);
1416#endif
1417
1418/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1419enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1420 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1530,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1530 1486
1531static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1532{ 1488{
1533 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1534 1490
1535 if (!sd) 1491 if (!sd)
1536 return NULL; 1492 return NULL;
@@ -1599,11 +1555,11 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1599 struct rq *rq = cpu_rq(cpu); 1555 struct rq *rq = cpu_rq(cpu);
1600 unsigned long flags; 1556 unsigned long flags;
1601 1557
1602 spin_lock_irqsave(&rq->lock, flags); 1558 raw_spin_lock_irqsave(&rq->lock, flags);
1603 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; 1559 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1604 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1560 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1605 __set_se_shares(tg->se[cpu], shares); 1561 __set_se_shares(tg->se[cpu], shares);
1606 spin_unlock_irqrestore(&rq->lock, flags); 1562 raw_spin_unlock_irqrestore(&rq->lock, flags);
1607 } 1563 }
1608} 1564}
1609 1565
@@ -1614,7 +1570,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1614 */ 1570 */
1615static int tg_shares_up(struct task_group *tg, void *data) 1571static int tg_shares_up(struct task_group *tg, void *data)
1616{ 1572{
1617 unsigned long weight, rq_weight = 0, shares = 0; 1573 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1618 unsigned long *usd_rq_weight; 1574 unsigned long *usd_rq_weight;
1619 struct sched_domain *sd = data; 1575 struct sched_domain *sd = data;
1620 unsigned long flags; 1576 unsigned long flags;
@@ -1630,6 +1586,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1630 weight = tg->cfs_rq[i]->load.weight; 1586 weight = tg->cfs_rq[i]->load.weight;
1631 usd_rq_weight[i] = weight; 1587 usd_rq_weight[i] = weight;
1632 1588
1589 rq_weight += weight;
1633 /* 1590 /*
1634 * If there are currently no tasks on the cpu pretend there 1591 * If there are currently no tasks on the cpu pretend there
1635 * is one of average load so that when a new task gets to 1592 * is one of average load so that when a new task gets to
@@ -1638,10 +1595,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
1638 if (!weight) 1595 if (!weight)
1639 weight = NICE_0_LOAD; 1596 weight = NICE_0_LOAD;
1640 1597
1641 rq_weight += weight; 1598 sum_weight += weight;
1642 shares += tg->cfs_rq[i]->shares; 1599 shares += tg->cfs_rq[i]->shares;
1643 } 1600 }
1644 1601
1602 if (!rq_weight)
1603 rq_weight = sum_weight;
1604
1645 if ((!shares && rq_weight) || shares > tg->shares) 1605 if ((!shares && rq_weight) || shares > tg->shares)
1646 shares = tg->shares; 1606 shares = tg->shares;
1647 1607
@@ -1696,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1696 } 1656 }
1697} 1657}
1698 1658
1699static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1700{
1701 if (root_task_group_empty())
1702 return;
1703
1704 spin_unlock(&rq->lock);
1705 update_shares(sd);
1706 spin_lock(&rq->lock);
1707}
1708
1709static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1710{ 1660{
1711 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1720,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1720{ 1670{
1721} 1671}
1722 1672
1723static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1724{
1725}
1726
1727#endif 1673#endif
1728 1674
1729#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1743,7 +1689,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1743 __acquires(busiest->lock) 1689 __acquires(busiest->lock)
1744 __acquires(this_rq->lock) 1690 __acquires(this_rq->lock)
1745{ 1691{
1746 spin_unlock(&this_rq->lock); 1692 raw_spin_unlock(&this_rq->lock);
1747 double_rq_lock(this_rq, busiest); 1693 double_rq_lock(this_rq, busiest);
1748 1694
1749 return 1; 1695 return 1;
@@ -1764,14 +1710,16 @@ static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1764{ 1710{
1765 int ret = 0; 1711 int ret = 0;
1766 1712
1767 if (unlikely(!spin_trylock(&busiest->lock))) { 1713 if (unlikely(!raw_spin_trylock(&busiest->lock))) {
1768 if (busiest < this_rq) { 1714 if (busiest < this_rq) {
1769 spin_unlock(&this_rq->lock); 1715 raw_spin_unlock(&this_rq->lock);
1770 spin_lock(&busiest->lock); 1716 raw_spin_lock(&busiest->lock);
1771 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING); 1717 raw_spin_lock_nested(&this_rq->lock,
1718 SINGLE_DEPTH_NESTING);
1772 ret = 1; 1719 ret = 1;
1773 } else 1720 } else
1774 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING); 1721 raw_spin_lock_nested(&busiest->lock,
1722 SINGLE_DEPTH_NESTING);
1775 } 1723 }
1776 return ret; 1724 return ret;
1777} 1725}
@@ -1785,7 +1733,7 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1785{ 1733{
1786 if (unlikely(!irqs_disabled())) { 1734 if (unlikely(!irqs_disabled())) {
1787 /* printk() doesn't work good under rq->lock */ 1735 /* printk() doesn't work good under rq->lock */
1788 spin_unlock(&this_rq->lock); 1736 raw_spin_unlock(&this_rq->lock);
1789 BUG_ON(1); 1737 BUG_ON(1);
1790 } 1738 }
1791 1739
@@ -1795,9 +1743,54 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1795static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1743static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1796 __releases(busiest->lock) 1744 __releases(busiest->lock)
1797{ 1745{
1798 spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1799 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1800} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1801#endif 1794#endif
1802 1795
1803#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1810,19 +1803,31 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1810#endif 1803#endif
1811 1804
1812static void calc_load_account_active(struct rq *this_rq); 1805static void calc_load_account_active(struct rq *this_rq);
1806static void update_sysctl(void);
1807static int get_update_sysctl_factor(void);
1813 1808
1814#include "sched_stats.h" 1809static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1815#include "sched_idletask.c" 1810{
1816#include "sched_fair.c" 1811 set_task_rq(p, cpu);
1817#include "sched_rt.c" 1812#ifdef CONFIG_SMP
1818#ifdef CONFIG_SCHED_DEBUG 1813 /*
1819# include "sched_debug.c" 1814 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1815 * successfuly executed on another CPU. We must ensure that updates of
1816 * per-task data have been completed by this moment.
1817 */
1818 smp_wmb();
1819 task_thread_info(p)->cpu = cpu;
1820#endif 1820#endif
1821}
1822
1823static const struct sched_class rt_sched_class;
1821 1824
1822#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1823#define for_each_class(class) \ 1826#define for_each_class(class) \
1824 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1825 1828
1829#include "sched_stats.h"
1830
1826static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1827{ 1832{
1828 rq->nr_running++; 1833 rq->nr_running++;
@@ -1860,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1860 *avg += diff >> 3; 1865 *avg += diff >> 3;
1861} 1866}
1862 1867
1863static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1864{ 1870{
1865 if (wakeup) 1871 if (wakeup)
1866 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1867 1873
1868 sched_info_queued(p); 1874 sched_info_queued(p);
1869 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1870 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1871} 1877}
1872 1878
@@ -1889,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1889} 1895}
1890 1896
1891/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1892 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1893 */ 1930 */
1894static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1934,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1934 return p->prio; 1971 return p->prio;
1935} 1972}
1936 1973
1937/*
1938 * activate_task - move a task to the runqueue.
1939 */
1940static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1941{
1942 if (task_contributes_to_load(p))
1943 rq->nr_uninterruptible--;
1944
1945 enqueue_task(rq, p, wakeup);
1946 inc_nr_running(rq);
1947}
1948
1949/*
1950 * deactivate_task - remove a task from the runqueue.
1951 */
1952static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1953{
1954 if (task_contributes_to_load(p))
1955 rq->nr_uninterruptible++;
1956
1957 dequeue_task(rq, p, sleep);
1958 dec_nr_running(rq);
1959}
1960
1961/** 1974/**
1962 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1963 * @p: the task in question. 1976 * @p: the task in question.
@@ -1967,20 +1980,6 @@ inline int task_curr(const struct task_struct *p)
1967 return cpu_curr(task_cpu(p)) == p; 1980 return cpu_curr(task_cpu(p)) == p;
1968} 1981}
1969 1982
1970static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1971{
1972 set_task_rq(p, cpu);
1973#ifdef CONFIG_SMP
1974 /*
1975 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1976 * successfuly executed on another CPU. We must ensure that updates of
1977 * per-task data have been completed by this moment.
1978 */
1979 smp_wmb();
1980 task_thread_info(p)->cpu = cpu;
1981#endif
1982}
1983
1984static inline void check_class_changed(struct rq *rq, struct task_struct *p, 1983static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1985 const struct sched_class *prev_class, 1984 const struct sched_class *prev_class,
1986 int oldprio, int running) 1985 int oldprio, int running)
@@ -1993,39 +1992,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1993 p->sched_class->prio_changed(rq, p, oldprio, running); 1992 p->sched_class->prio_changed(rq, p, oldprio, running);
1994} 1993}
1995 1994
1996/**
1997 * kthread_bind - bind a just-created kthread to a cpu.
1998 * @p: thread created by kthread_create().
1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 update_rq_clock(rq);
2021 set_task_cpu(p, cpu);
2022 p->cpus_allowed = cpumask_of_cpu(cpu);
2023 p->rt.nr_cpus_allowed = 1;
2024 p->flags |= PF_THREAD_BOUND;
2025 spin_unlock_irqrestore(&rq->lock, flags);
2026}
2027EXPORT_SYMBOL(kthread_bind);
2028
2029#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
2030/* 1996/*
2031 * Is this task likely cache-hot: 1997 * Is this task likely cache-hot:
@@ -2035,6 +2001,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2035{ 2001{
2036 s64 delta; 2002 s64 delta;
2037 2003
2004 if (p->sched_class != &fair_sched_class)
2005 return 0;
2006
2038 /* 2007 /*
2039 * Buddy candidates are cache hot: 2008 * Buddy candidates are cache hot:
2040 */ 2009 */
@@ -2043,9 +2012,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2043 &p->se == cfs_rq_of(&p->se)->last)) 2012 &p->se == cfs_rq_of(&p->se)->last))
2044 return 1; 2013 return 1;
2045 2014
2046 if (p->sched_class != &fair_sched_class)
2047 return 0;
2048
2049 if (sysctl_sched_migration_cost == -1) 2015 if (sysctl_sched_migration_cost == -1)
2050 return 1; 2016 return 1;
2051 if (sysctl_sched_migration_cost == 0) 2017 if (sysctl_sched_migration_cost == 0)
@@ -2056,38 +2022,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2056 return delta < (s64)sysctl_sched_migration_cost; 2022 return delta < (s64)sysctl_sched_migration_cost;
2057} 2023}
2058 2024
2059
2060void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2025void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2061{ 2026{
2062 int old_cpu = task_cpu(p); 2027#ifdef CONFIG_SCHED_DEBUG
2063 struct rq *old_rq = cpu_rq(old_cpu), *new_rq = cpu_rq(new_cpu); 2028 /*
2064 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2029 * We should never call set_task_cpu() on a blocked task,
2065 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2030 * ttwu() will sort out the placement.
2066 u64 clock_offset; 2031 */
2067 2032 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2068 clock_offset = old_rq->clock - new_rq->clock; 2033 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2034#endif
2069 2035
2070 trace_sched_migrate_task(p, new_cpu); 2036 trace_sched_migrate_task(p, new_cpu);
2071 2037
2072#ifdef CONFIG_SCHEDSTATS 2038 if (task_cpu(p) != new_cpu) {
2073 if (p->se.wait_start)
2074 p->se.wait_start -= clock_offset;
2075 if (p->se.sleep_start)
2076 p->se.sleep_start -= clock_offset;
2077 if (p->se.block_start)
2078 p->se.block_start -= clock_offset;
2079#endif
2080 if (old_cpu != new_cpu) {
2081 p->se.nr_migrations++; 2039 p->se.nr_migrations++;
2082#ifdef CONFIG_SCHEDSTATS 2040 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2083 if (task_hot(p, old_rq->clock, NULL))
2084 schedstat_inc(p, se.nr_forced2_migrations);
2085#endif
2086 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS,
2087 1, 1, NULL, 0);
2088 } 2041 }
2089 p->se.vruntime -= old_cfsrq->min_vruntime -
2090 new_cfsrq->min_vruntime;
2091 2042
2092 __set_task_cpu(p, new_cpu); 2043 __set_task_cpu(p, new_cpu);
2093} 2044}
@@ -2112,13 +2063,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2112 2063
2113 /* 2064 /*
2114 * If the task is not on a runqueue (and not running), then 2065 * If the task is not on a runqueue (and not running), then
2115 * it is sufficient to simply update the task's cpu field. 2066 * the next wake-up will properly place the task.
2116 */ 2067 */
2117 if (!p->se.on_rq && !task_running(rq, p)) { 2068 if (!p->se.on_rq && !task_running(rq, p))
2118 update_rq_clock(rq);
2119 set_task_cpu(p, dest_cpu);
2120 return 0; 2069 return 0;
2121 }
2122 2070
2123 init_completion(&req->done); 2071 init_completion(&req->done);
2124 req->task = p; 2072 req->task = p;
@@ -2323,6 +2271,75 @@ void task_oncpu_function_call(struct task_struct *p,
2323 preempt_enable(); 2271 preempt_enable();
2324} 2272}
2325 2273
2274#ifdef CONFIG_SMP
2275static int select_fallback_rq(int cpu, struct task_struct *p)
2276{
2277 int dest_cpu;
2278 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2279
2280 /* Look for allowed, online CPU in same node. */
2281 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2282 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2283 return dest_cpu;
2284
2285 /* Any allowed, online CPU? */
2286 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2287 if (dest_cpu < nr_cpu_ids)
2288 return dest_cpu;
2289
2290 /* No more Mr. Nice Guy. */
2291 if (dest_cpu >= nr_cpu_ids) {
2292 rcu_read_lock();
2293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2294 rcu_read_unlock();
2295 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2296
2297 /*
2298 * Don't tell them about moving exiting tasks or
2299 * kernel threads (both mm NULL), since they never
2300 * leave kernel.
2301 */
2302 if (p->mm && printk_ratelimit()) {
2303 printk(KERN_INFO "process %d (%s) no "
2304 "longer affine to cpu%d\n",
2305 task_pid_nr(p), p->comm, cpu);
2306 }
2307 }
2308
2309 return dest_cpu;
2310}
2311
2312/*
2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2315 * by:
2316 *
2317 * exec: is unstable, retry loop
2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2319 */
2320static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{
2323 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2324
2325 /*
2326 * In order not to call set_task_cpu() on a blocking task we need
2327 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2328 * cpu.
2329 *
2330 * Since this is common to all placement strategies, this lives here.
2331 *
2332 * [ this allows ->select_task() to simply return task_cpu(p) and
2333 * not worry about this generic constraint ]
2334 */
2335 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2336 !cpu_online(cpu)))
2337 cpu = select_fallback_rq(task_cpu(p), p);
2338
2339 return cpu;
2340}
2341#endif
2342
2326/*** 2343/***
2327 * try_to_wake_up - wake up a thread 2344 * try_to_wake_up - wake up a thread
2328 * @p: the to-be-woken-up thread 2345 * @p: the to-be-woken-up thread
@@ -2374,20 +2391,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2374 if (task_contributes_to_load(p)) 2391 if (task_contributes_to_load(p))
2375 rq->nr_uninterruptible--; 2392 rq->nr_uninterruptible--;
2376 p->state = TASK_WAKING; 2393 p->state = TASK_WAKING;
2377 task_rq_unlock(rq, &flags);
2378 2394
2379 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2395 if (p->sched_class->task_waking)
2396 p->sched_class->task_waking(rq, p);
2397
2398 __task_rq_unlock(rq);
2399
2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2380 if (cpu != orig_cpu) { 2401 if (cpu != orig_cpu) {
2381 local_irq_save(flags); 2402 /*
2382 rq = cpu_rq(cpu); 2403 * Since we migrate the task without holding any rq->lock,
2383 update_rq_clock(rq); 2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2384 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2385 local_irq_restore(flags);
2386 } 2408 }
2387 rq = task_rq_lock(p, &flags);
2388 2409
2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2412 update_rq_clock(rq);
2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2389 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2390 cpu = task_cpu(p);
2391 2422
2392#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2393 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2440,8 +2471,8 @@ out_running:
2440 2471
2441 p->state = TASK_RUNNING; 2472 p->state = TASK_RUNNING;
2442#ifdef CONFIG_SMP 2473#ifdef CONFIG_SMP
2443 if (p->sched_class->task_wake_up) 2474 if (p->sched_class->task_woken)
2444 p->sched_class->task_wake_up(rq, p); 2475 p->sched_class->task_woken(rq, p);
2445 2476
2446 if (unlikely(rq->idle_stamp)) { 2477 if (unlikely(rq->idle_stamp)) {
2447 u64 delta = rq->clock - rq->idle_stamp; 2478 u64 delta = rq->clock - rq->idle_stamp;
@@ -2499,7 +2530,6 @@ static void __sched_fork(struct task_struct *p)
2499 p->se.avg_overlap = 0; 2530 p->se.avg_overlap = 0;
2500 p->se.start_runtime = 0; 2531 p->se.start_runtime = 0;
2501 p->se.avg_wakeup = sysctl_sched_wakeup_granularity; 2532 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2502 p->se.avg_running = 0;
2503 2533
2504#ifdef CONFIG_SCHEDSTATS 2534#ifdef CONFIG_SCHEDSTATS
2505 p->se.wait_start = 0; 2535 p->se.wait_start = 0;
@@ -2521,7 +2551,6 @@ static void __sched_fork(struct task_struct *p)
2521 p->se.nr_failed_migrations_running = 0; 2551 p->se.nr_failed_migrations_running = 0;
2522 p->se.nr_failed_migrations_hot = 0; 2552 p->se.nr_failed_migrations_hot = 0;
2523 p->se.nr_forced_migrations = 0; 2553 p->se.nr_forced_migrations = 0;
2524 p->se.nr_forced2_migrations = 0;
2525 2554
2526 p->se.nr_wakeups = 0; 2555 p->se.nr_wakeups = 0;
2527 p->se.nr_wakeups_sync = 0; 2556 p->se.nr_wakeups_sync = 0;
@@ -2542,14 +2571,6 @@ static void __sched_fork(struct task_struct *p)
2542#ifdef CONFIG_PREEMPT_NOTIFIERS 2571#ifdef CONFIG_PREEMPT_NOTIFIERS
2543 INIT_HLIST_HEAD(&p->preempt_notifiers); 2572 INIT_HLIST_HEAD(&p->preempt_notifiers);
2544#endif 2573#endif
2545
2546 /*
2547 * We mark the process as running here, but have not actually
2548 * inserted it onto the runqueue yet. This guarantees that
2549 * nobody will actually run it, and a signal or other external
2550 * event cannot wake it up and insert it on the runqueue either.
2551 */
2552 p->state = TASK_RUNNING;
2553} 2574}
2554 2575
2555/* 2576/*
@@ -2558,9 +2579,14 @@ static void __sched_fork(struct task_struct *p)
2558void sched_fork(struct task_struct *p, int clone_flags) 2579void sched_fork(struct task_struct *p, int clone_flags)
2559{ 2580{
2560 int cpu = get_cpu(); 2581 int cpu = get_cpu();
2561 unsigned long flags;
2562 2582
2563 __sched_fork(p); 2583 __sched_fork(p);
2584 /*
2585 * We mark the process as waking here. This guarantees that
2586 * nobody will actually run it, and a signal or other external
2587 * event cannot wake it up and insert it on the runqueue either.
2588 */
2589 p->state = TASK_WAKING;
2564 2590
2565 /* 2591 /*
2566 * Revert to default priority/policy on fork if requested. 2592 * Revert to default priority/policy on fork if requested.
@@ -2592,13 +2618,10 @@ void sched_fork(struct task_struct *p, int clone_flags)
2592 if (!rt_prio(p->prio)) 2618 if (!rt_prio(p->prio))
2593 p->sched_class = &fair_sched_class; 2619 p->sched_class = &fair_sched_class;
2594 2620
2595#ifdef CONFIG_SMP 2621 if (p->sched_class->task_fork)
2596 cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); 2622 p->sched_class->task_fork(p);
2597#endif 2623
2598 local_irq_save(flags);
2599 update_rq_clock(cpu_rq(cpu));
2600 set_task_cpu(p, cpu); 2624 set_task_cpu(p, cpu);
2601 local_irq_restore(flags);
2602 2625
2603#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2626#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2604 if (likely(sched_info_on())) 2627 if (likely(sched_info_on()))
@@ -2627,28 +2650,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2627{ 2650{
2628 unsigned long flags; 2651 unsigned long flags;
2629 struct rq *rq; 2652 struct rq *rq;
2653 int cpu = get_cpu();
2630 2654
2631 rq = task_rq_lock(p, &flags); 2655#ifdef CONFIG_SMP
2632 BUG_ON(p->state != TASK_RUNNING); 2656 /*
2633 update_rq_clock(rq); 2657 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug
2660 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning
2663 * cpu_online_mask is stable.
2664 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu);
2667#endif
2634 2668
2635 if (!p->sched_class->task_new || !current->se.on_rq) { 2669 /*
2636 activate_task(rq, p, 0); 2670 * Since the task is not on the rq and we still have TASK_WAKING set
2637 } else { 2671 * nobody else will migrate this task.
2638 /* 2672 */
2639 * Let the scheduling class do new task startup 2673 rq = cpu_rq(cpu);
2640 * management (if any): 2674 raw_spin_lock_irqsave(&rq->lock, flags);
2641 */ 2675
2642 p->sched_class->task_new(rq, p); 2676 BUG_ON(p->state != TASK_WAKING);
2643 inc_nr_running(rq); 2677 p->state = TASK_RUNNING;
2644 } 2678 update_rq_clock(rq);
2679 activate_task(rq, p, 0);
2645 trace_sched_wakeup_new(rq, p, 1); 2680 trace_sched_wakeup_new(rq, p, 1);
2646 check_preempt_curr(rq, p, WF_FORK); 2681 check_preempt_curr(rq, p, WF_FORK);
2647#ifdef CONFIG_SMP 2682#ifdef CONFIG_SMP
2648 if (p->sched_class->task_wake_up) 2683 if (p->sched_class->task_woken)
2649 p->sched_class->task_wake_up(rq, p); 2684 p->sched_class->task_woken(rq, p);
2650#endif 2685#endif
2651 task_rq_unlock(rq, &flags); 2686 task_rq_unlock(rq, &flags);
2687 put_cpu();
2652} 2688}
2653 2689
2654#ifdef CONFIG_PREEMPT_NOTIFIERS 2690#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2767,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2767 */ 2803 */
2768 prev_state = prev->state; 2804 prev_state = prev->state;
2769 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2770 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2771 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2772 2814
2773 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -2798,10 +2840,10 @@ static inline void post_schedule(struct rq *rq)
2798 if (rq->post_schedule) { 2840 if (rq->post_schedule) {
2799 unsigned long flags; 2841 unsigned long flags;
2800 2842
2801 spin_lock_irqsave(&rq->lock, flags); 2843 raw_spin_lock_irqsave(&rq->lock, flags);
2802 if (rq->curr->sched_class->post_schedule) 2844 if (rq->curr->sched_class->post_schedule)
2803 rq->curr->sched_class->post_schedule(rq); 2845 rq->curr->sched_class->post_schedule(rq);
2804 spin_unlock_irqrestore(&rq->lock, flags); 2846 raw_spin_unlock_irqrestore(&rq->lock, flags);
2805 2847
2806 rq->post_schedule = 0; 2848 rq->post_schedule = 0;
2807 } 2849 }
@@ -3072,65 +3114,36 @@ static void update_cpu_load(struct rq *this_rq)
3072#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3073 3115
3074/* 3116/*
3075 * double_rq_lock - safely lock two runqueues 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3076 * 3118 * this point the task has the smallest effective memory and cache footprint.
3077 * Note this does not disable interrupts like task_rq_lock,
3078 * you need to do so manually before calling.
3079 */
3080static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3081 __acquires(rq1->lock)
3082 __acquires(rq2->lock)
3083{
3084 BUG_ON(!irqs_disabled());
3085 if (rq1 == rq2) {
3086 spin_lock(&rq1->lock);
3087 __acquire(rq2->lock); /* Fake it out ;) */
3088 } else {
3089 if (rq1 < rq2) {
3090 spin_lock(&rq1->lock);
3091 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3092 } else {
3093 spin_lock(&rq2->lock);
3094 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3095 }
3096 }
3097 update_rq_clock(rq1);
3098 update_rq_clock(rq2);
3099}
3100
3101/*
3102 * double_rq_unlock - safely unlock two runqueues
3103 *
3104 * Note this does not restore interrupts like task_rq_unlock,
3105 * you need to do so manually after calling.
3106 */
3107static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3108 __releases(rq1->lock)
3109 __releases(rq2->lock)
3110{
3111 spin_unlock(&rq1->lock);
3112 if (rq1 != rq2)
3113 spin_unlock(&rq2->lock);
3114 else
3115 __release(rq2->lock);
3116}
3117
3118/*
3119 * If dest_cpu is allowed for this process, migrate the task to it.
3120 * This is accomplished by forcing the cpu_allowed mask to only
3121 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3122 * the cpu_allowed mask is restored.
3123 */ 3119 */
3124static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3120void sched_exec(void)
3125{ 3121{
3122 struct task_struct *p = current;
3126 struct migration_req req; 3123 struct migration_req req;
3124 int dest_cpu, this_cpu;
3127 unsigned long flags; 3125 unsigned long flags;
3128 struct rq *rq; 3126 struct rq *rq;
3129 3127
3128again:
3129 this_cpu = get_cpu();
3130 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3131 if (dest_cpu == this_cpu) {
3132 put_cpu();
3133 return;
3134 }
3135
3130 rq = task_rq_lock(p, &flags); 3136 rq = task_rq_lock(p, &flags);
3137 put_cpu();
3138
3139 /*
3140 * select_task_rq() can race against ->cpus_allowed
3141 */
3131 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3142 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3132 || unlikely(!cpu_active(dest_cpu))) 3143 || unlikely(!cpu_active(dest_cpu))) {
3133 goto out; 3144 task_rq_unlock(rq, &flags);
3145 goto again;
3146 }
3134 3147
3135 /* force the process onto the specified CPU */ 3148 /* force the process onto the specified CPU */
3136 if (migrate_task(p, dest_cpu, &req)) { 3149 if (migrate_task(p, dest_cpu, &req)) {
@@ -3145,1791 +3158,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3145 3158
3146 return; 3159 return;
3147 } 3160 }
3148out:
3149 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3150} 3162}
3151 3163
3152/*
3153 * sched_exec - execve() is a valuable balancing opportunity, because at
3154 * this point the task has the smallest effective memory and cache footprint.
3155 */
3156void sched_exec(void)
3157{
3158 int new_cpu, this_cpu = get_cpu();
3159 new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);
3160 put_cpu();
3161 if (new_cpu != this_cpu)
3162 sched_migrate_task(current, new_cpu);
3163}
3164
3165/*
3166 * pull_task - move a task from a remote runqueue to the local runqueue.
3167 * Both runqueues must be locked.
3168 */
3169static void pull_task(struct rq *src_rq, struct task_struct *p,
3170 struct rq *this_rq, int this_cpu)
3171{
3172 deactivate_task(src_rq, p, 0);
3173 set_task_cpu(p, this_cpu);
3174 activate_task(this_rq, p, 0);
3175 /*
3176 * Note that idle threads have a prio of MAX_PRIO, for this test
3177 * to be always true for them.
3178 */
3179 check_preempt_curr(this_rq, p, 0);
3180}
3181
3182/*
3183 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3184 */
3185static
3186int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3187 struct sched_domain *sd, enum cpu_idle_type idle,
3188 int *all_pinned)
3189{
3190 int tsk_cache_hot = 0;
3191 /*
3192 * We do not migrate tasks that are:
3193 * 1) running (obviously), or
3194 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3195 * 3) are cache-hot on their current CPU.
3196 */
3197 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3198 schedstat_inc(p, se.nr_failed_migrations_affine);
3199 return 0;
3200 }
3201 *all_pinned = 0;
3202
3203 if (task_running(rq, p)) {
3204 schedstat_inc(p, se.nr_failed_migrations_running);
3205 return 0;
3206 }
3207
3208 /*
3209 * Aggressive migration if:
3210 * 1) task is cache cold, or
3211 * 2) too many balance attempts have failed.
3212 */
3213
3214 tsk_cache_hot = task_hot(p, rq->clock, sd);
3215 if (!tsk_cache_hot ||
3216 sd->nr_balance_failed > sd->cache_nice_tries) {
3217#ifdef CONFIG_SCHEDSTATS
3218 if (tsk_cache_hot) {
3219 schedstat_inc(sd, lb_hot_gained[idle]);
3220 schedstat_inc(p, se.nr_forced_migrations);
3221 }
3222#endif
3223 return 1;
3224 }
3225
3226 if (tsk_cache_hot) {
3227 schedstat_inc(p, se.nr_failed_migrations_hot);
3228 return 0;
3229 }
3230 return 1;
3231}
3232
3233static unsigned long
3234balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3235 unsigned long max_load_move, struct sched_domain *sd,
3236 enum cpu_idle_type idle, int *all_pinned,
3237 int *this_best_prio, struct rq_iterator *iterator)
3238{
3239 int loops = 0, pulled = 0, pinned = 0;
3240 struct task_struct *p;
3241 long rem_load_move = max_load_move;
3242
3243 if (max_load_move == 0)
3244 goto out;
3245
3246 pinned = 1;
3247
3248 /*
3249 * Start the load-balancing iterator:
3250 */
3251 p = iterator->start(iterator->arg);
3252next:
3253 if (!p || loops++ > sysctl_sched_nr_migrate)
3254 goto out;
3255
3256 if ((p->se.load.weight >> 1) > rem_load_move ||
3257 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3258 p = iterator->next(iterator->arg);
3259 goto next;
3260 }
3261
3262 pull_task(busiest, p, this_rq, this_cpu);
3263 pulled++;
3264 rem_load_move -= p->se.load.weight;
3265
3266#ifdef CONFIG_PREEMPT
3267 /*
3268 * NEWIDLE balancing is a source of latency, so preemptible kernels
3269 * will stop after the first task is pulled to minimize the critical
3270 * section.
3271 */
3272 if (idle == CPU_NEWLY_IDLE)
3273 goto out;
3274#endif
3275
3276 /*
3277 * We only want to steal up to the prescribed amount of weighted load.
3278 */
3279 if (rem_load_move > 0) {
3280 if (p->prio < *this_best_prio)
3281 *this_best_prio = p->prio;
3282 p = iterator->next(iterator->arg);
3283 goto next;
3284 }
3285out:
3286 /*
3287 * Right now, this is one of only two places pull_task() is called,
3288 * so we can safely collect pull_task() stats here rather than
3289 * inside pull_task().
3290 */
3291 schedstat_add(sd, lb_gained[idle], pulled);
3292
3293 if (all_pinned)
3294 *all_pinned = pinned;
3295
3296 return max_load_move - rem_load_move;
3297}
3298
3299/*
3300 * move_tasks tries to move up to max_load_move weighted load from busiest to
3301 * this_rq, as part of a balancing operation within domain "sd".
3302 * Returns 1 if successful and 0 otherwise.
3303 *
3304 * Called with both runqueues locked.
3305 */
3306static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3307 unsigned long max_load_move,
3308 struct sched_domain *sd, enum cpu_idle_type idle,
3309 int *all_pinned)
3310{
3311 const struct sched_class *class = sched_class_highest;
3312 unsigned long total_load_moved = 0;
3313 int this_best_prio = this_rq->curr->prio;
3314
3315 do {
3316 total_load_moved +=
3317 class->load_balance(this_rq, this_cpu, busiest,
3318 max_load_move - total_load_moved,
3319 sd, idle, all_pinned, &this_best_prio);
3320 class = class->next;
3321
3322#ifdef CONFIG_PREEMPT
3323 /*
3324 * NEWIDLE balancing is a source of latency, so preemptible
3325 * kernels will stop after the first task is pulled to minimize
3326 * the critical section.
3327 */
3328 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3329 break;
3330#endif
3331 } while (class && max_load_move > total_load_moved);
3332
3333 return total_load_moved > 0;
3334}
3335
3336static int
3337iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3338 struct sched_domain *sd, enum cpu_idle_type idle,
3339 struct rq_iterator *iterator)
3340{
3341 struct task_struct *p = iterator->start(iterator->arg);
3342 int pinned = 0;
3343
3344 while (p) {
3345 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3346 pull_task(busiest, p, this_rq, this_cpu);
3347 /*
3348 * Right now, this is only the second place pull_task()
3349 * is called, so we can safely collect pull_task()
3350 * stats here rather than inside pull_task().
3351 */
3352 schedstat_inc(sd, lb_gained[idle]);
3353
3354 return 1;
3355 }
3356 p = iterator->next(iterator->arg);
3357 }
3358
3359 return 0;
3360}
3361
3362/*
3363 * move_one_task tries to move exactly one task from busiest to this_rq, as
3364 * part of active balancing operations within "domain".
3365 * Returns 1 if successful and 0 otherwise.
3366 *
3367 * Called with both runqueues locked.
3368 */
3369static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3370 struct sched_domain *sd, enum cpu_idle_type idle)
3371{
3372 const struct sched_class *class;
3373
3374 for_each_class(class) {
3375 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3376 return 1;
3377 }
3378
3379 return 0;
3380}
3381/********** Helpers for find_busiest_group ************************/
3382/*
3383 * sd_lb_stats - Structure to store the statistics of a sched_domain
3384 * during load balancing.
3385 */
3386struct sd_lb_stats {
3387 struct sched_group *busiest; /* Busiest group in this sd */
3388 struct sched_group *this; /* Local group in this sd */
3389 unsigned long total_load; /* Total load of all groups in sd */
3390 unsigned long total_pwr; /* Total power of all groups in sd */
3391 unsigned long avg_load; /* Average load across all groups in sd */
3392
3393 /** Statistics of this group */
3394 unsigned long this_load;
3395 unsigned long this_load_per_task;
3396 unsigned long this_nr_running;
3397
3398 /* Statistics of the busiest group */
3399 unsigned long max_load;
3400 unsigned long busiest_load_per_task;
3401 unsigned long busiest_nr_running;
3402
3403 int group_imb; /* Is there imbalance in this sd */
3404#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3405 int power_savings_balance; /* Is powersave balance needed for this sd */
3406 struct sched_group *group_min; /* Least loaded group in sd */
3407 struct sched_group *group_leader; /* Group which relieves group_min */
3408 unsigned long min_load_per_task; /* load_per_task in group_min */
3409 unsigned long leader_nr_running; /* Nr running of group_leader */
3410 unsigned long min_nr_running; /* Nr running of group_min */
3411#endif
3412};
3413
3414/*
3415 * sg_lb_stats - stats of a sched_group required for load_balancing
3416 */
3417struct sg_lb_stats {
3418 unsigned long avg_load; /*Avg load across the CPUs of the group */
3419 unsigned long group_load; /* Total load over the CPUs of the group */
3420 unsigned long sum_nr_running; /* Nr tasks running in the group */
3421 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3422 unsigned long group_capacity;
3423 int group_imb; /* Is there an imbalance in the group ? */
3424};
3425
3426/**
3427 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3428 * @group: The group whose first cpu is to be returned.
3429 */
3430static inline unsigned int group_first_cpu(struct sched_group *group)
3431{
3432 return cpumask_first(sched_group_cpus(group));
3433}
3434
3435/**
3436 * get_sd_load_idx - Obtain the load index for a given sched domain.
3437 * @sd: The sched_domain whose load_idx is to be obtained.
3438 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3439 */
3440static inline int get_sd_load_idx(struct sched_domain *sd,
3441 enum cpu_idle_type idle)
3442{
3443 int load_idx;
3444
3445 switch (idle) {
3446 case CPU_NOT_IDLE:
3447 load_idx = sd->busy_idx;
3448 break;
3449
3450 case CPU_NEWLY_IDLE:
3451 load_idx = sd->newidle_idx;
3452 break;
3453 default:
3454 load_idx = sd->idle_idx;
3455 break;
3456 }
3457
3458 return load_idx;
3459}
3460
3461
3462#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3463/**
3464 * init_sd_power_savings_stats - Initialize power savings statistics for
3465 * the given sched_domain, during load balancing.
3466 *
3467 * @sd: Sched domain whose power-savings statistics are to be initialized.
3468 * @sds: Variable containing the statistics for sd.
3469 * @idle: Idle status of the CPU at which we're performing load-balancing.
3470 */
3471static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3472 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3473{
3474 /*
3475 * Busy processors will not participate in power savings
3476 * balance.
3477 */
3478 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3479 sds->power_savings_balance = 0;
3480 else {
3481 sds->power_savings_balance = 1;
3482 sds->min_nr_running = ULONG_MAX;
3483 sds->leader_nr_running = 0;
3484 }
3485}
3486
3487/**
3488 * update_sd_power_savings_stats - Update the power saving stats for a
3489 * sched_domain while performing load balancing.
3490 *
3491 * @group: sched_group belonging to the sched_domain under consideration.
3492 * @sds: Variable containing the statistics of the sched_domain
3493 * @local_group: Does group contain the CPU for which we're performing
3494 * load balancing ?
3495 * @sgs: Variable containing the statistics of the group.
3496 */
3497static inline void update_sd_power_savings_stats(struct sched_group *group,
3498 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3499{
3500
3501 if (!sds->power_savings_balance)
3502 return;
3503
3504 /*
3505 * If the local group is idle or completely loaded
3506 * no need to do power savings balance at this domain
3507 */
3508 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3509 !sds->this_nr_running))
3510 sds->power_savings_balance = 0;
3511
3512 /*
3513 * If a group is already running at full capacity or idle,
3514 * don't include that group in power savings calculations
3515 */
3516 if (!sds->power_savings_balance ||
3517 sgs->sum_nr_running >= sgs->group_capacity ||
3518 !sgs->sum_nr_running)
3519 return;
3520
3521 /*
3522 * Calculate the group which has the least non-idle load.
3523 * This is the group from where we need to pick up the load
3524 * for saving power
3525 */
3526 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3527 (sgs->sum_nr_running == sds->min_nr_running &&
3528 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3529 sds->group_min = group;
3530 sds->min_nr_running = sgs->sum_nr_running;
3531 sds->min_load_per_task = sgs->sum_weighted_load /
3532 sgs->sum_nr_running;
3533 }
3534
3535 /*
3536 * Calculate the group which is almost near its
3537 * capacity but still has some space to pick up some load
3538 * from other group and save more power
3539 */
3540 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3541 return;
3542
3543 if (sgs->sum_nr_running > sds->leader_nr_running ||
3544 (sgs->sum_nr_running == sds->leader_nr_running &&
3545 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3546 sds->group_leader = group;
3547 sds->leader_nr_running = sgs->sum_nr_running;
3548 }
3549}
3550
3551/**
3552 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3553 * @sds: Variable containing the statistics of the sched_domain
3554 * under consideration.
3555 * @this_cpu: Cpu at which we're currently performing load-balancing.
3556 * @imbalance: Variable to store the imbalance.
3557 *
3558 * Description:
3559 * Check if we have potential to perform some power-savings balance.
3560 * If yes, set the busiest group to be the least loaded group in the
3561 * sched_domain, so that it's CPUs can be put to idle.
3562 *
3563 * Returns 1 if there is potential to perform power-savings balance.
3564 * Else returns 0.
3565 */
3566static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3567 int this_cpu, unsigned long *imbalance)
3568{
3569 if (!sds->power_savings_balance)
3570 return 0;
3571
3572 if (sds->this != sds->group_leader ||
3573 sds->group_leader == sds->group_min)
3574 return 0;
3575
3576 *imbalance = sds->min_load_per_task;
3577 sds->busiest = sds->group_min;
3578
3579 return 1;
3580
3581}
3582#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3583static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3584 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3585{
3586 return;
3587}
3588
3589static inline void update_sd_power_savings_stats(struct sched_group *group,
3590 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3591{
3592 return;
3593}
3594
3595static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3596 int this_cpu, unsigned long *imbalance)
3597{
3598 return 0;
3599}
3600#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3601
3602
3603unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3604{
3605 return SCHED_LOAD_SCALE;
3606}
3607
3608unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3609{
3610 return default_scale_freq_power(sd, cpu);
3611}
3612
3613unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3614{
3615 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3616 unsigned long smt_gain = sd->smt_gain;
3617
3618 smt_gain /= weight;
3619
3620 return smt_gain;
3621}
3622
3623unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3624{
3625 return default_scale_smt_power(sd, cpu);
3626}
3627
3628unsigned long scale_rt_power(int cpu)
3629{
3630 struct rq *rq = cpu_rq(cpu);
3631 u64 total, available;
3632
3633 sched_avg_update(rq);
3634
3635 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3636 available = total - rq->rt_avg;
3637
3638 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3639 total = SCHED_LOAD_SCALE;
3640
3641 total >>= SCHED_LOAD_SHIFT;
3642
3643 return div_u64(available, total);
3644}
3645
3646static void update_cpu_power(struct sched_domain *sd, int cpu)
3647{
3648 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3649 unsigned long power = SCHED_LOAD_SCALE;
3650 struct sched_group *sdg = sd->groups;
3651
3652 if (sched_feat(ARCH_POWER))
3653 power *= arch_scale_freq_power(sd, cpu);
3654 else
3655 power *= default_scale_freq_power(sd, cpu);
3656
3657 power >>= SCHED_LOAD_SHIFT;
3658
3659 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3660 if (sched_feat(ARCH_POWER))
3661 power *= arch_scale_smt_power(sd, cpu);
3662 else
3663 power *= default_scale_smt_power(sd, cpu);
3664
3665 power >>= SCHED_LOAD_SHIFT;
3666 }
3667
3668 power *= scale_rt_power(cpu);
3669 power >>= SCHED_LOAD_SHIFT;
3670
3671 if (!power)
3672 power = 1;
3673
3674 sdg->cpu_power = power;
3675}
3676
3677static void update_group_power(struct sched_domain *sd, int cpu)
3678{
3679 struct sched_domain *child = sd->child;
3680 struct sched_group *group, *sdg = sd->groups;
3681 unsigned long power;
3682
3683 if (!child) {
3684 update_cpu_power(sd, cpu);
3685 return;
3686 }
3687
3688 power = 0;
3689
3690 group = child->groups;
3691 do {
3692 power += group->cpu_power;
3693 group = group->next;
3694 } while (group != child->groups);
3695
3696 sdg->cpu_power = power;
3697}
3698
3699/**
3700 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3701 * @sd: The sched_domain whose statistics are to be updated.
3702 * @group: sched_group whose statistics are to be updated.
3703 * @this_cpu: Cpu for which load balance is currently performed.
3704 * @idle: Idle status of this_cpu
3705 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3706 * @sd_idle: Idle status of the sched_domain containing group.
3707 * @local_group: Does group contain this_cpu.
3708 * @cpus: Set of cpus considered for load balancing.
3709 * @balance: Should we balance.
3710 * @sgs: variable to hold the statistics for this group.
3711 */
3712static inline void update_sg_lb_stats(struct sched_domain *sd,
3713 struct sched_group *group, int this_cpu,
3714 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3715 int local_group, const struct cpumask *cpus,
3716 int *balance, struct sg_lb_stats *sgs)
3717{
3718 unsigned long load, max_cpu_load, min_cpu_load;
3719 int i;
3720 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3721 unsigned long sum_avg_load_per_task;
3722 unsigned long avg_load_per_task;
3723
3724 if (local_group) {
3725 balance_cpu = group_first_cpu(group);
3726 if (balance_cpu == this_cpu)
3727 update_group_power(sd, this_cpu);
3728 }
3729
3730 /* Tally up the load of all CPUs in the group */
3731 sum_avg_load_per_task = avg_load_per_task = 0;
3732 max_cpu_load = 0;
3733 min_cpu_load = ~0UL;
3734
3735 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3736 struct rq *rq = cpu_rq(i);
3737
3738 if (*sd_idle && rq->nr_running)
3739 *sd_idle = 0;
3740
3741 /* Bias balancing toward cpus of our domain */
3742 if (local_group) {
3743 if (idle_cpu(i) && !first_idle_cpu) {
3744 first_idle_cpu = 1;
3745 balance_cpu = i;
3746 }
3747
3748 load = target_load(i, load_idx);
3749 } else {
3750 load = source_load(i, load_idx);
3751 if (load > max_cpu_load)
3752 max_cpu_load = load;
3753 if (min_cpu_load > load)
3754 min_cpu_load = load;
3755 }
3756
3757 sgs->group_load += load;
3758 sgs->sum_nr_running += rq->nr_running;
3759 sgs->sum_weighted_load += weighted_cpuload(i);
3760
3761 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3762 }
3763
3764 /*
3765 * First idle cpu or the first cpu(busiest) in this sched group
3766 * is eligible for doing load balancing at this and above
3767 * domains. In the newly idle case, we will allow all the cpu's
3768 * to do the newly idle load balance.
3769 */
3770 if (idle != CPU_NEWLY_IDLE && local_group &&
3771 balance_cpu != this_cpu && balance) {
3772 *balance = 0;
3773 return;
3774 }
3775
3776 /* Adjust by relative CPU power of the group */
3777 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3778
3779
3780 /*
3781 * Consider the group unbalanced when the imbalance is larger
3782 * than the average weight of two tasks.
3783 *
3784 * APZ: with cgroup the avg task weight can vary wildly and
3785 * might not be a suitable number - should we keep a
3786 * normalized nr_running number somewhere that negates
3787 * the hierarchy?
3788 */
3789 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3790 group->cpu_power;
3791
3792 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3793 sgs->group_imb = 1;
3794
3795 sgs->group_capacity =
3796 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3797}
3798
3799/**
3800 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3801 * @sd: sched_domain whose statistics are to be updated.
3802 * @this_cpu: Cpu for which load balance is currently performed.
3803 * @idle: Idle status of this_cpu
3804 * @sd_idle: Idle status of the sched_domain containing group.
3805 * @cpus: Set of cpus considered for load balancing.
3806 * @balance: Should we balance.
3807 * @sds: variable to hold the statistics for this sched_domain.
3808 */
3809static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3810 enum cpu_idle_type idle, int *sd_idle,
3811 const struct cpumask *cpus, int *balance,
3812 struct sd_lb_stats *sds)
3813{
3814 struct sched_domain *child = sd->child;
3815 struct sched_group *group = sd->groups;
3816 struct sg_lb_stats sgs;
3817 int load_idx, prefer_sibling = 0;
3818
3819 if (child && child->flags & SD_PREFER_SIBLING)
3820 prefer_sibling = 1;
3821
3822 init_sd_power_savings_stats(sd, sds, idle);
3823 load_idx = get_sd_load_idx(sd, idle);
3824
3825 do {
3826 int local_group;
3827
3828 local_group = cpumask_test_cpu(this_cpu,
3829 sched_group_cpus(group));
3830 memset(&sgs, 0, sizeof(sgs));
3831 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3832 local_group, cpus, balance, &sgs);
3833
3834 if (local_group && balance && !(*balance))
3835 return;
3836
3837 sds->total_load += sgs.group_load;
3838 sds->total_pwr += group->cpu_power;
3839
3840 /*
3841 * In case the child domain prefers tasks go to siblings
3842 * first, lower the group capacity to one so that we'll try
3843 * and move all the excess tasks away.
3844 */
3845 if (prefer_sibling)
3846 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3847
3848 if (local_group) {
3849 sds->this_load = sgs.avg_load;
3850 sds->this = group;
3851 sds->this_nr_running = sgs.sum_nr_running;
3852 sds->this_load_per_task = sgs.sum_weighted_load;
3853 } else if (sgs.avg_load > sds->max_load &&
3854 (sgs.sum_nr_running > sgs.group_capacity ||
3855 sgs.group_imb)) {
3856 sds->max_load = sgs.avg_load;
3857 sds->busiest = group;
3858 sds->busiest_nr_running = sgs.sum_nr_running;
3859 sds->busiest_load_per_task = sgs.sum_weighted_load;
3860 sds->group_imb = sgs.group_imb;
3861 }
3862
3863 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3864 group = group->next;
3865 } while (group != sd->groups);
3866}
3867
3868/**
3869 * fix_small_imbalance - Calculate the minor imbalance that exists
3870 * amongst the groups of a sched_domain, during
3871 * load balancing.
3872 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3873 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3874 * @imbalance: Variable to store the imbalance.
3875 */
3876static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3877 int this_cpu, unsigned long *imbalance)
3878{
3879 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3880 unsigned int imbn = 2;
3881
3882 if (sds->this_nr_running) {
3883 sds->this_load_per_task /= sds->this_nr_running;
3884 if (sds->busiest_load_per_task >
3885 sds->this_load_per_task)
3886 imbn = 1;
3887 } else
3888 sds->this_load_per_task =
3889 cpu_avg_load_per_task(this_cpu);
3890
3891 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3892 sds->busiest_load_per_task * imbn) {
3893 *imbalance = sds->busiest_load_per_task;
3894 return;
3895 }
3896
3897 /*
3898 * OK, we don't have enough imbalance to justify moving tasks,
3899 * however we may be able to increase total CPU power used by
3900 * moving them.
3901 */
3902
3903 pwr_now += sds->busiest->cpu_power *
3904 min(sds->busiest_load_per_task, sds->max_load);
3905 pwr_now += sds->this->cpu_power *
3906 min(sds->this_load_per_task, sds->this_load);
3907 pwr_now /= SCHED_LOAD_SCALE;
3908
3909 /* Amount of load we'd subtract */
3910 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3911 sds->busiest->cpu_power;
3912 if (sds->max_load > tmp)
3913 pwr_move += sds->busiest->cpu_power *
3914 min(sds->busiest_load_per_task, sds->max_load - tmp);
3915
3916 /* Amount of load we'd add */
3917 if (sds->max_load * sds->busiest->cpu_power <
3918 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3919 tmp = (sds->max_load * sds->busiest->cpu_power) /
3920 sds->this->cpu_power;
3921 else
3922 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3923 sds->this->cpu_power;
3924 pwr_move += sds->this->cpu_power *
3925 min(sds->this_load_per_task, sds->this_load + tmp);
3926 pwr_move /= SCHED_LOAD_SCALE;
3927
3928 /* Move if we gain throughput */
3929 if (pwr_move > pwr_now)
3930 *imbalance = sds->busiest_load_per_task;
3931}
3932
3933/**
3934 * calculate_imbalance - Calculate the amount of imbalance present within the
3935 * groups of a given sched_domain during load balance.
3936 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3937 * @this_cpu: Cpu for which currently load balance is being performed.
3938 * @imbalance: The variable to store the imbalance.
3939 */
3940static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3941 unsigned long *imbalance)
3942{
3943 unsigned long max_pull;
3944 /*
3945 * In the presence of smp nice balancing, certain scenarios can have
3946 * max load less than avg load(as we skip the groups at or below
3947 * its cpu_power, while calculating max_load..)
3948 */
3949 if (sds->max_load < sds->avg_load) {
3950 *imbalance = 0;
3951 return fix_small_imbalance(sds, this_cpu, imbalance);
3952 }
3953
3954 /* Don't want to pull so many tasks that a group would go idle */
3955 max_pull = min(sds->max_load - sds->avg_load,
3956 sds->max_load - sds->busiest_load_per_task);
3957
3958 /* How much load to actually move to equalise the imbalance */
3959 *imbalance = min(max_pull * sds->busiest->cpu_power,
3960 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3961 / SCHED_LOAD_SCALE;
3962
3963 /*
3964 * if *imbalance is less than the average load per runnable task
3965 * there is no gaurantee that any tasks will be moved so we'll have
3966 * a think about bumping its value to force at least one task to be
3967 * moved
3968 */
3969 if (*imbalance < sds->busiest_load_per_task)
3970 return fix_small_imbalance(sds, this_cpu, imbalance);
3971
3972}
3973/******* find_busiest_group() helpers end here *********************/
3974
3975/**
3976 * find_busiest_group - Returns the busiest group within the sched_domain
3977 * if there is an imbalance. If there isn't an imbalance, and
3978 * the user has opted for power-savings, it returns a group whose
3979 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3980 * such a group exists.
3981 *
3982 * Also calculates the amount of weighted load which should be moved
3983 * to restore balance.
3984 *
3985 * @sd: The sched_domain whose busiest group is to be returned.
3986 * @this_cpu: The cpu for which load balancing is currently being performed.
3987 * @imbalance: Variable which stores amount of weighted load which should
3988 * be moved to restore balance/put a group to idle.
3989 * @idle: The idle status of this_cpu.
3990 * @sd_idle: The idleness of sd
3991 * @cpus: The set of CPUs under consideration for load-balancing.
3992 * @balance: Pointer to a variable indicating if this_cpu
3993 * is the appropriate cpu to perform load balancing at this_level.
3994 *
3995 * Returns: - the busiest group if imbalance exists.
3996 * - If no imbalance and user has opted for power-savings balance,
3997 * return the least loaded group whose CPUs can be
3998 * put to idle by rebalancing its tasks onto our group.
3999 */
4000static struct sched_group *
4001find_busiest_group(struct sched_domain *sd, int this_cpu,
4002 unsigned long *imbalance, enum cpu_idle_type idle,
4003 int *sd_idle, const struct cpumask *cpus, int *balance)
4004{
4005 struct sd_lb_stats sds;
4006
4007 memset(&sds, 0, sizeof(sds));
4008
4009 /*
4010 * Compute the various statistics relavent for load balancing at
4011 * this level.
4012 */
4013 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4014 balance, &sds);
4015
4016 /* Cases where imbalance does not exist from POV of this_cpu */
4017 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4018 * at this level.
4019 * 2) There is no busy sibling group to pull from.
4020 * 3) This group is the busiest group.
4021 * 4) This group is more busy than the avg busieness at this
4022 * sched_domain.
4023 * 5) The imbalance is within the specified limit.
4024 * 6) Any rebalance would lead to ping-pong
4025 */
4026 if (balance && !(*balance))
4027 goto ret;
4028
4029 if (!sds.busiest || sds.busiest_nr_running == 0)
4030 goto out_balanced;
4031
4032 if (sds.this_load >= sds.max_load)
4033 goto out_balanced;
4034
4035 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4036
4037 if (sds.this_load >= sds.avg_load)
4038 goto out_balanced;
4039
4040 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4041 goto out_balanced;
4042
4043 sds.busiest_load_per_task /= sds.busiest_nr_running;
4044 if (sds.group_imb)
4045 sds.busiest_load_per_task =
4046 min(sds.busiest_load_per_task, sds.avg_load);
4047
4048 /*
4049 * We're trying to get all the cpus to the average_load, so we don't
4050 * want to push ourselves above the average load, nor do we wish to
4051 * reduce the max loaded cpu below the average load, as either of these
4052 * actions would just result in more rebalancing later, and ping-pong
4053 * tasks around. Thus we look for the minimum possible imbalance.
4054 * Negative imbalances (*we* are more loaded than anyone else) will
4055 * be counted as no imbalance for these purposes -- we can't fix that
4056 * by pulling tasks to us. Be careful of negative numbers as they'll
4057 * appear as very large values with unsigned longs.
4058 */
4059 if (sds.max_load <= sds.busiest_load_per_task)
4060 goto out_balanced;
4061
4062 /* Looks like there is an imbalance. Compute it */
4063 calculate_imbalance(&sds, this_cpu, imbalance);
4064 return sds.busiest;
4065
4066out_balanced:
4067 /*
4068 * There is no obvious imbalance. But check if we can do some balancing
4069 * to save power.
4070 */
4071 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4072 return sds.busiest;
4073ret:
4074 *imbalance = 0;
4075 return NULL;
4076}
4077
4078/*
4079 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4080 */
4081static struct rq *
4082find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4083 unsigned long imbalance, const struct cpumask *cpus)
4084{
4085 struct rq *busiest = NULL, *rq;
4086 unsigned long max_load = 0;
4087 int i;
4088
4089 for_each_cpu(i, sched_group_cpus(group)) {
4090 unsigned long power = power_of(i);
4091 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4092 unsigned long wl;
4093
4094 if (!cpumask_test_cpu(i, cpus))
4095 continue;
4096
4097 rq = cpu_rq(i);
4098 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4099 wl /= power;
4100
4101 if (capacity && rq->nr_running == 1 && wl > imbalance)
4102 continue;
4103
4104 if (wl > max_load) {
4105 max_load = wl;
4106 busiest = rq;
4107 }
4108 }
4109
4110 return busiest;
4111}
4112
4113/*
4114 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4115 * so long as it is large enough.
4116 */
4117#define MAX_PINNED_INTERVAL 512
4118
4119/* Working cpumask for load_balance and load_balance_newidle. */
4120static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4121
4122/*
4123 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4124 * tasks if there is an imbalance.
4125 */
4126static int load_balance(int this_cpu, struct rq *this_rq,
4127 struct sched_domain *sd, enum cpu_idle_type idle,
4128 int *balance)
4129{
4130 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4131 struct sched_group *group;
4132 unsigned long imbalance;
4133 struct rq *busiest;
4134 unsigned long flags;
4135 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4136
4137 cpumask_copy(cpus, cpu_online_mask);
4138
4139 /*
4140 * When power savings policy is enabled for the parent domain, idle
4141 * sibling can pick up load irrespective of busy siblings. In this case,
4142 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4143 * portraying it as CPU_NOT_IDLE.
4144 */
4145 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4146 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4147 sd_idle = 1;
4148
4149 schedstat_inc(sd, lb_count[idle]);
4150
4151redo:
4152 update_shares(sd);
4153 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4154 cpus, balance);
4155
4156 if (*balance == 0)
4157 goto out_balanced;
4158
4159 if (!group) {
4160 schedstat_inc(sd, lb_nobusyg[idle]);
4161 goto out_balanced;
4162 }
4163
4164 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4165 if (!busiest) {
4166 schedstat_inc(sd, lb_nobusyq[idle]);
4167 goto out_balanced;
4168 }
4169
4170 BUG_ON(busiest == this_rq);
4171
4172 schedstat_add(sd, lb_imbalance[idle], imbalance);
4173
4174 ld_moved = 0;
4175 if (busiest->nr_running > 1) {
4176 /*
4177 * Attempt to move tasks. If find_busiest_group has found
4178 * an imbalance but busiest->nr_running <= 1, the group is
4179 * still unbalanced. ld_moved simply stays zero, so it is
4180 * correctly treated as an imbalance.
4181 */
4182 local_irq_save(flags);
4183 double_rq_lock(this_rq, busiest);
4184 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4185 imbalance, sd, idle, &all_pinned);
4186 double_rq_unlock(this_rq, busiest);
4187 local_irq_restore(flags);
4188
4189 /*
4190 * some other cpu did the load balance for us.
4191 */
4192 if (ld_moved && this_cpu != smp_processor_id())
4193 resched_cpu(this_cpu);
4194
4195 /* All tasks on this runqueue were pinned by CPU affinity */
4196 if (unlikely(all_pinned)) {
4197 cpumask_clear_cpu(cpu_of(busiest), cpus);
4198 if (!cpumask_empty(cpus))
4199 goto redo;
4200 goto out_balanced;
4201 }
4202 }
4203
4204 if (!ld_moved) {
4205 schedstat_inc(sd, lb_failed[idle]);
4206 sd->nr_balance_failed++;
4207
4208 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4209
4210 spin_lock_irqsave(&busiest->lock, flags);
4211
4212 /* don't kick the migration_thread, if the curr
4213 * task on busiest cpu can't be moved to this_cpu
4214 */
4215 if (!cpumask_test_cpu(this_cpu,
4216 &busiest->curr->cpus_allowed)) {
4217 spin_unlock_irqrestore(&busiest->lock, flags);
4218 all_pinned = 1;
4219 goto out_one_pinned;
4220 }
4221
4222 if (!busiest->active_balance) {
4223 busiest->active_balance = 1;
4224 busiest->push_cpu = this_cpu;
4225 active_balance = 1;
4226 }
4227 spin_unlock_irqrestore(&busiest->lock, flags);
4228 if (active_balance)
4229 wake_up_process(busiest->migration_thread);
4230
4231 /*
4232 * We've kicked active balancing, reset the failure
4233 * counter.
4234 */
4235 sd->nr_balance_failed = sd->cache_nice_tries+1;
4236 }
4237 } else
4238 sd->nr_balance_failed = 0;
4239
4240 if (likely(!active_balance)) {
4241 /* We were unbalanced, so reset the balancing interval */
4242 sd->balance_interval = sd->min_interval;
4243 } else {
4244 /*
4245 * If we've begun active balancing, start to back off. This
4246 * case may not be covered by the all_pinned logic if there
4247 * is only 1 task on the busy runqueue (because we don't call
4248 * move_tasks).
4249 */
4250 if (sd->balance_interval < sd->max_interval)
4251 sd->balance_interval *= 2;
4252 }
4253
4254 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4255 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4256 ld_moved = -1;
4257
4258 goto out;
4259
4260out_balanced:
4261 schedstat_inc(sd, lb_balanced[idle]);
4262
4263 sd->nr_balance_failed = 0;
4264
4265out_one_pinned:
4266 /* tune up the balancing interval */
4267 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4268 (sd->balance_interval < sd->max_interval))
4269 sd->balance_interval *= 2;
4270
4271 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4272 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4273 ld_moved = -1;
4274 else
4275 ld_moved = 0;
4276out:
4277 if (ld_moved)
4278 update_shares(sd);
4279 return ld_moved;
4280}
4281
4282/*
4283 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4284 * tasks if there is an imbalance.
4285 *
4286 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4287 * this_rq is locked.
4288 */
4289static int
4290load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4291{
4292 struct sched_group *group;
4293 struct rq *busiest = NULL;
4294 unsigned long imbalance;
4295 int ld_moved = 0;
4296 int sd_idle = 0;
4297 int all_pinned = 0;
4298 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4299
4300 cpumask_copy(cpus, cpu_online_mask);
4301
4302 /*
4303 * When power savings policy is enabled for the parent domain, idle
4304 * sibling can pick up load irrespective of busy siblings. In this case,
4305 * let the state of idle sibling percolate up as IDLE, instead of
4306 * portraying it as CPU_NOT_IDLE.
4307 */
4308 if (sd->flags & SD_SHARE_CPUPOWER &&
4309 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4310 sd_idle = 1;
4311
4312 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4313redo:
4314 update_shares_locked(this_rq, sd);
4315 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4316 &sd_idle, cpus, NULL);
4317 if (!group) {
4318 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4319 goto out_balanced;
4320 }
4321
4322 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4323 if (!busiest) {
4324 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4325 goto out_balanced;
4326 }
4327
4328 BUG_ON(busiest == this_rq);
4329
4330 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4331
4332 ld_moved = 0;
4333 if (busiest->nr_running > 1) {
4334 /* Attempt to move tasks */
4335 double_lock_balance(this_rq, busiest);
4336 /* this_rq->clock is already updated */
4337 update_rq_clock(busiest);
4338 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4339 imbalance, sd, CPU_NEWLY_IDLE,
4340 &all_pinned);
4341 double_unlock_balance(this_rq, busiest);
4342
4343 if (unlikely(all_pinned)) {
4344 cpumask_clear_cpu(cpu_of(busiest), cpus);
4345 if (!cpumask_empty(cpus))
4346 goto redo;
4347 }
4348 }
4349
4350 if (!ld_moved) {
4351 int active_balance = 0;
4352
4353 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4354 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4355 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4356 return -1;
4357
4358 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4359 return -1;
4360
4361 if (sd->nr_balance_failed++ < 2)
4362 return -1;
4363
4364 /*
4365 * The only task running in a non-idle cpu can be moved to this
4366 * cpu in an attempt to completely freeup the other CPU
4367 * package. The same method used to move task in load_balance()
4368 * have been extended for load_balance_newidle() to speedup
4369 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4370 *
4371 * The package power saving logic comes from
4372 * find_busiest_group(). If there are no imbalance, then
4373 * f_b_g() will return NULL. However when sched_mc={1,2} then
4374 * f_b_g() will select a group from which a running task may be
4375 * pulled to this cpu in order to make the other package idle.
4376 * If there is no opportunity to make a package idle and if
4377 * there are no imbalance, then f_b_g() will return NULL and no
4378 * action will be taken in load_balance_newidle().
4379 *
4380 * Under normal task pull operation due to imbalance, there
4381 * will be more than one task in the source run queue and
4382 * move_tasks() will succeed. ld_moved will be true and this
4383 * active balance code will not be triggered.
4384 */
4385
4386 /* Lock busiest in correct order while this_rq is held */
4387 double_lock_balance(this_rq, busiest);
4388
4389 /*
4390 * don't kick the migration_thread, if the curr
4391 * task on busiest cpu can't be moved to this_cpu
4392 */
4393 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4394 double_unlock_balance(this_rq, busiest);
4395 all_pinned = 1;
4396 return ld_moved;
4397 }
4398
4399 if (!busiest->active_balance) {
4400 busiest->active_balance = 1;
4401 busiest->push_cpu = this_cpu;
4402 active_balance = 1;
4403 }
4404
4405 double_unlock_balance(this_rq, busiest);
4406 /*
4407 * Should not call ttwu while holding a rq->lock
4408 */
4409 spin_unlock(&this_rq->lock);
4410 if (active_balance)
4411 wake_up_process(busiest->migration_thread);
4412 spin_lock(&this_rq->lock);
4413
4414 } else
4415 sd->nr_balance_failed = 0;
4416
4417 update_shares_locked(this_rq, sd);
4418 return ld_moved;
4419
4420out_balanced:
4421 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4422 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4423 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4424 return -1;
4425 sd->nr_balance_failed = 0;
4426
4427 return 0;
4428}
4429
4430/*
4431 * idle_balance is called by schedule() if this_cpu is about to become
4432 * idle. Attempts to pull tasks from other CPUs.
4433 */
4434static void idle_balance(int this_cpu, struct rq *this_rq)
4435{
4436 struct sched_domain *sd;
4437 int pulled_task = 0;
4438 unsigned long next_balance = jiffies + HZ;
4439
4440 this_rq->idle_stamp = this_rq->clock;
4441
4442 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4443 return;
4444
4445 for_each_domain(this_cpu, sd) {
4446 unsigned long interval;
4447
4448 if (!(sd->flags & SD_LOAD_BALANCE))
4449 continue;
4450
4451 if (sd->flags & SD_BALANCE_NEWIDLE)
4452 /* If we've pulled tasks over stop searching: */
4453 pulled_task = load_balance_newidle(this_cpu, this_rq,
4454 sd);
4455
4456 interval = msecs_to_jiffies(sd->balance_interval);
4457 if (time_after(next_balance, sd->last_balance + interval))
4458 next_balance = sd->last_balance + interval;
4459 if (pulled_task) {
4460 this_rq->idle_stamp = 0;
4461 break;
4462 }
4463 }
4464 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4465 /*
4466 * We are going idle. next_balance may be set based on
4467 * a busy processor. So reset next_balance.
4468 */
4469 this_rq->next_balance = next_balance;
4470 }
4471}
4472
4473/*
4474 * active_load_balance is run by migration threads. It pushes running tasks
4475 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4476 * running on each physical CPU where possible, and avoids physical /
4477 * logical imbalances.
4478 *
4479 * Called with busiest_rq locked.
4480 */
4481static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4482{
4483 int target_cpu = busiest_rq->push_cpu;
4484 struct sched_domain *sd;
4485 struct rq *target_rq;
4486
4487 /* Is there any task to move? */
4488 if (busiest_rq->nr_running <= 1)
4489 return;
4490
4491 target_rq = cpu_rq(target_cpu);
4492
4493 /*
4494 * This condition is "impossible", if it occurs
4495 * we need to fix it. Originally reported by
4496 * Bjorn Helgaas on a 128-cpu setup.
4497 */
4498 BUG_ON(busiest_rq == target_rq);
4499
4500 /* move a task from busiest_rq to target_rq */
4501 double_lock_balance(busiest_rq, target_rq);
4502 update_rq_clock(busiest_rq);
4503 update_rq_clock(target_rq);
4504
4505 /* Search for an sd spanning us and the target CPU. */
4506 for_each_domain(target_cpu, sd) {
4507 if ((sd->flags & SD_LOAD_BALANCE) &&
4508 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4509 break;
4510 }
4511
4512 if (likely(sd)) {
4513 schedstat_inc(sd, alb_count);
4514
4515 if (move_one_task(target_rq, target_cpu, busiest_rq,
4516 sd, CPU_IDLE))
4517 schedstat_inc(sd, alb_pushed);
4518 else
4519 schedstat_inc(sd, alb_failed);
4520 }
4521 double_unlock_balance(busiest_rq, target_rq);
4522}
4523
4524#ifdef CONFIG_NO_HZ
4525static struct {
4526 atomic_t load_balancer;
4527 cpumask_var_t cpu_mask;
4528 cpumask_var_t ilb_grp_nohz_mask;
4529} nohz ____cacheline_aligned = {
4530 .load_balancer = ATOMIC_INIT(-1),
4531};
4532
4533int get_nohz_load_balancer(void)
4534{
4535 return atomic_read(&nohz.load_balancer);
4536}
4537
4538#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4539/**
4540 * lowest_flag_domain - Return lowest sched_domain containing flag.
4541 * @cpu: The cpu whose lowest level of sched domain is to
4542 * be returned.
4543 * @flag: The flag to check for the lowest sched_domain
4544 * for the given cpu.
4545 *
4546 * Returns the lowest sched_domain of a cpu which contains the given flag.
4547 */
4548static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4549{
4550 struct sched_domain *sd;
4551
4552 for_each_domain(cpu, sd)
4553 if (sd && (sd->flags & flag))
4554 break;
4555
4556 return sd;
4557}
4558
4559/**
4560 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4561 * @cpu: The cpu whose domains we're iterating over.
4562 * @sd: variable holding the value of the power_savings_sd
4563 * for cpu.
4564 * @flag: The flag to filter the sched_domains to be iterated.
4565 *
4566 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4567 * set, starting from the lowest sched_domain to the highest.
4568 */
4569#define for_each_flag_domain(cpu, sd, flag) \
4570 for (sd = lowest_flag_domain(cpu, flag); \
4571 (sd && (sd->flags & flag)); sd = sd->parent)
4572
4573/**
4574 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4575 * @ilb_group: group to be checked for semi-idleness
4576 *
4577 * Returns: 1 if the group is semi-idle. 0 otherwise.
4578 *
4579 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4580 * and atleast one non-idle CPU. This helper function checks if the given
4581 * sched_group is semi-idle or not.
4582 */
4583static inline int is_semi_idle_group(struct sched_group *ilb_group)
4584{
4585 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4586 sched_group_cpus(ilb_group));
4587
4588 /*
4589 * A sched_group is semi-idle when it has atleast one busy cpu
4590 * and atleast one idle cpu.
4591 */
4592 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4593 return 0;
4594
4595 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4596 return 0;
4597
4598 return 1;
4599}
4600/**
4601 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4602 * @cpu: The cpu which is nominating a new idle_load_balancer.
4603 *
4604 * Returns: Returns the id of the idle load balancer if it exists,
4605 * Else, returns >= nr_cpu_ids.
4606 *
4607 * This algorithm picks the idle load balancer such that it belongs to a
4608 * semi-idle powersavings sched_domain. The idea is to try and avoid
4609 * completely idle packages/cores just for the purpose of idle load balancing
4610 * when there are other idle cpu's which are better suited for that job.
4611 */
4612static int find_new_ilb(int cpu)
4613{
4614 struct sched_domain *sd;
4615 struct sched_group *ilb_group;
4616
4617 /*
4618 * Have idle load balancer selection from semi-idle packages only
4619 * when power-aware load balancing is enabled
4620 */
4621 if (!(sched_smt_power_savings || sched_mc_power_savings))
4622 goto out_done;
4623
4624 /*
4625 * Optimize for the case when we have no idle CPUs or only one
4626 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4627 */
4628 if (cpumask_weight(nohz.cpu_mask) < 2)
4629 goto out_done;
4630
4631 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4632 ilb_group = sd->groups;
4633
4634 do {
4635 if (is_semi_idle_group(ilb_group))
4636 return cpumask_first(nohz.ilb_grp_nohz_mask);
4637
4638 ilb_group = ilb_group->next;
4639
4640 } while (ilb_group != sd->groups);
4641 }
4642
4643out_done:
4644 return cpumask_first(nohz.cpu_mask);
4645}
4646#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4647static inline int find_new_ilb(int call_cpu)
4648{
4649 return cpumask_first(nohz.cpu_mask);
4650}
4651#endif
4652
4653/*
4654 * This routine will try to nominate the ilb (idle load balancing)
4655 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4656 * load balancing on behalf of all those cpus. If all the cpus in the system
4657 * go into this tickless mode, then there will be no ilb owner (as there is
4658 * no need for one) and all the cpus will sleep till the next wakeup event
4659 * arrives...
4660 *
4661 * For the ilb owner, tick is not stopped. And this tick will be used
4662 * for idle load balancing. ilb owner will still be part of
4663 * nohz.cpu_mask..
4664 *
4665 * While stopping the tick, this cpu will become the ilb owner if there
4666 * is no other owner. And will be the owner till that cpu becomes busy
4667 * or if all cpus in the system stop their ticks at which point
4668 * there is no need for ilb owner.
4669 *
4670 * When the ilb owner becomes busy, it nominates another owner, during the
4671 * next busy scheduler_tick()
4672 */
4673int select_nohz_load_balancer(int stop_tick)
4674{
4675 int cpu = smp_processor_id();
4676
4677 if (stop_tick) {
4678 cpu_rq(cpu)->in_nohz_recently = 1;
4679
4680 if (!cpu_active(cpu)) {
4681 if (atomic_read(&nohz.load_balancer) != cpu)
4682 return 0;
4683
4684 /*
4685 * If we are going offline and still the leader,
4686 * give up!
4687 */
4688 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4689 BUG();
4690
4691 return 0;
4692 }
4693
4694 cpumask_set_cpu(cpu, nohz.cpu_mask);
4695
4696 /* time for ilb owner also to sleep */
4697 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4698 if (atomic_read(&nohz.load_balancer) == cpu)
4699 atomic_set(&nohz.load_balancer, -1);
4700 return 0;
4701 }
4702
4703 if (atomic_read(&nohz.load_balancer) == -1) {
4704 /* make me the ilb owner */
4705 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4706 return 1;
4707 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4708 int new_ilb;
4709
4710 if (!(sched_smt_power_savings ||
4711 sched_mc_power_savings))
4712 return 1;
4713 /*
4714 * Check to see if there is a more power-efficient
4715 * ilb.
4716 */
4717 new_ilb = find_new_ilb(cpu);
4718 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4719 atomic_set(&nohz.load_balancer, -1);
4720 resched_cpu(new_ilb);
4721 return 0;
4722 }
4723 return 1;
4724 }
4725 } else {
4726 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4727 return 0;
4728
4729 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4730
4731 if (atomic_read(&nohz.load_balancer) == cpu)
4732 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4733 BUG();
4734 }
4735 return 0;
4736}
4737#endif
4738
4739static DEFINE_SPINLOCK(balancing);
4740
4741/*
4742 * It checks each scheduling domain to see if it is due to be balanced,
4743 * and initiates a balancing operation if so.
4744 *
4745 * Balancing parameters are set up in arch_init_sched_domains.
4746 */
4747static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4748{
4749 int balance = 1;
4750 struct rq *rq = cpu_rq(cpu);
4751 unsigned long interval;
4752 struct sched_domain *sd;
4753 /* Earliest time when we have to do rebalance again */
4754 unsigned long next_balance = jiffies + 60*HZ;
4755 int update_next_balance = 0;
4756 int need_serialize;
4757
4758 for_each_domain(cpu, sd) {
4759 if (!(sd->flags & SD_LOAD_BALANCE))
4760 continue;
4761
4762 interval = sd->balance_interval;
4763 if (idle != CPU_IDLE)
4764 interval *= sd->busy_factor;
4765
4766 /* scale ms to jiffies */
4767 interval = msecs_to_jiffies(interval);
4768 if (unlikely(!interval))
4769 interval = 1;
4770 if (interval > HZ*NR_CPUS/10)
4771 interval = HZ*NR_CPUS/10;
4772
4773 need_serialize = sd->flags & SD_SERIALIZE;
4774
4775 if (need_serialize) {
4776 if (!spin_trylock(&balancing))
4777 goto out;
4778 }
4779
4780 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4781 if (load_balance(cpu, rq, sd, idle, &balance)) {
4782 /*
4783 * We've pulled tasks over so either we're no
4784 * longer idle, or one of our SMT siblings is
4785 * not idle.
4786 */
4787 idle = CPU_NOT_IDLE;
4788 }
4789 sd->last_balance = jiffies;
4790 }
4791 if (need_serialize)
4792 spin_unlock(&balancing);
4793out:
4794 if (time_after(next_balance, sd->last_balance + interval)) {
4795 next_balance = sd->last_balance + interval;
4796 update_next_balance = 1;
4797 }
4798
4799 /*
4800 * Stop the load balance at this level. There is another
4801 * CPU in our sched group which is doing load balancing more
4802 * actively.
4803 */
4804 if (!balance)
4805 break;
4806 }
4807
4808 /*
4809 * next_balance will be updated only when there is a need.
4810 * When the cpu is attached to null domain for ex, it will not be
4811 * updated.
4812 */
4813 if (likely(update_next_balance))
4814 rq->next_balance = next_balance;
4815}
4816
4817/*
4818 * run_rebalance_domains is triggered when needed from the scheduler tick.
4819 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4820 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4821 */
4822static void run_rebalance_domains(struct softirq_action *h)
4823{
4824 int this_cpu = smp_processor_id();
4825 struct rq *this_rq = cpu_rq(this_cpu);
4826 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4827 CPU_IDLE : CPU_NOT_IDLE;
4828
4829 rebalance_domains(this_cpu, idle);
4830
4831#ifdef CONFIG_NO_HZ
4832 /*
4833 * If this cpu is the owner for idle load balancing, then do the
4834 * balancing on behalf of the other idle cpus whose ticks are
4835 * stopped.
4836 */
4837 if (this_rq->idle_at_tick &&
4838 atomic_read(&nohz.load_balancer) == this_cpu) {
4839 struct rq *rq;
4840 int balance_cpu;
4841
4842 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4843 if (balance_cpu == this_cpu)
4844 continue;
4845
4846 /*
4847 * If this cpu gets work to do, stop the load balancing
4848 * work being done for other cpus. Next load
4849 * balancing owner will pick it up.
4850 */
4851 if (need_resched())
4852 break;
4853
4854 rebalance_domains(balance_cpu, CPU_IDLE);
4855
4856 rq = cpu_rq(balance_cpu);
4857 if (time_after(this_rq->next_balance, rq->next_balance))
4858 this_rq->next_balance = rq->next_balance;
4859 }
4860 }
4861#endif
4862}
4863
4864static inline int on_null_domain(int cpu)
4865{
4866 return !rcu_dereference(cpu_rq(cpu)->sd);
4867}
4868
4869/*
4870 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4871 *
4872 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4873 * idle load balancing owner or decide to stop the periodic load balancing,
4874 * if the whole system is idle.
4875 */
4876static inline void trigger_load_balance(struct rq *rq, int cpu)
4877{
4878#ifdef CONFIG_NO_HZ
4879 /*
4880 * If we were in the nohz mode recently and busy at the current
4881 * scheduler tick, then check if we need to nominate new idle
4882 * load balancer.
4883 */
4884 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4885 rq->in_nohz_recently = 0;
4886
4887 if (atomic_read(&nohz.load_balancer) == cpu) {
4888 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4889 atomic_set(&nohz.load_balancer, -1);
4890 }
4891
4892 if (atomic_read(&nohz.load_balancer) == -1) {
4893 int ilb = find_new_ilb(cpu);
4894
4895 if (ilb < nr_cpu_ids)
4896 resched_cpu(ilb);
4897 }
4898 }
4899
4900 /*
4901 * If this cpu is idle and doing idle load balancing for all the
4902 * cpus with ticks stopped, is it time for that to stop?
4903 */
4904 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4905 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4906 resched_cpu(cpu);
4907 return;
4908 }
4909
4910 /*
4911 * If this cpu is idle and the idle load balancing is done by
4912 * someone else, then no need raise the SCHED_SOFTIRQ
4913 */
4914 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4915 cpumask_test_cpu(cpu, nohz.cpu_mask))
4916 return;
4917#endif
4918 /* Don't need to rebalance while attached to NULL domain */
4919 if (time_after_eq(jiffies, rq->next_balance) &&
4920 likely(!on_null_domain(cpu)))
4921 raise_softirq(SCHED_SOFTIRQ);
4922}
4923
4924#else /* CONFIG_SMP */
4925
4926/*
4927 * on UP we do not need to balance between CPUs:
4928 */
4929static inline void idle_balance(int cpu, struct rq *rq)
4930{
4931}
4932
4933#endif 3164#endif
4934 3165
4935DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5278,13 +3509,13 @@ void scheduler_tick(void)
5278 3509
5279 sched_clock_tick(); 3510 sched_clock_tick();
5280 3511
5281 spin_lock(&rq->lock); 3512 raw_spin_lock(&rq->lock);
5282 update_rq_clock(rq); 3513 update_rq_clock(rq);
5283 update_cpu_load(rq); 3514 update_cpu_load(rq);
5284 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5285 spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5286 3517
5287 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5288 3519
5289#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5290 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5396,13 +3627,14 @@ static inline void schedule_debug(struct task_struct *prev)
5396#endif 3627#endif
5397} 3628}
5398 3629
5399static void put_prev_task(struct rq *rq, struct task_struct *p) 3630static void put_prev_task(struct rq *rq, struct task_struct *prev)
5400{ 3631{
5401 u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; 3632 if (prev->state == TASK_RUNNING) {
3633 u64 runtime = prev->se.sum_exec_runtime;
5402 3634
5403 update_avg(&p->se.avg_running, runtime); 3635 runtime -= prev->se.prev_sum_exec_runtime;
3636 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
5404 3637
5405 if (p->state == TASK_RUNNING) {
5406 /* 3638 /*
5407 * In order to avoid avg_overlap growing stale when we are 3639 * In order to avoid avg_overlap growing stale when we are
5408 * indeed overlapping and hence not getting put to sleep, grow 3640 * indeed overlapping and hence not getting put to sleep, grow
@@ -5412,12 +3644,9 @@ static void put_prev_task(struct rq *rq, struct task_struct *p)
5412 * correlates to the amount of cache footprint a task can 3644 * correlates to the amount of cache footprint a task can
5413 * build up. 3645 * build up.
5414 */ 3646 */
5415 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); 3647 update_avg(&prev->se.avg_overlap, runtime);
5416 update_avg(&p->se.avg_overlap, runtime);
5417 } else {
5418 update_avg(&p->se.avg_running, 0);
5419 } 3648 }
5420 p->sched_class->put_prev_task(rq, p); 3649 prev->sched_class->put_prev_task(rq, prev);
5421} 3650}
5422 3651
5423/* 3652/*
@@ -5478,7 +3707,7 @@ need_resched_nonpreemptible:
5478 if (sched_feat(HRTICK)) 3707 if (sched_feat(HRTICK))
5479 hrtick_clear(rq); 3708 hrtick_clear(rq);
5480 3709
5481 spin_lock_irq(&rq->lock); 3710 raw_spin_lock_irq(&rq->lock);
5482 update_rq_clock(rq); 3711 update_rq_clock(rq);
5483 clear_tsk_need_resched(prev); 3712 clear_tsk_need_resched(prev);
5484 3713
@@ -5500,7 +3729,7 @@ need_resched_nonpreemptible:
5500 3729
5501 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5502 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5503 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5504 3733
5505 rq->nr_switches++; 3734 rq->nr_switches++;
5506 rq->curr = next; 3735 rq->curr = next;
@@ -5514,12 +3743,15 @@ need_resched_nonpreemptible:
5514 cpu = smp_processor_id(); 3743 cpu = smp_processor_id();
5515 rq = cpu_rq(cpu); 3744 rq = cpu_rq(cpu);
5516 } else 3745 } else
5517 spin_unlock_irq(&rq->lock); 3746 raw_spin_unlock_irq(&rq->lock);
5518 3747
5519 post_schedule(rq); 3748 post_schedule(rq);
5520 3749
5521 if (unlikely(reacquire_kernel_lock(current) < 0)) 3750 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3751 prev = rq->curr;
3752 switch_count = &prev->nivcsw;
5522 goto need_resched_nonpreemptible; 3753 goto need_resched_nonpreemptible;
3754 }
5523 3755
5524 preempt_enable_no_resched(); 3756 preempt_enable_no_resched();
5525 if (need_resched()) 3757 if (need_resched())
@@ -5931,14 +4163,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5931 */ 4163 */
5932bool try_wait_for_completion(struct completion *x) 4164bool try_wait_for_completion(struct completion *x)
5933{ 4165{
4166 unsigned long flags;
5934 int ret = 1; 4167 int ret = 1;
5935 4168
5936 spin_lock_irq(&x->wait.lock); 4169 spin_lock_irqsave(&x->wait.lock, flags);
5937 if (!x->done) 4170 if (!x->done)
5938 ret = 0; 4171 ret = 0;
5939 else 4172 else
5940 x->done--; 4173 x->done--;
5941 spin_unlock_irq(&x->wait.lock); 4174 spin_unlock_irqrestore(&x->wait.lock, flags);
5942 return ret; 4175 return ret;
5943} 4176}
5944EXPORT_SYMBOL(try_wait_for_completion); 4177EXPORT_SYMBOL(try_wait_for_completion);
@@ -5953,12 +4186,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5953 */ 4186 */
5954bool completion_done(struct completion *x) 4187bool completion_done(struct completion *x)
5955{ 4188{
4189 unsigned long flags;
5956 int ret = 1; 4190 int ret = 1;
5957 4191
5958 spin_lock_irq(&x->wait.lock); 4192 spin_lock_irqsave(&x->wait.lock, flags);
5959 if (!x->done) 4193 if (!x->done)
5960 ret = 0; 4194 ret = 0;
5961 spin_unlock_irq(&x->wait.lock); 4195 spin_unlock_irqrestore(&x->wait.lock, flags);
5962 return ret; 4196 return ret;
5963} 4197}
5964EXPORT_SYMBOL(completion_done); 4198EXPORT_SYMBOL(completion_done);
@@ -6026,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6026 unsigned long flags; 4260 unsigned long flags;
6027 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6028 struct rq *rq; 4262 struct rq *rq;
6029 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6030 4264
6031 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6032 4266
@@ -6034,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6034 update_rq_clock(rq); 4268 update_rq_clock(rq);
6035 4269
6036 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6037 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6038 running = task_current(rq, p); 4273 running = task_current(rq, p);
6039 if (on_rq) 4274 if (on_rq)
@@ -6051,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6051 if (running) 4286 if (running)
6052 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6053 if (on_rq) { 4288 if (on_rq) {
6054 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6055 4290
6056 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6057 } 4292 }
@@ -6095,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6095 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6096 4331
6097 if (on_rq) { 4332 if (on_rq) {
6098 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6099 /* 4334 /*
6100 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6101 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6253,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6253{ 4488{
6254 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6255 unsigned long flags; 4490 unsigned long flags;
6256 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6257 struct rq *rq; 4492 struct rq *rq;
6258 int reset_on_fork; 4493 int reset_on_fork;
6259 4494
@@ -6343,7 +4578,7 @@ recheck:
6343 * make sure no PI-waiters arrive (or leave) while we are 4578 * make sure no PI-waiters arrive (or leave) while we are
6344 * changing the priority of the task: 4579 * changing the priority of the task:
6345 */ 4580 */
6346 spin_lock_irqsave(&p->pi_lock, flags); 4581 raw_spin_lock_irqsave(&p->pi_lock, flags);
6347 /* 4582 /*
6348 * To be able to change p->policy safely, the apropriate 4583 * To be able to change p->policy safely, the apropriate
6349 * runqueue lock must be held. 4584 * runqueue lock must be held.
@@ -6353,7 +4588,7 @@ recheck:
6353 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4588 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
6354 policy = oldpolicy = -1; 4589 policy = oldpolicy = -1;
6355 __task_rq_unlock(rq); 4590 __task_rq_unlock(rq);
6356 spin_unlock_irqrestore(&p->pi_lock, flags); 4591 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6357 goto recheck; 4592 goto recheck;
6358 } 4593 }
6359 update_rq_clock(rq); 4594 update_rq_clock(rq);
@@ -6367,6 +4602,7 @@ recheck:
6367 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6368 4603
6369 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6370 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6371 4607
6372 if (running) 4608 if (running)
@@ -6377,7 +4613,7 @@ recheck:
6377 check_class_changed(rq, p, prev_class, oldprio, running); 4613 check_class_changed(rq, p, prev_class, oldprio, running);
6378 } 4614 }
6379 __task_rq_unlock(rq); 4615 __task_rq_unlock(rq);
6380 spin_unlock_irqrestore(&p->pi_lock, flags); 4616 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6381 4617
6382 rt_mutex_adjust_pi(p); 4618 rt_mutex_adjust_pi(p);
6383 4619
@@ -6477,7 +4713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6477 return -EINVAL; 4713 return -EINVAL;
6478 4714
6479 retval = -ESRCH; 4715 retval = -ESRCH;
6480 read_lock(&tasklist_lock); 4716 rcu_read_lock();
6481 p = find_process_by_pid(pid); 4717 p = find_process_by_pid(pid);
6482 if (p) { 4718 if (p) {
6483 retval = security_task_getscheduler(p); 4719 retval = security_task_getscheduler(p);
@@ -6485,7 +4721,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6485 retval = p->policy 4721 retval = p->policy
6486 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4722 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6487 } 4723 }
6488 read_unlock(&tasklist_lock); 4724 rcu_read_unlock();
6489 return retval; 4725 return retval;
6490} 4726}
6491 4727
@@ -6503,7 +4739,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6503 if (!param || pid < 0) 4739 if (!param || pid < 0)
6504 return -EINVAL; 4740 return -EINVAL;
6505 4741
6506 read_lock(&tasklist_lock); 4742 rcu_read_lock();
6507 p = find_process_by_pid(pid); 4743 p = find_process_by_pid(pid);
6508 retval = -ESRCH; 4744 retval = -ESRCH;
6509 if (!p) 4745 if (!p)
@@ -6514,7 +4750,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6514 goto out_unlock; 4750 goto out_unlock;
6515 4751
6516 lp.sched_priority = p->rt_priority; 4752 lp.sched_priority = p->rt_priority;
6517 read_unlock(&tasklist_lock); 4753 rcu_read_unlock();
6518 4754
6519 /* 4755 /*
6520 * This one might sleep, we cannot do it with a spinlock held ... 4756 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6524,7 +4760,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6524 return retval; 4760 return retval;
6525 4761
6526out_unlock: 4762out_unlock:
6527 read_unlock(&tasklist_lock); 4763 rcu_read_unlock();
6528 return retval; 4764 return retval;
6529} 4765}
6530 4766
@@ -6535,22 +4771,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6535 int retval; 4771 int retval;
6536 4772
6537 get_online_cpus(); 4773 get_online_cpus();
6538 read_lock(&tasklist_lock); 4774 rcu_read_lock();
6539 4775
6540 p = find_process_by_pid(pid); 4776 p = find_process_by_pid(pid);
6541 if (!p) { 4777 if (!p) {
6542 read_unlock(&tasklist_lock); 4778 rcu_read_unlock();
6543 put_online_cpus(); 4779 put_online_cpus();
6544 return -ESRCH; 4780 return -ESRCH;
6545 } 4781 }
6546 4782
6547 /* 4783 /* Prevent p going away */
6548 * It is not safe to call set_cpus_allowed with the
6549 * tasklist_lock held. We will bump the task_struct's
6550 * usage count and then drop tasklist_lock.
6551 */
6552 get_task_struct(p); 4784 get_task_struct(p);
6553 read_unlock(&tasklist_lock); 4785 rcu_read_unlock();
6554 4786
6555 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4787 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6556 retval = -ENOMEM; 4788 retval = -ENOMEM;
@@ -6631,10 +4863,12 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6631long sched_getaffinity(pid_t pid, struct cpumask *mask) 4863long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632{ 4864{
6633 struct task_struct *p; 4865 struct task_struct *p;
4866 unsigned long flags;
4867 struct rq *rq;
6634 int retval; 4868 int retval;
6635 4869
6636 get_online_cpus(); 4870 get_online_cpus();
6637 read_lock(&tasklist_lock); 4871 rcu_read_lock();
6638 4872
6639 retval = -ESRCH; 4873 retval = -ESRCH;
6640 p = find_process_by_pid(pid); 4874 p = find_process_by_pid(pid);
@@ -6645,10 +4879,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6645 if (retval) 4879 if (retval)
6646 goto out_unlock; 4880 goto out_unlock;
6647 4881
4882 rq = task_rq_lock(p, &flags);
6648 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); 4883 cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
4884 task_rq_unlock(rq, &flags);
6649 4885
6650out_unlock: 4886out_unlock:
6651 read_unlock(&tasklist_lock); 4887 rcu_read_unlock();
6652 put_online_cpus(); 4888 put_online_cpus();
6653 4889
6654 return retval; 4890 return retval;
@@ -6703,7 +4939,7 @@ SYSCALL_DEFINE0(sched_yield)
6703 */ 4939 */
6704 __release(rq->lock); 4940 __release(rq->lock);
6705 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 4941 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
6706 _raw_spin_unlock(&rq->lock); 4942 do_raw_spin_unlock(&rq->lock);
6707 preempt_enable_no_resched(); 4943 preempt_enable_no_resched();
6708 4944
6709 schedule(); 4945 schedule();
@@ -6883,6 +5119,8 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6883{ 5119{
6884 struct task_struct *p; 5120 struct task_struct *p;
6885 unsigned int time_slice; 5121 unsigned int time_slice;
5122 unsigned long flags;
5123 struct rq *rq;
6886 int retval; 5124 int retval;
6887 struct timespec t; 5125 struct timespec t;
6888 5126
@@ -6890,7 +5128,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6890 return -EINVAL; 5128 return -EINVAL;
6891 5129
6892 retval = -ESRCH; 5130 retval = -ESRCH;
6893 read_lock(&tasklist_lock); 5131 rcu_read_lock();
6894 p = find_process_by_pid(pid); 5132 p = find_process_by_pid(pid);
6895 if (!p) 5133 if (!p)
6896 goto out_unlock; 5134 goto out_unlock;
@@ -6899,15 +5137,17 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6899 if (retval) 5137 if (retval)
6900 goto out_unlock; 5138 goto out_unlock;
6901 5139
6902 time_slice = p->sched_class->get_rr_interval(p); 5140 rq = task_rq_lock(p, &flags);
5141 time_slice = p->sched_class->get_rr_interval(rq, p);
5142 task_rq_unlock(rq, &flags);
6903 5143
6904 read_unlock(&tasklist_lock); 5144 rcu_read_unlock();
6905 jiffies_to_timespec(time_slice, &t); 5145 jiffies_to_timespec(time_slice, &t);
6906 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5146 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6907 return retval; 5147 return retval;
6908 5148
6909out_unlock: 5149out_unlock:
6910 read_unlock(&tasklist_lock); 5150 rcu_read_unlock();
6911 return retval; 5151 return retval;
6912} 5152}
6913 5153
@@ -6995,12 +5235,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6995 struct rq *rq = cpu_rq(cpu); 5235 struct rq *rq = cpu_rq(cpu);
6996 unsigned long flags; 5236 unsigned long flags;
6997 5237
6998 spin_lock_irqsave(&rq->lock, flags); 5238 raw_spin_lock_irqsave(&rq->lock, flags);
6999 5239
7000 __sched_fork(idle); 5240 __sched_fork(idle);
5241 idle->state = TASK_RUNNING;
7001 idle->se.exec_start = sched_clock(); 5242 idle->se.exec_start = sched_clock();
7002 5243
7003 idle->prio = idle->normal_prio = MAX_PRIO;
7004 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5244 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
7005 __set_task_cpu(idle, cpu); 5245 __set_task_cpu(idle, cpu);
7006 5246
@@ -7008,7 +5248,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
7008#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5248#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
7009 idle->oncpu = 1; 5249 idle->oncpu = 1;
7010#endif 5250#endif
7011 spin_unlock_irqrestore(&rq->lock, flags); 5251 raw_spin_unlock_irqrestore(&rq->lock, flags);
7012 5252
7013 /* Set the preempt count _outside_ the spinlocks! */ 5253 /* Set the preempt count _outside_ the spinlocks! */
7014#if defined(CONFIG_PREEMPT) 5254#if defined(CONFIG_PREEMPT)
@@ -7041,22 +5281,43 @@ cpumask_var_t nohz_cpu_mask;
7041 * 5281 *
7042 * This idea comes from the SD scheduler of Con Kolivas: 5282 * This idea comes from the SD scheduler of Con Kolivas:
7043 */ 5283 */
7044static inline void sched_init_granularity(void) 5284static int get_update_sysctl_factor(void)
7045{ 5285{
7046 unsigned int factor = 1 + ilog2(num_online_cpus()); 5286 unsigned int cpus = min_t(int, num_online_cpus(), 8);
7047 const unsigned long limit = 200000000; 5287 unsigned int factor;
7048 5288
7049 sysctl_sched_min_granularity *= factor; 5289 switch (sysctl_sched_tunable_scaling) {
7050 if (sysctl_sched_min_granularity > limit) 5290 case SCHED_TUNABLESCALING_NONE:
7051 sysctl_sched_min_granularity = limit; 5291 factor = 1;
5292 break;
5293 case SCHED_TUNABLESCALING_LINEAR:
5294 factor = cpus;
5295 break;
5296 case SCHED_TUNABLESCALING_LOG:
5297 default:
5298 factor = 1 + ilog2(cpus);
5299 break;
5300 }
7052 5301
7053 sysctl_sched_latency *= factor; 5302 return factor;
7054 if (sysctl_sched_latency > limit) 5303}
7055 sysctl_sched_latency = limit;
7056 5304
7057 sysctl_sched_wakeup_granularity *= factor; 5305static void update_sysctl(void)
5306{
5307 unsigned int factor = get_update_sysctl_factor();
7058 5308
7059 sysctl_sched_shares_ratelimit *= factor; 5309#define SET_SYSCTL(name) \
5310 (sysctl_##name = (factor) * normalized_sysctl_##name)
5311 SET_SYSCTL(sched_min_granularity);
5312 SET_SYSCTL(sched_latency);
5313 SET_SYSCTL(sched_wakeup_granularity);
5314 SET_SYSCTL(sched_shares_ratelimit);
5315#undef SET_SYSCTL
5316}
5317
5318static inline void sched_init_granularity(void)
5319{
5320 update_sysctl();
7060} 5321}
7061 5322
7062#ifdef CONFIG_SMP 5323#ifdef CONFIG_SMP
@@ -7093,7 +5354,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7093 int ret = 0; 5354 int ret = 0;
7094 5355
7095 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7096 if (!cpumask_intersects(new_mask, cpu_online_mask)) { 5357
5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7097 ret = -EINVAL; 5359 ret = -EINVAL;
7098 goto out; 5360 goto out;
7099 } 5361 }
@@ -7115,7 +5377,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7115 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5377 if (cpumask_test_cpu(task_cpu(p), new_mask))
7116 goto out; 5378 goto out;
7117 5379
7118 if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { 5380 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) {
7119 /* Need help from migration thread: drop lock and wait. */ 5381 /* Need help from migration thread: drop lock and wait. */
7120 struct task_struct *mt = rq->migration_thread; 5382 struct task_struct *mt = rq->migration_thread;
7121 5383
@@ -7148,7 +5410,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7148static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5410static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7149{ 5411{
7150 struct rq *rq_dest, *rq_src; 5412 struct rq *rq_dest, *rq_src;
7151 int ret = 0, on_rq; 5413 int ret = 0;
7152 5414
7153 if (unlikely(!cpu_active(dest_cpu))) 5415 if (unlikely(!cpu_active(dest_cpu)))
7154 return ret; 5416 return ret;
@@ -7164,12 +5426,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7164 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5426 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7165 goto fail; 5427 goto fail;
7166 5428
7167 on_rq = p->se.on_rq; 5429 /*
7168 if (on_rq) 5430 * If we're not on a rq, the next wake-up will ensure we're
5431 * placed properly.
5432 */
5433 if (p->se.on_rq) {
7169 deactivate_task(rq_src, p, 0); 5434 deactivate_task(rq_src, p, 0);
7170 5435 set_task_cpu(p, dest_cpu);
7171 set_task_cpu(p, dest_cpu);
7172 if (on_rq) {
7173 activate_task(rq_dest, p, 0); 5436 activate_task(rq_dest, p, 0);
7174 check_preempt_curr(rq_dest, p, 0); 5437 check_preempt_curr(rq_dest, p, 0);
7175 } 5438 }
@@ -7204,10 +5467,10 @@ static int migration_thread(void *data)
7204 struct migration_req *req; 5467 struct migration_req *req;
7205 struct list_head *head; 5468 struct list_head *head;
7206 5469
7207 spin_lock_irq(&rq->lock); 5470 raw_spin_lock_irq(&rq->lock);
7208 5471
7209 if (cpu_is_offline(cpu)) { 5472 if (cpu_is_offline(cpu)) {
7210 spin_unlock_irq(&rq->lock); 5473 raw_spin_unlock_irq(&rq->lock);
7211 break; 5474 break;
7212 } 5475 }
7213 5476
@@ -7219,7 +5482,7 @@ static int migration_thread(void *data)
7219 head = &rq->migration_queue; 5482 head = &rq->migration_queue;
7220 5483
7221 if (list_empty(head)) { 5484 if (list_empty(head)) {
7222 spin_unlock_irq(&rq->lock); 5485 raw_spin_unlock_irq(&rq->lock);
7223 schedule(); 5486 schedule();
7224 set_current_state(TASK_INTERRUPTIBLE); 5487 set_current_state(TASK_INTERRUPTIBLE);
7225 continue; 5488 continue;
@@ -7228,14 +5491,14 @@ static int migration_thread(void *data)
7228 list_del_init(head->next); 5491 list_del_init(head->next);
7229 5492
7230 if (req->task != NULL) { 5493 if (req->task != NULL) {
7231 spin_unlock(&rq->lock); 5494 raw_spin_unlock(&rq->lock);
7232 __migrate_task(req->task, cpu, req->dest_cpu); 5495 __migrate_task(req->task, cpu, req->dest_cpu);
7233 } else if (likely(cpu == (badcpu = smp_processor_id()))) { 5496 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
7234 req->dest_cpu = RCU_MIGRATION_GOT_QS; 5497 req->dest_cpu = RCU_MIGRATION_GOT_QS;
7235 spin_unlock(&rq->lock); 5498 raw_spin_unlock(&rq->lock);
7236 } else { 5499 } else {
7237 req->dest_cpu = RCU_MIGRATION_MUST_SYNC; 5500 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
7238 spin_unlock(&rq->lock); 5501 raw_spin_unlock(&rq->lock);
7239 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu); 5502 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
7240 } 5503 }
7241 local_irq_enable(); 5504 local_irq_enable();
@@ -7265,37 +5528,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7265static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5528static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7266{ 5529{
7267 int dest_cpu; 5530 int dest_cpu;
7268 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7269 5531
7270again: 5532again:
7271 /* Look for allowed, online CPU in same node. */ 5533 dest_cpu = select_fallback_rq(dead_cpu, p);
7272 for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
7273 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7274 goto move;
7275
7276 /* Any allowed, online CPU? */
7277 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
7278 if (dest_cpu < nr_cpu_ids)
7279 goto move;
7280
7281 /* No more Mr. Nice Guy. */
7282 if (dest_cpu >= nr_cpu_ids) {
7283 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7284 dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
7285
7286 /*
7287 * Don't tell them about moving exiting tasks or
7288 * kernel threads (both mm NULL), since they never
7289 * leave kernel.
7290 */
7291 if (p->mm && printk_ratelimit()) {
7292 printk(KERN_INFO "process %d (%s) no "
7293 "longer affine to cpu%d\n",
7294 task_pid_nr(p), p->comm, dead_cpu);
7295 }
7296 }
7297 5534
7298move:
7299 /* It can have affinity changed while we were choosing. */ 5535 /* It can have affinity changed while we were choosing. */
7300 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5536 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7301 goto again; 5537 goto again;
@@ -7310,7 +5546,7 @@ move:
7310 */ 5546 */
7311static void migrate_nr_uninterruptible(struct rq *rq_src) 5547static void migrate_nr_uninterruptible(struct rq *rq_src)
7312{ 5548{
7313 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask)); 5549 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
7314 unsigned long flags; 5550 unsigned long flags;
7315 5551
7316 local_irq_save(flags); 5552 local_irq_save(flags);
@@ -7358,14 +5594,14 @@ void sched_idle_next(void)
7358 * Strictly not necessary since rest of the CPUs are stopped by now 5594 * Strictly not necessary since rest of the CPUs are stopped by now
7359 * and interrupts disabled on the current cpu. 5595 * and interrupts disabled on the current cpu.
7360 */ 5596 */
7361 spin_lock_irqsave(&rq->lock, flags); 5597 raw_spin_lock_irqsave(&rq->lock, flags);
7362 5598
7363 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5599 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
7364 5600
7365 update_rq_clock(rq); 5601 update_rq_clock(rq);
7366 activate_task(rq, p, 0); 5602 activate_task(rq, p, 0);
7367 5603
7368 spin_unlock_irqrestore(&rq->lock, flags); 5604 raw_spin_unlock_irqrestore(&rq->lock, flags);
7369} 5605}
7370 5606
7371/* 5607/*
@@ -7401,9 +5637,9 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
7401 * that's OK. No task can be added to this CPU, so iteration is 5637 * that's OK. No task can be added to this CPU, so iteration is
7402 * fine. 5638 * fine.
7403 */ 5639 */
7404 spin_unlock_irq(&rq->lock); 5640 raw_spin_unlock_irq(&rq->lock);
7405 move_task_off_dead_cpu(dead_cpu, p); 5641 move_task_off_dead_cpu(dead_cpu, p);
7406 spin_lock_irq(&rq->lock); 5642 raw_spin_lock_irq(&rq->lock);
7407 5643
7408 put_task_struct(p); 5644 put_task_struct(p);
7409} 5645}
@@ -7563,7 +5799,7 @@ static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
7563static struct ctl_table_header *sd_sysctl_header; 5799static struct ctl_table_header *sd_sysctl_header;
7564static void register_sched_domain_sysctl(void) 5800static void register_sched_domain_sysctl(void)
7565{ 5801{
7566 int i, cpu_num = num_online_cpus(); 5802 int i, cpu_num = num_possible_cpus();
7567 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); 5803 struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
7568 char buf[32]; 5804 char buf[32];
7569 5805
@@ -7573,7 +5809,7 @@ static void register_sched_domain_sysctl(void)
7573 if (entry == NULL) 5809 if (entry == NULL)
7574 return; 5810 return;
7575 5811
7576 for_each_online_cpu(i) { 5812 for_each_possible_cpu(i) {
7577 snprintf(buf, 32, "cpu%d", i); 5813 snprintf(buf, 32, "cpu%d", i);
7578 entry->procname = kstrdup(buf, GFP_KERNEL); 5814 entry->procname = kstrdup(buf, GFP_KERNEL);
7579 entry->mode = 0555; 5815 entry->mode = 0555;
@@ -7669,13 +5905,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7669 5905
7670 /* Update our root-domain */ 5906 /* Update our root-domain */
7671 rq = cpu_rq(cpu); 5907 rq = cpu_rq(cpu);
7672 spin_lock_irqsave(&rq->lock, flags); 5908 raw_spin_lock_irqsave(&rq->lock, flags);
7673 if (rq->rd) { 5909 if (rq->rd) {
7674 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5910 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7675 5911
7676 set_rq_online(rq); 5912 set_rq_online(rq);
7677 } 5913 }
7678 spin_unlock_irqrestore(&rq->lock, flags); 5914 raw_spin_unlock_irqrestore(&rq->lock, flags);
7679 break; 5915 break;
7680 5916
7681#ifdef CONFIG_HOTPLUG_CPU 5917#ifdef CONFIG_HOTPLUG_CPU
@@ -7700,14 +5936,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7700 put_task_struct(rq->migration_thread); 5936 put_task_struct(rq->migration_thread);
7701 rq->migration_thread = NULL; 5937 rq->migration_thread = NULL;
7702 /* Idle task back to normal (off runqueue, low prio) */ 5938 /* Idle task back to normal (off runqueue, low prio) */
7703 spin_lock_irq(&rq->lock); 5939 raw_spin_lock_irq(&rq->lock);
7704 update_rq_clock(rq); 5940 update_rq_clock(rq);
7705 deactivate_task(rq, rq->idle, 0); 5941 deactivate_task(rq, rq->idle, 0);
7706 rq->idle->static_prio = MAX_PRIO;
7707 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5942 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
7708 rq->idle->sched_class = &idle_sched_class; 5943 rq->idle->sched_class = &idle_sched_class;
7709 migrate_dead_tasks(cpu); 5944 migrate_dead_tasks(cpu);
7710 spin_unlock_irq(&rq->lock); 5945 raw_spin_unlock_irq(&rq->lock);
7711 cpuset_unlock(); 5946 cpuset_unlock();
7712 migrate_nr_uninterruptible(rq); 5947 migrate_nr_uninterruptible(rq);
7713 BUG_ON(rq->nr_running != 0); 5948 BUG_ON(rq->nr_running != 0);
@@ -7717,30 +5952,30 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
7717 * they didn't take sched_hotcpu_mutex. Just wake up 5952 * they didn't take sched_hotcpu_mutex. Just wake up
7718 * the requestors. 5953 * the requestors.
7719 */ 5954 */
7720 spin_lock_irq(&rq->lock); 5955 raw_spin_lock_irq(&rq->lock);
7721 while (!list_empty(&rq->migration_queue)) { 5956 while (!list_empty(&rq->migration_queue)) {
7722 struct migration_req *req; 5957 struct migration_req *req;
7723 5958
7724 req = list_entry(rq->migration_queue.next, 5959 req = list_entry(rq->migration_queue.next,
7725 struct migration_req, list); 5960 struct migration_req, list);
7726 list_del_init(&req->list); 5961 list_del_init(&req->list);
7727 spin_unlock_irq(&rq->lock); 5962 raw_spin_unlock_irq(&rq->lock);
7728 complete(&req->done); 5963 complete(&req->done);
7729 spin_lock_irq(&rq->lock); 5964 raw_spin_lock_irq(&rq->lock);
7730 } 5965 }
7731 spin_unlock_irq(&rq->lock); 5966 raw_spin_unlock_irq(&rq->lock);
7732 break; 5967 break;
7733 5968
7734 case CPU_DYING: 5969 case CPU_DYING:
7735 case CPU_DYING_FROZEN: 5970 case CPU_DYING_FROZEN:
7736 /* Update our root-domain */ 5971 /* Update our root-domain */
7737 rq = cpu_rq(cpu); 5972 rq = cpu_rq(cpu);
7738 spin_lock_irqsave(&rq->lock, flags); 5973 raw_spin_lock_irqsave(&rq->lock, flags);
7739 if (rq->rd) { 5974 if (rq->rd) {
7740 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5975 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7741 set_rq_offline(rq); 5976 set_rq_offline(rq);
7742 } 5977 }
7743 spin_unlock_irqrestore(&rq->lock, flags); 5978 raw_spin_unlock_irqrestore(&rq->lock, flags);
7744 break; 5979 break;
7745#endif 5980#endif
7746 } 5981 }
@@ -7970,7 +6205,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7970 struct root_domain *old_rd = NULL; 6205 struct root_domain *old_rd = NULL;
7971 unsigned long flags; 6206 unsigned long flags;
7972 6207
7973 spin_lock_irqsave(&rq->lock, flags); 6208 raw_spin_lock_irqsave(&rq->lock, flags);
7974 6209
7975 if (rq->rd) { 6210 if (rq->rd) {
7976 old_rd = rq->rd; 6211 old_rd = rq->rd;
@@ -7996,7 +6231,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7996 if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) 6231 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
7997 set_rq_online(rq); 6232 set_rq_online(rq);
7998 6233
7999 spin_unlock_irqrestore(&rq->lock, flags); 6234 raw_spin_unlock_irqrestore(&rq->lock, flags);
8000 6235
8001 if (old_rd) 6236 if (old_rd)
8002 free_rootdomain(old_rd); 6237 free_rootdomain(old_rd);
@@ -8282,14 +6517,14 @@ enum s_alloc {
8282 */ 6517 */
8283#ifdef CONFIG_SCHED_SMT 6518#ifdef CONFIG_SCHED_SMT
8284static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains); 6519static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
8285static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus); 6520static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
8286 6521
8287static int 6522static int
8288cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, 6523cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
8289 struct sched_group **sg, struct cpumask *unused) 6524 struct sched_group **sg, struct cpumask *unused)
8290{ 6525{
8291 if (sg) 6526 if (sg)
8292 *sg = &per_cpu(sched_group_cpus, cpu).sg; 6527 *sg = &per_cpu(sched_groups, cpu).sg;
8293 return cpu; 6528 return cpu;
8294} 6529}
8295#endif /* CONFIG_SCHED_SMT */ 6530#endif /* CONFIG_SCHED_SMT */
@@ -9099,7 +7334,7 @@ match1:
9099 if (doms_new == NULL) { 7334 if (doms_new == NULL) {
9100 ndoms_cur = 0; 7335 ndoms_cur = 0;
9101 doms_new = &fallback_doms; 7336 doms_new = &fallback_doms;
9102 cpumask_andnot(doms_new[0], cpu_online_mask, cpu_isolated_map); 7337 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
9103 WARN_ON_ONCE(dattr_new); 7338 WARN_ON_ONCE(dattr_new);
9104 } 7339 }
9105 7340
@@ -9230,8 +7465,10 @@ static int update_sched_domains(struct notifier_block *nfb,
9230 switch (action) { 7465 switch (action) {
9231 case CPU_ONLINE: 7466 case CPU_ONLINE:
9232 case CPU_ONLINE_FROZEN: 7467 case CPU_ONLINE_FROZEN:
9233 case CPU_DEAD: 7468 case CPU_DOWN_PREPARE:
9234 case CPU_DEAD_FROZEN: 7469 case CPU_DOWN_PREPARE_FROZEN:
7470 case CPU_DOWN_FAILED:
7471 case CPU_DOWN_FAILED_FROZEN:
9235 partition_sched_domains(1, NULL, NULL); 7472 partition_sched_domains(1, NULL, NULL);
9236 return NOTIFY_OK; 7473 return NOTIFY_OK;
9237 7474
@@ -9278,7 +7515,7 @@ void __init sched_init_smp(void)
9278#endif 7515#endif
9279 get_online_cpus(); 7516 get_online_cpus();
9280 mutex_lock(&sched_domains_mutex); 7517 mutex_lock(&sched_domains_mutex);
9281 arch_init_sched_domains(cpu_online_mask); 7518 arch_init_sched_domains(cpu_active_mask);
9282 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); 7519 cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
9283 if (cpumask_empty(non_isolated_cpus)) 7520 if (cpumask_empty(non_isolated_cpus))
9284 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); 7521 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -9351,13 +7588,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
9351#ifdef CONFIG_SMP 7588#ifdef CONFIG_SMP
9352 rt_rq->rt_nr_migratory = 0; 7589 rt_rq->rt_nr_migratory = 0;
9353 rt_rq->overloaded = 0; 7590 rt_rq->overloaded = 0;
9354 plist_head_init(&rt_rq->pushable_tasks, &rq->lock); 7591 plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
9355#endif 7592#endif
9356 7593
9357 rt_rq->rt_time = 0; 7594 rt_rq->rt_time = 0;
9358 rt_rq->rt_throttled = 0; 7595 rt_rq->rt_throttled = 0;
9359 rt_rq->rt_runtime = 0; 7596 rt_rq->rt_runtime = 0;
9360 spin_lock_init(&rt_rq->rt_runtime_lock); 7597 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
9361 7598
9362#ifdef CONFIG_RT_GROUP_SCHED 7599#ifdef CONFIG_RT_GROUP_SCHED
9363 rt_rq->rt_nr_boosted = 0; 7600 rt_rq->rt_nr_boosted = 0;
@@ -9404,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9404 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9405 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9406 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9407 rt_rq->rt_se = rt_se;
9408 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9409 if (add) 7645 if (add)
9410 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9435,9 +7671,6 @@ void __init sched_init(void)
9435#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9436 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9437#endif 7673#endif
9438#ifdef CONFIG_USER_SCHED
9439 alloc_size *= 2;
9440#endif
9441#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9442 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9443#endif 7676#endif
@@ -9451,13 +7684,6 @@ void __init sched_init(void)
9451 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9452 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9453 7686
9454#ifdef CONFIG_USER_SCHED
9455 root_task_group.se = (struct sched_entity **)ptr;
9456 ptr += nr_cpu_ids * sizeof(void **);
9457
9458 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9459 ptr += nr_cpu_ids * sizeof(void **);
9460#endif /* CONFIG_USER_SCHED */
9461#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9462#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9463 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9466,13 +7692,6 @@ void __init sched_init(void)
9466 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9467 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9468 7694
9469#ifdef CONFIG_USER_SCHED
9470 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9471 ptr += nr_cpu_ids * sizeof(void **);
9472
9473 root_task_group.rt_rq = (struct rt_rq **)ptr;
9474 ptr += nr_cpu_ids * sizeof(void **);
9475#endif /* CONFIG_USER_SCHED */
9476#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9477#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9478 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9492,22 +7711,13 @@ void __init sched_init(void)
9492#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9493 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9494 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9495#ifdef CONFIG_USER_SCHED
9496 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9497 global_rt_period(), RUNTIME_INF);
9498#endif /* CONFIG_USER_SCHED */
9499#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9500 7715
9501#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9502 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9503 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9504 7719
9505#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9506 INIT_LIST_HEAD(&root_task_group.children);
9507 init_task_group.parent = &root_task_group;
9508 list_add(&init_task_group.siblings, &root_task_group.children);
9509#endif /* CONFIG_USER_SCHED */
9510#endif /* CONFIG_GROUP_SCHED */
9511 7721
9512#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9513 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9517,7 +7727,7 @@ void __init sched_init(void)
9517 struct rq *rq; 7727 struct rq *rq;
9518 7728
9519 rq = cpu_rq(i); 7729 rq = cpu_rq(i);
9520 spin_lock_init(&rq->lock); 7730 raw_spin_lock_init(&rq->lock);
9521 rq->nr_running = 0; 7731 rq->nr_running = 0;
9522 rq->calc_load_active = 0; 7732 rq->calc_load_active = 0;
9523 rq->calc_load_update = jiffies + LOAD_FREQ; 7733 rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9547,25 +7757,6 @@ void __init sched_init(void)
9547 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9548 */ 7758 */
9549 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9550#elif defined CONFIG_USER_SCHED
9551 root_task_group.shares = NICE_0_LOAD;
9552 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9553 /*
9554 * In case of task-groups formed thr' the user id of tasks,
9555 * init_task_group represents tasks belonging to root user.
9556 * Hence it forms a sibling of all subsequent groups formed.
9557 * In this case, init_task_group gets only a fraction of overall
9558 * system cpu resource, based on the weight assigned to root
9559 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9560 * by letting tasks of init_task_group sit in a separate cfs_rq
9561 * (init_tg_cfs_rq) and having one entity represent this group of
9562 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9563 */
9564 init_tg_cfs_entry(&init_task_group,
9565 &per_cpu(init_tg_cfs_rq, i),
9566 &per_cpu(init_sched_entity, i), i, 1,
9567 root_task_group.se[i]);
9568
9569#endif 7760#endif
9570#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9571 7762
@@ -9574,12 +7765,6 @@ void __init sched_init(void)
9574 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9575#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9576 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9577#elif defined CONFIG_USER_SCHED
9578 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9579 init_tg_rt_entry(&init_task_group,
9580 &per_cpu(init_rt_rq, i),
9581 &per_cpu(init_sched_rt_entity, i), i, 1,
9582 root_task_group.rt_se[i]);
9583#endif 7768#endif
9584#endif 7769#endif
9585 7770
@@ -9615,7 +7800,7 @@ void __init sched_init(void)
9615#endif 7800#endif
9616 7801
9617#ifdef CONFIG_RT_MUTEXES 7802#ifdef CONFIG_RT_MUTEXES
9618 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); 7803 plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
9619#endif 7804#endif
9620 7805
9621 /* 7806 /*
@@ -9659,12 +7844,12 @@ void __init sched_init(void)
9659#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7844#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9660static inline int preempt_count_equals(int preempt_offset) 7845static inline int preempt_count_equals(int preempt_offset)
9661{ 7846{
9662 int nested = preempt_count() & ~PREEMPT_ACTIVE; 7847 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9663 7848
9664 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9665} 7850}
9666 7851
9667void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9668{ 7853{
9669#ifdef in_atomic 7854#ifdef in_atomic
9670 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -9740,13 +7925,13 @@ void normalize_rt_tasks(void)
9740 continue; 7925 continue;
9741 } 7926 }
9742 7927
9743 spin_lock(&p->pi_lock); 7928 raw_spin_lock(&p->pi_lock);
9744 rq = __task_rq_lock(p); 7929 rq = __task_rq_lock(p);
9745 7930
9746 normalize_task(rq, p); 7931 normalize_task(rq, p);
9747 7932
9748 __task_rq_unlock(rq); 7933 __task_rq_unlock(rq);
9749 spin_unlock(&p->pi_lock); 7934 raw_spin_unlock(&p->pi_lock);
9750 } while_each_thread(g, p); 7935 } while_each_thread(g, p);
9751 7936
9752 read_unlock_irqrestore(&tasklist_lock, flags); 7937 read_unlock_irqrestore(&tasklist_lock, flags);
@@ -9842,13 +8027,15 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9842 se = kzalloc_node(sizeof(struct sched_entity), 8027 se = kzalloc_node(sizeof(struct sched_entity),
9843 GFP_KERNEL, cpu_to_node(i)); 8028 GFP_KERNEL, cpu_to_node(i));
9844 if (!se) 8029 if (!se)
9845 goto err; 8030 goto err_free_rq;
9846 8031
9847 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8032 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
9848 } 8033 }
9849 8034
9850 return 1; 8035 return 1;
9851 8036
8037 err_free_rq:
8038 kfree(cfs_rq);
9852 err: 8039 err:
9853 return 0; 8040 return 0;
9854} 8041}
@@ -9930,13 +8117,15 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
9930 rt_se = kzalloc_node(sizeof(struct sched_rt_entity), 8117 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
9931 GFP_KERNEL, cpu_to_node(i)); 8118 GFP_KERNEL, cpu_to_node(i));
9932 if (!rt_se) 8119 if (!rt_se)
9933 goto err; 8120 goto err_free_rq;
9934 8121
9935 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8122 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
9936 } 8123 }
9937 8124
9938 return 1; 8125 return 1;
9939 8126
8127 err_free_rq:
8128 kfree(rt_rq);
9940 err: 8129 err:
9941 return 0; 8130 return 0;
9942} 8131}
@@ -9971,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9971} 8160}
9972#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
9973 8162
9974#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
9975static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
9976{ 8165{
9977 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10070,17 +8259,17 @@ void sched_move_task(struct task_struct *tsk)
10070 8259
10071#ifdef CONFIG_FAIR_GROUP_SCHED 8260#ifdef CONFIG_FAIR_GROUP_SCHED
10072 if (tsk->sched_class->moved_group) 8261 if (tsk->sched_class->moved_group)
10073 tsk->sched_class->moved_group(tsk); 8262 tsk->sched_class->moved_group(tsk, on_rq);
10074#endif 8263#endif
10075 8264
10076 if (unlikely(running)) 8265 if (unlikely(running))
10077 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10078 if (on_rq) 8267 if (on_rq)
10079 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10080 8269
10081 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10082} 8271}
10083#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10084 8273
10085#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10086static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10105,9 +8294,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
10105 struct rq *rq = cfs_rq->rq; 8294 struct rq *rq = cfs_rq->rq;
10106 unsigned long flags; 8295 unsigned long flags;
10107 8296
10108 spin_lock_irqsave(&rq->lock, flags); 8297 raw_spin_lock_irqsave(&rq->lock, flags);
10109 __set_se_shares(se, shares); 8298 __set_se_shares(se, shares);
10110 spin_unlock_irqrestore(&rq->lock, flags); 8299 raw_spin_unlock_irqrestore(&rq->lock, flags);
10111} 8300}
10112 8301
10113static DEFINE_MUTEX(shares_mutex); 8302static DEFINE_MUTEX(shares_mutex);
@@ -10222,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10222 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10223 } 8412 }
10224 8413
10225#ifdef CONFIG_USER_SCHED
10226 if (tg == &root_task_group) {
10227 period = global_rt_period();
10228 runtime = global_rt_runtime();
10229 }
10230#endif
10231
10232 /* 8414 /*
10233 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10234 */ 8416 */
@@ -10292,18 +8474,18 @@ static int tg_set_bandwidth(struct task_group *tg,
10292 if (err) 8474 if (err)
10293 goto unlock; 8475 goto unlock;
10294 8476
10295 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8477 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10296 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8478 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
10297 tg->rt_bandwidth.rt_runtime = rt_runtime; 8479 tg->rt_bandwidth.rt_runtime = rt_runtime;
10298 8480
10299 for_each_possible_cpu(i) { 8481 for_each_possible_cpu(i) {
10300 struct rt_rq *rt_rq = tg->rt_rq[i]; 8482 struct rt_rq *rt_rq = tg->rt_rq[i];
10301 8483
10302 spin_lock(&rt_rq->rt_runtime_lock); 8484 raw_spin_lock(&rt_rq->rt_runtime_lock);
10303 rt_rq->rt_runtime = rt_runtime; 8485 rt_rq->rt_runtime = rt_runtime;
10304 spin_unlock(&rt_rq->rt_runtime_lock); 8486 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10305 } 8487 }
10306 spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8488 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
10307 unlock: 8489 unlock:
10308 read_unlock(&tasklist_lock); 8490 read_unlock(&tasklist_lock);
10309 mutex_unlock(&rt_constraints_mutex); 8491 mutex_unlock(&rt_constraints_mutex);
@@ -10408,15 +8590,15 @@ static int sched_rt_global_constraints(void)
10408 if (sysctl_sched_rt_runtime == 0) 8590 if (sysctl_sched_rt_runtime == 0)
10409 return -EBUSY; 8591 return -EBUSY;
10410 8592
10411 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 8593 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
10412 for_each_possible_cpu(i) { 8594 for_each_possible_cpu(i) {
10413 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 8595 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
10414 8596
10415 spin_lock(&rt_rq->rt_runtime_lock); 8597 raw_spin_lock(&rt_rq->rt_runtime_lock);
10416 rt_rq->rt_runtime = global_rt_runtime(); 8598 rt_rq->rt_runtime = global_rt_runtime();
10417 spin_unlock(&rt_rq->rt_runtime_lock); 8599 raw_spin_unlock(&rt_rq->rt_runtime_lock);
10418 } 8600 }
10419 spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); 8601 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
10420 8602
10421 return 0; 8603 return 0;
10422} 8604}
@@ -10707,9 +8889,9 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
10707 /* 8889 /*
10708 * Take rq->lock to make 64-bit read safe on 32-bit platforms. 8890 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
10709 */ 8891 */
10710 spin_lock_irq(&cpu_rq(cpu)->lock); 8892 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10711 data = *cpuusage; 8893 data = *cpuusage;
10712 spin_unlock_irq(&cpu_rq(cpu)->lock); 8894 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10713#else 8895#else
10714 data = *cpuusage; 8896 data = *cpuusage;
10715#endif 8897#endif
@@ -10725,9 +8907,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
10725 /* 8907 /*
10726 * Take rq->lock to make 64-bit write safe on 32-bit platforms. 8908 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
10727 */ 8909 */
10728 spin_lock_irq(&cpu_rq(cpu)->lock); 8910 raw_spin_lock_irq(&cpu_rq(cpu)->lock);
10729 *cpuusage = val; 8911 *cpuusage = val;
10730 spin_unlock_irq(&cpu_rq(cpu)->lock); 8912 raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
10731#else 8913#else
10732 *cpuusage = val; 8914 *cpuusage = val;
10733#endif 8915#endif
@@ -10848,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10848} 9030}
10849 9031
10850/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10851 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10852 */ 9051 */
10853static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10854 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10855{ 9054{
10856 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10857 9057
10858 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10859 return; 9059 return;
@@ -10862,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10862 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10863 9063
10864 do { 9064 do {
10865 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10866 ca = ca->parent; 9066 ca = ca->parent;
10867 } while (ca); 9067 } while (ca);
10868 rcu_read_unlock(); 9068 rcu_read_unlock();
@@ -10961,9 +9161,9 @@ void synchronize_sched_expedited(void)
10961 init_completion(&req->done); 9161 init_completion(&req->done);
10962 req->task = NULL; 9162 req->task = NULL;
10963 req->dest_cpu = RCU_MIGRATION_NEED_QS; 9163 req->dest_cpu = RCU_MIGRATION_NEED_QS;
10964 spin_lock_irqsave(&rq->lock, flags); 9164 raw_spin_lock_irqsave(&rq->lock, flags);
10965 list_add(&req->list, &rq->migration_queue); 9165 list_add(&req->list, &rq->migration_queue);
10966 spin_unlock_irqrestore(&rq->lock, flags); 9166 raw_spin_unlock_irqrestore(&rq->lock, flags);
10967 wake_up_process(rq->migration_thread); 9167 wake_up_process(rq->migration_thread);
10968 } 9168 }
10969 for_each_online_cpu(cpu) { 9169 for_each_online_cpu(cpu) {
@@ -10971,11 +9171,11 @@ void synchronize_sched_expedited(void)
10971 req = &per_cpu(rcu_migration_req, cpu); 9171 req = &per_cpu(rcu_migration_req, cpu);
10972 rq = cpu_rq(cpu); 9172 rq = cpu_rq(cpu);
10973 wait_for_completion(&req->done); 9173 wait_for_completion(&req->done);
10974 spin_lock_irqsave(&rq->lock, flags); 9174 raw_spin_lock_irqsave(&rq->lock, flags);
10975 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC)) 9175 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
10976 need_full_sync = 1; 9176 need_full_sync = 1;
10977 req->dest_cpu = RCU_MIGRATION_IDLE; 9177 req->dest_cpu = RCU_MIGRATION_IDLE;
10978 spin_unlock_irqrestore(&rq->lock, flags); 9178 raw_spin_unlock_irqrestore(&rq->lock, flags);
10979 } 9179 }
10980 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 9180 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
10981 synchronize_sched_expedited_count++; 9181 synchronize_sched_expedited_count++;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 479ce5682d7c..5b496132c28a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -236,6 +236,18 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
236} 236}
237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 237EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
238 238
239unsigned long long cpu_clock(int cpu)
240{
241 unsigned long long clock;
242 unsigned long flags;
243
244 local_irq_save(flags);
245 clock = sched_clock_cpu(cpu);
246 local_irq_restore(flags);
247
248 return clock;
249}
250
239#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ 251#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
240 252
241void sched_clock_init(void) 253void sched_clock_init(void)
@@ -251,17 +263,12 @@ u64 sched_clock_cpu(int cpu)
251 return sched_clock(); 263 return sched_clock();
252} 264}
253 265
254#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
255 266
256unsigned long long cpu_clock(int cpu) 267unsigned long long cpu_clock(int cpu)
257{ 268{
258 unsigned long long clock; 269 return sched_clock_cpu(cpu);
259 unsigned long flags; 270}
260 271
261 local_irq_save(flags); 272#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
262 clock = sched_clock_cpu(cpu);
263 local_irq_restore(flags);
264 273
265 return clock;
266}
267EXPORT_SYMBOL_GPL(cpu_clock); 274EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 0f052fc674d5..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
@@ -135,26 +133,26 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
135 if (likely(newpri != CPUPRI_INVALID)) { 133 if (likely(newpri != CPUPRI_INVALID)) {
136 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; 134 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
137 135
138 spin_lock_irqsave(&vec->lock, flags); 136 raw_spin_lock_irqsave(&vec->lock, flags);
139 137
140 cpumask_set_cpu(cpu, vec->mask); 138 cpumask_set_cpu(cpu, vec->mask);
141 vec->count++; 139 vec->count++;
142 if (vec->count == 1) 140 if (vec->count == 1)
143 set_bit(newpri, cp->pri_active); 141 set_bit(newpri, cp->pri_active);
144 142
145 spin_unlock_irqrestore(&vec->lock, flags); 143 raw_spin_unlock_irqrestore(&vec->lock, flags);
146 } 144 }
147 if (likely(oldpri != CPUPRI_INVALID)) { 145 if (likely(oldpri != CPUPRI_INVALID)) {
148 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; 146 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
149 147
150 spin_lock_irqsave(&vec->lock, flags); 148 raw_spin_lock_irqsave(&vec->lock, flags);
151 149
152 vec->count--; 150 vec->count--;
153 if (!vec->count) 151 if (!vec->count)
154 clear_bit(oldpri, cp->pri_active); 152 clear_bit(oldpri, cp->pri_active);
155 cpumask_clear_cpu(cpu, vec->mask); 153 cpumask_clear_cpu(cpu, vec->mask);
156 154
157 spin_unlock_irqrestore(&vec->lock, flags); 155 raw_spin_unlock_irqrestore(&vec->lock, flags);
158 } 156 }
159 157
160 *currpri = newpri; 158 *currpri = newpri;
@@ -180,7 +178,7 @@ int cpupri_init(struct cpupri *cp, bool bootmem)
180 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { 178 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
181 struct cpupri_vec *vec = &cp->pri_to_cpu[i]; 179 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
182 180
183 spin_lock_init(&vec->lock); 181 raw_spin_lock_init(&vec->lock);
184 vec->count = 0; 182 vec->count = 0;
185 if (!zalloc_cpumask_var(&vec->mask, gfp)) 183 if (!zalloc_cpumask_var(&vec->mask, gfp))
186 goto cleanup; 184 goto cleanup;
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9a7e859b8fbf..7cb5bb6b95be 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -12,7 +12,7 @@
12/* values 2-101 are RT priorities 0-99 */ 12/* values 2-101 are RT priorities 0-99 */
13 13
14struct cpupri_vec { 14struct cpupri_vec {
15 spinlock_t lock; 15 raw_spinlock_t lock;
16 int count; 16 int count;
17 cpumask_var_t mask; 17 cpumask_var_t mask;
18}; 18};
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 6988cf08f705..67f95aada4b9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -184,7 +184,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 184 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
185 SPLIT_NS(cfs_rq->exec_clock)); 185 SPLIT_NS(cfs_rq->exec_clock));
186 186
187 spin_lock_irqsave(&rq->lock, flags); 187 raw_spin_lock_irqsave(&rq->lock, flags);
188 if (cfs_rq->rb_leftmost) 188 if (cfs_rq->rb_leftmost)
189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime; 189 MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
190 last = __pick_last_entity(cfs_rq); 190 last = __pick_last_entity(cfs_rq);
@@ -192,7 +192,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
192 max_vruntime = last->vruntime; 192 max_vruntime = last->vruntime;
193 min_vruntime = cfs_rq->min_vruntime; 193 min_vruntime = cfs_rq->min_vruntime;
194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; 194 rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime;
195 spin_unlock_irqrestore(&rq->lock, flags); 195 raw_spin_unlock_irqrestore(&rq->lock, flags);
196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", 196 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime",
197 SPLIT_NS(MIN_vruntime)); 197 SPLIT_NS(MIN_vruntime));
198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", 198 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime",
@@ -309,6 +309,12 @@ static void print_cpu(struct seq_file *m, int cpu)
309 print_rq(m, rq, cpu); 309 print_rq(m, rq, cpu);
310} 310}
311 311
312static const char *sched_tunable_scaling_names[] = {
313 "none",
314 "logaritmic",
315 "linear"
316};
317
312static int sched_debug_show(struct seq_file *m, void *v) 318static int sched_debug_show(struct seq_file *m, void *v)
313{ 319{
314 u64 now = ktime_to_ns(ktime_get()); 320 u64 now = ktime_to_ns(ktime_get());
@@ -334,6 +340,10 @@ static int sched_debug_show(struct seq_file *m, void *v)
334#undef PN 340#undef PN
335#undef P 341#undef P
336 342
343 SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
344 sysctl_sched_tunable_scaling,
345 sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
346
337 for_each_online_cpu(cpu) 347 for_each_online_cpu(cpu)
338 print_cpu(m, cpu); 348 print_cpu(m, cpu);
339 349
@@ -399,7 +409,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
399 PN(se.sum_exec_runtime); 409 PN(se.sum_exec_runtime);
400 PN(se.avg_overlap); 410 PN(se.avg_overlap);
401 PN(se.avg_wakeup); 411 PN(se.avg_wakeup);
402 PN(se.avg_running);
403 412
404 nr_switches = p->nvcsw + p->nivcsw; 413 nr_switches = p->nvcsw + p->nivcsw;
405 414
@@ -423,7 +432,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
423 P(se.nr_failed_migrations_running); 432 P(se.nr_failed_migrations_running);
424 P(se.nr_failed_migrations_hot); 433 P(se.nr_failed_migrations_hot);
425 P(se.nr_forced_migrations); 434 P(se.nr_forced_migrations);
426 P(se.nr_forced2_migrations);
427 P(se.nr_wakeups); 435 P(se.nr_wakeups);
428 P(se.nr_wakeups_sync); 436 P(se.nr_wakeups_sync);
429 P(se.nr_wakeups_migrate); 437 P(se.nr_wakeups_migrate);
@@ -499,7 +507,6 @@ void proc_sched_set_task(struct task_struct *p)
499 p->se.nr_failed_migrations_running = 0; 507 p->se.nr_failed_migrations_running = 0;
500 p->se.nr_failed_migrations_hot = 0; 508 p->se.nr_failed_migrations_hot = 0;
501 p->se.nr_forced_migrations = 0; 509 p->se.nr_forced_migrations = 0;
502 p->se.nr_forced2_migrations = 0;
503 p->se.nr_wakeups = 0; 510 p->se.nr_wakeups = 0;
504 p->se.nr_wakeups_sync = 0; 511 p->se.nr_wakeups_sync = 0;
505 p->se.nr_wakeups_migrate = 0; 512 p->se.nr_wakeups_migrate = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f61837ad336d..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -21,6 +21,7 @@
21 */ 21 */
22 22
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h>
24 25
25/* 26/*
26 * Targeted preemption latency for CPU-bound tasks: 27 * Targeted preemption latency for CPU-bound tasks:
@@ -35,12 +36,26 @@
35 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
36 */ 37 */
37unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 5000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL;
40
41/*
42 * The initial- and re-scaling of tunables is configurable
43 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
44 *
45 * Options are:
46 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
47 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
48 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
49 */
50enum sched_tunable_scaling sysctl_sched_tunable_scaling
51 = SCHED_TUNABLESCALING_LOG;
38 52
39/* 53/*
40 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
41 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
42 */ 56 */
43unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 1000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL;
44 59
45/* 60/*
46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
@@ -70,6 +85,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
70 * have immediate wakeup/sleep latencies. 85 * have immediate wakeup/sleep latencies.
71 */ 86 */
72unsigned int sysctl_sched_wakeup_granularity = 1000000UL; 87unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
88unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
73 89
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 90const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 91
@@ -383,11 +399,12 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
383 */ 399 */
384 400
385#ifdef CONFIG_SCHED_DEBUG 401#ifdef CONFIG_SCHED_DEBUG
386int sched_nr_latency_handler(struct ctl_table *table, int write, 402int sched_proc_update_handler(struct ctl_table *table, int write,
387 void __user *buffer, size_t *lenp, 403 void __user *buffer, size_t *lenp,
388 loff_t *ppos) 404 loff_t *ppos)
389{ 405{
390 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 406 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
407 int factor = get_update_sysctl_factor();
391 408
392 if (ret || !write) 409 if (ret || !write)
393 return ret; 410 return ret;
@@ -395,6 +412,14 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
395 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, 412 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
396 sysctl_sched_min_granularity); 413 sysctl_sched_min_granularity);
397 414
415#define WRT_SYSCTL(name) \
416 (normalized_sysctl_##name = sysctl_##name / (factor))
417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL
422
398 return 0; 423 return 0;
399} 424}
400#endif 425#endif
@@ -485,6 +510,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
485 curr->sum_exec_runtime += delta_exec; 510 curr->sum_exec_runtime += delta_exec;
486 schedstat_add(cfs_rq, exec_clock, delta_exec); 511 schedstat_add(cfs_rq, exec_clock, delta_exec);
487 delta_exec_weighted = calc_delta_fair(delta_exec, curr); 512 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
513
488 curr->vruntime += delta_exec_weighted; 514 curr->vruntime += delta_exec_weighted;
489 update_min_vruntime(cfs_rq); 515 update_min_vruntime(cfs_rq);
490} 516}
@@ -740,16 +766,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
740 se->vruntime = vruntime; 766 se->vruntime = vruntime;
741} 767}
742 768
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
743static void 772static void
744enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) 773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
745{ 774{
746 /* 775 /*
776 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr().
778 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE))
780 se->vruntime += cfs_rq->min_vruntime;
781
782 /*
747 * Update run-time statistics of the 'current'. 783 * Update run-time statistics of the 'current'.
748 */ 784 */
749 update_curr(cfs_rq); 785 update_curr(cfs_rq);
750 account_entity_enqueue(cfs_rq, se); 786 account_entity_enqueue(cfs_rq, se);
751 787
752 if (wakeup) { 788 if (flags & ENQUEUE_WAKEUP) {
753 place_entity(cfs_rq, se, 0); 789 place_entity(cfs_rq, se, 0);
754 enqueue_sleeper(cfs_rq, se); 790 enqueue_sleeper(cfs_rq, se);
755 } 791 }
@@ -803,6 +839,14 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
803 __dequeue_entity(cfs_rq, se); 839 __dequeue_entity(cfs_rq, se);
804 account_entity_dequeue(cfs_rq, se); 840 account_entity_dequeue(cfs_rq, se);
805 update_min_vruntime(cfs_rq); 841 update_min_vruntime(cfs_rq);
842
843 /*
844 * Normalize the entity after updating the min_vruntime because the
845 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position.
847 */
848 if (!sleep)
849 se->vruntime -= cfs_rq->min_vruntime;
806} 850}
807 851
808/* 852/*
@@ -1009,17 +1053,24 @@ static inline void hrtick_update(struct rq *rq)
1009 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1010 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1011 */ 1055 */
1012static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1013{ 1058{
1014 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1015 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1016 1067
1017 for_each_sched_entity(se) { 1068 for_each_sched_entity(se) {
1018 if (se->on_rq) 1069 if (se->on_rq)
1019 break; 1070 break;
1020 cfs_rq = cfs_rq_of(se); 1071 cfs_rq = cfs_rq_of(se);
1021 enqueue_entity(cfs_rq, se, wakeup); 1072 enqueue_entity(cfs_rq, se, flags);
1022 wakeup = 1; 1073 flags = ENQUEUE_WAKEUP;
1023 } 1074 }
1024 1075
1025 hrtick_update(rq); 1076 hrtick_update(rq);
@@ -1095,6 +1146,14 @@ static void yield_task_fair(struct rq *rq)
1095 1146
1096#ifdef CONFIG_SMP 1147#ifdef CONFIG_SMP
1097 1148
1149static void task_waking_fair(struct rq *rq, struct task_struct *p)
1150{
1151 struct sched_entity *se = &p->se;
1152 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1153
1154 se->vruntime -= cfs_rq->min_vruntime;
1155}
1156
1098#ifdef CONFIG_FAIR_GROUP_SCHED 1157#ifdef CONFIG_FAIR_GROUP_SCHED
1099/* 1158/*
1100 * effective_load() calculates the load change as seen from the root_task_group 1159 * effective_load() calculates the load change as seen from the root_task_group
@@ -1403,8 +1462,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1403 new_cpu = prev_cpu; 1462 new_cpu = prev_cpu;
1404 } 1463 }
1405 1464
1406 rcu_read_lock();
1407 for_each_domain(cpu, tmp) { 1465 for_each_domain(cpu, tmp) {
1466 if (!(tmp->flags & SD_LOAD_BALANCE))
1467 continue;
1468
1408 /* 1469 /*
1409 * If power savings logic is enabled for a domain, see if we 1470 * If power savings logic is enabled for a domain, see if we
1410 * are not overloaded, if so, don't balance wider. 1471 * are not overloaded, if so, don't balance wider.
@@ -1448,7 +1509,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1448 * If there's an idle sibling in this domain, make that 1509 * If there's an idle sibling in this domain, make that
1449 * the wake_affine target instead of the current cpu. 1510 * the wake_affine target instead of the current cpu.
1450 */ 1511 */
1451 if (tmp->flags & SD_PREFER_SIBLING) 1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1452 target = select_idle_sibling(p, tmp, target); 1513 target = select_idle_sibling(p, tmp, target);
1453 1514
1454 if (target >= 0) { 1515 if (target >= 0) {
@@ -1484,10 +1545,8 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1484 update_shares(tmp); 1545 update_shares(tmp);
1485 } 1546 }
1486 1547
1487 if (affine_sd && wake_affine(affine_sd, p, sync)) { 1548 if (affine_sd && wake_affine(affine_sd, p, sync))
1488 new_cpu = cpu; 1549 return cpu;
1489 goto out;
1490 }
1491 1550
1492 while (sd) { 1551 while (sd) {
1493 int load_idx = sd->forkexec_idx; 1552 int load_idx = sd->forkexec_idx;
@@ -1528,8 +1587,6 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1528 /* while loop will break here if sd == NULL */ 1587 /* while loop will break here if sd == NULL */
1529 } 1588 }
1530 1589
1531out:
1532 rcu_read_unlock();
1533 return new_cpu; 1590 return new_cpu;
1534} 1591}
1535#endif /* CONFIG_SMP */ 1592#endif /* CONFIG_SMP */
@@ -1651,12 +1708,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1651 int sync = wake_flags & WF_SYNC; 1708 int sync = wake_flags & WF_SYNC;
1652 int scale = cfs_rq->nr_running >= sched_nr_latency; 1709 int scale = cfs_rq->nr_running >= sched_nr_latency;
1653 1710
1654 update_curr(cfs_rq); 1711 if (unlikely(rt_prio(p->prio)))
1655 1712 goto preempt;
1656 if (unlikely(rt_prio(p->prio))) {
1657 resched_task(curr);
1658 return;
1659 }
1660 1713
1661 if (unlikely(p->sched_class != &fair_sched_class)) 1714 if (unlikely(p->sched_class != &fair_sched_class))
1662 return; 1715 return;
@@ -1682,50 +1735,44 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1682 return; 1735 return;
1683 1736
1684 /* Idle tasks are by definition preempted by everybody. */ 1737 /* Idle tasks are by definition preempted by everybody. */
1685 if (unlikely(curr->policy == SCHED_IDLE)) { 1738 if (unlikely(curr->policy == SCHED_IDLE))
1686 resched_task(curr); 1739 goto preempt;
1687 return;
1688 }
1689 1740
1690 if ((sched_feat(WAKEUP_SYNC) && sync) || 1741 if (sched_feat(WAKEUP_SYNC) && sync)
1691 (sched_feat(WAKEUP_OVERLAP) && 1742 goto preempt;
1692 (se->avg_overlap < sysctl_sched_migration_cost &&
1693 pse->avg_overlap < sysctl_sched_migration_cost))) {
1694 resched_task(curr);
1695 return;
1696 }
1697 1743
1698 if (sched_feat(WAKEUP_RUNNING)) { 1744 if (sched_feat(WAKEUP_OVERLAP) &&
1699 if (pse->avg_running < se->avg_running) { 1745 se->avg_overlap < sysctl_sched_migration_cost &&
1700 set_next_buddy(pse); 1746 pse->avg_overlap < sysctl_sched_migration_cost)
1701 resched_task(curr); 1747 goto preempt;
1702 return;
1703 }
1704 }
1705 1748
1706 if (!sched_feat(WAKEUP_PREEMPT)) 1749 if (!sched_feat(WAKEUP_PREEMPT))
1707 return; 1750 return;
1708 1751
1752 update_curr(cfs_rq);
1709 find_matching_se(&se, &pse); 1753 find_matching_se(&se, &pse);
1710
1711 BUG_ON(!pse); 1754 BUG_ON(!pse);
1755 if (wakeup_preempt_entity(se, pse) == 1)
1756 goto preempt;
1712 1757
1713 if (wakeup_preempt_entity(se, pse) == 1) { 1758 return;
1714 resched_task(curr); 1759
1715 /* 1760preempt:
1716 * Only set the backward buddy when the current task is still 1761 resched_task(curr);
1717 * on the rq. This can happen when a wakeup gets interleaved 1762 /*
1718 * with schedule on the ->pre_schedule() or idle_balance() 1763 * Only set the backward buddy when the current task is still
1719 * point, either of which can * drop the rq lock. 1764 * on the rq. This can happen when a wakeup gets interleaved
1720 * 1765 * with schedule on the ->pre_schedule() or idle_balance()
1721 * Also, during early boot the idle thread is in the fair class, 1766 * point, either of which can * drop the rq lock.
1722 * for obvious reasons its a bad idea to schedule back to it. 1767 *
1723 */ 1768 * Also, during early boot the idle thread is in the fair class,
1724 if (unlikely(!se->on_rq || curr == rq->idle)) 1769 * for obvious reasons its a bad idea to schedule back to it.
1725 return; 1770 */
1726 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) 1771 if (unlikely(!se->on_rq || curr == rq->idle))
1727 set_last_buddy(se); 1772 return;
1728 } 1773
1774 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1775 set_last_buddy(se);
1729} 1776}
1730 1777
1731static struct task_struct *pick_next_task_fair(struct rq *rq) 1778static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1769,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1769 */ 1816 */
1770 1817
1771/* 1818/*
1772 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1773 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1774 * dequeued so the iterator has to be dequeue-safe. Here we
1775 * achieve that by always pre-iterating before returning
1776 * the current task:
1777 */ 1821 */
1778static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1779__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1780{ 1824{
1781 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1782 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1783 1830
1784 if (next == &cfs_rq->tasks) 1831/*
1785 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1786 1851
1787 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1788 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1789 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1790 1856
1791 return p; 1857 /*
1792} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1793 1862
1794static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1795{ 1864 if (!tsk_cache_hot ||
1796 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1797 1874
1798 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1799} 1880}
1800 1881
1801static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1802{ 1892{
1803 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1804 1903
1805 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1806} 1916}
1807 1917
1808static unsigned long 1918static unsigned long
1809__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1810 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1811 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1812 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1813{ 1923{
1814 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1815 1927
1816 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1817 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1818 cfs_rq_iterator.arg = cfs_rq;
1819 1930
1820 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1821 max_load_move, sd, idle, all_pinned, 1932
1822 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1823} 1977}
1824 1978
1825#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1851,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1851 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1852 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1853 2007
1854 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1855 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1856 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1857 2011
1858 if (!moved_load) 2012 if (!moved_load)
1859 continue; 2013 continue;
@@ -1876,35 +2030,1529 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1876 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1877 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1878{ 2032{
1879 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1880 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1881 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1882} 2036}
1883#endif 2037#endif
1884 2038
1885static int 2039/*
1886move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1887 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
2050{
2051 unsigned long total_load_moved = 0, load_moved;
2052 int this_best_prio = this_rq->curr->prio;
2053
2054 do {
2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
2058
2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
2062 /*
2063 * NEWIDLE balancing is a source of latency, so preemptible
2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
2066 */
2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
2068 break;
2069
2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
2155 }
2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
2297 return 0;
2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
1888{ 2731{
1889 struct cfs_rq *busy_cfs_rq; 2732 struct sd_lb_stats sds;
1890 struct rq_iterator cfs_rq_iterator; 2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
1891 2813
1892 cfs_rq_iterator.start = load_balance_start_fair; 2814 /*
1893 cfs_rq_iterator.next = load_balance_next_fair; 2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
1894 2821
1895 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
1896 /* 2843 /*
1897 * pass busy_cfs_rq argument into 2844 * The only task running in a non-idle cpu can be moved to this
1898 * load_balance_[start|next]_fair iterators 2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
1899 */ 2861 */
1900 cfs_rq_iterator.arg = busy_cfs_rq; 2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
1901 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
1902 &cfs_rq_iterator)) 2864 return 0;
1903 return 1; 2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
1904 } 2868 }
1905 2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
1906 return 0; 3348 return 0;
1907} 3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
3536
3537static void rq_online_fair(struct rq *rq)
3538{
3539 update_sysctl();
3540}
3541
3542static void rq_offline_fair(struct rq *rq)
3543{
3544 update_sysctl();
3545}
3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1908#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1909 3557
1910/* 3558/*
@@ -1922,28 +3570,30 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1922} 3570}
1923 3571
1924/* 3572/*
1925 * Share the fairness runtime between parent and child, thus the 3573 * called on fork with the child task as argument from the parent's context
1926 * total amount of pressure for CPU stays equal - new tasks 3574 * - child not yet on the tasklist
1927 * get a chance to run but frequent forkers are not allowed to 3575 * - preemption disabled
1928 * monopolize the CPU. Note: the parent runqueue is locked,
1929 * the child is not running yet.
1930 */ 3576 */
1931static void task_new_fair(struct rq *rq, struct task_struct *p) 3577static void task_fork_fair(struct task_struct *p)
1932{ 3578{
1933 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3579 struct cfs_rq *cfs_rq = task_cfs_rq(current);
1934 struct sched_entity *se = &p->se, *curr = cfs_rq->curr; 3580 struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
1935 int this_cpu = smp_processor_id(); 3581 int this_cpu = smp_processor_id();
3582 struct rq *rq = this_rq();
3583 unsigned long flags;
1936 3584
1937 sched_info_queued(p); 3585 raw_spin_lock_irqsave(&rq->lock, flags);
3586
3587 if (unlikely(task_cpu(p) != this_cpu))
3588 __set_task_cpu(p, this_cpu);
1938 3589
1939 update_curr(cfs_rq); 3590 update_curr(cfs_rq);
3591
1940 if (curr) 3592 if (curr)
1941 se->vruntime = curr->vruntime; 3593 se->vruntime = curr->vruntime;
1942 place_entity(cfs_rq, se, 1); 3594 place_entity(cfs_rq, se, 1);
1943 3595
1944 /* 'curr' will be NULL if the child belongs to a different group */ 3596 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
1945 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1946 curr && entity_before(curr, se)) {
1947 /* 3597 /*
1948 * Upon rescheduling, sched_class::put_prev_task() will place 3598 * Upon rescheduling, sched_class::put_prev_task() will place
1949 * 'current' within the tree based on its new key value. 3599 * 'current' within the tree based on its new key value.
@@ -1952,7 +3602,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1952 resched_task(rq->curr); 3602 resched_task(rq->curr);
1953 } 3603 }
1954 3604
1955 enqueue_task_fair(rq, p, 0); 3605 se->vruntime -= cfs_rq->min_vruntime;
3606
3607 raw_spin_unlock_irqrestore(&rq->lock, flags);
1956} 3608}
1957 3609
1958/* 3610/*
@@ -2005,30 +3657,27 @@ static void set_curr_task_fair(struct rq *rq)
2005} 3657}
2006 3658
2007#ifdef CONFIG_FAIR_GROUP_SCHED 3659#ifdef CONFIG_FAIR_GROUP_SCHED
2008static void moved_group_fair(struct task_struct *p) 3660static void moved_group_fair(struct task_struct *p, int on_rq)
2009{ 3661{
2010 struct cfs_rq *cfs_rq = task_cfs_rq(p); 3662 struct cfs_rq *cfs_rq = task_cfs_rq(p);
2011 3663
2012 update_curr(cfs_rq); 3664 update_curr(cfs_rq);
2013 place_entity(cfs_rq, &p->se, 1); 3665 if (!on_rq)
3666 place_entity(cfs_rq, &p->se, 1);
2014} 3667}
2015#endif 3668#endif
2016 3669
2017unsigned int get_rr_interval_fair(struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2018{ 3671{
2019 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2020 unsigned long flags;
2021 struct rq *rq;
2022 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
2023 3674
2024 /* 3675 /*
2025 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise 3676 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
2026 * idle runqueue: 3677 * idle runqueue:
2027 */ 3678 */
2028 rq = task_rq_lock(task, &flags);
2029 if (rq->cfs.load.weight) 3679 if (rq->cfs.load.weight)
2030 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); 3680 rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
2031 task_rq_unlock(rq, &flags);
2032 3681
2033 return rr_interval; 3682 return rr_interval;
2034} 3683}
@@ -2050,13 +3699,15 @@ static const struct sched_class fair_sched_class = {
2050#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2051 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2052 3701
2053 .load_balance = load_balance_fair, 3702 .rq_online = rq_online_fair,
2054 .move_one_task = move_one_task_fair, 3703 .rq_offline = rq_offline_fair,
3704
3705 .task_waking = task_waking_fair,
2055#endif 3706#endif
2056 3707
2057 .set_curr_task = set_curr_task_fair, 3708 .set_curr_task = set_curr_task_fair,
2058 .task_tick = task_tick_fair, 3709 .task_tick = task_tick_fair,
2059 .task_new = task_new_fair, 3710 .task_fork = task_fork_fair,
2060 3711
2061 .prio_changed = prio_changed_fair, 3712 .prio_changed = prio_changed_fair,
2062 .switched_to = switched_to_fair, 3713 .switched_to = switched_to_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 0d94083582c7..d5059fd761d9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -54,11 +54,6 @@ SCHED_FEAT(WAKEUP_SYNC, 0)
54SCHED_FEAT(WAKEUP_OVERLAP, 0) 54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55 55
56/* 56/*
57 * Wakeup preemption towards tasks that run short
58 */
59SCHED_FEAT(WAKEUP_RUNNING, 0)
60
61/*
62 * Use the SYNC wakeup hint, pipes and the likes use this to indicate 57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
63 * the remote end is likely to consume the data we just wrote, and 58 * the remote end is likely to consume the data we just wrote, and
64 * therefore has cache benefit from being placed on the same cpu, see 59 * therefore has cache benefit from being placed on the same cpu, see
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index b133a28fcde3..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -34,34 +34,16 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep)
36{ 36{
37 spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
39 dump_stack(); 39 dump_stack();
40 spin_lock_irq(&rq->lock); 40 raw_spin_lock_irq(&rq->lock);
41} 41}
42 42
43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) 43static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 5c5fef378415..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -327,7 +333,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
327 333
328 weight = cpumask_weight(rd->span); 334 weight = cpumask_weight(rd->span);
329 335
330 spin_lock(&rt_b->rt_runtime_lock); 336 raw_spin_lock(&rt_b->rt_runtime_lock);
331 rt_period = ktime_to_ns(rt_b->rt_period); 337 rt_period = ktime_to_ns(rt_b->rt_period);
332 for_each_cpu(i, rd->span) { 338 for_each_cpu(i, rd->span) {
333 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 339 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
@@ -336,7 +342,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
336 if (iter == rt_rq) 342 if (iter == rt_rq)
337 continue; 343 continue;
338 344
339 spin_lock(&iter->rt_runtime_lock); 345 raw_spin_lock(&iter->rt_runtime_lock);
340 /* 346 /*
341 * Either all rqs have inf runtime and there's nothing to steal 347 * Either all rqs have inf runtime and there's nothing to steal
342 * or __disable_runtime() below sets a specific rq to inf to 348 * or __disable_runtime() below sets a specific rq to inf to
@@ -358,14 +364,14 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
358 rt_rq->rt_runtime += diff; 364 rt_rq->rt_runtime += diff;
359 more = 1; 365 more = 1;
360 if (rt_rq->rt_runtime == rt_period) { 366 if (rt_rq->rt_runtime == rt_period) {
361 spin_unlock(&iter->rt_runtime_lock); 367 raw_spin_unlock(&iter->rt_runtime_lock);
362 break; 368 break;
363 } 369 }
364 } 370 }
365next: 371next:
366 spin_unlock(&iter->rt_runtime_lock); 372 raw_spin_unlock(&iter->rt_runtime_lock);
367 } 373 }
368 spin_unlock(&rt_b->rt_runtime_lock); 374 raw_spin_unlock(&rt_b->rt_runtime_lock);
369 375
370 return more; 376 return more;
371} 377}
@@ -386,8 +392,8 @@ static void __disable_runtime(struct rq *rq)
386 s64 want; 392 s64 want;
387 int i; 393 int i;
388 394
389 spin_lock(&rt_b->rt_runtime_lock); 395 raw_spin_lock(&rt_b->rt_runtime_lock);
390 spin_lock(&rt_rq->rt_runtime_lock); 396 raw_spin_lock(&rt_rq->rt_runtime_lock);
391 /* 397 /*
392 * Either we're all inf and nobody needs to borrow, or we're 398 * Either we're all inf and nobody needs to borrow, or we're
393 * already disabled and thus have nothing to do, or we have 399 * already disabled and thus have nothing to do, or we have
@@ -396,7 +402,7 @@ static void __disable_runtime(struct rq *rq)
396 if (rt_rq->rt_runtime == RUNTIME_INF || 402 if (rt_rq->rt_runtime == RUNTIME_INF ||
397 rt_rq->rt_runtime == rt_b->rt_runtime) 403 rt_rq->rt_runtime == rt_b->rt_runtime)
398 goto balanced; 404 goto balanced;
399 spin_unlock(&rt_rq->rt_runtime_lock); 405 raw_spin_unlock(&rt_rq->rt_runtime_lock);
400 406
401 /* 407 /*
402 * Calculate the difference between what we started out with 408 * Calculate the difference between what we started out with
@@ -418,7 +424,7 @@ static void __disable_runtime(struct rq *rq)
418 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) 424 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
419 continue; 425 continue;
420 426
421 spin_lock(&iter->rt_runtime_lock); 427 raw_spin_lock(&iter->rt_runtime_lock);
422 if (want > 0) { 428 if (want > 0) {
423 diff = min_t(s64, iter->rt_runtime, want); 429 diff = min_t(s64, iter->rt_runtime, want);
424 iter->rt_runtime -= diff; 430 iter->rt_runtime -= diff;
@@ -427,13 +433,13 @@ static void __disable_runtime(struct rq *rq)
427 iter->rt_runtime -= want; 433 iter->rt_runtime -= want;
428 want -= want; 434 want -= want;
429 } 435 }
430 spin_unlock(&iter->rt_runtime_lock); 436 raw_spin_unlock(&iter->rt_runtime_lock);
431 437
432 if (!want) 438 if (!want)
433 break; 439 break;
434 } 440 }
435 441
436 spin_lock(&rt_rq->rt_runtime_lock); 442 raw_spin_lock(&rt_rq->rt_runtime_lock);
437 /* 443 /*
438 * We cannot be left wanting - that would mean some runtime 444 * We cannot be left wanting - that would mean some runtime
439 * leaked out of the system. 445 * leaked out of the system.
@@ -445,8 +451,8 @@ balanced:
445 * runtime - in which case borrowing doesn't make sense. 451 * runtime - in which case borrowing doesn't make sense.
446 */ 452 */
447 rt_rq->rt_runtime = RUNTIME_INF; 453 rt_rq->rt_runtime = RUNTIME_INF;
448 spin_unlock(&rt_rq->rt_runtime_lock); 454 raw_spin_unlock(&rt_rq->rt_runtime_lock);
449 spin_unlock(&rt_b->rt_runtime_lock); 455 raw_spin_unlock(&rt_b->rt_runtime_lock);
450 } 456 }
451} 457}
452 458
@@ -454,9 +460,9 @@ static void disable_runtime(struct rq *rq)
454{ 460{
455 unsigned long flags; 461 unsigned long flags;
456 462
457 spin_lock_irqsave(&rq->lock, flags); 463 raw_spin_lock_irqsave(&rq->lock, flags);
458 __disable_runtime(rq); 464 __disable_runtime(rq);
459 spin_unlock_irqrestore(&rq->lock, flags); 465 raw_spin_unlock_irqrestore(&rq->lock, flags);
460} 466}
461 467
462static void __enable_runtime(struct rq *rq) 468static void __enable_runtime(struct rq *rq)
@@ -472,13 +478,13 @@ static void __enable_runtime(struct rq *rq)
472 for_each_leaf_rt_rq(rt_rq, rq) { 478 for_each_leaf_rt_rq(rt_rq, rq) {
473 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 479 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
474 480
475 spin_lock(&rt_b->rt_runtime_lock); 481 raw_spin_lock(&rt_b->rt_runtime_lock);
476 spin_lock(&rt_rq->rt_runtime_lock); 482 raw_spin_lock(&rt_rq->rt_runtime_lock);
477 rt_rq->rt_runtime = rt_b->rt_runtime; 483 rt_rq->rt_runtime = rt_b->rt_runtime;
478 rt_rq->rt_time = 0; 484 rt_rq->rt_time = 0;
479 rt_rq->rt_throttled = 0; 485 rt_rq->rt_throttled = 0;
480 spin_unlock(&rt_rq->rt_runtime_lock); 486 raw_spin_unlock(&rt_rq->rt_runtime_lock);
481 spin_unlock(&rt_b->rt_runtime_lock); 487 raw_spin_unlock(&rt_b->rt_runtime_lock);
482 } 488 }
483} 489}
484 490
@@ -486,9 +492,9 @@ static void enable_runtime(struct rq *rq)
486{ 492{
487 unsigned long flags; 493 unsigned long flags;
488 494
489 spin_lock_irqsave(&rq->lock, flags); 495 raw_spin_lock_irqsave(&rq->lock, flags);
490 __enable_runtime(rq); 496 __enable_runtime(rq);
491 spin_unlock_irqrestore(&rq->lock, flags); 497 raw_spin_unlock_irqrestore(&rq->lock, flags);
492} 498}
493 499
494static int balance_runtime(struct rt_rq *rt_rq) 500static int balance_runtime(struct rt_rq *rt_rq)
@@ -496,9 +502,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
496 int more = 0; 502 int more = 0;
497 503
498 if (rt_rq->rt_time > rt_rq->rt_runtime) { 504 if (rt_rq->rt_time > rt_rq->rt_runtime) {
499 spin_unlock(&rt_rq->rt_runtime_lock); 505 raw_spin_unlock(&rt_rq->rt_runtime_lock);
500 more = do_balance_runtime(rt_rq); 506 more = do_balance_runtime(rt_rq);
501 spin_lock(&rt_rq->rt_runtime_lock); 507 raw_spin_lock(&rt_rq->rt_runtime_lock);
502 } 508 }
503 509
504 return more; 510 return more;
@@ -524,11 +530,11 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
524 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); 530 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
525 struct rq *rq = rq_of_rt_rq(rt_rq); 531 struct rq *rq = rq_of_rt_rq(rt_rq);
526 532
527 spin_lock(&rq->lock); 533 raw_spin_lock(&rq->lock);
528 if (rt_rq->rt_time) { 534 if (rt_rq->rt_time) {
529 u64 runtime; 535 u64 runtime;
530 536
531 spin_lock(&rt_rq->rt_runtime_lock); 537 raw_spin_lock(&rt_rq->rt_runtime_lock);
532 if (rt_rq->rt_throttled) 538 if (rt_rq->rt_throttled)
533 balance_runtime(rt_rq); 539 balance_runtime(rt_rq);
534 runtime = rt_rq->rt_runtime; 540 runtime = rt_rq->rt_runtime;
@@ -539,13 +545,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
539 } 545 }
540 if (rt_rq->rt_time || rt_rq->rt_nr_running) 546 if (rt_rq->rt_time || rt_rq->rt_nr_running)
541 idle = 0; 547 idle = 0;
542 spin_unlock(&rt_rq->rt_runtime_lock); 548 raw_spin_unlock(&rt_rq->rt_runtime_lock);
543 } else if (rt_rq->rt_nr_running) 549 } else if (rt_rq->rt_nr_running)
544 idle = 0; 550 idle = 0;
545 551
546 if (enqueue) 552 if (enqueue)
547 sched_rt_rq_enqueue(rt_rq); 553 sched_rt_rq_enqueue(rt_rq);
548 spin_unlock(&rq->lock); 554 raw_spin_unlock(&rq->lock);
549 } 555 }
550 556
551 return idle; 557 return idle;
@@ -624,11 +630,11 @@ static void update_curr_rt(struct rq *rq)
624 rt_rq = rt_rq_of_se(rt_se); 630 rt_rq = rt_rq_of_se(rt_se);
625 631
626 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { 632 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
627 spin_lock(&rt_rq->rt_runtime_lock); 633 raw_spin_lock(&rt_rq->rt_runtime_lock);
628 rt_rq->rt_time += delta_exec; 634 rt_rq->rt_time += delta_exec;
629 if (sched_rt_runtime_exceeded(rt_rq)) 635 if (sched_rt_runtime_exceeded(rt_rq))
630 resched_task(curr); 636 resched_task(curr);
631 spin_unlock(&rt_rq->rt_runtime_lock); 637 raw_spin_unlock(&rt_rq->rt_runtime_lock);
632 } 638 }
633 } 639 }
634} 640}
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1246,7 +1256,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1246 task_running(rq, task) || 1256 task_running(rq, task) ||
1247 !task->se.on_rq)) { 1257 !task->se.on_rq)) {
1248 1258
1249 spin_unlock(&lowest_rq->lock); 1259 raw_spin_unlock(&lowest_rq->lock);
1250 lowest_rq = NULL; 1260 lowest_rq = NULL;
1251 break; 1261 break;
1252 } 1262 }
@@ -1472,7 +1482,7 @@ static void post_schedule_rt(struct rq *rq)
1472 * If we are not running and we are not going to reschedule soon, we should 1482 * If we are not running and we are not going to reschedule soon, we should
1473 * try to push tasks away now 1483 * try to push tasks away now
1474 */ 1484 */
1475static void task_wake_up_rt(struct rq *rq, struct task_struct *p) 1485static void task_woken_rt(struct rq *rq, struct task_struct *p)
1476{ 1486{
1477 if (!task_running(rq, p) && 1487 if (!task_running(rq, p) &&
1478 !test_tsk_need_resched(rq->curr) && 1488 !test_tsk_need_resched(rq->curr) &&
@@ -1481,24 +1491,6 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,14 +1738,12 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
1754 .pre_schedule = pre_schedule_rt, 1744 .pre_schedule = pre_schedule_rt,
1755 .post_schedule = post_schedule_rt, 1745 .post_schedule = post_schedule_rt,
1756 .task_wake_up = task_wake_up_rt, 1746 .task_woken = task_woken_rt,
1757 .switched_from = switched_from_rt, 1747 .switched_from = switched_from_rt,
1758#endif 1748#endif
1759 1749
diff --git a/kernel/signal.c b/kernel/signal.c
index 6b982f2cf524..934ae5e687b9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -218,13 +218,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
218 struct user_struct *user; 218 struct user_struct *user;
219 219
220 /* 220 /*
221 * We won't get problems with the target's UID changing under us 221 * Protect access to @t credentials. This can go away when all
222 * because changing it requires RCU be used, and if t != current, the 222 * callers hold rcu read lock.
223 * caller must be holding the RCU readlock (by way of a spinlock) and
224 * we use RCU protection here
225 */ 223 */
224 rcu_read_lock();
226 user = get_uid(__task_cred(t)->user); 225 user = get_uid(__task_cred(t)->user);
227 atomic_inc(&user->sigpending); 226 atomic_inc(&user->sigpending);
227 rcu_read_unlock();
228 228
229 if (override_rlimit || 229 if (override_rlimit ||
230 atomic_read(&user->sigpending) <= 230 atomic_read(&user->sigpending) <=
@@ -423,7 +423,7 @@ still_pending:
423 */ 423 */
424 info->si_signo = sig; 424 info->si_signo = sig;
425 info->si_errno = 0; 425 info->si_errno = 0;
426 info->si_code = 0; 426 info->si_code = SI_USER;
427 info->si_pid = 0; 427 info->si_pid = 0;
428 info->si_uid = 0; 428 info->si_uid = 0;
429 } 429 }
@@ -607,6 +607,17 @@ static int rm_from_queue(unsigned long mask, struct sigpending *s)
607 return 1; 607 return 1;
608} 608}
609 609
610static inline int is_si_special(const struct siginfo *info)
611{
612 return info <= SEND_SIG_FORCED;
613}
614
615static inline bool si_fromuser(const struct siginfo *info)
616{
617 return info == SEND_SIG_NOINFO ||
618 (!is_si_special(info) && SI_FROMUSER(info));
619}
620
610/* 621/*
611 * Bad permissions for sending the signal 622 * Bad permissions for sending the signal
612 * - the caller must hold at least the RCU read lock 623 * - the caller must hold at least the RCU read lock
@@ -621,7 +632,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
621 if (!valid_signal(sig)) 632 if (!valid_signal(sig))
622 return -EINVAL; 633 return -EINVAL;
623 634
624 if (info != SEND_SIG_NOINFO && (is_si_special(info) || SI_FROMKERNEL(info))) 635 if (!si_fromuser(info))
625 return 0; 636 return 0;
626 637
627 error = audit_signal_info(sig, t); /* Let audit system see the signal */ 638 error = audit_signal_info(sig, t); /* Let audit system see the signal */
@@ -949,9 +960,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
949 int from_ancestor_ns = 0; 960 int from_ancestor_ns = 0;
950 961
951#ifdef CONFIG_PID_NS 962#ifdef CONFIG_PID_NS
952 if (!is_si_special(info) && SI_FROMUSER(info) && 963 from_ancestor_ns = si_fromuser(info) &&
953 task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0) 964 !task_pid_nr_ns(current, task_active_pid_ns(t));
954 from_ancestor_ns = 1;
955#endif 965#endif
956 966
957 return __send_signal(sig, info, t, group, from_ancestor_ns); 967 return __send_signal(sig, info, t, group, from_ancestor_ns);
@@ -969,7 +979,8 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
969 for (i = 0; i < 16; i++) { 979 for (i = 0; i < 16; i++) {
970 unsigned char insn; 980 unsigned char insn;
971 981
972 __get_user(insn, (unsigned char *)(regs->ip + i)); 982 if (get_user(insn, (unsigned char *)(regs->ip + i)))
983 break;
973 printk("%02x ", insn); 984 printk("%02x ", insn);
974 } 985 }
975 } 986 }
@@ -1052,12 +1063,6 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1052 return ret; 1063 return ret;
1053} 1064}
1054 1065
1055void
1056force_sig_specific(int sig, struct task_struct *t)
1057{
1058 force_sig_info(sig, SEND_SIG_FORCED, t);
1059}
1060
1061/* 1066/*
1062 * Nuke all other threads in the group. 1067 * Nuke all other threads in the group.
1063 */ 1068 */
@@ -1175,19 +1180,19 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1175 int ret = -EINVAL; 1180 int ret = -EINVAL;
1176 struct task_struct *p; 1181 struct task_struct *p;
1177 const struct cred *pcred; 1182 const struct cred *pcred;
1183 unsigned long flags;
1178 1184
1179 if (!valid_signal(sig)) 1185 if (!valid_signal(sig))
1180 return ret; 1186 return ret;
1181 1187
1182 read_lock(&tasklist_lock); 1188 rcu_read_lock();
1183 p = pid_task(pid, PIDTYPE_PID); 1189 p = pid_task(pid, PIDTYPE_PID);
1184 if (!p) { 1190 if (!p) {
1185 ret = -ESRCH; 1191 ret = -ESRCH;
1186 goto out_unlock; 1192 goto out_unlock;
1187 } 1193 }
1188 pcred = __task_cred(p); 1194 pcred = __task_cred(p);
1189 if ((info == SEND_SIG_NOINFO || 1195 if (si_fromuser(info) &&
1190 (!is_si_special(info) && SI_FROMUSER(info))) &&
1191 euid != pcred->suid && euid != pcred->uid && 1196 euid != pcred->suid && euid != pcred->uid &&
1192 uid != pcred->suid && uid != pcred->uid) { 1197 uid != pcred->suid && uid != pcred->uid) {
1193 ret = -EPERM; 1198 ret = -EPERM;
@@ -1196,14 +1201,16 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
1196 ret = security_task_kill(p, info, sig, secid); 1201 ret = security_task_kill(p, info, sig, secid);
1197 if (ret) 1202 if (ret)
1198 goto out_unlock; 1203 goto out_unlock;
1199 if (sig && p->sighand) { 1204
1200 unsigned long flags; 1205 if (sig) {
1201 spin_lock_irqsave(&p->sighand->siglock, flags); 1206 if (lock_task_sighand(p, &flags)) {
1202 ret = __send_signal(sig, info, p, 1, 0); 1207 ret = __send_signal(sig, info, p, 1, 0);
1203 spin_unlock_irqrestore(&p->sighand->siglock, flags); 1208 unlock_task_sighand(p, &flags);
1209 } else
1210 ret = -ESRCH;
1204 } 1211 }
1205out_unlock: 1212out_unlock:
1206 read_unlock(&tasklist_lock); 1213 rcu_read_unlock();
1207 return ret; 1214 return ret;
1208} 1215}
1209EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); 1216EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
@@ -1837,11 +1844,6 @@ relock:
1837 1844
1838 for (;;) { 1845 for (;;) {
1839 struct k_sigaction *ka; 1846 struct k_sigaction *ka;
1840
1841 if (unlikely(signal->group_stop_count > 0) &&
1842 do_signal_stop(0))
1843 goto relock;
1844
1845 /* 1847 /*
1846 * Tracing can induce an artifical signal and choose sigaction. 1848 * Tracing can induce an artifical signal and choose sigaction.
1847 * The return value in @signr determines the default action, 1849 * The return value in @signr determines the default action,
@@ -1853,6 +1855,10 @@ relock:
1853 if (unlikely(signr != 0)) 1855 if (unlikely(signr != 0))
1854 ka = return_ka; 1856 ka = return_ka;
1855 else { 1857 else {
1858 if (unlikely(signal->group_stop_count > 0) &&
1859 do_signal_stop(0))
1860 goto relock;
1861
1856 signr = dequeue_signal(current, &current->blocked, 1862 signr = dequeue_signal(current, &current->blocked,
1857 info); 1863 info);
1858 1864
diff --git a/kernel/smp.c b/kernel/smp.c
index a8c76069cf50..9867b6bfefce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -12,15 +12,13 @@
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/cpu.h> 13#include <linux/cpu.h>
14 14
15static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
16
17static struct { 15static struct {
18 struct list_head queue; 16 struct list_head queue;
19 spinlock_t lock; 17 raw_spinlock_t lock;
20} call_function __cacheline_aligned_in_smp = 18} call_function __cacheline_aligned_in_smp =
21 { 19 {
22 .queue = LIST_HEAD_INIT(call_function.queue), 20 .queue = LIST_HEAD_INIT(call_function.queue),
23 .lock = __SPIN_LOCK_UNLOCKED(call_function.lock), 21 .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock),
24 }; 22 };
25 23
26enum { 24enum {
@@ -33,12 +31,14 @@ struct call_function_data {
33 cpumask_var_t cpumask; 31 cpumask_var_t cpumask;
34}; 32};
35 33
34static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
35
36struct call_single_queue { 36struct call_single_queue {
37 struct list_head list; 37 struct list_head list;
38 spinlock_t lock; 38 raw_spinlock_t lock;
39}; 39};
40 40
41static DEFINE_PER_CPU(struct call_function_data, cfd_data); 41static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue);
42 42
43static int 43static int
44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) 44hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
@@ -80,7 +80,7 @@ static int __cpuinit init_call_single_data(void)
80 for_each_possible_cpu(i) { 80 for_each_possible_cpu(i) {
81 struct call_single_queue *q = &per_cpu(call_single_queue, i); 81 struct call_single_queue *q = &per_cpu(call_single_queue, i);
82 82
83 spin_lock_init(&q->lock); 83 raw_spin_lock_init(&q->lock);
84 INIT_LIST_HEAD(&q->list); 84 INIT_LIST_HEAD(&q->list);
85 } 85 }
86 86
@@ -141,10 +141,10 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
141 unsigned long flags; 141 unsigned long flags;
142 int ipi; 142 int ipi;
143 143
144 spin_lock_irqsave(&dst->lock, flags); 144 raw_spin_lock_irqsave(&dst->lock, flags);
145 ipi = list_empty(&dst->list); 145 ipi = list_empty(&dst->list);
146 list_add_tail(&data->list, &dst->list); 146 list_add_tail(&data->list, &dst->list);
147 spin_unlock_irqrestore(&dst->lock, flags); 147 raw_spin_unlock_irqrestore(&dst->lock, flags);
148 148
149 /* 149 /*
150 * The list addition should be visible before sending the IPI 150 * The list addition should be visible before sending the IPI
@@ -171,7 +171,7 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
171void generic_smp_call_function_interrupt(void) 171void generic_smp_call_function_interrupt(void)
172{ 172{
173 struct call_function_data *data; 173 struct call_function_data *data;
174 int cpu = get_cpu(); 174 int cpu = smp_processor_id();
175 175
176 /* 176 /*
177 * Shouldn't receive this interrupt on a cpu that is not yet online. 177 * Shouldn't receive this interrupt on a cpu that is not yet online.
@@ -201,9 +201,9 @@ void generic_smp_call_function_interrupt(void)
201 refs = atomic_dec_return(&data->refs); 201 refs = atomic_dec_return(&data->refs);
202 WARN_ON(refs < 0); 202 WARN_ON(refs < 0);
203 if (!refs) { 203 if (!refs) {
204 spin_lock(&call_function.lock); 204 raw_spin_lock(&call_function.lock);
205 list_del_rcu(&data->csd.list); 205 list_del_rcu(&data->csd.list);
206 spin_unlock(&call_function.lock); 206 raw_spin_unlock(&call_function.lock);
207 } 207 }
208 208
209 if (refs) 209 if (refs)
@@ -212,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
212 csd_unlock(&data->csd); 212 csd_unlock(&data->csd);
213 } 213 }
214 214
215 put_cpu();
216} 215}
217 216
218/* 217/*
@@ -230,9 +229,9 @@ void generic_smp_call_function_single_interrupt(void)
230 */ 229 */
231 WARN_ON_ONCE(!cpu_online(smp_processor_id())); 230 WARN_ON_ONCE(!cpu_online(smp_processor_id()));
232 231
233 spin_lock(&q->lock); 232 raw_spin_lock(&q->lock);
234 list_replace_init(&q->list, &list); 233 list_replace_init(&q->list, &list);
235 spin_unlock(&q->lock); 234 raw_spin_unlock(&q->lock);
236 235
237 while (!list_empty(&list)) { 236 while (!list_empty(&list)) {
238 struct call_single_data *data; 237 struct call_single_data *data;
@@ -257,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void)
257 } 256 }
258} 257}
259 258
260static DEFINE_PER_CPU(struct call_single_data, csd_data); 259static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
261 260
262/* 261/*
263 * smp_call_function_single - Run a function on a specific CPU 262 * smp_call_function_single - Run a function on a specific CPU
@@ -348,7 +347,7 @@ int smp_call_function_any(const struct cpumask *mask,
348 goto call; 347 goto call;
349 348
350 /* Try for same node. */ 349 /* Try for same node. */
351 nodemask = cpumask_of_node(cpu); 350 nodemask = cpumask_of_node(cpu_to_node(cpu));
352 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; 351 for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids;
353 cpu = cpumask_next_and(cpu, nodemask, mask)) { 352 cpu = cpumask_next_and(cpu, nodemask, mask)) {
354 if (cpu_online(cpu)) 353 if (cpu_online(cpu))
@@ -449,14 +448,14 @@ void smp_call_function_many(const struct cpumask *mask,
449 cpumask_clear_cpu(this_cpu, data->cpumask); 448 cpumask_clear_cpu(this_cpu, data->cpumask);
450 atomic_set(&data->refs, cpumask_weight(data->cpumask)); 449 atomic_set(&data->refs, cpumask_weight(data->cpumask));
451 450
452 spin_lock_irqsave(&call_function.lock, flags); 451 raw_spin_lock_irqsave(&call_function.lock, flags);
453 /* 452 /*
454 * Place entry at the _HEAD_ of the list, so that any cpu still 453 * Place entry at the _HEAD_ of the list, so that any cpu still
455 * observing the entry in generic_smp_call_function_interrupt() 454 * observing the entry in generic_smp_call_function_interrupt()
456 * will not miss any other list entries: 455 * will not miss any other list entries:
457 */ 456 */
458 list_add_rcu(&data->csd.list, &call_function.queue); 457 list_add_rcu(&data->csd.list, &call_function.queue);
459 spin_unlock_irqrestore(&call_function.lock, flags); 458 raw_spin_unlock_irqrestore(&call_function.lock, flags);
460 459
461 /* 460 /*
462 * Make the list addition visible before sending the ipi. 461 * Make the list addition visible before sending the ipi.
@@ -501,20 +500,20 @@ EXPORT_SYMBOL(smp_call_function);
501 500
502void ipi_call_lock(void) 501void ipi_call_lock(void)
503{ 502{
504 spin_lock(&call_function.lock); 503 raw_spin_lock(&call_function.lock);
505} 504}
506 505
507void ipi_call_unlock(void) 506void ipi_call_unlock(void)
508{ 507{
509 spin_unlock(&call_function.lock); 508 raw_spin_unlock(&call_function.lock);
510} 509}
511 510
512void ipi_call_lock_irq(void) 511void ipi_call_lock_irq(void)
513{ 512{
514 spin_lock_irq(&call_function.lock); 513 raw_spin_lock_irq(&call_function.lock);
515} 514}
516 515
517void ipi_call_unlock_irq(void) 516void ipi_call_unlock_irq(void)
518{ 517{
519 spin_unlock_irq(&call_function.lock); 518 raw_spin_unlock_irq(&call_function.lock);
520} 519}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 21939d9e830e..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
@@ -697,7 +692,7 @@ void __init softirq_init(void)
697 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 692 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
698} 693}
699 694
700static int ksoftirqd(void * __bind_cpu) 695static int run_ksoftirqd(void * __bind_cpu)
701{ 696{
702 set_current_state(TASK_INTERRUPTIBLE); 697 set_current_state(TASK_INTERRUPTIBLE);
703 698
@@ -810,7 +805,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
810 switch (action) { 805 switch (action) {
811 case CPU_UP_PREPARE: 806 case CPU_UP_PREPARE:
812 case CPU_UP_PREPARE_FROZEN: 807 case CPU_UP_PREPARE_FROZEN:
813 p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
814 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
815 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
816 return NOTIFY_BAD; 811 return NOTIFY_BAD;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 81324d12eb35..0d4c7898ab80 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -22,9 +22,10 @@
22 22
23static DEFINE_SPINLOCK(print_lock); 23static DEFINE_SPINLOCK(print_lock);
24 24
25static DEFINE_PER_CPU(unsigned long, touch_timestamp); 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, print_timestamp); 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -70,22 +71,28 @@ static void __touch_softlockup_watchdog(void)
70{ 71{
71 int this_cpu = raw_smp_processor_id(); 72 int this_cpu = raw_smp_processor_id();
72 73
73 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 74 __raw_get_cpu_var(softlockup_touch_ts) = get_timestamp(this_cpu);
74} 75}
75 76
76void touch_softlockup_watchdog(void) 77void touch_softlockup_watchdog(void)
77{ 78{
78 __raw_get_cpu_var(touch_timestamp) = 0; 79 __raw_get_cpu_var(softlockup_touch_ts) = 0;
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
85 92
86 /* Cause each CPU to re-update its timestamp rather than complain */ 93 /* Cause each CPU to re-update its timestamp rather than complain */
87 for_each_online_cpu(cpu) 94 for_each_online_cpu(cpu)
88 per_cpu(touch_timestamp, cpu) = 0; 95 per_cpu(softlockup_touch_ts, cpu) = 0;
89} 96}
90EXPORT_SYMBOL(touch_all_softlockup_watchdogs); 97EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
91 98
@@ -104,28 +111,36 @@ int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
104void softlockup_tick(void) 111void softlockup_tick(void)
105{ 112{
106 int this_cpu = smp_processor_id(); 113 int this_cpu = smp_processor_id();
107 unsigned long touch_timestamp = per_cpu(touch_timestamp, this_cpu); 114 unsigned long touch_ts = per_cpu(softlockup_touch_ts, this_cpu);
108 unsigned long print_timestamp; 115 unsigned long print_ts;
109 struct pt_regs *regs = get_irq_regs(); 116 struct pt_regs *regs = get_irq_regs();
110 unsigned long now; 117 unsigned long now;
111 118
112 /* Is detection switched off? */ 119 /* Is detection switched off? */
113 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) { 120 if (!per_cpu(softlockup_watchdog, this_cpu) || softlockup_thresh <= 0) {
114 /* Be sure we don't false trigger if switched back on */ 121 /* Be sure we don't false trigger if switched back on */
115 if (touch_timestamp) 122 if (touch_ts)
116 per_cpu(touch_timestamp, this_cpu) = 0; 123 per_cpu(softlockup_touch_ts, this_cpu) = 0;
117 return; 124 return;
118 } 125 }
119 126
120 if (touch_timestamp == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
124 139
125 print_timestamp = per_cpu(print_timestamp, this_cpu); 140 print_ts = per_cpu(softlockup_print_ts, this_cpu);
126 141
127 /* report at most once a second */ 142 /* report at most once a second */
128 if (print_timestamp == touch_timestamp || did_panic) 143 if (print_ts == touch_ts || did_panic)
129 return; 144 return;
130 145
131 /* do not print during early bootup: */ 146 /* do not print during early bootup: */
@@ -140,18 +155,18 @@ void softlockup_tick(void)
140 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
141 * threshold timespan. 156 * threshold timespan.
142 */ 157 */
143 if (now > touch_timestamp + softlockup_thresh/2) 158 if (now > touch_ts + softlockup_thresh/2)
144 wake_up_process(per_cpu(watchdog_task, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
145 160
146 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
147 if (now <= (touch_timestamp + softlockup_thresh)) 162 if (now <= (touch_ts + softlockup_thresh))
148 return; 163 return;
149 164
150 per_cpu(print_timestamp, this_cpu) = touch_timestamp; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
151 166
152 spin_lock(&print_lock); 167 spin_lock(&print_lock);
153 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 168 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
154 this_cpu, now - touch_timestamp, 169 this_cpu, now - touch_ts,
155 current->comm, task_pid_nr(current)); 170 current->comm, task_pid_nr(current));
156 print_modules(); 171 print_modules();
157 print_irqtrace_events(current); 172 print_irqtrace_events(current);
@@ -209,32 +224,32 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
209 switch (action) { 224 switch (action) {
210 case CPU_UP_PREPARE: 225 case CPU_UP_PREPARE:
211 case CPU_UP_PREPARE_FROZEN: 226 case CPU_UP_PREPARE_FROZEN:
212 BUG_ON(per_cpu(watchdog_task, hotcpu)); 227 BUG_ON(per_cpu(softlockup_watchdog, hotcpu));
213 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu); 228 p = kthread_create(watchdog, hcpu, "watchdog/%d", hotcpu);
214 if (IS_ERR(p)) { 229 if (IS_ERR(p)) {
215 printk(KERN_ERR "watchdog for %i failed\n", hotcpu); 230 printk(KERN_ERR "watchdog for %i failed\n", hotcpu);
216 return NOTIFY_BAD; 231 return NOTIFY_BAD;
217 } 232 }
218 per_cpu(touch_timestamp, hotcpu) = 0; 233 per_cpu(softlockup_touch_ts, hotcpu) = 0;
219 per_cpu(watchdog_task, hotcpu) = p; 234 per_cpu(softlockup_watchdog, hotcpu) = p;
220 kthread_bind(p, hotcpu); 235 kthread_bind(p, hotcpu);
221 break; 236 break;
222 case CPU_ONLINE: 237 case CPU_ONLINE:
223 case CPU_ONLINE_FROZEN: 238 case CPU_ONLINE_FROZEN:
224 wake_up_process(per_cpu(watchdog_task, hotcpu)); 239 wake_up_process(per_cpu(softlockup_watchdog, hotcpu));
225 break; 240 break;
226#ifdef CONFIG_HOTPLUG_CPU 241#ifdef CONFIG_HOTPLUG_CPU
227 case CPU_UP_CANCELED: 242 case CPU_UP_CANCELED:
228 case CPU_UP_CANCELED_FROZEN: 243 case CPU_UP_CANCELED_FROZEN:
229 if (!per_cpu(watchdog_task, hotcpu)) 244 if (!per_cpu(softlockup_watchdog, hotcpu))
230 break; 245 break;
231 /* Unbind so it can run. Fall thru. */ 246 /* Unbind so it can run. Fall thru. */
232 kthread_bind(per_cpu(watchdog_task, hotcpu), 247 kthread_bind(per_cpu(softlockup_watchdog, hotcpu),
233 cpumask_any(cpu_online_mask)); 248 cpumask_any(cpu_online_mask));
234 case CPU_DEAD: 249 case CPU_DEAD:
235 case CPU_DEAD_FROZEN: 250 case CPU_DEAD_FROZEN:
236 p = per_cpu(watchdog_task, hotcpu); 251 p = per_cpu(softlockup_watchdog, hotcpu);
237 per_cpu(watchdog_task, hotcpu) = NULL; 252 per_cpu(softlockup_watchdog, hotcpu) = NULL;
238 kthread_stop(p); 253 kthread_stop(p);
239 break; 254 break;
240#endif /* CONFIG_HOTPLUG_CPU */ 255#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 41e042219ff6..be6517fb9c14 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -32,6 +32,8 @@
32 * include/linux/spinlock_api_smp.h 32 * include/linux/spinlock_api_smp.h
33 */ 33 */
34#else 34#else
35#define raw_read_can_lock(l) read_can_lock(l)
36#define raw_write_can_lock(l) write_can_lock(l)
35/* 37/*
36 * We build the __lock_function inlines here. They are too large for 38 * We build the __lock_function inlines here. They are too large for
37 * inlining all over the place, but here is only one user per function 39 * inlining all over the place, but here is only one user per function
@@ -42,49 +44,49 @@
42 * towards that other CPU that it should break the lock ASAP. 44 * towards that other CPU that it should break the lock ASAP.
43 */ 45 */
44#define BUILD_LOCK_OPS(op, locktype) \ 46#define BUILD_LOCK_OPS(op, locktype) \
45void __lockfunc __##op##_lock(locktype##_t *lock) \ 47void __lockfunc __raw_##op##_lock(locktype##_t *lock) \
46{ \ 48{ \
47 for (;;) { \ 49 for (;;) { \
48 preempt_disable(); \ 50 preempt_disable(); \
49 if (likely(_raw_##op##_trylock(lock))) \ 51 if (likely(do_raw_##op##_trylock(lock))) \
50 break; \ 52 break; \
51 preempt_enable(); \ 53 preempt_enable(); \
52 \ 54 \
53 if (!(lock)->break_lock) \ 55 if (!(lock)->break_lock) \
54 (lock)->break_lock = 1; \ 56 (lock)->break_lock = 1; \
55 while (!op##_can_lock(lock) && (lock)->break_lock) \ 57 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
56 _raw_##op##_relax(&lock->raw_lock); \ 58 arch_##op##_relax(&lock->raw_lock); \
57 } \ 59 } \
58 (lock)->break_lock = 0; \ 60 (lock)->break_lock = 0; \
59} \ 61} \
60 \ 62 \
61unsigned long __lockfunc __##op##_lock_irqsave(locktype##_t *lock) \ 63unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \
62{ \ 64{ \
63 unsigned long flags; \ 65 unsigned long flags; \
64 \ 66 \
65 for (;;) { \ 67 for (;;) { \
66 preempt_disable(); \ 68 preempt_disable(); \
67 local_irq_save(flags); \ 69 local_irq_save(flags); \
68 if (likely(_raw_##op##_trylock(lock))) \ 70 if (likely(do_raw_##op##_trylock(lock))) \
69 break; \ 71 break; \
70 local_irq_restore(flags); \ 72 local_irq_restore(flags); \
71 preempt_enable(); \ 73 preempt_enable(); \
72 \ 74 \
73 if (!(lock)->break_lock) \ 75 if (!(lock)->break_lock) \
74 (lock)->break_lock = 1; \ 76 (lock)->break_lock = 1; \
75 while (!op##_can_lock(lock) && (lock)->break_lock) \ 77 while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\
76 _raw_##op##_relax(&lock->raw_lock); \ 78 arch_##op##_relax(&lock->raw_lock); \
77 } \ 79 } \
78 (lock)->break_lock = 0; \ 80 (lock)->break_lock = 0; \
79 return flags; \ 81 return flags; \
80} \ 82} \
81 \ 83 \
82void __lockfunc __##op##_lock_irq(locktype##_t *lock) \ 84void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \
83{ \ 85{ \
84 _##op##_lock_irqsave(lock); \ 86 _raw_##op##_lock_irqsave(lock); \
85} \ 87} \
86 \ 88 \
87void __lockfunc __##op##_lock_bh(locktype##_t *lock) \ 89void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \
88{ \ 90{ \
89 unsigned long flags; \ 91 unsigned long flags; \
90 \ 92 \
@@ -93,7 +95,7 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
93 /* irq-disabling. We use the generic preemption-aware */ \ 95 /* irq-disabling. We use the generic preemption-aware */ \
94 /* function: */ \ 96 /* function: */ \
95 /**/ \ 97 /**/ \
96 flags = _##op##_lock_irqsave(lock); \ 98 flags = _raw_##op##_lock_irqsave(lock); \
97 local_bh_disable(); \ 99 local_bh_disable(); \
98 local_irq_restore(flags); \ 100 local_irq_restore(flags); \
99} \ 101} \
@@ -107,269 +109,269 @@ void __lockfunc __##op##_lock_bh(locktype##_t *lock) \
107 * __[spin|read|write]_lock_irqsave() 109 * __[spin|read|write]_lock_irqsave()
108 * __[spin|read|write]_lock_bh() 110 * __[spin|read|write]_lock_bh()
109 */ 111 */
110BUILD_LOCK_OPS(spin, spinlock); 112BUILD_LOCK_OPS(spin, raw_spinlock);
111BUILD_LOCK_OPS(read, rwlock); 113BUILD_LOCK_OPS(read, rwlock);
112BUILD_LOCK_OPS(write, rwlock); 114BUILD_LOCK_OPS(write, rwlock);
113 115
114#endif 116#endif
115 117
116#ifdef CONFIG_DEBUG_LOCK_ALLOC 118#ifndef CONFIG_INLINE_SPIN_TRYLOCK
117 119int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
118void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
119{ 120{
120 preempt_disable(); 121 return __raw_spin_trylock(lock);
121 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
122 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
123} 122}
124EXPORT_SYMBOL(_spin_lock_nested); 123EXPORT_SYMBOL(_raw_spin_trylock);
124#endif
125 125
126unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, 126#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH
127 int subclass) 127int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
128{ 128{
129 unsigned long flags; 129 return __raw_spin_trylock_bh(lock);
130
131 local_irq_save(flags);
132 preempt_disable();
133 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
134 LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
135 _raw_spin_lock_flags, &flags);
136 return flags;
137} 130}
138EXPORT_SYMBOL(_spin_lock_irqsave_nested); 131EXPORT_SYMBOL(_raw_spin_trylock_bh);
132#endif
139 133
140void __lockfunc _spin_lock_nest_lock(spinlock_t *lock, 134#ifndef CONFIG_INLINE_SPIN_LOCK
141 struct lockdep_map *nest_lock) 135void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
142{ 136{
143 preempt_disable(); 137 __raw_spin_lock(lock);
144 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
145 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
146} 138}
147EXPORT_SYMBOL(_spin_lock_nest_lock); 139EXPORT_SYMBOL(_raw_spin_lock);
148
149#endif 140#endif
150 141
151#ifndef CONFIG_INLINE_SPIN_TRYLOCK 142#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE
152int __lockfunc _spin_trylock(spinlock_t *lock) 143unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
153{ 144{
154 return __spin_trylock(lock); 145 return __raw_spin_lock_irqsave(lock);
155} 146}
156EXPORT_SYMBOL(_spin_trylock); 147EXPORT_SYMBOL(_raw_spin_lock_irqsave);
157#endif 148#endif
158 149
159#ifndef CONFIG_INLINE_READ_TRYLOCK 150#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ
160int __lockfunc _read_trylock(rwlock_t *lock) 151void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
161{ 152{
162 return __read_trylock(lock); 153 __raw_spin_lock_irq(lock);
163} 154}
164EXPORT_SYMBOL(_read_trylock); 155EXPORT_SYMBOL(_raw_spin_lock_irq);
165#endif 156#endif
166 157
167#ifndef CONFIG_INLINE_WRITE_TRYLOCK 158#ifndef CONFIG_INLINE_SPIN_LOCK_BH
168int __lockfunc _write_trylock(rwlock_t *lock) 159void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
169{ 160{
170 return __write_trylock(lock); 161 __raw_spin_lock_bh(lock);
171} 162}
172EXPORT_SYMBOL(_write_trylock); 163EXPORT_SYMBOL(_raw_spin_lock_bh);
173#endif 164#endif
174 165
175#ifndef CONFIG_INLINE_READ_LOCK 166#ifndef CONFIG_INLINE_SPIN_UNLOCK
176void __lockfunc _read_lock(rwlock_t *lock) 167void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
177{ 168{
178 __read_lock(lock); 169 __raw_spin_unlock(lock);
179} 170}
180EXPORT_SYMBOL(_read_lock); 171EXPORT_SYMBOL(_raw_spin_unlock);
181#endif 172#endif
182 173
183#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE 174#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE
184unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock) 175void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
185{ 176{
186 return __spin_lock_irqsave(lock); 177 __raw_spin_unlock_irqrestore(lock, flags);
187} 178}
188EXPORT_SYMBOL(_spin_lock_irqsave); 179EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
189#endif 180#endif
190 181
191#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ 182#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ
192void __lockfunc _spin_lock_irq(spinlock_t *lock) 183void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
193{ 184{
194 __spin_lock_irq(lock); 185 __raw_spin_unlock_irq(lock);
195} 186}
196EXPORT_SYMBOL(_spin_lock_irq); 187EXPORT_SYMBOL(_raw_spin_unlock_irq);
197#endif 188#endif
198 189
199#ifndef CONFIG_INLINE_SPIN_LOCK_BH 190#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH
200void __lockfunc _spin_lock_bh(spinlock_t *lock) 191void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
201{ 192{
202 __spin_lock_bh(lock); 193 __raw_spin_unlock_bh(lock);
203} 194}
204EXPORT_SYMBOL(_spin_lock_bh); 195EXPORT_SYMBOL(_raw_spin_unlock_bh);
205#endif 196#endif
206 197
207#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE 198#ifndef CONFIG_INLINE_READ_TRYLOCK
208unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock) 199int __lockfunc _raw_read_trylock(rwlock_t *lock)
209{ 200{
210 return __read_lock_irqsave(lock); 201 return __raw_read_trylock(lock);
211} 202}
212EXPORT_SYMBOL(_read_lock_irqsave); 203EXPORT_SYMBOL(_raw_read_trylock);
213#endif 204#endif
214 205
215#ifndef CONFIG_INLINE_READ_LOCK_IRQ 206#ifndef CONFIG_INLINE_READ_LOCK
216void __lockfunc _read_lock_irq(rwlock_t *lock) 207void __lockfunc _raw_read_lock(rwlock_t *lock)
217{ 208{
218 __read_lock_irq(lock); 209 __raw_read_lock(lock);
219} 210}
220EXPORT_SYMBOL(_read_lock_irq); 211EXPORT_SYMBOL(_raw_read_lock);
221#endif 212#endif
222 213
223#ifndef CONFIG_INLINE_READ_LOCK_BH 214#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE
224void __lockfunc _read_lock_bh(rwlock_t *lock) 215unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock)
225{ 216{
226 __read_lock_bh(lock); 217 return __raw_read_lock_irqsave(lock);
227} 218}
228EXPORT_SYMBOL(_read_lock_bh); 219EXPORT_SYMBOL(_raw_read_lock_irqsave);
229#endif 220#endif
230 221
231#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE 222#ifndef CONFIG_INLINE_READ_LOCK_IRQ
232unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock) 223void __lockfunc _raw_read_lock_irq(rwlock_t *lock)
233{ 224{
234 return __write_lock_irqsave(lock); 225 __raw_read_lock_irq(lock);
235} 226}
236EXPORT_SYMBOL(_write_lock_irqsave); 227EXPORT_SYMBOL(_raw_read_lock_irq);
237#endif 228#endif
238 229
239#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ 230#ifndef CONFIG_INLINE_READ_LOCK_BH
240void __lockfunc _write_lock_irq(rwlock_t *lock) 231void __lockfunc _raw_read_lock_bh(rwlock_t *lock)
241{ 232{
242 __write_lock_irq(lock); 233 __raw_read_lock_bh(lock);
243} 234}
244EXPORT_SYMBOL(_write_lock_irq); 235EXPORT_SYMBOL(_raw_read_lock_bh);
245#endif 236#endif
246 237
247#ifndef CONFIG_INLINE_WRITE_LOCK_BH 238#ifndef CONFIG_INLINE_READ_UNLOCK
248void __lockfunc _write_lock_bh(rwlock_t *lock) 239void __lockfunc _raw_read_unlock(rwlock_t *lock)
249{ 240{
250 __write_lock_bh(lock); 241 __raw_read_unlock(lock);
251} 242}
252EXPORT_SYMBOL(_write_lock_bh); 243EXPORT_SYMBOL(_raw_read_unlock);
253#endif 244#endif
254 245
255#ifndef CONFIG_INLINE_SPIN_LOCK 246#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE
256void __lockfunc _spin_lock(spinlock_t *lock) 247void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
257{ 248{
258 __spin_lock(lock); 249 __raw_read_unlock_irqrestore(lock, flags);
259} 250}
260EXPORT_SYMBOL(_spin_lock); 251EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
261#endif 252#endif
262 253
263#ifndef CONFIG_INLINE_WRITE_LOCK 254#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ
264void __lockfunc _write_lock(rwlock_t *lock) 255void __lockfunc _raw_read_unlock_irq(rwlock_t *lock)
265{ 256{
266 __write_lock(lock); 257 __raw_read_unlock_irq(lock);
267} 258}
268EXPORT_SYMBOL(_write_lock); 259EXPORT_SYMBOL(_raw_read_unlock_irq);
269#endif 260#endif
270 261
271#ifndef CONFIG_INLINE_SPIN_UNLOCK 262#ifndef CONFIG_INLINE_READ_UNLOCK_BH
272void __lockfunc _spin_unlock(spinlock_t *lock) 263void __lockfunc _raw_read_unlock_bh(rwlock_t *lock)
273{ 264{
274 __spin_unlock(lock); 265 __raw_read_unlock_bh(lock);
275} 266}
276EXPORT_SYMBOL(_spin_unlock); 267EXPORT_SYMBOL(_raw_read_unlock_bh);
277#endif 268#endif
278 269
279#ifndef CONFIG_INLINE_WRITE_UNLOCK 270#ifndef CONFIG_INLINE_WRITE_TRYLOCK
280void __lockfunc _write_unlock(rwlock_t *lock) 271int __lockfunc _raw_write_trylock(rwlock_t *lock)
281{ 272{
282 __write_unlock(lock); 273 return __raw_write_trylock(lock);
283} 274}
284EXPORT_SYMBOL(_write_unlock); 275EXPORT_SYMBOL(_raw_write_trylock);
285#endif 276#endif
286 277
287#ifndef CONFIG_INLINE_READ_UNLOCK 278#ifndef CONFIG_INLINE_WRITE_LOCK
288void __lockfunc _read_unlock(rwlock_t *lock) 279void __lockfunc _raw_write_lock(rwlock_t *lock)
289{ 280{
290 __read_unlock(lock); 281 __raw_write_lock(lock);
291} 282}
292EXPORT_SYMBOL(_read_unlock); 283EXPORT_SYMBOL(_raw_write_lock);
293#endif 284#endif
294 285
295#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE 286#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE
296void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 287unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock)
297{ 288{
298 __spin_unlock_irqrestore(lock, flags); 289 return __raw_write_lock_irqsave(lock);
299} 290}
300EXPORT_SYMBOL(_spin_unlock_irqrestore); 291EXPORT_SYMBOL(_raw_write_lock_irqsave);
301#endif 292#endif
302 293
303#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ 294#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ
304void __lockfunc _spin_unlock_irq(spinlock_t *lock) 295void __lockfunc _raw_write_lock_irq(rwlock_t *lock)
305{ 296{
306 __spin_unlock_irq(lock); 297 __raw_write_lock_irq(lock);
307} 298}
308EXPORT_SYMBOL(_spin_unlock_irq); 299EXPORT_SYMBOL(_raw_write_lock_irq);
309#endif 300#endif
310 301
311#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH 302#ifndef CONFIG_INLINE_WRITE_LOCK_BH
312void __lockfunc _spin_unlock_bh(spinlock_t *lock) 303void __lockfunc _raw_write_lock_bh(rwlock_t *lock)
313{ 304{
314 __spin_unlock_bh(lock); 305 __raw_write_lock_bh(lock);
315} 306}
316EXPORT_SYMBOL(_spin_unlock_bh); 307EXPORT_SYMBOL(_raw_write_lock_bh);
317#endif 308#endif
318 309
319#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE 310#ifndef CONFIG_INLINE_WRITE_UNLOCK
320void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 311void __lockfunc _raw_write_unlock(rwlock_t *lock)
321{ 312{
322 __read_unlock_irqrestore(lock, flags); 313 __raw_write_unlock(lock);
323} 314}
324EXPORT_SYMBOL(_read_unlock_irqrestore); 315EXPORT_SYMBOL(_raw_write_unlock);
325#endif 316#endif
326 317
327#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ 318#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE
328void __lockfunc _read_unlock_irq(rwlock_t *lock) 319void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
329{ 320{
330 __read_unlock_irq(lock); 321 __raw_write_unlock_irqrestore(lock, flags);
331} 322}
332EXPORT_SYMBOL(_read_unlock_irq); 323EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
333#endif 324#endif
334 325
335#ifndef CONFIG_INLINE_READ_UNLOCK_BH 326#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ
336void __lockfunc _read_unlock_bh(rwlock_t *lock) 327void __lockfunc _raw_write_unlock_irq(rwlock_t *lock)
337{ 328{
338 __read_unlock_bh(lock); 329 __raw_write_unlock_irq(lock);
339} 330}
340EXPORT_SYMBOL(_read_unlock_bh); 331EXPORT_SYMBOL(_raw_write_unlock_irq);
341#endif 332#endif
342 333
343#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE 334#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH
344void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 335void __lockfunc _raw_write_unlock_bh(rwlock_t *lock)
345{ 336{
346 __write_unlock_irqrestore(lock, flags); 337 __raw_write_unlock_bh(lock);
347} 338}
348EXPORT_SYMBOL(_write_unlock_irqrestore); 339EXPORT_SYMBOL(_raw_write_unlock_bh);
349#endif 340#endif
350 341
351#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ 342#ifdef CONFIG_DEBUG_LOCK_ALLOC
352void __lockfunc _write_unlock_irq(rwlock_t *lock) 343
344void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass)
353{ 345{
354 __write_unlock_irq(lock); 346 preempt_disable();
347 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
348 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
355} 349}
356EXPORT_SYMBOL(_write_unlock_irq); 350EXPORT_SYMBOL(_raw_spin_lock_nested);
357#endif
358 351
359#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH 352unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock,
360void __lockfunc _write_unlock_bh(rwlock_t *lock) 353 int subclass)
361{ 354{
362 __write_unlock_bh(lock); 355 unsigned long flags;
356
357 local_irq_save(flags);
358 preempt_disable();
359 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
360 LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock,
361 do_raw_spin_lock_flags, &flags);
362 return flags;
363} 363}
364EXPORT_SYMBOL(_write_unlock_bh); 364EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested);
365#endif
366 365
367#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH 366void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock,
368int __lockfunc _spin_trylock_bh(spinlock_t *lock) 367 struct lockdep_map *nest_lock)
369{ 368{
370 return __spin_trylock_bh(lock); 369 preempt_disable();
370 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
371 LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
371} 372}
372EXPORT_SYMBOL(_spin_trylock_bh); 373EXPORT_SYMBOL(_raw_spin_lock_nest_lock);
374
373#endif 375#endif
374 376
375notrace int in_lock_functions(unsigned long addr) 377notrace int in_lock_functions(unsigned long addr)
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 818d7d9aa03c..bde4295774c8 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -34,6 +34,30 @@
34#include <linux/smp.h> 34#include <linux/smp.h>
35#include <linux/srcu.h> 35#include <linux/srcu.h>
36 36
37static int init_srcu_struct_fields(struct srcu_struct *sp)
38{
39 sp->completed = 0;
40 mutex_init(&sp->mutex);
41 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
42 return sp->per_cpu_ref ? 0 : -ENOMEM;
43}
44
45#ifdef CONFIG_DEBUG_LOCK_ALLOC
46
47int __init_srcu_struct(struct srcu_struct *sp, const char *name,
48 struct lock_class_key *key)
49{
50#ifdef CONFIG_DEBUG_LOCK_ALLOC
51 /* Don't re-initialize a lock while it is held. */
52 debug_check_no_locks_freed((void *)sp, sizeof(*sp));
53 lockdep_init_map(&sp->dep_map, name, key, 0);
54#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
55 return init_srcu_struct_fields(sp);
56}
57EXPORT_SYMBOL_GPL(__init_srcu_struct);
58
59#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
60
37/** 61/**
38 * init_srcu_struct - initialize a sleep-RCU structure 62 * init_srcu_struct - initialize a sleep-RCU structure
39 * @sp: structure to initialize. 63 * @sp: structure to initialize.
@@ -44,13 +68,12 @@
44 */ 68 */
45int init_srcu_struct(struct srcu_struct *sp) 69int init_srcu_struct(struct srcu_struct *sp)
46{ 70{
47 sp->completed = 0; 71 return init_srcu_struct_fields(sp);
48 mutex_init(&sp->mutex);
49 sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array);
50 return (sp->per_cpu_ref ? 0 : -ENOMEM);
51} 72}
52EXPORT_SYMBOL_GPL(init_srcu_struct); 73EXPORT_SYMBOL_GPL(init_srcu_struct);
53 74
75#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
76
54/* 77/*
55 * srcu_readers_active_idx -- returns approximate number of readers 78 * srcu_readers_active_idx -- returns approximate number of readers
56 * active on the specified rank of per-CPU counters. 79 * active on the specified rank of per-CPU counters.
@@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp)
100} 123}
101EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 124EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
102 125
103/** 126/*
104 * srcu_read_lock - register a new reader for an SRCU-protected structure.
105 * @sp: srcu_struct in which to register the new reader.
106 *
107 * Counts the new reader in the appropriate per-CPU element of the 127 * Counts the new reader in the appropriate per-CPU element of the
108 * srcu_struct. Must be called from process context. 128 * srcu_struct. Must be called from process context.
109 * Returns an index that must be passed to the matching srcu_read_unlock(). 129 * Returns an index that must be passed to the matching srcu_read_unlock().
110 */ 130 */
111int srcu_read_lock(struct srcu_struct *sp) 131int __srcu_read_lock(struct srcu_struct *sp)
112{ 132{
113 int idx; 133 int idx;
114 134
@@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp)
120 preempt_enable(); 140 preempt_enable();
121 return idx; 141 return idx;
122} 142}
123EXPORT_SYMBOL_GPL(srcu_read_lock); 143EXPORT_SYMBOL_GPL(__srcu_read_lock);
124 144
125/** 145/*
126 * srcu_read_unlock - unregister a old reader from an SRCU-protected structure.
127 * @sp: srcu_struct in which to unregister the old reader.
128 * @idx: return value from corresponding srcu_read_lock().
129 *
130 * Removes the count for the old reader from the appropriate per-CPU 146 * Removes the count for the old reader from the appropriate per-CPU
131 * element of the srcu_struct. Note that this may well be a different 147 * element of the srcu_struct. Note that this may well be a different
132 * CPU than that which was incremented by the corresponding srcu_read_lock(). 148 * CPU than that which was incremented by the corresponding srcu_read_lock().
133 * Must be called from process context. 149 * Must be called from process context.
134 */ 150 */
135void srcu_read_unlock(struct srcu_struct *sp, int idx) 151void __srcu_read_unlock(struct srcu_struct *sp, int idx)
136{ 152{
137 preempt_disable(); 153 preempt_disable();
138 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 154 srcu_barrier(); /* ensure compiler won't misorder critical section. */
139 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 155 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--;
140 preempt_enable(); 156 preempt_enable();
141} 157}
142EXPORT_SYMBOL_GPL(srcu_read_unlock); 158EXPORT_SYMBOL_GPL(__srcu_read_unlock);
143 159
144/* 160/*
145 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 161 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
146 */ 162 */
147void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 163static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
148{ 164{
149 int idx; 165 int idx;
150 166
diff --git a/kernel/sys.c b/kernel/sys.c
index 9968c5fb55b9..877fe4f8e05e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,7 +8,6 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/utsname.h> 9#include <linux/utsname.h>
10#include <linux/mman.h> 10#include <linux/mman.h>
11#include <linux/smp_lock.h>
12#include <linux/notifier.h> 11#include <linux/notifier.h>
13#include <linux/reboot.h> 12#include <linux/reboot.h>
14#include <linux/prctl.h> 13#include <linux/prctl.h>
@@ -163,6 +162,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
163 if (niceval > 19) 162 if (niceval > 19)
164 niceval = 19; 163 niceval = 19;
165 164
165 rcu_read_lock();
166 read_lock(&tasklist_lock); 166 read_lock(&tasklist_lock);
167 switch (which) { 167 switch (which) {
168 case PRIO_PROCESS: 168 case PRIO_PROCESS:
@@ -190,16 +190,17 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
190 !(user = find_user(who))) 190 !(user = find_user(who)))
191 goto out_unlock; /* No processes for this user */ 191 goto out_unlock; /* No processes for this user */
192 192
193 do_each_thread(g, p) 193 do_each_thread(g, p) {
194 if (__task_cred(p)->uid == who) 194 if (__task_cred(p)->uid == who)
195 error = set_one_prio(p, niceval, error); 195 error = set_one_prio(p, niceval, error);
196 while_each_thread(g, p); 196 } while_each_thread(g, p);
197 if (who != cred->uid) 197 if (who != cred->uid)
198 free_uid(user); /* For find_user() */ 198 free_uid(user); /* For find_user() */
199 break; 199 break;
200 } 200 }
201out_unlock: 201out_unlock:
202 read_unlock(&tasklist_lock); 202 read_unlock(&tasklist_lock);
203 rcu_read_unlock();
203out: 204out:
204 return error; 205 return error;
205} 206}
@@ -221,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
221 if (which > PRIO_USER || which < PRIO_PROCESS) 222 if (which > PRIO_USER || which < PRIO_PROCESS)
222 return -EINVAL; 223 return -EINVAL;
223 224
225 rcu_read_lock();
224 read_lock(&tasklist_lock); 226 read_lock(&tasklist_lock);
225 switch (which) { 227 switch (which) {
226 case PRIO_PROCESS: 228 case PRIO_PROCESS:
@@ -253,19 +255,20 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
253 !(user = find_user(who))) 255 !(user = find_user(who)))
254 goto out_unlock; /* No processes for this user */ 256 goto out_unlock; /* No processes for this user */
255 257
256 do_each_thread(g, p) 258 do_each_thread(g, p) {
257 if (__task_cred(p)->uid == who) { 259 if (__task_cred(p)->uid == who) {
258 niceval = 20 - task_nice(p); 260 niceval = 20 - task_nice(p);
259 if (niceval > retval) 261 if (niceval > retval)
260 retval = niceval; 262 retval = niceval;
261 } 263 }
262 while_each_thread(g, p); 264 } while_each_thread(g, p);
263 if (who != cred->uid) 265 if (who != cred->uid)
264 free_uid(user); /* for find_user() */ 266 free_uid(user); /* for find_user() */
265 break; 267 break;
266 } 268 }
267out_unlock: 269out_unlock:
268 read_unlock(&tasklist_lock); 270 read_unlock(&tasklist_lock);
271 rcu_read_unlock();
269 272
270 return retval; 273 return retval;
271} 274}
@@ -349,6 +352,9 @@ void kernel_power_off(void)
349 machine_power_off(); 352 machine_power_off();
350} 353}
351EXPORT_SYMBOL_GPL(kernel_power_off); 354EXPORT_SYMBOL_GPL(kernel_power_off);
355
356static DEFINE_MUTEX(reboot_mutex);
357
352/* 358/*
353 * Reboot system call: for obvious reasons only root may call it, 359 * Reboot system call: for obvious reasons only root may call it,
354 * and even root needs to set up some magic numbers in the registers 360 * and even root needs to set up some magic numbers in the registers
@@ -381,7 +387,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
381 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) 387 if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off)
382 cmd = LINUX_REBOOT_CMD_HALT; 388 cmd = LINUX_REBOOT_CMD_HALT;
383 389
384 lock_kernel(); 390 mutex_lock(&reboot_mutex);
385 switch (cmd) { 391 switch (cmd) {
386 case LINUX_REBOOT_CMD_RESTART: 392 case LINUX_REBOOT_CMD_RESTART:
387 kernel_restart(NULL); 393 kernel_restart(NULL);
@@ -397,20 +403,18 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 403
398 case LINUX_REBOOT_CMD_HALT: 404 case LINUX_REBOOT_CMD_HALT:
399 kernel_halt(); 405 kernel_halt();
400 unlock_kernel();
401 do_exit(0); 406 do_exit(0);
402 panic("cannot halt"); 407 panic("cannot halt");
403 408
404 case LINUX_REBOOT_CMD_POWER_OFF: 409 case LINUX_REBOOT_CMD_POWER_OFF:
405 kernel_power_off(); 410 kernel_power_off();
406 unlock_kernel();
407 do_exit(0); 411 do_exit(0);
408 break; 412 break;
409 413
410 case LINUX_REBOOT_CMD_RESTART2: 414 case LINUX_REBOOT_CMD_RESTART2:
411 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { 415 if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) {
412 unlock_kernel(); 416 ret = -EFAULT;
413 return -EFAULT; 417 break;
414 } 418 }
415 buffer[sizeof(buffer) - 1] = '\0'; 419 buffer[sizeof(buffer) - 1] = '\0';
416 420
@@ -433,7 +437,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
433 ret = -EINVAL; 437 ret = -EINVAL;
434 break; 438 break;
435 } 439 }
436 unlock_kernel(); 440 mutex_unlock(&reboot_mutex);
437 return ret; 441 return ret;
438} 442}
439 443
@@ -567,11 +571,6 @@ static int set_user(struct cred *new)
567 if (!new_user) 571 if (!new_user)
568 return -EAGAIN; 572 return -EAGAIN;
569 573
570 if (!task_can_switch_user(new_user, current)) {
571 free_uid(new_user);
572 return -EINVAL;
573 }
574
575 if (atomic_read(&new_user->processes) >= 574 if (atomic_read(&new_user->processes) >=
576 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
577 new_user != INIT_USER) { 576 new_user != INIT_USER) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9327a26765c5..8a68b2448468 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -244,6 +244,10 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 244static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
245static int min_wakeup_granularity_ns; /* 0 usecs */ 245static int min_wakeup_granularity_ns; /* 0 usecs */
246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 246static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
247static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
248static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
249static int min_sched_shares_ratelimit = 100000; /* 100 usec */
250static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
247#endif 251#endif
248 252
249static struct ctl_table kern_table[] = { 253static struct ctl_table kern_table[] = {
@@ -260,7 +264,7 @@ static struct ctl_table kern_table[] = {
260 .data = &sysctl_sched_min_granularity, 264 .data = &sysctl_sched_min_granularity,
261 .maxlen = sizeof(unsigned int), 265 .maxlen = sizeof(unsigned int),
262 .mode = 0644, 266 .mode = 0644,
263 .proc_handler = sched_nr_latency_handler, 267 .proc_handler = sched_proc_update_handler,
264 .extra1 = &min_sched_granularity_ns, 268 .extra1 = &min_sched_granularity_ns,
265 .extra2 = &max_sched_granularity_ns, 269 .extra2 = &max_sched_granularity_ns,
266 }, 270 },
@@ -269,7 +273,7 @@ static struct ctl_table kern_table[] = {
269 .data = &sysctl_sched_latency, 273 .data = &sysctl_sched_latency,
270 .maxlen = sizeof(unsigned int), 274 .maxlen = sizeof(unsigned int),
271 .mode = 0644, 275 .mode = 0644,
272 .proc_handler = sched_nr_latency_handler, 276 .proc_handler = sched_proc_update_handler,
273 .extra1 = &min_sched_granularity_ns, 277 .extra1 = &min_sched_granularity_ns,
274 .extra2 = &max_sched_granularity_ns, 278 .extra2 = &max_sched_granularity_ns,
275 }, 279 },
@@ -278,7 +282,7 @@ static struct ctl_table kern_table[] = {
278 .data = &sysctl_sched_wakeup_granularity, 282 .data = &sysctl_sched_wakeup_granularity,
279 .maxlen = sizeof(unsigned int), 283 .maxlen = sizeof(unsigned int),
280 .mode = 0644, 284 .mode = 0644,
281 .proc_handler = proc_dointvec_minmax, 285 .proc_handler = sched_proc_update_handler,
282 .extra1 = &min_wakeup_granularity_ns, 286 .extra1 = &min_wakeup_granularity_ns,
283 .extra2 = &max_wakeup_granularity_ns, 287 .extra2 = &max_wakeup_granularity_ns,
284 }, 288 },
@@ -287,7 +291,18 @@ static struct ctl_table kern_table[] = {
287 .data = &sysctl_sched_shares_ratelimit, 291 .data = &sysctl_sched_shares_ratelimit,
288 .maxlen = sizeof(unsigned int), 292 .maxlen = sizeof(unsigned int),
289 .mode = 0644, 293 .mode = 0644,
290 .proc_handler = proc_dointvec, 294 .proc_handler = sched_proc_update_handler,
295 .extra1 = &min_sched_shares_ratelimit,
296 .extra2 = &max_sched_shares_ratelimit,
297 },
298 {
299 .procname = "sched_tunable_scaling",
300 .data = &sysctl_sched_tunable_scaling,
301 .maxlen = sizeof(enum sched_tunable_scaling),
302 .mode = 0644,
303 .proc_handler = sched_proc_update_handler,
304 .extra1 = &min_sched_tunable_scaling,
305 .extra2 = &max_sched_tunable_scaling,
291 }, 306 },
292 { 307 {
293 .procname = "sched_shares_thresh", 308 .procname = "sched_shares_thresh",
@@ -298,13 +313,6 @@ static struct ctl_table kern_table[] = {
298 .extra1 = &zero, 313 .extra1 = &zero,
299 }, 314 },
300 { 315 {
301 .procname = "sched_features",
302 .data = &sysctl_sched_features,
303 .maxlen = sizeof(unsigned int),
304 .mode = 0644,
305 .proc_handler = proc_dointvec,
306 },
307 {
308 .procname = "sched_migration_cost", 316 .procname = "sched_migration_cost",
309 .data = &sysctl_sched_migration_cost, 317 .data = &sysctl_sched_migration_cost,
310 .maxlen = sizeof(unsigned int), 318 .maxlen = sizeof(unsigned int),
@@ -1043,7 +1051,7 @@ static struct ctl_table vm_table[] = {
1043 .extra2 = &one_hundred, 1051 .extra2 = &one_hundred,
1044 }, 1052 },
1045#ifdef CONFIG_HUGETLB_PAGE 1053#ifdef CONFIG_HUGETLB_PAGE
1046 { 1054 {
1047 .procname = "nr_hugepages", 1055 .procname = "nr_hugepages",
1048 .data = NULL, 1056 .data = NULL,
1049 .maxlen = sizeof(unsigned long), 1057 .maxlen = sizeof(unsigned long),
@@ -1051,7 +1059,18 @@ static struct ctl_table vm_table[] = {
1051 .proc_handler = hugetlb_sysctl_handler, 1059 .proc_handler = hugetlb_sysctl_handler,
1052 .extra1 = (void *)&hugetlb_zero, 1060 .extra1 = (void *)&hugetlb_zero,
1053 .extra2 = (void *)&hugetlb_infinity, 1061 .extra2 = (void *)&hugetlb_infinity,
1054 }, 1062 },
1063#ifdef CONFIG_NUMA
1064 {
1065 .procname = "nr_hugepages_mempolicy",
1066 .data = NULL,
1067 .maxlen = sizeof(unsigned long),
1068 .mode = 0644,
1069 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1070 .extra1 = (void *)&hugetlb_zero,
1071 .extra2 = (void *)&hugetlb_infinity,
1072 },
1073#endif
1055 { 1074 {
1056 .procname = "hugetlb_shm_group", 1075 .procname = "hugetlb_shm_group",
1057 .data = &sysctl_hugetlb_shm_group, 1076 .data = &sysctl_hugetlb_shm_group,
@@ -1112,7 +1131,8 @@ static struct ctl_table vm_table[] = {
1112 .data = &sysctl_max_map_count, 1131 .data = &sysctl_max_map_count,
1113 .maxlen = sizeof(sysctl_max_map_count), 1132 .maxlen = sizeof(sysctl_max_map_count),
1114 .mode = 0644, 1133 .mode = 0644,
1115 .proc_handler = proc_dointvec 1134 .proc_handler = proc_dointvec_minmax,
1135 .extra1 = &zero,
1116 }, 1136 },
1117#else 1137#else
1118 { 1138 {
@@ -1194,6 +1214,7 @@ static struct ctl_table vm_table[] = {
1194 .proc_handler = proc_dointvec_jiffies, 1214 .proc_handler = proc_dointvec_jiffies,
1195 }, 1215 },
1196#endif 1216#endif
1217#ifdef CONFIG_MMU
1197 { 1218 {
1198 .procname = "mmap_min_addr", 1219 .procname = "mmap_min_addr",
1199 .data = &dac_mmap_min_addr, 1220 .data = &dac_mmap_min_addr,
@@ -1201,6 +1222,7 @@ static struct ctl_table vm_table[] = {
1201 .mode = 0644, 1222 .mode = 0644,
1202 .proc_handler = mmap_min_addr_handler, 1223 .proc_handler = mmap_min_addr_handler,
1203 }, 1224 },
1225#endif
1204#ifdef CONFIG_NUMA 1226#ifdef CONFIG_NUMA
1205 { 1227 {
1206 .procname = "numa_zonelist_order", 1228 .procname = "numa_zonelist_order",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b75dbf40f573..8f5d16e0707a 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1399,6 +1399,13 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1399{ 1399{
1400 int i; 1400 int i;
1401 1401
1402 /*
1403 * CTL_KERN/KERN_VERSION is used by older glibc and cannot
1404 * ever go away.
1405 */
1406 if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
1407 return;
1408
1402 if (printk_ratelimit()) { 1409 if (printk_ratelimit()) {
1403 printk(KERN_INFO 1410 printk(KERN_INFO
1404 "warning: process `%s' used the deprecated sysctl " 1411 "warning: process `%s' used the deprecated sysctl "
@@ -1410,6 +1417,35 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
1410 return; 1417 return;
1411} 1418}
1412 1419
1420#define WARN_ONCE_HASH_BITS 8
1421#define WARN_ONCE_HASH_SIZE (1<<WARN_ONCE_HASH_BITS)
1422
1423static DECLARE_BITMAP(warn_once_bitmap, WARN_ONCE_HASH_SIZE);
1424
1425#define FNV32_OFFSET 2166136261U
1426#define FNV32_PRIME 0x01000193
1427
1428/*
1429 * Print each legacy sysctl (approximately) only once.
1430 * To avoid making the tables non-const use a external
1431 * hash-table instead.
1432 * Worst case hash collision: 6, but very rarely.
1433 * NOTE! We don't use the SMP-safe bit tests. We simply
1434 * don't care enough.
1435 */
1436static void warn_on_bintable(const int *name, int nlen)
1437{
1438 int i;
1439 u32 hash = FNV32_OFFSET;
1440
1441 for (i = 0; i < nlen; i++)
1442 hash = (hash ^ name[i]) * FNV32_PRIME;
1443 hash %= WARN_ONCE_HASH_SIZE;
1444 if (__test_and_set_bit(hash, warn_once_bitmap))
1445 return;
1446 deprecated_sysctl_warning(name, nlen);
1447}
1448
1413static ssize_t do_sysctl(int __user *args_name, int nlen, 1449static ssize_t do_sysctl(int __user *args_name, int nlen,
1414 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1450 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1415{ 1451{
@@ -1424,7 +1460,7 @@ static ssize_t do_sysctl(int __user *args_name, int nlen,
1424 if (get_user(name[i], args_name + i)) 1460 if (get_user(name[i], args_name + i))
1425 return -EFAULT; 1461 return -EFAULT;
1426 1462
1427 deprecated_sysctl_warning(name, nlen); 1463 warn_on_bintable(name, nlen);
1428 1464
1429 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); 1465 return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen);
1430} 1466}
diff --git a/kernel/time.c b/kernel/time.c
index c6324d96009e..804798005d19 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -136,6 +136,7 @@ static inline void warp_clock(void)
136 write_seqlock_irq(&xtime_lock); 136 write_seqlock_irq(&xtime_lock);
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 138 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
139 update_xtime_cache(0);
139 write_sequnlock_irq(&xtime_lock); 140 write_sequnlock_irq(&xtime_lock);
140 clock_was_set(); 141 clock_was_set();
141} 142}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 20a8920029ee..d7395fdfb9f3 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -30,7 +30,7 @@ static LIST_HEAD(clockevents_released);
30static RAW_NOTIFIER_HEAD(clockevents_chain); 30static RAW_NOTIFIER_HEAD(clockevents_chain);
31 31
32/* Protection for the above */ 32/* Protection for the above */
33static DEFINE_SPINLOCK(clockevents_lock); 33static DEFINE_RAW_SPINLOCK(clockevents_lock);
34 34
35/** 35/**
36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds 36 * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds
@@ -141,9 +141,9 @@ int clockevents_register_notifier(struct notifier_block *nb)
141 unsigned long flags; 141 unsigned long flags;
142 int ret; 142 int ret;
143 143
144 spin_lock_irqsave(&clockevents_lock, flags); 144 raw_spin_lock_irqsave(&clockevents_lock, flags);
145 ret = raw_notifier_chain_register(&clockevents_chain, nb); 145 ret = raw_notifier_chain_register(&clockevents_chain, nb);
146 spin_unlock_irqrestore(&clockevents_lock, flags); 146 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
147 147
148 return ret; 148 return ret;
149} 149}
@@ -185,13 +185,13 @@ void clockevents_register_device(struct clock_event_device *dev)
185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 185 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
186 BUG_ON(!dev->cpumask); 186 BUG_ON(!dev->cpumask);
187 187
188 spin_lock_irqsave(&clockevents_lock, flags); 188 raw_spin_lock_irqsave(&clockevents_lock, flags);
189 189
190 list_add(&dev->list, &clockevent_devices); 190 list_add(&dev->list, &clockevent_devices);
191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); 191 clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev);
192 clockevents_notify_released(); 192 clockevents_notify_released();
193 193
194 spin_unlock_irqrestore(&clockevents_lock, flags); 194 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
195} 195}
196EXPORT_SYMBOL_GPL(clockevents_register_device); 196EXPORT_SYMBOL_GPL(clockevents_register_device);
197 197
@@ -238,10 +238,11 @@ void clockevents_exchange_device(struct clock_event_device *old,
238 */ 238 */
239void clockevents_notify(unsigned long reason, void *arg) 239void clockevents_notify(unsigned long reason, void *arg)
240{ 240{
241 struct list_head *node, *tmp; 241 struct clock_event_device *dev, *tmp;
242 unsigned long flags; 242 unsigned long flags;
243 int cpu;
243 244
244 spin_lock_irqsave(&clockevents_lock, flags); 245 raw_spin_lock_irqsave(&clockevents_lock, flags);
245 clockevents_do_notify(reason, arg); 246 clockevents_do_notify(reason, arg);
246 247
247 switch (reason) { 248 switch (reason) {
@@ -250,13 +251,25 @@ void clockevents_notify(unsigned long reason, void *arg)
250 * Unregister the clock event devices which were 251 * Unregister the clock event devices which were
251 * released from the users in the notify chain. 252 * released from the users in the notify chain.
252 */ 253 */
253 list_for_each_safe(node, tmp, &clockevents_released) 254 list_for_each_entry_safe(dev, tmp, &clockevents_released, list)
254 list_del(node); 255 list_del(&dev->list);
256 /*
257 * Now check whether the CPU has left unused per cpu devices
258 */
259 cpu = *((int *)arg);
260 list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) {
261 if (cpumask_test_cpu(cpu, dev->cpumask) &&
262 cpumask_weight(dev->cpumask) == 1 &&
263 !tick_is_broadcast_device(dev)) {
264 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
265 list_del(&dev->list);
266 }
267 }
255 break; 268 break;
256 default: 269 default:
257 break; 270 break;
258 } 271 }
259 spin_unlock_irqrestore(&clockevents_lock, flags); 272 raw_spin_unlock_irqrestore(&clockevents_lock, flags);
260} 273}
261EXPORT_SYMBOL_GPL(clockevents_notify); 274EXPORT_SYMBOL_GPL(clockevents_notify);
262#endif 275#endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d422c7b2236b..1f663d23e85e 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void)
343{ 343{
344 unsigned long flags; 344 unsigned long flags;
345 345
346 spin_lock_irqsave(&watchdog_lock, flags); 346 /*
347 * We use trylock here to avoid a potential dead lock when
348 * kgdb calls this code after the kernel has been stopped with
349 * watchdog_lock held. When watchdog_lock is held we just
350 * return and accept, that the watchdog might trigger and mark
351 * the monitored clock source (usually TSC) unstable.
352 *
353 * This does not affect the other caller clocksource_resume()
354 * because at this point the kernel is UP, interrupts are
355 * disabled and nothing can hold watchdog_lock.
356 */
357 if (!spin_trylock_irqsave(&watchdog_lock, flags))
358 return;
347 clocksource_reset_watchdog(); 359 clocksource_reset_watchdog();
348 spin_unlock_irqrestore(&watchdog_lock, flags); 360 spin_unlock_irqrestore(&watchdog_lock, flags);
349} 361}
@@ -441,6 +453,18 @@ static inline int clocksource_watchdog_kthread(void *data) { return 0; }
441#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ 453#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */
442 454
443/** 455/**
456 * clocksource_suspend - suspend the clocksource(s)
457 */
458void clocksource_suspend(void)
459{
460 struct clocksource *cs;
461
462 list_for_each_entry_reverse(cs, &clocksource_list, list)
463 if (cs->suspend)
464 cs->suspend(cs);
465}
466
467/**
444 * clocksource_resume - resume the clocksource(s) 468 * clocksource_resume - resume the clocksource(s)
445 */ 469 */
446void clocksource_resume(void) 470void clocksource_resume(void)
@@ -449,7 +473,7 @@ void clocksource_resume(void)
449 473
450 list_for_each_entry(cs, &clocksource_list, list) 474 list_for_each_entry(cs, &clocksource_list, list)
451 if (cs->resume) 475 if (cs->resume)
452 cs->resume(); 476 cs->resume(cs);
453 477
454 clocksource_resume_watchdog(); 478 clocksource_resume_watchdog();
455} 479}
@@ -458,8 +482,8 @@ void clocksource_resume(void)
458 * clocksource_touch_watchdog - Update watchdog 482 * clocksource_touch_watchdog - Update watchdog
459 * 483 *
460 * Update the watchdog after exception contexts such as kgdb so as not 484 * Update the watchdog after exception contexts such as kgdb so as not
461 * to incorrectly trip the watchdog. 485 * to incorrectly trip the watchdog. This might fail when the kernel
462 * 486 * was stopped in code which holds watchdog_lock.
463 */ 487 */
464void clocksource_touch_watchdog(void) 488void clocksource_touch_watchdog(void)
465{ 489{
@@ -677,7 +701,7 @@ sysfs_show_current_clocksources(struct sys_device *dev,
677 * @count: length of buffer 701 * @count: length of buffer
678 * 702 *
679 * Takes input from sysfs interface for manually overriding the default 703 * Takes input from sysfs interface for manually overriding the default
680 * clocksource selction. 704 * clocksource selection.
681 */ 705 */
682static ssize_t sysfs_override_clocksource(struct sys_device *dev, 706static ssize_t sysfs_override_clocksource(struct sys_device *dev,
683 struct sysdev_attribute *attr, 707 struct sysdev_attribute *attr,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4800f933910e..7c0f180d6e9d 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -58,10 +58,10 @@ static s64 time_offset;
58static long time_constant = 2; 58static long time_constant = 2;
59 59
60/* maximum error (usecs): */ 60/* maximum error (usecs): */
61long time_maxerror = NTP_PHASE_LIMIT; 61static long time_maxerror = NTP_PHASE_LIMIT;
62 62
63/* estimated error (usecs): */ 63/* estimated error (usecs): */
64long time_esterror = NTP_PHASE_LIMIT; 64static long time_esterror = NTP_PHASE_LIMIT;
65 65
66/* frequency offset (scaled nsecs/secs): */ 66/* frequency offset (scaled nsecs/secs): */
67static s64 time_freq; 67static s64 time_freq;
@@ -142,11 +142,11 @@ static void ntp_update_offset(long offset)
142 * Select how the frequency is to be controlled 142 * Select how the frequency is to be controlled
143 * and in which mode (PLL or FLL). 143 * and in which mode (PLL or FLL).
144 */ 144 */
145 secs = xtime.tv_sec - time_reftime; 145 secs = get_seconds() - time_reftime;
146 if (unlikely(time_status & STA_FREQHOLD)) 146 if (unlikely(time_status & STA_FREQHOLD))
147 secs = 0; 147 secs = 0;
148 148
149 time_reftime = xtime.tv_sec; 149 time_reftime = get_seconds();
150 150
151 offset64 = offset; 151 offset64 = offset;
152 freq_adj = (offset64 * secs) << 152 freq_adj = (offset64 * secs) <<
@@ -368,7 +368,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
368 * reference time to current time. 368 * reference time to current time.
369 */ 369 */
370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) 370 if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
371 time_reftime = xtime.tv_sec; 371 time_reftime = get_seconds();
372 372
373 /* only set allowed bits */ 373 /* only set allowed bits */
374 time_status &= STA_RONLY; 374 time_status &= STA_RONLY;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index c2ec25087a35..b3bafd5fc66d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -31,7 +31,7 @@ static struct tick_device tick_broadcast_device;
31/* FIXME: Use cpumask_var_t. */ 31/* FIXME: Use cpumask_var_t. */
32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); 32static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS);
33static DECLARE_BITMAP(tmpmask, NR_CPUS); 33static DECLARE_BITMAP(tmpmask, NR_CPUS);
34static DEFINE_SPINLOCK(tick_broadcast_lock); 34static DEFINE_RAW_SPINLOCK(tick_broadcast_lock);
35static int tick_broadcast_force; 35static int tick_broadcast_force;
36 36
37#ifdef CONFIG_TICK_ONESHOT 37#ifdef CONFIG_TICK_ONESHOT
@@ -96,7 +96,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
96 unsigned long flags; 96 unsigned long flags;
97 int ret = 0; 97 int ret = 0;
98 98
99 spin_lock_irqsave(&tick_broadcast_lock, flags); 99 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
100 100
101 /* 101 /*
102 * Devices might be registered with both periodic and oneshot 102 * Devices might be registered with both periodic and oneshot
@@ -122,7 +122,7 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
122 tick_broadcast_clear_oneshot(cpu); 122 tick_broadcast_clear_oneshot(cpu);
123 } 123 }
124 } 124 }
125 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 125 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
126 return ret; 126 return ret;
127} 127}
128 128
@@ -161,13 +161,13 @@ static void tick_do_broadcast(struct cpumask *mask)
161 */ 161 */
162static void tick_do_periodic_broadcast(void) 162static void tick_do_periodic_broadcast(void)
163{ 163{
164 spin_lock(&tick_broadcast_lock); 164 raw_spin_lock(&tick_broadcast_lock);
165 165
166 cpumask_and(to_cpumask(tmpmask), 166 cpumask_and(to_cpumask(tmpmask),
167 cpu_online_mask, tick_get_broadcast_mask()); 167 cpu_online_mask, tick_get_broadcast_mask());
168 tick_do_broadcast(to_cpumask(tmpmask)); 168 tick_do_broadcast(to_cpumask(tmpmask));
169 169
170 spin_unlock(&tick_broadcast_lock); 170 raw_spin_unlock(&tick_broadcast_lock);
171} 171}
172 172
173/* 173/*
@@ -212,7 +212,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
212 unsigned long flags; 212 unsigned long flags;
213 int cpu, bc_stopped; 213 int cpu, bc_stopped;
214 214
215 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
216 216
217 cpu = smp_processor_id(); 217 cpu = smp_processor_id();
218 td = &per_cpu(tick_cpu_device, cpu); 218 td = &per_cpu(tick_cpu_device, cpu);
@@ -263,7 +263,7 @@ static void tick_do_broadcast_on_off(unsigned long *reason)
263 tick_broadcast_setup_oneshot(bc); 263 tick_broadcast_setup_oneshot(bc);
264 } 264 }
265out: 265out:
266 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 266 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
267} 267}
268 268
269/* 269/*
@@ -299,7 +299,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
299 unsigned long flags; 299 unsigned long flags;
300 unsigned int cpu = *cpup; 300 unsigned int cpu = *cpup;
301 301
302 spin_lock_irqsave(&tick_broadcast_lock, flags); 302 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
303 303
304 bc = tick_broadcast_device.evtdev; 304 bc = tick_broadcast_device.evtdev;
305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); 305 cpumask_clear_cpu(cpu, tick_get_broadcast_mask());
@@ -309,7 +309,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
309 clockevents_shutdown(bc); 309 clockevents_shutdown(bc);
310 } 310 }
311 311
312 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 312 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
313} 313}
314 314
315void tick_suspend_broadcast(void) 315void tick_suspend_broadcast(void)
@@ -317,13 +317,13 @@ void tick_suspend_broadcast(void)
317 struct clock_event_device *bc; 317 struct clock_event_device *bc;
318 unsigned long flags; 318 unsigned long flags;
319 319
320 spin_lock_irqsave(&tick_broadcast_lock, flags); 320 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
321 321
322 bc = tick_broadcast_device.evtdev; 322 bc = tick_broadcast_device.evtdev;
323 if (bc) 323 if (bc)
324 clockevents_shutdown(bc); 324 clockevents_shutdown(bc);
325 325
326 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 326 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
327} 327}
328 328
329int tick_resume_broadcast(void) 329int tick_resume_broadcast(void)
@@ -332,7 +332,7 @@ int tick_resume_broadcast(void)
332 unsigned long flags; 332 unsigned long flags;
333 int broadcast = 0; 333 int broadcast = 0;
334 334
335 spin_lock_irqsave(&tick_broadcast_lock, flags); 335 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
336 336
337 bc = tick_broadcast_device.evtdev; 337 bc = tick_broadcast_device.evtdev;
338 338
@@ -351,7 +351,7 @@ int tick_resume_broadcast(void)
351 break; 351 break;
352 } 352 }
353 } 353 }
354 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 354 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
355 355
356 return broadcast; 356 return broadcast;
357} 357}
@@ -405,7 +405,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
405 ktime_t now, next_event; 405 ktime_t now, next_event;
406 int cpu; 406 int cpu;
407 407
408 spin_lock(&tick_broadcast_lock); 408 raw_spin_lock(&tick_broadcast_lock);
409again: 409again:
410 dev->next_event.tv64 = KTIME_MAX; 410 dev->next_event.tv64 = KTIME_MAX;
411 next_event.tv64 = KTIME_MAX; 411 next_event.tv64 = KTIME_MAX;
@@ -443,7 +443,7 @@ again:
443 if (tick_broadcast_set_event(next_event, 0)) 443 if (tick_broadcast_set_event(next_event, 0))
444 goto again; 444 goto again;
445 } 445 }
446 spin_unlock(&tick_broadcast_lock); 446 raw_spin_unlock(&tick_broadcast_lock);
447} 447}
448 448
449/* 449/*
@@ -457,7 +457,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
457 unsigned long flags; 457 unsigned long flags;
458 int cpu; 458 int cpu;
459 459
460 spin_lock_irqsave(&tick_broadcast_lock, flags); 460 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
461 461
462 /* 462 /*
463 * Periodic mode does not care about the enter/exit of power 463 * Periodic mode does not care about the enter/exit of power
@@ -492,7 +492,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
492 } 492 }
493 493
494out: 494out:
495 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 495 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
496} 496}
497 497
498/* 498/*
@@ -563,13 +563,13 @@ void tick_broadcast_switch_to_oneshot(void)
563 struct clock_event_device *bc; 563 struct clock_event_device *bc;
564 unsigned long flags; 564 unsigned long flags;
565 565
566 spin_lock_irqsave(&tick_broadcast_lock, flags); 566 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
567 567
568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; 568 tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT;
569 bc = tick_broadcast_device.evtdev; 569 bc = tick_broadcast_device.evtdev;
570 if (bc) 570 if (bc)
571 tick_broadcast_setup_oneshot(bc); 571 tick_broadcast_setup_oneshot(bc);
572 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 572 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
573} 573}
574 574
575 575
@@ -581,7 +581,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
581 unsigned long flags; 581 unsigned long flags;
582 unsigned int cpu = *cpup; 582 unsigned int cpu = *cpup;
583 583
584 spin_lock_irqsave(&tick_broadcast_lock, flags); 584 raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
585 585
586 /* 586 /*
587 * Clear the broadcast mask flag for the dead cpu, but do not 587 * Clear the broadcast mask flag for the dead cpu, but do not
@@ -589,7 +589,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
589 */ 589 */
590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); 590 cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask());
591 591
592 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 592 raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
593} 593}
594 594
595/* 595/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 83c4417b6a3c..b6b898d2eeef 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37static DEFINE_RAW_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
40 * Debugging: see timer_list.c 40 * Debugging: see timer_list.c
@@ -209,7 +209,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
209 int cpu, ret = NOTIFY_OK; 209 int cpu, ret = NOTIFY_OK;
210 unsigned long flags; 210 unsigned long flags;
211 211
212 spin_lock_irqsave(&tick_device_lock, flags); 212 raw_spin_lock_irqsave(&tick_device_lock, flags);
213 213
214 cpu = smp_processor_id(); 214 cpu = smp_processor_id();
215 if (!cpumask_test_cpu(cpu, newdev->cpumask)) 215 if (!cpumask_test_cpu(cpu, newdev->cpumask))
@@ -268,7 +268,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 268 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
269 tick_oneshot_notify(); 269 tick_oneshot_notify();
270 270
271 spin_unlock_irqrestore(&tick_device_lock, flags); 271 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
272 return NOTIFY_STOP; 272 return NOTIFY_STOP;
273 273
274out_bc: 274out_bc:
@@ -278,7 +278,7 @@ out_bc:
278 if (tick_check_broadcast_device(newdev)) 278 if (tick_check_broadcast_device(newdev))
279 ret = NOTIFY_STOP; 279 ret = NOTIFY_STOP;
280 280
281 spin_unlock_irqrestore(&tick_device_lock, flags); 281 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
282 282
283 return ret; 283 return ret;
284} 284}
@@ -311,7 +311,7 @@ static void tick_shutdown(unsigned int *cpup)
311 struct clock_event_device *dev = td->evtdev; 311 struct clock_event_device *dev = td->evtdev;
312 unsigned long flags; 312 unsigned long flags;
313 313
314 spin_lock_irqsave(&tick_device_lock, flags); 314 raw_spin_lock_irqsave(&tick_device_lock, flags);
315 td->mode = TICKDEV_MODE_PERIODIC; 315 td->mode = TICKDEV_MODE_PERIODIC;
316 if (dev) { 316 if (dev) {
317 /* 317 /*
@@ -322,7 +322,7 @@ static void tick_shutdown(unsigned int *cpup)
322 clockevents_exchange_device(dev, NULL); 322 clockevents_exchange_device(dev, NULL);
323 td->evtdev = NULL; 323 td->evtdev = NULL;
324 } 324 }
325 spin_unlock_irqrestore(&tick_device_lock, flags); 325 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
326} 326}
327 327
328static void tick_suspend(void) 328static void tick_suspend(void)
@@ -330,9 +330,9 @@ static void tick_suspend(void)
330 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 330 struct tick_device *td = &__get_cpu_var(tick_cpu_device);
331 unsigned long flags; 331 unsigned long flags;
332 332
333 spin_lock_irqsave(&tick_device_lock, flags); 333 raw_spin_lock_irqsave(&tick_device_lock, flags);
334 clockevents_shutdown(td->evtdev); 334 clockevents_shutdown(td->evtdev);
335 spin_unlock_irqrestore(&tick_device_lock, flags); 335 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
336} 336}
337 337
338static void tick_resume(void) 338static void tick_resume(void)
@@ -341,7 +341,7 @@ static void tick_resume(void)
341 unsigned long flags; 341 unsigned long flags;
342 int broadcast = tick_resume_broadcast(); 342 int broadcast = tick_resume_broadcast();
343 343
344 spin_lock_irqsave(&tick_device_lock, flags); 344 raw_spin_lock_irqsave(&tick_device_lock, flags);
345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 345 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
346 346
347 if (!broadcast) { 347 if (!broadcast) {
@@ -350,7 +350,7 @@ static void tick_resume(void)
350 else 350 else
351 tick_resume_oneshot(); 351 tick_resume_oneshot();
352 } 352 }
353 spin_unlock_irqrestore(&tick_device_lock, flags); 353 raw_spin_unlock_irqrestore(&tick_device_lock, flags);
354} 354}
355 355
356/* 356/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b1c05bf75ee0..290eefbc1f60 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -6,7 +6,6 @@
6#define TICK_DO_TIMER_BOOT -2 6#define TICK_DO_TIMER_BOOT -2
7 7
8DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
9extern spinlock_t tick_device_lock;
10extern ktime_t tick_next_period; 9extern ktime_t tick_next_period;
11extern ktime_t tick_period; 10extern ktime_t tick_period;
12extern int tick_do_timer_cpu __read_mostly; 11extern int tick_do_timer_cpu __read_mostly;
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 96ff643a5a59..12f5c55090be 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -89,7 +89,7 @@ int timecompare_offset(struct timecompare *sync,
89 * source time 89 * source time
90 */ 90 */
91 sample.offset = 91 sample.offset =
92 ktime_to_ns(ktime_add(end, start)) / 2 - 92 (ktime_to_ns(end) + ktime_to_ns(start)) / 2 -
93 ts; 93 ts;
94 94
95 /* simple insertion sort based on duration */ 95 /* simple insertion sort based on duration */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index af4135f05825..16736379a9ca 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,6 +165,13 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
168/* must hold xtime_lock */ 175/* must hold xtime_lock */
169void timekeeping_leap_insert(int leapsecond) 176void timekeeping_leap_insert(int leapsecond)
170{ 177{
@@ -325,6 +332,8 @@ int do_settimeofday(struct timespec *tv)
325 332
326 xtime = *tv; 333 xtime = *tv;
327 334
335 update_xtime_cache(0);
336
328 timekeeper.ntp_error = 0; 337 timekeeper.ntp_error = 0;
329 ntp_clear(); 338 ntp_clear();
330 339
@@ -550,6 +559,7 @@ void __init timekeeping_init(void)
550 } 559 }
551 set_normalized_timespec(&wall_to_monotonic, 560 set_normalized_timespec(&wall_to_monotonic,
552 -boot.tv_sec, -boot.tv_nsec); 561 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
553 total_sleep_time.tv_sec = 0; 563 total_sleep_time.tv_sec = 0;
554 total_sleep_time.tv_nsec = 0; 564 total_sleep_time.tv_nsec = 0;
555 write_sequnlock_irqrestore(&xtime_lock, flags); 565 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -583,6 +593,7 @@ static int timekeeping_resume(struct sys_device *dev)
583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
584 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 594 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
585 } 595 }
596 update_xtime_cache(0);
586 /* re-base the last cycle value */ 597 /* re-base the last cycle value */
587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
588 timekeeper.ntp_error = 0; 599 timekeeper.ntp_error = 0;
@@ -611,6 +622,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
611 write_sequnlock_irqrestore(&xtime_lock, flags); 622 write_sequnlock_irqrestore(&xtime_lock, flags);
612 623
613 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); 624 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
625 clocksource_suspend();
614 626
615 return 0; 627 return 0;
616} 628}
@@ -722,6 +734,7 @@ static void timekeeping_adjust(s64 offset)
722 timekeeper.ntp_error_shift; 734 timekeeper.ntp_error_shift;
723} 735}
724 736
737
725/** 738/**
726 * logarithmic_accumulation - shifted accumulation of cycles 739 * logarithmic_accumulation - shifted accumulation of cycles
727 * 740 *
@@ -765,6 +778,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
765 return offset; 778 return offset;
766} 779}
767 780
781
768/** 782/**
769 * update_wall_time - Uses the current clocksource to increment the wall time 783 * update_wall_time - Uses the current clocksource to increment the wall time
770 * 784 *
@@ -774,6 +788,7 @@ void update_wall_time(void)
774{ 788{
775 struct clocksource *clock; 789 struct clocksource *clock;
776 cycle_t offset; 790 cycle_t offset;
791 u64 nsecs;
777 int shift = 0, maxshift; 792 int shift = 0, maxshift;
778 793
779 /* Make sure we're fully resumed: */ 794 /* Make sure we're fully resumed: */
@@ -839,6 +854,9 @@ void update_wall_time(void)
839 timekeeper.ntp_error += timekeeper.xtime_nsec << 854 timekeeper.ntp_error += timekeeper.xtime_nsec <<
840 timekeeper.ntp_error_shift; 855 timekeeper.ntp_error_shift;
841 856
857 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift);
858 update_xtime_cache(nsecs);
859
842 /* check to see if there is a new clocksource to use */ 860 /* check to see if there is a new clocksource to use */
843 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 861 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
844} 862}
@@ -863,6 +881,7 @@ void getboottime(struct timespec *ts)
863 881
864 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 882 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
865} 883}
884EXPORT_SYMBOL_GPL(getboottime);
866 885
867/** 886/**
868 * monotonic_to_bootbased - Convert the monotonic time to boot based. 887 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -872,16 +891,17 @@ void monotonic_to_bootbased(struct timespec *ts)
872{ 891{
873 *ts = timespec_add_safe(*ts, total_sleep_time); 892 *ts = timespec_add_safe(*ts, total_sleep_time);
874} 893}
894EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
875 895
876unsigned long get_seconds(void) 896unsigned long get_seconds(void)
877{ 897{
878 return xtime.tv_sec; 898 return xtime_cache.tv_sec;
879} 899}
880EXPORT_SYMBOL(get_seconds); 900EXPORT_SYMBOL(get_seconds);
881 901
882struct timespec __current_kernel_time(void) 902struct timespec __current_kernel_time(void)
883{ 903{
884 return xtime; 904 return xtime_cache;
885} 905}
886 906
887struct timespec current_kernel_time(void) 907struct timespec current_kernel_time(void)
@@ -891,7 +911,8 @@ struct timespec current_kernel_time(void)
891 911
892 do { 912 do {
893 seq = read_seqbegin(&xtime_lock); 913 seq = read_seqbegin(&xtime_lock);
894 now = xtime; 914
915 now = xtime_cache;
895 } while (read_seqretry(&xtime_lock, seq)); 916 } while (read_seqretry(&xtime_lock, seq));
896 917
897 return now; 918 return now;
@@ -905,7 +926,8 @@ struct timespec get_monotonic_coarse(void)
905 926
906 do { 927 do {
907 seq = read_seqbegin(&xtime_lock); 928 seq = read_seqbegin(&xtime_lock);
908 now = xtime; 929
930 now = xtime_cache;
909 mono = wall_to_monotonic; 931 mono = wall_to_monotonic;
910 } while (read_seqretry(&xtime_lock, seq)); 932 } while (read_seqretry(&xtime_lock, seq));
911 933
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 665c76edbf17..bdfb8dd1050c 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -84,7 +84,7 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base,
84 84
85next_one: 85next_one:
86 i = 0; 86 i = 0;
87 spin_lock_irqsave(&base->cpu_base->lock, flags); 87 raw_spin_lock_irqsave(&base->cpu_base->lock, flags);
88 88
89 curr = base->first; 89 curr = base->first;
90 /* 90 /*
@@ -100,13 +100,13 @@ next_one:
100 100
101 timer = rb_entry(curr, struct hrtimer, node); 101 timer = rb_entry(curr, struct hrtimer, node);
102 tmp = *timer; 102 tmp = *timer;
103 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
104 104
105 print_timer(m, timer, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
106 next++; 106 next++;
107 goto next_one; 107 goto next_one;
108 } 108 }
109 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 109 raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags);
110} 110}
111 111
112static void 112static void
@@ -150,6 +150,9 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
150 P_ns(expires_next); 150 P_ns(expires_next);
151 P(hres_active); 151 P(hres_active);
152 P(nr_events); 152 P(nr_events);
153 P(nr_retries);
154 P(nr_hangs);
155 P_ns(max_hang_time);
153#endif 156#endif
154#undef P 157#undef P
155#undef P_ns 158#undef P_ns
@@ -234,10 +237,10 @@ static void timer_list_show_tickdevices(struct seq_file *m)
234#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 237#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
235 print_tickdevice(m, tick_get_broadcast_device(), -1); 238 print_tickdevice(m, tick_get_broadcast_device(), -1);
236 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 239 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
237 tick_get_broadcast_mask()->bits[0]); 240 cpumask_bits(tick_get_broadcast_mask())[0]);
238#ifdef CONFIG_TICK_ONESHOT 241#ifdef CONFIG_TICK_ONESHOT
239 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", 242 SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
240 tick_get_broadcast_oneshot_mask()->bits[0]); 243 cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
241#endif 244#endif
242 SEQ_printf(m, "\n"); 245 SEQ_printf(m, "\n");
243#endif 246#endif
@@ -254,7 +257,7 @@ static int timer_list_show(struct seq_file *m, void *v)
254 u64 now = ktime_to_ns(ktime_get()); 257 u64 now = ktime_to_ns(ktime_get());
255 int cpu; 258 int cpu;
256 259
257 SEQ_printf(m, "Timer List Version: v0.4\n"); 260 SEQ_printf(m, "Timer List Version: v0.5\n");
258 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
259 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
260 263
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index ee5681f8d7ec..2f3b585b8d7d 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -86,7 +86,7 @@ static DEFINE_SPINLOCK(table_lock);
86/* 86/*
87 * Per-CPU lookup locks for fast hash lookup: 87 * Per-CPU lookup locks for fast hash lookup:
88 */ 88 */
89static DEFINE_PER_CPU(spinlock_t, lookup_lock); 89static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock);
90 90
91/* 91/*
92 * Mutex to serialize state changes with show-stats activities: 92 * Mutex to serialize state changes with show-stats activities:
@@ -238,14 +238,14 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
238 /* 238 /*
239 * It doesnt matter which lock we take: 239 * It doesnt matter which lock we take:
240 */ 240 */
241 spinlock_t *lock; 241 raw_spinlock_t *lock;
242 struct entry *entry, input; 242 struct entry *entry, input;
243 unsigned long flags; 243 unsigned long flags;
244 244
245 if (likely(!timer_stats_active)) 245 if (likely(!timer_stats_active))
246 return; 246 return;
247 247
248 lock = &per_cpu(lookup_lock, raw_smp_processor_id()); 248 lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id());
249 249
250 input.timer = timer; 250 input.timer = timer;
251 input.start_func = startf; 251 input.start_func = startf;
@@ -253,7 +253,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
253 input.pid = pid; 253 input.pid = pid;
254 input.timer_flag = timer_flag; 254 input.timer_flag = timer_flag;
255 255
256 spin_lock_irqsave(lock, flags); 256 raw_spin_lock_irqsave(lock, flags);
257 if (!timer_stats_active) 257 if (!timer_stats_active)
258 goto out_unlock; 258 goto out_unlock;
259 259
@@ -264,7 +264,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
264 atomic_inc(&overflow_count); 264 atomic_inc(&overflow_count);
265 265
266 out_unlock: 266 out_unlock:
267 spin_unlock_irqrestore(lock, flags); 267 raw_spin_unlock_irqrestore(lock, flags);
268} 268}
269 269
270static void print_name_offset(struct seq_file *m, unsigned long addr) 270static void print_name_offset(struct seq_file *m, unsigned long addr)
@@ -348,9 +348,11 @@ static void sync_access(void)
348 int cpu; 348 int cpu;
349 349
350 for_each_online_cpu(cpu) { 350 for_each_online_cpu(cpu) {
351 spin_lock_irqsave(&per_cpu(lookup_lock, cpu), flags); 351 raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu);
352
353 raw_spin_lock_irqsave(lock, flags);
352 /* nothing */ 354 /* nothing */
353 spin_unlock_irqrestore(&per_cpu(lookup_lock, cpu), flags); 355 raw_spin_unlock_irqrestore(lock, flags);
354 } 356 }
355} 357}
356 358
@@ -408,7 +410,7 @@ void __init init_timer_stats(void)
408 int cpu; 410 int cpu;
409 411
410 for_each_possible_cpu(cpu) 412 for_each_possible_cpu(cpu)
411 spin_lock_init(&per_cpu(lookup_lock, cpu)); 413 raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu));
412} 414}
413 415
414static int __init init_tstats_procfs(void) 416static int __init init_tstats_procfs(void)
diff --git a/kernel/timer.c b/kernel/timer.c
index 5db5a8d26811..c61a7949387f 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -656,8 +656,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
656 656
657 debug_activate(timer, expires); 657 debug_activate(timer, expires);
658 658
659 new_base = __get_cpu_var(tvec_bases);
660
661 cpu = smp_processor_id(); 659 cpu = smp_processor_id();
662 660
663#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) 661#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
@@ -1200,6 +1198,7 @@ void update_process_times(int user_tick)
1200 run_local_timers(); 1198 run_local_timers();
1201 rcu_check_callbacks(cpu, user_tick); 1199 rcu_check_callbacks(cpu, user_tick);
1202 printk_tick(); 1200 printk_tick();
1201 perf_event_do_pending();
1203 scheduler_tick(); 1202 scheduler_tick();
1204 run_posix_cpu_timers(p); 1203 run_posix_cpu_timers(p);
1205} 1204}
@@ -1211,8 +1210,6 @@ static void run_timer_softirq(struct softirq_action *h)
1211{ 1210{
1212 struct tvec_base *base = __get_cpu_var(tvec_bases); 1211 struct tvec_base *base = __get_cpu_var(tvec_bases);
1213 1212
1214 perf_event_do_pending();
1215
1216 hrtimer_run_pending(); 1213 hrtimer_run_pending();
1217 1214
1218 if (time_after_eq(jiffies, base->timer_jiffies)) 1215 if (time_after_eq(jiffies, base->timer_jiffies))
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d006554888dc..13e13d428cd3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -12,39 +12,37 @@ config NOP_TRACER
12config HAVE_FTRACE_NMI_ENTER 12config HAVE_FTRACE_NMI_ENTER
13 bool 13 bool
14 help 14 help
15 See Documentation/trace/ftrace-implementation.txt 15 See Documentation/trace/ftrace-design.txt
16 16
17config HAVE_FUNCTION_TRACER 17config HAVE_FUNCTION_TRACER
18 bool 18 bool
19 help 19 help
20 See Documentation/trace/ftrace-implementation.txt 20 See Documentation/trace/ftrace-design.txt
21 21
22config HAVE_FUNCTION_GRAPH_TRACER 22config HAVE_FUNCTION_GRAPH_TRACER
23 bool 23 bool
24 help 24 help
25 See Documentation/trace/ftrace-implementation.txt 25 See Documentation/trace/ftrace-design.txt
26 26
27config HAVE_FUNCTION_GRAPH_FP_TEST 27config HAVE_FUNCTION_GRAPH_FP_TEST
28 bool 28 bool
29 help 29 help
30 An arch may pass in a unique value (frame pointer) to both the 30 See Documentation/trace/ftrace-design.txt
31 entering and exiting of a function. On exit, the value is compared
32 and if it does not match, then it will panic the kernel.
33 31
34config HAVE_FUNCTION_TRACE_MCOUNT_TEST 32config HAVE_FUNCTION_TRACE_MCOUNT_TEST
35 bool 33 bool
36 help 34 help
37 See Documentation/trace/ftrace-implementation.txt 35 See Documentation/trace/ftrace-design.txt
38 36
39config HAVE_DYNAMIC_FTRACE 37config HAVE_DYNAMIC_FTRACE
40 bool 38 bool
41 help 39 help
42 See Documentation/trace/ftrace-implementation.txt 40 See Documentation/trace/ftrace-design.txt
43 41
44config HAVE_FTRACE_MCOUNT_RECORD 42config HAVE_FTRACE_MCOUNT_RECORD
45 bool 43 bool
46 help 44 help
47 See Documentation/trace/ftrace-implementation.txt 45 See Documentation/trace/ftrace-design.txt
48 46
49config HAVE_HW_BRANCH_TRACER 47config HAVE_HW_BRANCH_TRACER
50 bool 48 bool
@@ -52,7 +50,7 @@ config HAVE_HW_BRANCH_TRACER
52config HAVE_SYSCALL_TRACEPOINTS 50config HAVE_SYSCALL_TRACEPOINTS
53 bool 51 bool
54 help 52 help
55 See Documentation/trace/ftrace-implementation.txt 53 See Documentation/trace/ftrace-design.txt
56 54
57config TRACER_MAX_TRACE 55config TRACER_MAX_TRACE
58 bool 56 bool
@@ -83,7 +81,7 @@ config RING_BUFFER_ALLOW_SWAP
83# This allows those options to appear when no other tracer is selected. But the 81# This allows those options to appear when no other tracer is selected. But the
84# options do not appear when something else selects it. We need the two options 82# options do not appear when something else selects it. We need the two options
85# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the 83# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
86# hidding of the automatic options. 84# hiding of the automatic options.
87 85
88config TRACING 86config TRACING
89 bool 87 bool
@@ -119,7 +117,7 @@ menuconfig FTRACE
119 bool "Tracers" 117 bool "Tracers"
120 default y if DEBUG_KERNEL 118 default y if DEBUG_KERNEL
121 help 119 help
122 Enable the kernel tracing infrastructure. 120 Enable the kernel tracing infrastructure.
123 121
124if FTRACE 122if FTRACE
125 123
@@ -133,7 +131,7 @@ config FUNCTION_TRACER
133 help 131 help
134 Enable the kernel to trace every kernel function. This is done 132 Enable the kernel to trace every kernel function. This is done
135 by using a compiler feature to insert a small, 5-byte No-Operation 133 by using a compiler feature to insert a small, 5-byte No-Operation
136 instruction to the beginning of every kernel function, which NOP 134 instruction at the beginning of every kernel function, which NOP
137 sequence is then dynamically patched into a tracer call when 135 sequence is then dynamically patched into a tracer call when
138 tracing is enabled by the administrator. If it's runtime disabled 136 tracing is enabled by the administrator. If it's runtime disabled
139 (the bootup default), then the overhead of the instructions is very 137 (the bootup default), then the overhead of the instructions is very
@@ -150,7 +148,7 @@ config FUNCTION_GRAPH_TRACER
150 and its entry. 148 and its entry.
151 Its first purpose is to trace the duration of functions and 149 Its first purpose is to trace the duration of functions and
152 draw a call graph for each thread with some information like 150 draw a call graph for each thread with some information like
153 the return value. This is done by setting the current return 151 the return value. This is done by setting the current return
154 address on the current task structure into a stack of calls. 152 address on the current task structure into a stack of calls.
155 153
156 154
@@ -173,7 +171,7 @@ config IRQSOFF_TRACER
173 171
174 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 172 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
175 173
176 (Note that kernel size and overhead increases with this option 174 (Note that kernel size and overhead increase with this option
177 enabled. This option and the preempt-off timing option can be 175 enabled. This option and the preempt-off timing option can be
178 used together or separately.) 176 used together or separately.)
179 177
@@ -186,7 +184,7 @@ config PREEMPT_TRACER
186 select TRACER_MAX_TRACE 184 select TRACER_MAX_TRACE
187 select RING_BUFFER_ALLOW_SWAP 185 select RING_BUFFER_ALLOW_SWAP
188 help 186 help
189 This option measures the time spent in preemption off critical 187 This option measures the time spent in preemption-off critical
190 sections, with microsecond accuracy. 188 sections, with microsecond accuracy.
191 189
192 The default measurement method is a maximum search, which is 190 The default measurement method is a maximum search, which is
@@ -195,7 +193,7 @@ config PREEMPT_TRACER
195 193
196 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency 194 echo 0 > /sys/kernel/debug/tracing/tracing_max_latency
197 195
198 (Note that kernel size and overhead increases with this option 196 (Note that kernel size and overhead increase with this option
199 enabled. This option and the irqs-off timing option can be 197 enabled. This option and the irqs-off timing option can be
200 used together or separately.) 198 used together or separately.)
201 199
@@ -222,7 +220,7 @@ config ENABLE_DEFAULT_TRACERS
222 depends on !GENERIC_TRACER 220 depends on !GENERIC_TRACER
223 select TRACING 221 select TRACING
224 help 222 help
225 This tracer hooks to various trace points in the kernel 223 This tracer hooks to various trace points in the kernel,
226 allowing the user to pick and choose which trace point they 224 allowing the user to pick and choose which trace point they
227 want to trace. It also includes the sched_switch tracer plugin. 225 want to trace. It also includes the sched_switch tracer plugin.
228 226
@@ -265,19 +263,19 @@ choice
265 The likely/unlikely profiler only looks at the conditions that 263 The likely/unlikely profiler only looks at the conditions that
266 are annotated with a likely or unlikely macro. 264 are annotated with a likely or unlikely macro.
267 265
268 The "all branch" profiler will profile every if statement in the 266 The "all branch" profiler will profile every if-statement in the
269 kernel. This profiler will also enable the likely/unlikely 267 kernel. This profiler will also enable the likely/unlikely
270 profiler as well. 268 profiler.
271 269
272 Either of the above profilers add a bit of overhead to the system. 270 Either of the above profilers adds a bit of overhead to the system.
273 If unsure choose "No branch profiling". 271 If unsure, choose "No branch profiling".
274 272
275config BRANCH_PROFILE_NONE 273config BRANCH_PROFILE_NONE
276 bool "No branch profiling" 274 bool "No branch profiling"
277 help 275 help
278 No branch profiling. Branch profiling adds a bit of overhead. 276 No branch profiling. Branch profiling adds a bit of overhead.
279 Only enable it if you want to analyse the branching behavior. 277 Only enable it if you want to analyse the branching behavior.
280 Otherwise keep it disabled. 278 Otherwise keep it disabled.
281 279
282config PROFILE_ANNOTATED_BRANCHES 280config PROFILE_ANNOTATED_BRANCHES
283 bool "Trace likely/unlikely profiler" 281 bool "Trace likely/unlikely profiler"
@@ -288,7 +286,7 @@ config PROFILE_ANNOTATED_BRANCHES
288 286
289 /sys/kernel/debug/tracing/profile_annotated_branch 287 /sys/kernel/debug/tracing/profile_annotated_branch
290 288
291 Note: this will add a significant overhead, only turn this 289 Note: this will add a significant overhead; only turn this
292 on if you need to profile the system's use of these macros. 290 on if you need to profile the system's use of these macros.
293 291
294config PROFILE_ALL_BRANCHES 292config PROFILE_ALL_BRANCHES
@@ -305,7 +303,7 @@ config PROFILE_ALL_BRANCHES
305 303
306 This configuration, when enabled, will impose a great overhead 304 This configuration, when enabled, will impose a great overhead
307 on the system. This should only be enabled when the system 305 on the system. This should only be enabled when the system
308 is to be analyzed 306 is to be analyzed in much detail.
309endchoice 307endchoice
310 308
311config TRACING_BRANCHES 309config TRACING_BRANCHES
@@ -330,15 +328,6 @@ config BRANCH_TRACER
330 328
331 Say N if unsure. 329 Say N if unsure.
332 330
333config POWER_TRACER
334 bool "Trace power consumption behavior"
335 depends on X86
336 select GENERIC_TRACER
337 help
338 This tracer helps developers to analyze and optimize the kernels
339 power management decisions, specifically the C-state and P-state
340 behavior.
341
342config KSYM_TRACER 331config KSYM_TRACER
343 bool "Trace read and write access on kernel memory locations" 332 bool "Trace read and write access on kernel memory locations"
344 depends on HAVE_HW_BREAKPOINT 333 depends on HAVE_HW_BREAKPOINT
@@ -391,14 +380,14 @@ config HW_BRANCH_TRACER
391 select GENERIC_TRACER 380 select GENERIC_TRACER
392 help 381 help
393 This tracer records all branches on the system in a circular 382 This tracer records all branches on the system in a circular
394 buffer giving access to the last N branches for each cpu. 383 buffer, giving access to the last N branches for each cpu.
395 384
396config KMEMTRACE 385config KMEMTRACE
397 bool "Trace SLAB allocations" 386 bool "Trace SLAB allocations"
398 select GENERIC_TRACER 387 select GENERIC_TRACER
399 help 388 help
400 kmemtrace provides tracing for slab allocator functions, such as 389 kmemtrace provides tracing for slab allocator functions, such as
401 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected 390 kmalloc, kfree, kmem_cache_alloc, kmem_cache_free, etc. Collected
402 data is then fed to the userspace application in order to analyse 391 data is then fed to the userspace application in order to analyse
403 allocation hotspots, internal fragmentation and so on, making it 392 allocation hotspots, internal fragmentation and so on, making it
404 possible to see how well an allocator performs, as well as debug 393 possible to see how well an allocator performs, as well as debug
@@ -417,15 +406,15 @@ config WORKQUEUE_TRACER
417 bool "Trace workqueues" 406 bool "Trace workqueues"
418 select GENERIC_TRACER 407 select GENERIC_TRACER
419 help 408 help
420 The workqueue tracer provides some statistical informations 409 The workqueue tracer provides some statistical information
421 about each cpu workqueue thread such as the number of the 410 about each cpu workqueue thread such as the number of the
422 works inserted and executed since their creation. It can help 411 works inserted and executed since their creation. It can help
423 to evaluate the amount of work each of them have to perform. 412 to evaluate the amount of work each of them has to perform.
424 For example it can help a developer to decide whether he should 413 For example it can help a developer to decide whether he should
425 choose a per cpu workqueue instead of a singlethreaded one. 414 choose a per-cpu workqueue instead of a singlethreaded one.
426 415
427config BLK_DEV_IO_TRACE 416config BLK_DEV_IO_TRACE
428 bool "Support for tracing block io actions" 417 bool "Support for tracing block IO actions"
429 depends on SYSFS 418 depends on SYSFS
430 depends on BLOCK 419 depends on BLOCK
431 select RELAY 420 select RELAY
@@ -451,20 +440,20 @@ config BLK_DEV_IO_TRACE
451 440
452config KPROBE_EVENT 441config KPROBE_EVENT
453 depends on KPROBES 442 depends on KPROBES
454 depends on X86 443 depends on HAVE_REGS_AND_STACK_ACCESS_API
455 bool "Enable kprobes-based dynamic events" 444 bool "Enable kprobes-based dynamic events"
456 select TRACING 445 select TRACING
457 default y 446 default y
458 help 447 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly 448 This allows the user to add tracing events (similar to tracepoints)
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt 449 on the fly via the ftrace interface. See
461 for more details. 450 Documentation/trace/kprobetrace.txt for more details.
462 451
463 Those events can be inserted wherever kprobes can probe, and record 452 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values. 453 various register and memory values.
465 454
466 This option is also required by perf-probe subcommand of perf tools. If 455 This option is also required by perf-probe subcommand of perf tools.
467 you want to use perf tools, this option is strongly recommended. 456 If you want to use perf tools, this option is strongly recommended.
468 457
469config DYNAMIC_FTRACE 458config DYNAMIC_FTRACE
470 bool "enable/disable ftrace tracepoints dynamically" 459 bool "enable/disable ftrace tracepoints dynamically"
@@ -472,32 +461,32 @@ config DYNAMIC_FTRACE
472 depends on HAVE_DYNAMIC_FTRACE 461 depends on HAVE_DYNAMIC_FTRACE
473 default y 462 default y
474 help 463 help
475 This option will modify all the calls to ftrace dynamically 464 This option will modify all the calls to ftrace dynamically
476 (will patch them out of the binary image and replaces them 465 (will patch them out of the binary image and replace them
477 with a No-Op instruction) as they are called. A table is 466 with a No-Op instruction) as they are called. A table is
478 created to dynamically enable them again. 467 created to dynamically enable them again.
479 468
480 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise 469 This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
481 has native performance as long as no tracing is active. 470 otherwise has native performance as long as no tracing is active.
482 471
483 The changes to the code are done by a kernel thread that 472 The changes to the code are done by a kernel thread that
484 wakes up once a second and checks to see if any ftrace calls 473 wakes up once a second and checks to see if any ftrace calls
485 were made. If so, it runs stop_machine (stops all CPUS) 474 were made. If so, it runs stop_machine (stops all CPUS)
486 and modifies the code to jump over the call to ftrace. 475 and modifies the code to jump over the call to ftrace.
487 476
488config FUNCTION_PROFILER 477config FUNCTION_PROFILER
489 bool "Kernel function profiler" 478 bool "Kernel function profiler"
490 depends on FUNCTION_TRACER 479 depends on FUNCTION_TRACER
491 default n 480 default n
492 help 481 help
493 This option enables the kernel function profiler. A file is created 482 This option enables the kernel function profiler. A file is created
494 in debugfs called function_profile_enabled which defaults to zero. 483 in debugfs called function_profile_enabled which defaults to zero.
495 When a 1 is echoed into this file profiling begins, and when a 484 When a 1 is echoed into this file profiling begins, and when a
496 zero is entered, profiling stops. A file in the trace_stats 485 zero is entered, profiling stops. A "functions" file is created in
497 directory called functions, that show the list of functions that 486 the trace_stats directory; this file shows the list of functions that
498 have been hit and their counters. 487 have been hit and their counters.
499 488
500 If in doubt, say N 489 If in doubt, say N.
501 490
502config FTRACE_MCOUNT_RECORD 491config FTRACE_MCOUNT_RECORD
503 def_bool y 492 def_bool y
@@ -556,8 +545,8 @@ config RING_BUFFER_BENCHMARK
556 tristate "Ring buffer benchmark stress tester" 545 tristate "Ring buffer benchmark stress tester"
557 depends on RING_BUFFER 546 depends on RING_BUFFER
558 help 547 help
559 This option creates a test to stress the ring buffer and bench mark it. 548 This option creates a test to stress the ring buffer and benchmark it.
560 It creates its own ring buffer such that it will not interfer with 549 It creates its own ring buffer such that it will not interfere with
561 any other users of the ring buffer (such as ftrace). It then creates 550 any other users of the ring buffer (such as ftrace). It then creates
562 a producer and consumer that will run for 10 seconds and sleep for 551 a producer and consumer that will run for 10 seconds and sleep for
563 10 seconds. Each interval it will print out the number of events 552 10 seconds. Each interval it will print out the number of events
@@ -566,7 +555,7 @@ config RING_BUFFER_BENCHMARK
566 It does not disable interrupts or raise its priority, so it may be 555 It does not disable interrupts or raise its priority, so it may be
567 affected by processes that are running. 556 affected by processes that are running.
568 557
569 If unsure, say N 558 If unsure, say N.
570 559
571endif # FTRACE 560endif # FTRACE
572 561
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e51a1bcb7bed..83783579378f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
@@ -1690,7 +1666,7 @@ ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1690static int ftrace_match(char *str, char *regex, int len, int type) 1666static int ftrace_match(char *str, char *regex, int len, int type)
1691{ 1667{
1692 int matched = 0; 1668 int matched = 0;
1693 char *ptr; 1669 int slen;
1694 1670
1695 switch (type) { 1671 switch (type) {
1696 case MATCH_FULL: 1672 case MATCH_FULL:
@@ -1706,8 +1682,8 @@ static int ftrace_match(char *str, char *regex, int len, int type)
1706 matched = 1; 1682 matched = 1;
1707 break; 1683 break;
1708 case MATCH_END_ONLY: 1684 case MATCH_END_ONLY:
1709 ptr = strstr(str, regex); 1685 slen = strlen(str);
1710 if (ptr && (ptr[len] == 0)) 1686 if (slen >= len && memcmp(str + slen - len, regex, len) == 0)
1711 matched = 1; 1687 matched = 1;
1712 break; 1688 break;
1713 } 1689 }
@@ -1724,7 +1700,7 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
1724 return ftrace_match(str, regex, len, type); 1700 return ftrace_match(str, regex, len, type);
1725} 1701}
1726 1702
1727static void ftrace_match_records(char *buff, int len, int enable) 1703static int ftrace_match_records(char *buff, int len, int enable)
1728{ 1704{
1729 unsigned int search_len; 1705 unsigned int search_len;
1730 struct ftrace_page *pg; 1706 struct ftrace_page *pg;
@@ -1733,6 +1709,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1733 char *search; 1709 char *search;
1734 int type; 1710 int type;
1735 int not; 1711 int not;
1712 int found = 0;
1736 1713
1737 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1714 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1738 type = filter_parse_regex(buff, len, &search, &not); 1715 type = filter_parse_regex(buff, len, &search, &not);
@@ -1750,6 +1727,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
1750 rec->flags &= ~flag; 1727 rec->flags &= ~flag;
1751 else 1728 else
1752 rec->flags |= flag; 1729 rec->flags |= flag;
1730 found = 1;
1753 } 1731 }
1754 /* 1732 /*
1755 * Only enable filtering if we have a function that 1733 * Only enable filtering if we have a function that
@@ -1759,6 +1737,8 @@ static void ftrace_match_records(char *buff, int len, int enable)
1759 ftrace_filtered = 1; 1737 ftrace_filtered = 1;
1760 } while_for_each_ftrace_rec(); 1738 } while_for_each_ftrace_rec();
1761 mutex_unlock(&ftrace_lock); 1739 mutex_unlock(&ftrace_lock);
1740
1741 return found;
1762} 1742}
1763 1743
1764static int 1744static int
@@ -1780,7 +1760,7 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
1780 return 1; 1760 return 1;
1781} 1761}
1782 1762
1783static void ftrace_match_module_records(char *buff, char *mod, int enable) 1763static int ftrace_match_module_records(char *buff, char *mod, int enable)
1784{ 1764{
1785 unsigned search_len = 0; 1765 unsigned search_len = 0;
1786 struct ftrace_page *pg; 1766 struct ftrace_page *pg;
@@ -1789,6 +1769,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1789 char *search = buff; 1769 char *search = buff;
1790 unsigned long flag; 1770 unsigned long flag;
1791 int not = 0; 1771 int not = 0;
1772 int found = 0;
1792 1773
1793 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1774 flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1794 1775
@@ -1819,12 +1800,15 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
1819 rec->flags &= ~flag; 1800 rec->flags &= ~flag;
1820 else 1801 else
1821 rec->flags |= flag; 1802 rec->flags |= flag;
1803 found = 1;
1822 } 1804 }
1823 if (enable && (rec->flags & FTRACE_FL_FILTER)) 1805 if (enable && (rec->flags & FTRACE_FL_FILTER))
1824 ftrace_filtered = 1; 1806 ftrace_filtered = 1;
1825 1807
1826 } while_for_each_ftrace_rec(); 1808 } while_for_each_ftrace_rec();
1827 mutex_unlock(&ftrace_lock); 1809 mutex_unlock(&ftrace_lock);
1810
1811 return found;
1828} 1812}
1829 1813
1830/* 1814/*
@@ -1853,8 +1837,9 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
1853 if (!strlen(mod)) 1837 if (!strlen(mod))
1854 return -EINVAL; 1838 return -EINVAL;
1855 1839
1856 ftrace_match_module_records(func, mod, enable); 1840 if (ftrace_match_module_records(func, mod, enable))
1857 return 0; 1841 return 0;
1842 return -EINVAL;
1858} 1843}
1859 1844
1860static struct ftrace_func_command ftrace_mod_cmd = { 1845static struct ftrace_func_command ftrace_mod_cmd = {
@@ -2151,8 +2136,9 @@ static int ftrace_process_regex(char *buff, int len, int enable)
2151 func = strsep(&next, ":"); 2136 func = strsep(&next, ":");
2152 2137
2153 if (!next) { 2138 if (!next) {
2154 ftrace_match_records(func, len, enable); 2139 if (ftrace_match_records(func, len, enable))
2155 return 0; 2140 return 0;
2141 return ret;
2156 } 2142 }
2157 2143
2158 /* command found */ 2144 /* command found */
@@ -2198,10 +2184,9 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2198 !trace_parser_cont(parser)) { 2184 !trace_parser_cont(parser)) {
2199 ret = ftrace_process_regex(parser->buffer, 2185 ret = ftrace_process_regex(parser->buffer,
2200 parser->idx, enable); 2186 parser->idx, enable);
2187 trace_parser_clear(parser);
2201 if (ret) 2188 if (ret)
2202 goto out_unlock; 2189 goto out_unlock;
2203
2204 trace_parser_clear(parser);
2205 } 2190 }
2206 2191
2207 ret = read; 2192 ret = read;
@@ -2417,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = {
2417static DEFINE_MUTEX(graph_lock); 2402static DEFINE_MUTEX(graph_lock);
2418 2403
2419int ftrace_graph_count; 2404int ftrace_graph_count;
2405int ftrace_graph_filter_enabled;
2420unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; 2406unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly;
2421 2407
2422static void * 2408static void *
@@ -2439,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos)
2439 mutex_lock(&graph_lock); 2425 mutex_lock(&graph_lock);
2440 2426
2441 /* Nothing, tell g_show to print all functions are enabled */ 2427 /* Nothing, tell g_show to print all functions are enabled */
2442 if (!ftrace_graph_count && !*pos) 2428 if (!ftrace_graph_filter_enabled && !*pos)
2443 return (void *)1; 2429 return (void *)1;
2444 2430
2445 return __g_next(m, pos); 2431 return __g_next(m, pos);
@@ -2485,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file)
2485 mutex_lock(&graph_lock); 2471 mutex_lock(&graph_lock);
2486 if ((file->f_mode & FMODE_WRITE) && 2472 if ((file->f_mode & FMODE_WRITE) &&
2487 (file->f_flags & O_TRUNC)) { 2473 (file->f_flags & O_TRUNC)) {
2474 ftrace_graph_filter_enabled = 0;
2488 ftrace_graph_count = 0; 2475 ftrace_graph_count = 0;
2489 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); 2476 memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs));
2490 } 2477 }
@@ -2510,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2510 struct dyn_ftrace *rec; 2497 struct dyn_ftrace *rec;
2511 struct ftrace_page *pg; 2498 struct ftrace_page *pg;
2512 int search_len; 2499 int search_len;
2513 int found = 0; 2500 int fail = 1;
2514 int type, not; 2501 int type, not;
2515 char *search; 2502 char *search;
2516 bool exists; 2503 bool exists;
@@ -2521,38 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
2521 2508
2522 /* decode regex */ 2509 /* decode regex */
2523 type = filter_parse_regex(buffer, strlen(buffer), &search, &not); 2510 type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
2524 if (not) 2511 if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
2525 return -EINVAL; 2512 return -EBUSY;
2526 2513
2527 search_len = strlen(search); 2514 search_len = strlen(search);
2528 2515
2529 mutex_lock(&ftrace_lock); 2516 mutex_lock(&ftrace_lock);
2530 do_for_each_ftrace_rec(pg, rec) { 2517 do_for_each_ftrace_rec(pg, rec) {
2531 2518
2532 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2533 break;
2534
2535 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) 2519 if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
2536 continue; 2520 continue;
2537 2521
2538 if (ftrace_match_record(rec, search, search_len, type)) { 2522 if (ftrace_match_record(rec, search, search_len, type)) {
2539 /* ensure it is not already in the array */ 2523 /* if it is in the array */
2540 exists = false; 2524 exists = false;
2541 for (i = 0; i < *idx; i++) 2525 for (i = 0; i < *idx; i++) {
2542 if (array[i] == rec->ip) { 2526 if (array[i] == rec->ip) {
2543 exists = true; 2527 exists = true;
2544 break; 2528 break;
2545 } 2529 }
2546 if (!exists) { 2530 }
2547 array[(*idx)++] = rec->ip; 2531
2548 found = 1; 2532 if (!not) {
2533 fail = 0;
2534 if (!exists) {
2535 array[(*idx)++] = rec->ip;
2536 if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
2537 goto out;
2538 }
2539 } else {
2540 if (exists) {
2541 array[i] = array[--(*idx)];
2542 array[*idx] = 0;
2543 fail = 0;
2544 }
2549 } 2545 }
2550 } 2546 }
2551 } while_for_each_ftrace_rec(); 2547 } while_for_each_ftrace_rec();
2552 2548out:
2553 mutex_unlock(&ftrace_lock); 2549 mutex_unlock(&ftrace_lock);
2554 2550
2555 return found ? 0 : -EINVAL; 2551 if (fail)
2552 return -EINVAL;
2553
2554 ftrace_graph_filter_enabled = 1;
2555 return 0;
2556} 2556}
2557 2557
2558static ssize_t 2558static ssize_t
@@ -2562,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
2562 struct trace_parser parser; 2562 struct trace_parser parser;
2563 ssize_t read, ret; 2563 ssize_t read, ret;
2564 2564
2565 if (!cnt || cnt < 0) 2565 if (!cnt)
2566 return 0; 2566 return 0;
2567 2567
2568 mutex_lock(&graph_lock); 2568 mutex_lock(&graph_lock);
2569 2569
2570 if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) {
2571 ret = -EBUSY;
2572 goto out_unlock;
2573 }
2574
2575 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { 2570 if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) {
2576 ret = -ENOMEM; 2571 ret = -ENOMEM;
2577 goto out_unlock; 2572 goto out_unlock;
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index e06c6e3d56a3..9f4f565b01e6 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -14,7 +14,5 @@
14#define CREATE_TRACE_POINTS 14#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16 16
17EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
18EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
19EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); 17EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
20 18
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1ca4956ab5e..8c1b2d290718 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -423,7 +423,7 @@ struct ring_buffer_per_cpu {
423 int cpu; 423 int cpu;
424 struct ring_buffer *buffer; 424 struct ring_buffer *buffer;
425 spinlock_t reader_lock; /* serialize readers */ 425 spinlock_t reader_lock; /* serialize readers */
426 raw_spinlock_t lock; 426 arch_spinlock_t lock;
427 struct lock_class_key lock_key; 427 struct lock_class_key lock_key;
428 struct list_head *pages; 428 struct list_head *pages;
429 struct buffer_page *head_page; /* read from head */ 429 struct buffer_page *head_page; /* read from head */
@@ -464,6 +464,8 @@ struct ring_buffer_iter {
464 struct ring_buffer_per_cpu *cpu_buffer; 464 struct ring_buffer_per_cpu *cpu_buffer;
465 unsigned long head; 465 unsigned long head;
466 struct buffer_page *head_page; 466 struct buffer_page *head_page;
467 struct buffer_page *cache_reader_page;
468 unsigned long cache_read;
467 u64 read_stamp; 469 u64 read_stamp;
468}; 470};
469 471
@@ -998,7 +1000,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
998 cpu_buffer->buffer = buffer; 1000 cpu_buffer->buffer = buffer;
999 spin_lock_init(&cpu_buffer->reader_lock); 1001 spin_lock_init(&cpu_buffer->reader_lock);
1000 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); 1002 lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
1001 cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1003 cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1002 1004
1003 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), 1005 bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
1004 GFP_KERNEL, cpu_to_node(cpu)); 1006 GFP_KERNEL, cpu_to_node(cpu));
@@ -1193,9 +1195,6 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1193 struct list_head *p; 1195 struct list_head *p;
1194 unsigned i; 1196 unsigned i;
1195 1197
1196 atomic_inc(&cpu_buffer->record_disabled);
1197 synchronize_sched();
1198
1199 spin_lock_irq(&cpu_buffer->reader_lock); 1198 spin_lock_irq(&cpu_buffer->reader_lock);
1200 rb_head_page_deactivate(cpu_buffer); 1199 rb_head_page_deactivate(cpu_buffer);
1201 1200
@@ -1211,12 +1210,9 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1211 return; 1210 return;
1212 1211
1213 rb_reset_cpu(cpu_buffer); 1212 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1215
1216 rb_check_pages(cpu_buffer); 1213 rb_check_pages(cpu_buffer);
1217 1214
1218 atomic_dec(&cpu_buffer->record_disabled); 1215 spin_unlock_irq(&cpu_buffer->reader_lock);
1219
1220} 1216}
1221 1217
1222static void 1218static void
@@ -1227,9 +1223,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1227 struct list_head *p; 1223 struct list_head *p;
1228 unsigned i; 1224 unsigned i;
1229 1225
1230 atomic_inc(&cpu_buffer->record_disabled);
1231 synchronize_sched();
1232
1233 spin_lock_irq(&cpu_buffer->reader_lock); 1226 spin_lock_irq(&cpu_buffer->reader_lock);
1234 rb_head_page_deactivate(cpu_buffer); 1227 rb_head_page_deactivate(cpu_buffer);
1235 1228
@@ -1242,11 +1235,9 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1242 list_add_tail(&bpage->list, cpu_buffer->pages); 1235 list_add_tail(&bpage->list, cpu_buffer->pages);
1243 } 1236 }
1244 rb_reset_cpu(cpu_buffer); 1237 rb_reset_cpu(cpu_buffer);
1245 spin_unlock_irq(&cpu_buffer->reader_lock);
1246
1247 rb_check_pages(cpu_buffer); 1238 rb_check_pages(cpu_buffer);
1248 1239
1249 atomic_dec(&cpu_buffer->record_disabled); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1250} 1241}
1251 1242
1252/** 1243/**
@@ -1254,11 +1245,6 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1254 * @buffer: the buffer to resize. 1245 * @buffer: the buffer to resize.
1255 * @size: the new size. 1246 * @size: the new size.
1256 * 1247 *
1257 * The tracer is responsible for making sure that the buffer is
1258 * not being used while changing the size.
1259 * Note: We may be able to change the above requirement by using
1260 * RCU synchronizations.
1261 *
1262 * Minimum size is 2 * BUF_PAGE_SIZE. 1248 * Minimum size is 2 * BUF_PAGE_SIZE.
1263 * 1249 *
1264 * Returns -1 on failure. 1250 * Returns -1 on failure.
@@ -1290,6 +1276,11 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1290 if (size == buffer_size) 1276 if (size == buffer_size)
1291 return size; 1277 return size;
1292 1278
1279 atomic_inc(&buffer->record_disabled);
1280
1281 /* Make sure all writers are done with this buffer. */
1282 synchronize_sched();
1283
1293 mutex_lock(&buffer->mutex); 1284 mutex_lock(&buffer->mutex);
1294 get_online_cpus(); 1285 get_online_cpus();
1295 1286
@@ -1352,6 +1343,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1352 put_online_cpus(); 1343 put_online_cpus();
1353 mutex_unlock(&buffer->mutex); 1344 mutex_unlock(&buffer->mutex);
1354 1345
1346 atomic_dec(&buffer->record_disabled);
1347
1355 return size; 1348 return size;
1356 1349
1357 free_pages: 1350 free_pages:
@@ -1361,6 +1354,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1361 } 1354 }
1362 put_online_cpus(); 1355 put_online_cpus();
1363 mutex_unlock(&buffer->mutex); 1356 mutex_unlock(&buffer->mutex);
1357 atomic_dec(&buffer->record_disabled);
1364 return -ENOMEM; 1358 return -ENOMEM;
1365 1359
1366 /* 1360 /*
@@ -1370,6 +1364,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
1370 out_fail: 1364 out_fail:
1371 put_online_cpus(); 1365 put_online_cpus();
1372 mutex_unlock(&buffer->mutex); 1366 mutex_unlock(&buffer->mutex);
1367 atomic_dec(&buffer->record_disabled);
1373 return -1; 1368 return -1;
1374} 1369}
1375EXPORT_SYMBOL_GPL(ring_buffer_resize); 1370EXPORT_SYMBOL_GPL(ring_buffer_resize);
@@ -2723,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
2723 iter->read_stamp = cpu_buffer->read_stamp; 2718 iter->read_stamp = cpu_buffer->read_stamp;
2724 else 2719 else
2725 iter->read_stamp = iter->head_page->page->time_stamp; 2720 iter->read_stamp = iter->head_page->page->time_stamp;
2721 iter->cache_reader_page = cpu_buffer->reader_page;
2722 iter->cache_read = cpu_buffer->read;
2726} 2723}
2727 2724
2728/** 2725/**
@@ -2834,7 +2831,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2834 int ret; 2831 int ret;
2835 2832
2836 local_irq_save(flags); 2833 local_irq_save(flags);
2837 __raw_spin_lock(&cpu_buffer->lock); 2834 arch_spin_lock(&cpu_buffer->lock);
2838 2835
2839 again: 2836 again:
2840 /* 2837 /*
@@ -2876,7 +2873,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2876 * Splice the empty reader page into the list around the head. 2873 * Splice the empty reader page into the list around the head.
2877 */ 2874 */
2878 reader = rb_set_head_page(cpu_buffer); 2875 reader = rb_set_head_page(cpu_buffer);
2879 cpu_buffer->reader_page->list.next = reader->list.next; 2876 cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next);
2880 cpu_buffer->reader_page->list.prev = reader->list.prev; 2877 cpu_buffer->reader_page->list.prev = reader->list.prev;
2881 2878
2882 /* 2879 /*
@@ -2913,7 +2910,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2913 * 2910 *
2914 * Now make the new head point back to the reader page. 2911 * Now make the new head point back to the reader page.
2915 */ 2912 */
2916 reader->list.next->prev = &cpu_buffer->reader_page->list; 2913 rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list;
2917 rb_inc_page(cpu_buffer, &cpu_buffer->head_page); 2914 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
2918 2915
2919 /* Finally update the reader page to the new head */ 2916 /* Finally update the reader page to the new head */
@@ -2923,7 +2920,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2923 goto again; 2920 goto again;
2924 2921
2925 out: 2922 out:
2926 __raw_spin_unlock(&cpu_buffer->lock); 2923 arch_spin_unlock(&cpu_buffer->lock);
2927 local_irq_restore(flags); 2924 local_irq_restore(flags);
2928 2925
2929 return reader; 2926 return reader;
@@ -3067,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3067 struct ring_buffer_event *event; 3064 struct ring_buffer_event *event;
3068 int nr_loops = 0; 3065 int nr_loops = 0;
3069 3066
3070 if (ring_buffer_iter_empty(iter))
3071 return NULL;
3072
3073 cpu_buffer = iter->cpu_buffer; 3067 cpu_buffer = iter->cpu_buffer;
3074 buffer = cpu_buffer->buffer; 3068 buffer = cpu_buffer->buffer;
3075 3069
3070 /*
3071 * Check if someone performed a consuming read to
3072 * the buffer. A consuming read invalidates the iterator
3073 * and we need to reset the iterator in this case.
3074 */
3075 if (unlikely(iter->cache_read != cpu_buffer->read ||
3076 iter->cache_reader_page != cpu_buffer->reader_page))
3077 rb_iter_reset(iter);
3078
3076 again: 3079 again:
3080 if (ring_buffer_iter_empty(iter))
3081 return NULL;
3082
3077 /* 3083 /*
3078 * We repeat when a timestamp is encountered. 3084 * We repeat when a timestamp is encountered.
3079 * We can get multiple timestamps by nested interrupts or also 3085 * We can get multiple timestamps by nested interrupts or also
@@ -3088,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3088 if (rb_per_cpu_empty(cpu_buffer)) 3094 if (rb_per_cpu_empty(cpu_buffer))
3089 return NULL; 3095 return NULL;
3090 3096
3097 if (iter->head >= local_read(&iter->head_page->page->commit)) {
3098 rb_inc_iter(iter);
3099 goto again;
3100 }
3101
3091 event = rb_iter_head_event(iter); 3102 event = rb_iter_head_event(iter);
3092 3103
3093 switch (event->type_len) { 3104 switch (event->type_len) {
@@ -3286,9 +3297,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3286 synchronize_sched(); 3297 synchronize_sched();
3287 3298
3288 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3299 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3289 __raw_spin_lock(&cpu_buffer->lock); 3300 arch_spin_lock(&cpu_buffer->lock);
3290 rb_iter_reset(iter); 3301 rb_iter_reset(iter);
3291 __raw_spin_unlock(&cpu_buffer->lock); 3302 arch_spin_unlock(&cpu_buffer->lock);
3292 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3303 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3293 3304
3294 return iter; 3305 return iter;
@@ -3408,11 +3419,11 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
3408 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) 3419 if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
3409 goto out; 3420 goto out;
3410 3421
3411 __raw_spin_lock(&cpu_buffer->lock); 3422 arch_spin_lock(&cpu_buffer->lock);
3412 3423
3413 rb_reset_cpu(cpu_buffer); 3424 rb_reset_cpu(cpu_buffer);
3414 3425
3415 __raw_spin_unlock(&cpu_buffer->lock); 3426 arch_spin_unlock(&cpu_buffer->lock);
3416 3427
3417 out: 3428 out:
3418 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3429 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 874f2893cff0..032c57ca6502 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -12,7 +12,7 @@
12 * Copyright (C) 2004 William Lee Irwin III 12 * Copyright (C) 2004 William Lee Irwin III
13 */ 13 */
14#include <linux/ring_buffer.h> 14#include <linux/ring_buffer.h>
15#include <linux/utsrelease.h> 15#include <generated/utsrelease.h>
16#include <linux/stacktrace.h> 16#include <linux/stacktrace.h>
17#include <linux/writeback.h> 17#include <linux/writeback.h>
18#include <linux/kallsyms.h> 18#include <linux/kallsyms.h>
@@ -32,6 +32,7 @@
32#include <linux/splice.h> 32#include <linux/splice.h>
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h>
35#include <linux/ctype.h> 36#include <linux/ctype.h>
36#include <linux/init.h> 37#include <linux/init.h>
37#include <linux/poll.h> 38#include <linux/poll.h>
@@ -86,25 +87,22 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
86 */ 87 */
87static int tracing_disabled = 1; 88static int tracing_disabled = 1;
88 89
89DEFINE_PER_CPU(local_t, ftrace_cpu_disabled); 90DEFINE_PER_CPU(int, ftrace_cpu_disabled);
90 91
91static inline void ftrace_disable_cpu(void) 92static inline void ftrace_disable_cpu(void)
92{ 93{
93 preempt_disable(); 94 preempt_disable();
94 local_inc(&__get_cpu_var(ftrace_cpu_disabled)); 95 __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled));
95} 96}
96 97
97static inline void ftrace_enable_cpu(void) 98static inline void ftrace_enable_cpu(void)
98{ 99{
99 local_dec(&__get_cpu_var(ftrace_cpu_disabled)); 100 __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled));
100 preempt_enable(); 101 preempt_enable();
101} 102}
102 103
103static cpumask_var_t __read_mostly tracing_buffer_mask; 104static cpumask_var_t __read_mostly tracing_buffer_mask;
104 105
105/* Define which cpu buffers are currently read in trace_pipe */
106static cpumask_var_t tracing_reader_cpumask;
107
108#define for_each_tracing_cpu(cpu) \ 106#define for_each_tracing_cpu(cpu) \
109 for_each_cpu(cpu, tracing_buffer_mask) 107 for_each_cpu(cpu, tracing_buffer_mask)
110 108
@@ -203,7 +201,7 @@ cycle_t ftrace_now(int cpu)
203 */ 201 */
204static struct trace_array max_tr; 202static struct trace_array max_tr;
205 203
206static DEFINE_PER_CPU(struct trace_array_cpu, max_data); 204static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
207 205
208/* tracer_enabled is used to toggle activation of a tracer */ 206/* tracer_enabled is used to toggle activation of a tracer */
209static int tracer_enabled = 1; 207static int tracer_enabled = 1;
@@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly;
243 241
244/* 242/*
245 * trace_types_lock is used to protect the trace_types list. 243 * trace_types_lock is used to protect the trace_types list.
246 * This lock is also used to keep user access serialized.
247 * Accesses from userspace will grab this lock while userspace
248 * activities happen inside the kernel.
249 */ 244 */
250static DEFINE_MUTEX(trace_types_lock); 245static DEFINE_MUTEX(trace_types_lock);
251 246
247/*
248 * serialize the access of the ring buffer
249 *
250 * ring buffer serializes readers, but it is low level protection.
251 * The validity of the events (which returns by ring_buffer_peek() ..etc)
252 * are not protected by ring buffer.
253 *
254 * The content of events may become garbage if we allow other process consumes
255 * these events concurrently:
256 * A) the page of the consumed events may become a normal page
257 * (not reader page) in ring buffer, and this page will be rewrited
258 * by events producer.
259 * B) The page of the consumed events may become a page for splice_read,
260 * and this page will be returned to system.
261 *
262 * These primitives allow multi process access to different cpu ring buffer
263 * concurrently.
264 *
265 * These primitives don't distinguish read-only and read-consume access.
266 * Multi read-only access are also serialized.
267 */
268
269#ifdef CONFIG_SMP
270static DECLARE_RWSEM(all_cpu_access_lock);
271static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
272
273static inline void trace_access_lock(int cpu)
274{
275 if (cpu == TRACE_PIPE_ALL_CPU) {
276 /* gain it for accessing the whole ring buffer. */
277 down_write(&all_cpu_access_lock);
278 } else {
279 /* gain it for accessing a cpu ring buffer. */
280
281 /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
282 down_read(&all_cpu_access_lock);
283
284 /* Secondly block other access to this @cpu ring buffer. */
285 mutex_lock(&per_cpu(cpu_access_lock, cpu));
286 }
287}
288
289static inline void trace_access_unlock(int cpu)
290{
291 if (cpu == TRACE_PIPE_ALL_CPU) {
292 up_write(&all_cpu_access_lock);
293 } else {
294 mutex_unlock(&per_cpu(cpu_access_lock, cpu));
295 up_read(&all_cpu_access_lock);
296 }
297}
298
299static inline void trace_access_lock_init(void)
300{
301 int cpu;
302
303 for_each_possible_cpu(cpu)
304 mutex_init(&per_cpu(cpu_access_lock, cpu));
305}
306
307#else
308
309static DEFINE_MUTEX(access_lock);
310
311static inline void trace_access_lock(int cpu)
312{
313 (void)cpu;
314 mutex_lock(&access_lock);
315}
316
317static inline void trace_access_unlock(int cpu)
318{
319 (void)cpu;
320 mutex_unlock(&access_lock);
321}
322
323static inline void trace_access_lock_init(void)
324{
325}
326
327#endif
328
252/* trace_wait is a waitqueue for tasks blocked on trace_poll */ 329/* trace_wait is a waitqueue for tasks blocked on trace_poll */
253static DECLARE_WAIT_QUEUE_HEAD(trace_wait); 330static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
254 331
@@ -313,7 +390,6 @@ static const char *trace_options[] = {
313 "bin", 390 "bin",
314 "block", 391 "block",
315 "stacktrace", 392 "stacktrace",
316 "sched-tree",
317 "trace_printk", 393 "trace_printk",
318 "ftrace_preempt", 394 "ftrace_preempt",
319 "branch", 395 "branch",
@@ -493,15 +569,15 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
493 * protected by per_cpu spinlocks. But the action of the swap 569 * protected by per_cpu spinlocks. But the action of the swap
494 * needs its own lock. 570 * needs its own lock.
495 * 571 *
496 * This is defined as a raw_spinlock_t in order to help 572 * This is defined as a arch_spinlock_t in order to help
497 * with performance when lockdep debugging is enabled. 573 * with performance when lockdep debugging is enabled.
498 * 574 *
499 * It is also used in other places outside the update_max_tr 575 * It is also used in other places outside the update_max_tr
500 * so it needs to be defined outside of the 576 * so it needs to be defined outside of the
501 * CONFIG_TRACER_MAX_TRACE. 577 * CONFIG_TRACER_MAX_TRACE.
502 */ 578 */
503static raw_spinlock_t ftrace_max_lock = 579static arch_spinlock_t ftrace_max_lock =
504 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 580 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
505 581
506#ifdef CONFIG_TRACER_MAX_TRACE 582#ifdef CONFIG_TRACER_MAX_TRACE
507unsigned long __read_mostly tracing_max_latency; 583unsigned long __read_mostly tracing_max_latency;
@@ -555,13 +631,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
555 return; 631 return;
556 632
557 WARN_ON_ONCE(!irqs_disabled()); 633 WARN_ON_ONCE(!irqs_disabled());
558 __raw_spin_lock(&ftrace_max_lock); 634 arch_spin_lock(&ftrace_max_lock);
559 635
560 tr->buffer = max_tr.buffer; 636 tr->buffer = max_tr.buffer;
561 max_tr.buffer = buf; 637 max_tr.buffer = buf;
562 638
563 __update_max_tr(tr, tsk, cpu); 639 __update_max_tr(tr, tsk, cpu);
564 __raw_spin_unlock(&ftrace_max_lock); 640 arch_spin_unlock(&ftrace_max_lock);
565} 641}
566 642
567/** 643/**
@@ -581,7 +657,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
581 return; 657 return;
582 658
583 WARN_ON_ONCE(!irqs_disabled()); 659 WARN_ON_ONCE(!irqs_disabled());
584 __raw_spin_lock(&ftrace_max_lock); 660 arch_spin_lock(&ftrace_max_lock);
585 661
586 ftrace_disable_cpu(); 662 ftrace_disable_cpu();
587 663
@@ -603,7 +679,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
603 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); 679 WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY);
604 680
605 __update_max_tr(tr, tsk, cpu); 681 __update_max_tr(tr, tsk, cpu);
606 __raw_spin_unlock(&ftrace_max_lock); 682 arch_spin_unlock(&ftrace_max_lock);
607} 683}
608#endif /* CONFIG_TRACER_MAX_TRACE */ 684#endif /* CONFIG_TRACER_MAX_TRACE */
609 685
@@ -802,7 +878,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
802static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; 878static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
803static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; 879static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
804static int cmdline_idx; 880static int cmdline_idx;
805static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED; 881static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
806 882
807/* temporary disable recording */ 883/* temporary disable recording */
808static atomic_t trace_record_cmdline_disabled __read_mostly; 884static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -915,7 +991,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
915 * nor do we want to disable interrupts, 991 * nor do we want to disable interrupts,
916 * so if we miss here, then better luck next time. 992 * so if we miss here, then better luck next time.
917 */ 993 */
918 if (!__raw_spin_trylock(&trace_cmdline_lock)) 994 if (!arch_spin_trylock(&trace_cmdline_lock))
919 return; 995 return;
920 996
921 idx = map_pid_to_cmdline[tsk->pid]; 997 idx = map_pid_to_cmdline[tsk->pid];
@@ -940,7 +1016,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
940 1016
941 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); 1017 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
942 1018
943 __raw_spin_unlock(&trace_cmdline_lock); 1019 arch_spin_unlock(&trace_cmdline_lock);
944} 1020}
945 1021
946void trace_find_cmdline(int pid, char comm[]) 1022void trace_find_cmdline(int pid, char comm[])
@@ -952,20 +1028,25 @@ void trace_find_cmdline(int pid, char comm[])
952 return; 1028 return;
953 } 1029 }
954 1030
1031 if (WARN_ON_ONCE(pid < 0)) {
1032 strcpy(comm, "<XXX>");
1033 return;
1034 }
1035
955 if (pid > PID_MAX_DEFAULT) { 1036 if (pid > PID_MAX_DEFAULT) {
956 strcpy(comm, "<...>"); 1037 strcpy(comm, "<...>");
957 return; 1038 return;
958 } 1039 }
959 1040
960 preempt_disable(); 1041 preempt_disable();
961 __raw_spin_lock(&trace_cmdline_lock); 1042 arch_spin_lock(&trace_cmdline_lock);
962 map = map_pid_to_cmdline[pid]; 1043 map = map_pid_to_cmdline[pid];
963 if (map != NO_CMDLINE_MAP) 1044 if (map != NO_CMDLINE_MAP)
964 strcpy(comm, saved_cmdlines[map]); 1045 strcpy(comm, saved_cmdlines[map]);
965 else 1046 else
966 strcpy(comm, "<...>"); 1047 strcpy(comm, "<...>");
967 1048
968 __raw_spin_unlock(&trace_cmdline_lock); 1049 arch_spin_unlock(&trace_cmdline_lock);
969 preempt_enable(); 1050 preempt_enable();
970} 1051}
971 1052
@@ -1085,7 +1166,7 @@ trace_function(struct trace_array *tr,
1085 struct ftrace_entry *entry; 1166 struct ftrace_entry *entry;
1086 1167
1087 /* If we are reading the ring buffer, don't trace */ 1168 /* If we are reading the ring buffer, don't trace */
1088 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 1169 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
1089 return; 1170 return;
1090 1171
1091 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), 1172 event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry),
@@ -1151,6 +1232,22 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
1151 __ftrace_trace_stack(tr->buffer, flags, skip, pc); 1232 __ftrace_trace_stack(tr->buffer, flags, skip, pc);
1152} 1233}
1153 1234
1235/**
1236 * trace_dump_stack - record a stack back trace in the trace buffer
1237 */
1238void trace_dump_stack(void)
1239{
1240 unsigned long flags;
1241
1242 if (tracing_disabled || tracing_selftest_running)
1243 return;
1244
1245 local_save_flags(flags);
1246
1247 /* skipping 3 traces, seems to get us at the caller of this function */
1248 __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count());
1249}
1250
1154void 1251void
1155ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) 1252ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
1156{ 1253{
@@ -1251,8 +1348,8 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1251 */ 1348 */
1252int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) 1349int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1253{ 1350{
1254 static raw_spinlock_t trace_buf_lock = 1351 static arch_spinlock_t trace_buf_lock =
1255 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 1352 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
1256 static u32 trace_buf[TRACE_BUF_SIZE]; 1353 static u32 trace_buf[TRACE_BUF_SIZE];
1257 1354
1258 struct ftrace_event_call *call = &event_bprint; 1355 struct ftrace_event_call *call = &event_bprint;
@@ -1283,7 +1380,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1283 1380
1284 /* Lockdep uses trace_printk for lock tracing */ 1381 /* Lockdep uses trace_printk for lock tracing */
1285 local_irq_save(flags); 1382 local_irq_save(flags);
1286 __raw_spin_lock(&trace_buf_lock); 1383 arch_spin_lock(&trace_buf_lock);
1287 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); 1384 len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1288 1385
1289 if (len > TRACE_BUF_SIZE || len < 0) 1386 if (len > TRACE_BUF_SIZE || len < 0)
@@ -1300,11 +1397,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
1300 entry->fmt = fmt; 1397 entry->fmt = fmt;
1301 1398
1302 memcpy(entry->buf, trace_buf, sizeof(u32) * len); 1399 memcpy(entry->buf, trace_buf, sizeof(u32) * len);
1303 if (!filter_check_discard(call, entry, buffer, event)) 1400 if (!filter_check_discard(call, entry, buffer, event)) {
1304 ring_buffer_unlock_commit(buffer, event); 1401 ring_buffer_unlock_commit(buffer, event);
1402 ftrace_trace_stack(buffer, flags, 6, pc);
1403 }
1305 1404
1306out_unlock: 1405out_unlock:
1307 __raw_spin_unlock(&trace_buf_lock); 1406 arch_spin_unlock(&trace_buf_lock);
1308 local_irq_restore(flags); 1407 local_irq_restore(flags);
1309 1408
1310out: 1409out:
@@ -1334,7 +1433,7 @@ int trace_array_printk(struct trace_array *tr,
1334int trace_array_vprintk(struct trace_array *tr, 1433int trace_array_vprintk(struct trace_array *tr,
1335 unsigned long ip, const char *fmt, va_list args) 1434 unsigned long ip, const char *fmt, va_list args)
1336{ 1435{
1337 static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; 1436 static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED;
1338 static char trace_buf[TRACE_BUF_SIZE]; 1437 static char trace_buf[TRACE_BUF_SIZE];
1339 1438
1340 struct ftrace_event_call *call = &event_print; 1439 struct ftrace_event_call *call = &event_print;
@@ -1360,12 +1459,8 @@ int trace_array_vprintk(struct trace_array *tr,
1360 1459
1361 pause_graph_tracing(); 1460 pause_graph_tracing();
1362 raw_local_irq_save(irq_flags); 1461 raw_local_irq_save(irq_flags);
1363 __raw_spin_lock(&trace_buf_lock); 1462 arch_spin_lock(&trace_buf_lock);
1364 if (args == NULL) { 1463 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1365 strncpy(trace_buf, fmt, TRACE_BUF_SIZE);
1366 len = strlen(trace_buf);
1367 } else
1368 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
1369 1464
1370 size = sizeof(*entry) + len + 1; 1465 size = sizeof(*entry) + len + 1;
1371 buffer = tr->buffer; 1466 buffer = tr->buffer;
@@ -1378,11 +1473,13 @@ int trace_array_vprintk(struct trace_array *tr,
1378 1473
1379 memcpy(&entry->buf, trace_buf, len); 1474 memcpy(&entry->buf, trace_buf, len);
1380 entry->buf[len] = '\0'; 1475 entry->buf[len] = '\0';
1381 if (!filter_check_discard(call, entry, buffer, event)) 1476 if (!filter_check_discard(call, entry, buffer, event)) {
1382 ring_buffer_unlock_commit(buffer, event); 1477 ring_buffer_unlock_commit(buffer, event);
1478 ftrace_trace_stack(buffer, irq_flags, 6, pc);
1479 }
1383 1480
1384 out_unlock: 1481 out_unlock:
1385 __raw_spin_unlock(&trace_buf_lock); 1482 arch_spin_unlock(&trace_buf_lock);
1386 raw_local_irq_restore(irq_flags); 1483 raw_local_irq_restore(irq_flags);
1387 unpause_graph_tracing(); 1484 unpause_graph_tracing();
1388 out: 1485 out:
@@ -1516,6 +1613,8 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1516 int i = (int)*pos; 1613 int i = (int)*pos;
1517 void *ent; 1614 void *ent;
1518 1615
1616 WARN_ON_ONCE(iter->leftover);
1617
1519 (*pos)++; 1618 (*pos)++;
1520 1619
1521 /* can't go backwards */ 1620 /* can't go backwards */
@@ -1567,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1567} 1666}
1568 1667
1569/* 1668/*
1570 * No necessary locking here. The worst thing which can
1571 * happen is loosing events consumed at the same time
1572 * by a trace_pipe reader.
1573 * Other than that, we don't risk to crash the ring buffer
1574 * because it serializes the readers.
1575 *
1576 * The current tracer is copied to avoid a global locking 1669 * The current tracer is copied to avoid a global locking
1577 * all around. 1670 * all around.
1578 */ 1671 */
@@ -1614,17 +1707,29 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1614 ; 1707 ;
1615 1708
1616 } else { 1709 } else {
1617 l = *pos - 1; 1710 /*
1618 p = s_next(m, p, &l); 1711 * If we overflowed the seq_file before, then we want
1712 * to just reuse the trace_seq buffer again.
1713 */
1714 if (iter->leftover)
1715 p = iter;
1716 else {
1717 l = *pos - 1;
1718 p = s_next(m, p, &l);
1719 }
1619 } 1720 }
1620 1721
1621 trace_event_read_lock(); 1722 trace_event_read_lock();
1723 trace_access_lock(cpu_file);
1622 return p; 1724 return p;
1623} 1725}
1624 1726
1625static void s_stop(struct seq_file *m, void *p) 1727static void s_stop(struct seq_file *m, void *p)
1626{ 1728{
1729 struct trace_iterator *iter = m->private;
1730
1627 atomic_dec(&trace_record_cmdline_disabled); 1731 atomic_dec(&trace_record_cmdline_disabled);
1732 trace_access_unlock(iter->cpu_file);
1628 trace_event_read_unlock(); 1733 trace_event_read_unlock();
1629} 1734}
1630 1735
@@ -1923,6 +2028,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
1923static int s_show(struct seq_file *m, void *v) 2028static int s_show(struct seq_file *m, void *v)
1924{ 2029{
1925 struct trace_iterator *iter = v; 2030 struct trace_iterator *iter = v;
2031 int ret;
1926 2032
1927 if (iter->ent == NULL) { 2033 if (iter->ent == NULL) {
1928 if (iter->tr) { 2034 if (iter->tr) {
@@ -1942,9 +2048,27 @@ static int s_show(struct seq_file *m, void *v)
1942 if (!(trace_flags & TRACE_ITER_VERBOSE)) 2048 if (!(trace_flags & TRACE_ITER_VERBOSE))
1943 print_func_help_header(m); 2049 print_func_help_header(m);
1944 } 2050 }
2051 } else if (iter->leftover) {
2052 /*
2053 * If we filled the seq_file buffer earlier, we
2054 * want to just show it now.
2055 */
2056 ret = trace_print_seq(m, &iter->seq);
2057
2058 /* ret should this time be zero, but you never know */
2059 iter->leftover = ret;
2060
1945 } else { 2061 } else {
1946 print_trace_line(iter); 2062 print_trace_line(iter);
1947 trace_print_seq(m, &iter->seq); 2063 ret = trace_print_seq(m, &iter->seq);
2064 /*
2065 * If we overflow the seq_file buffer, then it will
2066 * ask us for this data again at start up.
2067 * Use that instead.
2068 * ret is 0 if seq_file write succeeded.
2069 * -1 otherwise.
2070 */
2071 iter->leftover = ret;
1948 } 2072 }
1949 2073
1950 return 0; 2074 return 0;
@@ -2254,7 +2378,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2254 mutex_lock(&tracing_cpumask_update_lock); 2378 mutex_lock(&tracing_cpumask_update_lock);
2255 2379
2256 local_irq_disable(); 2380 local_irq_disable();
2257 __raw_spin_lock(&ftrace_max_lock); 2381 arch_spin_lock(&ftrace_max_lock);
2258 for_each_tracing_cpu(cpu) { 2382 for_each_tracing_cpu(cpu) {
2259 /* 2383 /*
2260 * Increase/decrease the disabled counter if we are 2384 * Increase/decrease the disabled counter if we are
@@ -2269,7 +2393,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2269 atomic_dec(&global_trace.data[cpu]->disabled); 2393 atomic_dec(&global_trace.data[cpu]->disabled);
2270 } 2394 }
2271 } 2395 }
2272 __raw_spin_unlock(&ftrace_max_lock); 2396 arch_spin_unlock(&ftrace_max_lock);
2273 local_irq_enable(); 2397 local_irq_enable();
2274 2398
2275 cpumask_copy(tracing_cpumask, tracing_cpumask_new); 2399 cpumask_copy(tracing_cpumask, tracing_cpumask_new);
@@ -2291,67 +2415,49 @@ static const struct file_operations tracing_cpumask_fops = {
2291 .write = tracing_cpumask_write, 2415 .write = tracing_cpumask_write,
2292}; 2416};
2293 2417
2294static ssize_t 2418static int tracing_trace_options_show(struct seq_file *m, void *v)
2295tracing_trace_options_read(struct file *filp, char __user *ubuf,
2296 size_t cnt, loff_t *ppos)
2297{ 2419{
2298 struct tracer_opt *trace_opts; 2420 struct tracer_opt *trace_opts;
2299 u32 tracer_flags; 2421 u32 tracer_flags;
2300 int len = 0;
2301 char *buf;
2302 int r = 0;
2303 int i; 2422 int i;
2304 2423
2305
2306 /* calculate max size */
2307 for (i = 0; trace_options[i]; i++) {
2308 len += strlen(trace_options[i]);
2309 len += 3; /* "no" and newline */
2310 }
2311
2312 mutex_lock(&trace_types_lock); 2424 mutex_lock(&trace_types_lock);
2313 tracer_flags = current_trace->flags->val; 2425 tracer_flags = current_trace->flags->val;
2314 trace_opts = current_trace->flags->opts; 2426 trace_opts = current_trace->flags->opts;
2315 2427
2316 /*
2317 * Increase the size with names of options specific
2318 * of the current tracer.
2319 */
2320 for (i = 0; trace_opts[i].name; i++) {
2321 len += strlen(trace_opts[i].name);
2322 len += 3; /* "no" and newline */
2323 }
2324
2325 /* +1 for \0 */
2326 buf = kmalloc(len + 1, GFP_KERNEL);
2327 if (!buf) {
2328 mutex_unlock(&trace_types_lock);
2329 return -ENOMEM;
2330 }
2331
2332 for (i = 0; trace_options[i]; i++) { 2428 for (i = 0; trace_options[i]; i++) {
2333 if (trace_flags & (1 << i)) 2429 if (trace_flags & (1 << i))
2334 r += sprintf(buf + r, "%s\n", trace_options[i]); 2430 seq_printf(m, "%s\n", trace_options[i]);
2335 else 2431 else
2336 r += sprintf(buf + r, "no%s\n", trace_options[i]); 2432 seq_printf(m, "no%s\n", trace_options[i]);
2337 } 2433 }
2338 2434
2339 for (i = 0; trace_opts[i].name; i++) { 2435 for (i = 0; trace_opts[i].name; i++) {
2340 if (tracer_flags & trace_opts[i].bit) 2436 if (tracer_flags & trace_opts[i].bit)
2341 r += sprintf(buf + r, "%s\n", 2437 seq_printf(m, "%s\n", trace_opts[i].name);
2342 trace_opts[i].name);
2343 else 2438 else
2344 r += sprintf(buf + r, "no%s\n", 2439 seq_printf(m, "no%s\n", trace_opts[i].name);
2345 trace_opts[i].name);
2346 } 2440 }
2347 mutex_unlock(&trace_types_lock); 2441 mutex_unlock(&trace_types_lock);
2348 2442
2349 WARN_ON(r >= len + 1); 2443 return 0;
2444}
2350 2445
2351 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 2446static int __set_tracer_option(struct tracer *trace,
2447 struct tracer_flags *tracer_flags,
2448 struct tracer_opt *opts, int neg)
2449{
2450 int ret;
2352 2451
2353 kfree(buf); 2452 ret = trace->set_flag(tracer_flags->val, opts->bit, !neg);
2354 return r; 2453 if (ret)
2454 return ret;
2455
2456 if (neg)
2457 tracer_flags->val &= ~opts->bit;
2458 else
2459 tracer_flags->val |= opts->bit;
2460 return 0;
2355} 2461}
2356 2462
2357/* Try to assign a tracer specific option */ 2463/* Try to assign a tracer specific option */
@@ -2359,33 +2465,17 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
2359{ 2465{
2360 struct tracer_flags *tracer_flags = trace->flags; 2466 struct tracer_flags *tracer_flags = trace->flags;
2361 struct tracer_opt *opts = NULL; 2467 struct tracer_opt *opts = NULL;
2362 int ret = 0, i = 0; 2468 int i;
2363 int len;
2364 2469
2365 for (i = 0; tracer_flags->opts[i].name; i++) { 2470 for (i = 0; tracer_flags->opts[i].name; i++) {
2366 opts = &tracer_flags->opts[i]; 2471 opts = &tracer_flags->opts[i];
2367 len = strlen(opts->name);
2368 2472
2369 if (strncmp(cmp, opts->name, len) == 0) { 2473 if (strcmp(cmp, opts->name) == 0)
2370 ret = trace->set_flag(tracer_flags->val, 2474 return __set_tracer_option(trace, trace->flags,
2371 opts->bit, !neg); 2475 opts, neg);
2372 break;
2373 }
2374 } 2476 }
2375 /* Not found */
2376 if (!tracer_flags->opts[i].name)
2377 return -EINVAL;
2378 2477
2379 /* Refused to handle */ 2478 return -EINVAL;
2380 if (ret)
2381 return ret;
2382
2383 if (neg)
2384 tracer_flags->val &= ~opts->bit;
2385 else
2386 tracer_flags->val |= opts->bit;
2387
2388 return 0;
2389} 2479}
2390 2480
2391static void set_tracer_flags(unsigned int mask, int enabled) 2481static void set_tracer_flags(unsigned int mask, int enabled)
@@ -2405,7 +2495,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2405 size_t cnt, loff_t *ppos) 2495 size_t cnt, loff_t *ppos)
2406{ 2496{
2407 char buf[64]; 2497 char buf[64];
2408 char *cmp = buf; 2498 char *cmp;
2409 int neg = 0; 2499 int neg = 0;
2410 int ret; 2500 int ret;
2411 int i; 2501 int i;
@@ -2417,16 +2507,15 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2417 return -EFAULT; 2507 return -EFAULT;
2418 2508
2419 buf[cnt] = 0; 2509 buf[cnt] = 0;
2510 cmp = strstrip(buf);
2420 2511
2421 if (strncmp(buf, "no", 2) == 0) { 2512 if (strncmp(cmp, "no", 2) == 0) {
2422 neg = 1; 2513 neg = 1;
2423 cmp += 2; 2514 cmp += 2;
2424 } 2515 }
2425 2516
2426 for (i = 0; trace_options[i]; i++) { 2517 for (i = 0; trace_options[i]; i++) {
2427 int len = strlen(trace_options[i]); 2518 if (strcmp(cmp, trace_options[i]) == 0) {
2428
2429 if (strncmp(cmp, trace_options[i], len) == 0) {
2430 set_tracer_flags(1 << i, !neg); 2519 set_tracer_flags(1 << i, !neg);
2431 break; 2520 break;
2432 } 2521 }
@@ -2446,9 +2535,18 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2446 return cnt; 2535 return cnt;
2447} 2536}
2448 2537
2538static int tracing_trace_options_open(struct inode *inode, struct file *file)
2539{
2540 if (tracing_disabled)
2541 return -ENODEV;
2542 return single_open(file, tracing_trace_options_show, NULL);
2543}
2544
2449static const struct file_operations tracing_iter_fops = { 2545static const struct file_operations tracing_iter_fops = {
2450 .open = tracing_open_generic, 2546 .open = tracing_trace_options_open,
2451 .read = tracing_trace_options_read, 2547 .read = seq_read,
2548 .llseek = seq_lseek,
2549 .release = single_release,
2452 .write = tracing_trace_options_write, 2550 .write = tracing_trace_options_write,
2453}; 2551};
2454 2552
@@ -2822,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
2822 2920
2823 mutex_lock(&trace_types_lock); 2921 mutex_lock(&trace_types_lock);
2824 2922
2825 /* We only allow one reader per cpu */
2826 if (cpu_file == TRACE_PIPE_ALL_CPU) {
2827 if (!cpumask_empty(tracing_reader_cpumask)) {
2828 ret = -EBUSY;
2829 goto out;
2830 }
2831 cpumask_setall(tracing_reader_cpumask);
2832 } else {
2833 if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
2834 cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
2835 else {
2836 ret = -EBUSY;
2837 goto out;
2838 }
2839 }
2840
2841 /* create a buffer to store the information to pass to userspace */ 2923 /* create a buffer to store the information to pass to userspace */
2842 iter = kzalloc(sizeof(*iter), GFP_KERNEL); 2924 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2843 if (!iter) { 2925 if (!iter) {
@@ -2893,10 +2975,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
2893 2975
2894 mutex_lock(&trace_types_lock); 2976 mutex_lock(&trace_types_lock);
2895 2977
2896 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) 2978 if (iter->trace->pipe_close)
2897 cpumask_clear(tracing_reader_cpumask); 2979 iter->trace->pipe_close(iter);
2898 else
2899 cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
2900 2980
2901 mutex_unlock(&trace_types_lock); 2981 mutex_unlock(&trace_types_lock);
2902 2982
@@ -3056,6 +3136,7 @@ waitagain:
3056 iter->pos = -1; 3136 iter->pos = -1;
3057 3137
3058 trace_event_read_lock(); 3138 trace_event_read_lock();
3139 trace_access_lock(iter->cpu_file);
3059 while (find_next_entry_inc(iter) != NULL) { 3140 while (find_next_entry_inc(iter) != NULL) {
3060 enum print_line_t ret; 3141 enum print_line_t ret;
3061 int len = iter->seq.len; 3142 int len = iter->seq.len;
@@ -3072,6 +3153,7 @@ waitagain:
3072 if (iter->seq.len >= cnt) 3153 if (iter->seq.len >= cnt)
3073 break; 3154 break;
3074 } 3155 }
3156 trace_access_unlock(iter->cpu_file);
3075 trace_event_read_unlock(); 3157 trace_event_read_unlock();
3076 3158
3077 /* Now copy what we have to the user */ 3159 /* Now copy what we have to the user */
@@ -3104,7 +3186,7 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
3104 __free_page(spd->pages[idx]); 3186 __free_page(spd->pages[idx]);
3105} 3187}
3106 3188
3107static struct pipe_buf_operations tracing_pipe_buf_ops = { 3189static const struct pipe_buf_operations tracing_pipe_buf_ops = {
3108 .can_merge = 0, 3190 .can_merge = 0,
3109 .map = generic_pipe_buf_map, 3191 .map = generic_pipe_buf_map,
3110 .unmap = generic_pipe_buf_unmap, 3192 .unmap = generic_pipe_buf_unmap,
@@ -3197,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3197 } 3279 }
3198 3280
3199 trace_event_read_lock(); 3281 trace_event_read_lock();
3282 trace_access_lock(iter->cpu_file);
3200 3283
3201 /* Fill as many pages as possible. */ 3284 /* Fill as many pages as possible. */
3202 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3285 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
@@ -3220,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3220 trace_seq_init(&iter->seq); 3303 trace_seq_init(&iter->seq);
3221 } 3304 }
3222 3305
3306 trace_access_unlock(iter->cpu_file);
3223 trace_event_read_unlock(); 3307 trace_event_read_unlock();
3224 mutex_unlock(&iter->mutex); 3308 mutex_unlock(&iter->mutex);
3225 3309
@@ -3320,6 +3404,16 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3320 return cnt; 3404 return cnt;
3321} 3405}
3322 3406
3407static int mark_printk(const char *fmt, ...)
3408{
3409 int ret;
3410 va_list args;
3411 va_start(args, fmt);
3412 ret = trace_vprintk(0, fmt, args);
3413 va_end(args);
3414 return ret;
3415}
3416
3323static ssize_t 3417static ssize_t
3324tracing_mark_write(struct file *filp, const char __user *ubuf, 3418tracing_mark_write(struct file *filp, const char __user *ubuf,
3325 size_t cnt, loff_t *fpos) 3419 size_t cnt, loff_t *fpos)
@@ -3346,28 +3440,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
3346 } else 3440 } else
3347 buf[cnt] = '\0'; 3441 buf[cnt] = '\0';
3348 3442
3349 cnt = trace_vprintk(0, buf, NULL); 3443 cnt = mark_printk("%s", buf);
3350 kfree(buf); 3444 kfree(buf);
3351 *fpos += cnt; 3445 *fpos += cnt;
3352 3446
3353 return cnt; 3447 return cnt;
3354} 3448}
3355 3449
3356static ssize_t tracing_clock_read(struct file *filp, char __user *ubuf, 3450static int tracing_clock_show(struct seq_file *m, void *v)
3357 size_t cnt, loff_t *ppos)
3358{ 3451{
3359 char buf[64];
3360 int bufiter = 0;
3361 int i; 3452 int i;
3362 3453
3363 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) 3454 for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
3364 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, 3455 seq_printf(m,
3365 "%s%s%s%s", i ? " " : "", 3456 "%s%s%s%s", i ? " " : "",
3366 i == trace_clock_id ? "[" : "", trace_clocks[i].name, 3457 i == trace_clock_id ? "[" : "", trace_clocks[i].name,
3367 i == trace_clock_id ? "]" : ""); 3458 i == trace_clock_id ? "]" : "");
3368 bufiter += snprintf(buf + bufiter, sizeof(buf) - bufiter, "\n"); 3459 seq_putc(m, '\n');
3369 3460
3370 return simple_read_from_buffer(ubuf, cnt, ppos, buf, bufiter); 3461 return 0;
3371} 3462}
3372 3463
3373static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, 3464static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
@@ -3409,6 +3500,13 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
3409 return cnt; 3500 return cnt;
3410} 3501}
3411 3502
3503static int tracing_clock_open(struct inode *inode, struct file *file)
3504{
3505 if (tracing_disabled)
3506 return -ENODEV;
3507 return single_open(file, tracing_clock_show, NULL);
3508}
3509
3412static const struct file_operations tracing_max_lat_fops = { 3510static const struct file_operations tracing_max_lat_fops = {
3413 .open = tracing_open_generic, 3511 .open = tracing_open_generic,
3414 .read = tracing_max_lat_read, 3512 .read = tracing_max_lat_read,
@@ -3447,8 +3545,10 @@ static const struct file_operations tracing_mark_fops = {
3447}; 3545};
3448 3546
3449static const struct file_operations trace_clock_fops = { 3547static const struct file_operations trace_clock_fops = {
3450 .open = tracing_open_generic, 3548 .open = tracing_clock_open,
3451 .read = tracing_clock_read, 3549 .read = seq_read,
3550 .llseek = seq_lseek,
3551 .release = single_release,
3452 .write = tracing_clock_write, 3552 .write = tracing_clock_write,
3453}; 3553};
3454 3554
@@ -3505,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3505 3605
3506 info->read = 0; 3606 info->read = 0;
3507 3607
3608 trace_access_lock(info->cpu);
3508 ret = ring_buffer_read_page(info->tr->buffer, 3609 ret = ring_buffer_read_page(info->tr->buffer,
3509 &info->spare, 3610 &info->spare,
3510 count, 3611 count,
3511 info->cpu, 0); 3612 info->cpu, 0);
3613 trace_access_unlock(info->cpu);
3512 if (ret < 0) 3614 if (ret < 0)
3513 return 0; 3615 return 0;
3514 3616
@@ -3578,7 +3680,7 @@ static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
3578} 3680}
3579 3681
3580/* Pipe buffer operations for a buffer. */ 3682/* Pipe buffer operations for a buffer. */
3581static struct pipe_buf_operations buffer_pipe_buf_ops = { 3683static const struct pipe_buf_operations buffer_pipe_buf_ops = {
3582 .can_merge = 0, 3684 .can_merge = 0,
3583 .map = generic_pipe_buf_map, 3685 .map = generic_pipe_buf_map,
3584 .unmap = generic_pipe_buf_unmap, 3686 .unmap = generic_pipe_buf_unmap,
@@ -3636,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3636 len &= PAGE_MASK; 3738 len &= PAGE_MASK;
3637 } 3739 }
3638 3740
3741 trace_access_lock(info->cpu);
3639 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3742 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3640 3743
3641 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3744 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
@@ -3683,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3683 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3786 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3684 } 3787 }
3685 3788
3789 trace_access_unlock(info->cpu);
3686 spd.nr_pages = i; 3790 spd.nr_pages = i;
3687 3791
3688 /* did we read anything? */ 3792 /* did we read anything? */
@@ -3909,39 +4013,16 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
3909 if (ret < 0) 4013 if (ret < 0)
3910 return ret; 4014 return ret;
3911 4015
3912 ret = 0; 4016 if (val != 0 && val != 1)
3913 switch (val) { 4017 return -EINVAL;
3914 case 0:
3915 /* do nothing if already cleared */
3916 if (!(topt->flags->val & topt->opt->bit))
3917 break;
3918
3919 mutex_lock(&trace_types_lock);
3920 if (current_trace->set_flag)
3921 ret = current_trace->set_flag(topt->flags->val,
3922 topt->opt->bit, 0);
3923 mutex_unlock(&trace_types_lock);
3924 if (ret)
3925 return ret;
3926 topt->flags->val &= ~topt->opt->bit;
3927 break;
3928 case 1:
3929 /* do nothing if already set */
3930 if (topt->flags->val & topt->opt->bit)
3931 break;
3932 4018
4019 if (!!(topt->flags->val & topt->opt->bit) != val) {
3933 mutex_lock(&trace_types_lock); 4020 mutex_lock(&trace_types_lock);
3934 if (current_trace->set_flag) 4021 ret = __set_tracer_option(current_trace, topt->flags,
3935 ret = current_trace->set_flag(topt->flags->val, 4022 topt->opt, !val);
3936 topt->opt->bit, 1);
3937 mutex_unlock(&trace_types_lock); 4023 mutex_unlock(&trace_types_lock);
3938 if (ret) 4024 if (ret)
3939 return ret; 4025 return ret;
3940 topt->flags->val |= topt->opt->bit;
3941 break;
3942
3943 default:
3944 return -EINVAL;
3945 } 4026 }
3946 4027
3947 *ppos += cnt; 4028 *ppos += cnt;
@@ -4142,6 +4223,8 @@ static __init int tracer_init_debugfs(void)
4142 struct dentry *d_tracer; 4223 struct dentry *d_tracer;
4143 int cpu; 4224 int cpu;
4144 4225
4226 trace_access_lock_init();
4227
4145 d_tracer = tracing_init_dentry(); 4228 d_tracer = tracing_init_dentry();
4146 4229
4147 trace_create_file("tracing_enabled", 0644, d_tracer, 4230 trace_create_file("tracing_enabled", 0644, d_tracer,
@@ -4268,8 +4351,8 @@ trace_printk_seq(struct trace_seq *s)
4268 4351
4269static void __ftrace_dump(bool disable_tracing) 4352static void __ftrace_dump(bool disable_tracing)
4270{ 4353{
4271 static raw_spinlock_t ftrace_dump_lock = 4354 static arch_spinlock_t ftrace_dump_lock =
4272 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 4355 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
4273 /* use static because iter can be a bit big for the stack */ 4356 /* use static because iter can be a bit big for the stack */
4274 static struct trace_iterator iter; 4357 static struct trace_iterator iter;
4275 unsigned int old_userobj; 4358 unsigned int old_userobj;
@@ -4279,7 +4362,7 @@ static void __ftrace_dump(bool disable_tracing)
4279 4362
4280 /* only one dump */ 4363 /* only one dump */
4281 local_irq_save(flags); 4364 local_irq_save(flags);
4282 __raw_spin_lock(&ftrace_dump_lock); 4365 arch_spin_lock(&ftrace_dump_lock);
4283 if (dump_ran) 4366 if (dump_ran)
4284 goto out; 4367 goto out;
4285 4368
@@ -4354,7 +4437,7 @@ static void __ftrace_dump(bool disable_tracing)
4354 } 4437 }
4355 4438
4356 out: 4439 out:
4357 __raw_spin_unlock(&ftrace_dump_lock); 4440 arch_spin_unlock(&ftrace_dump_lock);
4358 local_irq_restore(flags); 4441 local_irq_restore(flags);
4359} 4442}
4360 4443
@@ -4376,9 +4459,6 @@ __init static int tracer_alloc_buffers(void)
4376 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) 4459 if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
4377 goto out_free_buffer_mask; 4460 goto out_free_buffer_mask;
4378 4461
4379 if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
4380 goto out_free_tracing_cpumask;
4381
4382 /* To save memory, keep the ring buffer size to its minimum */ 4462 /* To save memory, keep the ring buffer size to its minimum */
4383 if (ring_buffer_expanded) 4463 if (ring_buffer_expanded)
4384 ring_buf_size = trace_buf_size; 4464 ring_buf_size = trace_buf_size;
@@ -4415,7 +4495,7 @@ __init static int tracer_alloc_buffers(void)
4415 /* Allocate the first page for all buffers */ 4495 /* Allocate the first page for all buffers */
4416 for_each_tracing_cpu(i) { 4496 for_each_tracing_cpu(i) {
4417 global_trace.data[i] = &per_cpu(global_trace_cpu, i); 4497 global_trace.data[i] = &per_cpu(global_trace_cpu, i);
4418 max_tr.data[i] = &per_cpu(max_data, i); 4498 max_tr.data[i] = &per_cpu(max_tr_data, i);
4419 } 4499 }
4420 4500
4421 trace_init_cmdlines(); 4501 trace_init_cmdlines();
@@ -4436,8 +4516,6 @@ __init static int tracer_alloc_buffers(void)
4436 return 0; 4516 return 0;
4437 4517
4438out_free_cpumask: 4518out_free_cpumask:
4439 free_cpumask_var(tracing_reader_cpumask);
4440out_free_tracing_cpumask:
4441 free_cpumask_var(tracing_cpumask); 4519 free_cpumask_var(tracing_cpumask);
4442out_free_buffer_mask: 4520out_free_buffer_mask:
4443 free_cpumask_var(tracing_buffer_mask); 4521 free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1d7f4830a80d..fd05bcaf91b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,6 +272,7 @@ struct tracer_flags {
272 * @pipe_open: called when the trace_pipe file is opened 272 * @pipe_open: called when the trace_pipe file is opened
273 * @wait_pipe: override how the user waits for traces on trace_pipe 273 * @wait_pipe: override how the user waits for traces on trace_pipe
274 * @close: called when the trace file is released 274 * @close: called when the trace file is released
275 * @pipe_close: called when the trace_pipe file is released
275 * @read: override the default read callback on trace_pipe 276 * @read: override the default read callback on trace_pipe
276 * @splice_read: override the default splice_read callback on trace_pipe 277 * @splice_read: override the default splice_read callback on trace_pipe
277 * @selftest: selftest to run on boot (see trace_selftest.c) 278 * @selftest: selftest to run on boot (see trace_selftest.c)
@@ -290,6 +291,7 @@ struct tracer {
290 void (*pipe_open)(struct trace_iterator *iter); 291 void (*pipe_open)(struct trace_iterator *iter);
291 void (*wait_pipe)(struct trace_iterator *iter); 292 void (*wait_pipe)(struct trace_iterator *iter);
292 void (*close)(struct trace_iterator *iter); 293 void (*close)(struct trace_iterator *iter);
294 void (*pipe_close)(struct trace_iterator *iter);
293 ssize_t (*read)(struct trace_iterator *iter, 295 ssize_t (*read)(struct trace_iterator *iter,
294 struct file *filp, char __user *ubuf, 296 struct file *filp, char __user *ubuf,
295 size_t cnt, loff_t *ppos); 297 size_t cnt, loff_t *ppos);
@@ -441,7 +443,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
441 443
442extern int ring_buffer_expanded; 444extern int ring_buffer_expanded;
443extern bool tracing_selftest_disabled; 445extern bool tracing_selftest_disabled;
444DECLARE_PER_CPU(local_t, ftrace_cpu_disabled); 446DECLARE_PER_CPU(int, ftrace_cpu_disabled);
445 447
446#ifdef CONFIG_FTRACE_STARTUP_TEST 448#ifdef CONFIG_FTRACE_STARTUP_TEST
447extern int trace_selftest_startup_function(struct tracer *trace, 449extern int trace_selftest_startup_function(struct tracer *trace,
@@ -495,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495#ifdef CONFIG_DYNAMIC_FTRACE 497#ifdef CONFIG_DYNAMIC_FTRACE
496/* TODO: make this variable */ 498/* TODO: make this variable */
497#define FTRACE_GRAPH_MAX_FUNCS 32 499#define FTRACE_GRAPH_MAX_FUNCS 32
500extern int ftrace_graph_filter_enabled;
498extern int ftrace_graph_count; 501extern int ftrace_graph_count;
499extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; 502extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS];
500 503
@@ -502,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
502{ 505{
503 int i; 506 int i;
504 507
505 if (!ftrace_graph_count || test_tsk_trace_graph(current)) 508 if (!ftrace_graph_filter_enabled)
506 return 1; 509 return 1;
507 510
508 for (i = 0; i < ftrace_graph_count; i++) { 511 for (i = 0; i < ftrace_graph_count; i++) {
@@ -595,18 +598,17 @@ enum trace_iterator_flags {
595 TRACE_ITER_BIN = 0x40, 598 TRACE_ITER_BIN = 0x40,
596 TRACE_ITER_BLOCK = 0x80, 599 TRACE_ITER_BLOCK = 0x80,
597 TRACE_ITER_STACKTRACE = 0x100, 600 TRACE_ITER_STACKTRACE = 0x100,
598 TRACE_ITER_SCHED_TREE = 0x200, 601 TRACE_ITER_PRINTK = 0x200,
599 TRACE_ITER_PRINTK = 0x400, 602 TRACE_ITER_PREEMPTONLY = 0x400,
600 TRACE_ITER_PREEMPTONLY = 0x800, 603 TRACE_ITER_BRANCH = 0x800,
601 TRACE_ITER_BRANCH = 0x1000, 604 TRACE_ITER_ANNOTATE = 0x1000,
602 TRACE_ITER_ANNOTATE = 0x2000, 605 TRACE_ITER_USERSTACKTRACE = 0x2000,
603 TRACE_ITER_USERSTACKTRACE = 0x4000, 606 TRACE_ITER_SYM_USEROBJ = 0x4000,
604 TRACE_ITER_SYM_USEROBJ = 0x8000, 607 TRACE_ITER_PRINTK_MSGONLY = 0x8000,
605 TRACE_ITER_PRINTK_MSGONLY = 0x10000, 608 TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */
606 TRACE_ITER_CONTEXT_INFO = 0x20000, /* Print pid/cpu/time */ 609 TRACE_ITER_LATENCY_FMT = 0x20000,
607 TRACE_ITER_LATENCY_FMT = 0x40000, 610 TRACE_ITER_SLEEP_TIME = 0x40000,
608 TRACE_ITER_SLEEP_TIME = 0x80000, 611 TRACE_ITER_GRAPH_TIME = 0x80000,
609 TRACE_ITER_GRAPH_TIME = 0x100000,
610}; 612};
611 613
612/* 614/*
@@ -790,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[];
790 792
791#undef FTRACE_ENTRY 793#undef FTRACE_ENTRY
792#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ 794#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \
793 extern struct ftrace_event_call event_##call; 795 extern struct ftrace_event_call \
796 __attribute__((__aligned__(4))) event_##call;
794#undef FTRACE_ENTRY_DUP 797#undef FTRACE_ENTRY_DUP
795#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ 798#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \
796 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) 799 FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4a194f08f88c..b9bc4d470177 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
307 return -1; 307 return -1;
308 if (percent_a > percent_b) 308 if (percent_a > percent_b)
309 return 1; 309 return 1;
310 else 310
311 return 0; 311 if (a->incorrect < b->incorrect)
312 return -1;
313 if (a->incorrect > b->incorrect)
314 return 1;
315
316 /*
317 * Since the above shows worse (incorrect) cases
318 * first, we continue that by showing best (correct)
319 * cases last.
320 */
321 if (a->correct > b->correct)
322 return -1;
323 if (a->correct < b->correct)
324 return 1;
325
326 return 0;
312} 327}
313 328
314static struct tracer_stat annotated_branch_stats = { 329static struct tracer_stat annotated_branch_stats = {
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 878c03f386ba..84a3a7ba072a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -71,10 +71,10 @@ u64 notrace trace_clock(void)
71/* keep prev_time and lock in the same cacheline. */ 71/* keep prev_time and lock in the same cacheline. */
72static struct { 72static struct {
73 u64 prev_time; 73 u64 prev_time;
74 raw_spinlock_t lock; 74 arch_spinlock_t lock;
75} trace_clock_struct ____cacheline_aligned_in_smp = 75} trace_clock_struct ____cacheline_aligned_in_smp =
76 { 76 {
77 .lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, 77 .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED,
78 }; 78 };
79 79
80u64 notrace trace_clock_global(void) 80u64 notrace trace_clock_global(void)
@@ -94,7 +94,7 @@ u64 notrace trace_clock_global(void)
94 if (unlikely(in_nmi())) 94 if (unlikely(in_nmi()))
95 goto out; 95 goto out;
96 96
97 __raw_spin_lock(&trace_clock_struct.lock); 97 arch_spin_lock(&trace_clock_struct.lock);
98 98
99 /* 99 /*
100 * TODO: if this happens often then maybe we should reset 100 * TODO: if this happens often then maybe we should reset
@@ -106,7 +106,7 @@ u64 notrace trace_clock_global(void)
106 106
107 trace_clock_struct.prev_time = now; 107 trace_clock_struct.prev_time = now;
108 108
109 __raw_spin_unlock(&trace_clock_struct.lock); 109 arch_spin_unlock(&trace_clock_struct.lock);
110 110
111 out: 111 out:
112 raw_local_irq_restore(flags); 112 raw_local_irq_restore(flags);
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index d9c60f80aa0d..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -25,7 +23,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
25 char *buf; 23 char *buf;
26 int ret = -ENOMEM; 24 int ret = -ENOMEM;
27 25
28 if (atomic_inc_return(&event->profile_count)) 26 if (event->profile_count++ > 0)
29 return 0; 27 return 0;
30 28
31 if (!total_profile_count) { 29 if (!total_profile_count) {
@@ -56,7 +54,7 @@ fail_buf_nmi:
56 perf_trace_buf = NULL; 54 perf_trace_buf = NULL;
57 } 55 }
58fail_buf: 56fail_buf:
59 atomic_dec(&event->profile_count); 57 event->profile_count--;
60 58
61 return ret; 59 return ret;
62} 60}
@@ -83,7 +81,7 @@ static void ftrace_profile_disable_event(struct ftrace_event_call *event)
83{ 81{
84 char *buf, *nmi_buf; 82 char *buf, *nmi_buf;
85 83
86 if (!atomic_add_negative(-1, &event->profile_count)) 84 if (--event->profile_count > 0)
87 return; 85 return;
88 86
89 event->profile_disable(event); 87 event->profile_disable(event);
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d18315dc836..3f972ad98d04 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
60 return 0; 60 return 0;
61 61
62err: 62err:
63 if (field) { 63 if (field)
64 kfree(field->name); 64 kfree(field->name);
65 kfree(field->type);
66 }
67 kfree(field); 65 kfree(field);
68 66
69 return -ENOMEM; 67 return -ENOMEM;
@@ -78,7 +76,7 @@ EXPORT_SYMBOL_GPL(trace_define_field);
78 if (ret) \ 76 if (ret) \
79 return ret; 77 return ret;
80 78
81int trace_define_common_fields(struct ftrace_event_call *call) 79static int trace_define_common_fields(struct ftrace_event_call *call)
82{ 80{
83 int ret; 81 int ret;
84 struct trace_entry ent; 82 struct trace_entry ent;
@@ -91,7 +89,6 @@ int trace_define_common_fields(struct ftrace_event_call *call)
91 89
92 return ret; 90 return ret;
93} 91}
94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 92
96void trace_destroy_fields(struct ftrace_event_call *call) 93void trace_destroy_fields(struct ftrace_event_call *call)
97{ 94{
@@ -105,9 +102,25 @@ void trace_destroy_fields(struct ftrace_event_call *call)
105 } 102 }
106} 103}
107 104
108static void ftrace_event_enable_disable(struct ftrace_event_call *call, 105int trace_event_raw_init(struct ftrace_event_call *call)
106{
107 int id;
108
109 id = register_ftrace_event(call->event);
110 if (!id)
111 return -ENODEV;
112 call->id = id;
113 INIT_LIST_HEAD(&call->fields);
114
115 return 0;
116}
117EXPORT_SYMBOL_GPL(trace_event_raw_init);
118
119static int ftrace_event_enable_disable(struct ftrace_event_call *call,
109 int enable) 120 int enable)
110{ 121{
122 int ret = 0;
123
111 switch (enable) { 124 switch (enable) {
112 case 0: 125 case 0:
113 if (call->enabled) { 126 if (call->enabled) {
@@ -118,12 +131,20 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
118 break; 131 break;
119 case 1: 132 case 1:
120 if (!call->enabled) { 133 if (!call->enabled) {
121 call->enabled = 1;
122 tracing_start_cmdline_record(); 134 tracing_start_cmdline_record();
123 call->regfunc(call); 135 ret = call->regfunc(call);
136 if (ret) {
137 tracing_stop_cmdline_record();
138 pr_info("event trace: Could not enable event "
139 "%s\n", call->name);
140 break;
141 }
142 call->enabled = 1;
124 } 143 }
125 break; 144 break;
126 } 145 }
146
147 return ret;
127} 148}
128 149
129static void ftrace_clear_events(void) 150static void ftrace_clear_events(void)
@@ -402,7 +423,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
402 case 0: 423 case 0:
403 case 1: 424 case 1:
404 mutex_lock(&event_mutex); 425 mutex_lock(&event_mutex);
405 ftrace_event_enable_disable(call, val); 426 ret = ftrace_event_enable_disable(call, val);
406 mutex_unlock(&event_mutex); 427 mutex_unlock(&event_mutex);
407 break; 428 break;
408 429
@@ -412,7 +433,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
412 433
413 *ppos += cnt; 434 *ppos += cnt;
414 435
415 return cnt; 436 return ret ? ret : cnt;
416} 437}
417 438
418static ssize_t 439static ssize_t
@@ -497,41 +518,16 @@ out:
497 return ret; 518 return ret;
498} 519}
499 520
500extern char *__bad_type_size(void);
501
502#undef FIELD
503#define FIELD(type, name) \
504 sizeof(type) != sizeof(field.name) ? __bad_type_size() : \
505 #type, "common_" #name, offsetof(typeof(field), name), \
506 sizeof(field.name), is_signed_type(type)
507
508static int trace_write_header(struct trace_seq *s)
509{
510 struct trace_entry field;
511
512 /* struct trace_entry */
513 return trace_seq_printf(s,
514 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
515 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
516 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
517 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
518 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n"
519 "\n",
520 FIELD(unsigned short, type),
521 FIELD(unsigned char, flags),
522 FIELD(unsigned char, preempt_count),
523 FIELD(int, pid),
524 FIELD(int, lock_depth));
525}
526
527static ssize_t 521static ssize_t
528event_format_read(struct file *filp, char __user *ubuf, size_t cnt, 522event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
529 loff_t *ppos) 523 loff_t *ppos)
530{ 524{
531 struct ftrace_event_call *call = filp->private_data; 525 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field;
532 struct trace_seq *s; 527 struct trace_seq *s;
528 int common_field_count = 5;
533 char *buf; 529 char *buf;
534 int r; 530 int r = 0;
535 531
536 if (*ppos) 532 if (*ppos)
537 return 0; 533 return 0;
@@ -542,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
542 538
543 trace_seq_init(s); 539 trace_seq_init(s);
544 540
545 /* If any of the first writes fail, so will the show_format. */
546
547 trace_seq_printf(s, "name: %s\n", call->name); 541 trace_seq_printf(s, "name: %s\n", call->name);
548 trace_seq_printf(s, "ID: %d\n", call->id); 542 trace_seq_printf(s, "ID: %d\n", call->id);
549 trace_seq_printf(s, "format:\n"); 543 trace_seq_printf(s, "format:\n");
550 trace_write_header(s);
551 544
552 r = call->show_format(call, s); 545 list_for_each_entry_reverse(field, &call->fields, link) {
546 /*
547 * Smartly shows the array type(except dynamic array).
548 * Normal:
549 * field:TYPE VAR
550 * If TYPE := TYPE[LEN], it is shown:
551 * field:TYPE VAR[LEN]
552 */
553 const char *array_descriptor = strchr(field->type, '[');
554
555 if (!strncmp(field->type, "__data_loc", 10))
556 array_descriptor = NULL;
557
558 if (!array_descriptor) {
559 r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;"
560 "\tsize:%u;\tsigned:%d;\n",
561 field->type, field->name, field->offset,
562 field->size, !!field->is_signed);
563 } else {
564 r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;"
565 "\tsize:%u;\tsigned:%d;\n",
566 (int)(array_descriptor - field->type),
567 field->type, field->name,
568 array_descriptor, field->offset,
569 field->size, !!field->is_signed);
570 }
571
572 if (--common_field_count == 0)
573 r = trace_seq_printf(s, "\n");
574
575 if (!r)
576 break;
577 }
578
579 if (r)
580 r = trace_seq_printf(s, "\nprint fmt: %s\n",
581 call->print_fmt);
582
553 if (!r) { 583 if (!r) {
554 /* 584 /*
555 * ug! The format output is bigger than a PAGE!! 585 * ug! The format output is bigger than a PAGE!!
@@ -913,7 +943,9 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
913 id); 943 id);
914 944
915 if (call->define_fields) { 945 if (call->define_fields) {
916 ret = call->define_fields(call); 946 ret = trace_define_common_fields(call);
947 if (!ret)
948 ret = call->define_fields(call);
917 if (ret < 0) { 949 if (ret < 0) {
918 pr_warning("Could not initialize trace point" 950 pr_warning("Could not initialize trace point"
919 " events/%s\n", call->name); 951 " events/%s\n", call->name);
@@ -923,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
923 filter); 955 filter);
924 } 956 }
925 957
926 /* A trace may not want to export its format */
927 if (!call->show_format)
928 return 0;
929
930 trace_create_file("format", 0444, call->dir, call, 958 trace_create_file("format", 0444, call->dir, call,
931 format); 959 format);
932 960
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 50504cb228de..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -211,8 +211,9 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
211{ 211{
212 char **addr = (char **)(event + pred->offset); 212 char **addr = (char **)(event + pred->offset);
213 int cmp, match; 213 int cmp, match;
214 int len = strlen(*addr) + 1; /* including tailing '\0' */
214 215
215 cmp = pred->regex.match(*addr, &pred->regex, pred->regex.field_len); 216 cmp = pred->regex.match(*addr, &pred->regex, len);
216 217
217 match = cmp ^ pred->not; 218 match = cmp ^ pred->not;
218 219
@@ -251,7 +252,18 @@ static int filter_pred_none(struct filter_pred *pred, void *event,
251 return 0; 252 return 0;
252} 253}
253 254
254/* Basic regex callbacks */ 255/*
256 * regex_match_foo - Basic regex callbacks
257 *
258 * @str: the string to be searched
259 * @r: the regex structure containing the pattern string
260 * @len: the length of the string to be searched (including '\0')
261 *
262 * Note:
263 * - @str might not be NULL-terminated if it's of type DYN_STRING
264 * or STATIC_STRING
265 */
266
255static int regex_match_full(char *str, struct regex *r, int len) 267static int regex_match_full(char *str, struct regex *r, int len)
256{ 268{
257 if (strncmp(str, r->pattern, len) == 0) 269 if (strncmp(str, r->pattern, len) == 0)
@@ -261,23 +273,24 @@ static int regex_match_full(char *str, struct regex *r, int len)
261 273
262static int regex_match_front(char *str, struct regex *r, int len) 274static int regex_match_front(char *str, struct regex *r, int len)
263{ 275{
264 if (strncmp(str, r->pattern, len) == 0) 276 if (strncmp(str, r->pattern, r->len) == 0)
265 return 1; 277 return 1;
266 return 0; 278 return 0;
267} 279}
268 280
269static int regex_match_middle(char *str, struct regex *r, int len) 281static int regex_match_middle(char *str, struct regex *r, int len)
270{ 282{
271 if (strstr(str, r->pattern)) 283 if (strnstr(str, r->pattern, len))
272 return 1; 284 return 1;
273 return 0; 285 return 0;
274} 286}
275 287
276static int regex_match_end(char *str, struct regex *r, int len) 288static int regex_match_end(char *str, struct regex *r, int len)
277{ 289{
278 char *ptr = strstr(str, r->pattern); 290 int strlen = len - 1;
279 291
280 if (ptr && (ptr[r->len] == 0)) 292 if (strlen >= r->len &&
293 memcmp(str + strlen - r->len, r->pattern, r->len) == 0)
281 return 1; 294 return 1;
282 return 0; 295 return 0;
283} 296}
@@ -781,10 +794,8 @@ static int filter_add_pred(struct filter_parse_state *ps,
781 pred->regex.field_len = field->size; 794 pred->regex.field_len = field->size;
782 } else if (field->filter_type == FILTER_DYN_STRING) 795 } else if (field->filter_type == FILTER_DYN_STRING)
783 fn = filter_pred_strloc; 796 fn = filter_pred_strloc;
784 else { 797 else
785 fn = filter_pred_pchar; 798 fn = filter_pred_pchar;
786 pred->regex.field_len = strlen(pred->regex.pattern);
787 }
788 } else { 799 } else {
789 if (field->is_signed) 800 if (field->is_signed)
790 ret = strict_strtoll(pred->regex.pattern, 0, &val); 801 ret = strict_strtoll(pred->regex.pattern, 0, &val);
@@ -1360,7 +1371,7 @@ out_unlock:
1360 return err; 1371 return err;
1361} 1372}
1362 1373
1363#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1364 1375
1365void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1366{ 1377{
@@ -1428,5 +1439,5 @@ out_unlock:
1428 return err; 1439 return err;
1429} 1440}
1430 1441
1431#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1432 1443
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index dff8c84ddf17..e091f64ba6ce 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \
62 62
63#include "trace_entries.h" 63#include "trace_entries.h"
64 64
65
66#undef __field
67#define __field(type, item) \
68 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
69 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
70 offsetof(typeof(field), item), \
71 sizeof(field.item), is_signed_type(type)); \
72 if (!ret) \
73 return 0;
74
75#undef __field_desc
76#define __field_desc(type, container, item) \
77 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
78 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
79 offsetof(typeof(field), container.item), \
80 sizeof(field.container.item), \
81 is_signed_type(type)); \
82 if (!ret) \
83 return 0;
84
85#undef __array
86#define __array(type, item, len) \
87 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
88 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
89 offsetof(typeof(field), item), \
90 sizeof(field.item), is_signed_type(type)); \
91 if (!ret) \
92 return 0;
93
94#undef __array_desc
95#define __array_desc(type, container, item, len) \
96 ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \
97 "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \
98 offsetof(typeof(field), container.item), \
99 sizeof(field.container.item), \
100 is_signed_type(type)); \
101 if (!ret) \
102 return 0;
103
104#undef __dynamic_array
105#define __dynamic_array(type, item) \
106 ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \
107 "offset:%zu;\tsize:0;\tsigned:%u;\n", \
108 offsetof(typeof(field), item), \
109 is_signed_type(type)); \
110 if (!ret) \
111 return 0;
112
113#undef F_printk
114#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
115
116#undef __entry
117#define __entry REC
118
119#undef FTRACE_ENTRY
120#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
121static int \
122ftrace_format_##name(struct ftrace_event_call *unused, \
123 struct trace_seq *s) \
124{ \
125 struct struct_name field __attribute__((unused)); \
126 int ret = 0; \
127 \
128 tstruct; \
129 \
130 trace_seq_printf(s, "\nprint fmt: " print); \
131 \
132 return ret; \
133}
134
135#include "trace_entries.h"
136
137#undef __field 65#undef __field
138#define __field(type, item) \ 66#define __field(type, item) \
139 ret = trace_define_field(event_call, #type, #item, \ 67 ret = trace_define_field(event_call, #type, #item, \
@@ -158,7 +86,8 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
158 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ 86 BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \
159 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 87 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
160 offsetof(typeof(field), item), \ 88 offsetof(typeof(field), item), \
161 sizeof(field.item), 0, FILTER_OTHER); \ 89 sizeof(field.item), \
90 is_signed_type(type), FILTER_OTHER); \
162 if (ret) \ 91 if (ret) \
163 return ret; 92 return ret;
164 93
@@ -168,13 +97,18 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
168 ret = trace_define_field(event_call, #type "[" #len "]", #item, \ 97 ret = trace_define_field(event_call, #type "[" #len "]", #item, \
169 offsetof(typeof(field), \ 98 offsetof(typeof(field), \
170 container.item), \ 99 container.item), \
171 sizeof(field.container.item), 0, \ 100 sizeof(field.container.item), \
172 FILTER_OTHER); \ 101 is_signed_type(type), FILTER_OTHER); \
173 if (ret) \ 102 if (ret) \
174 return ret; 103 return ret;
175 104
176#undef __dynamic_array 105#undef __dynamic_array
177#define __dynamic_array(type, item) 106#define __dynamic_array(type, item) \
107 ret = trace_define_field(event_call, #type, #item, \
108 offsetof(typeof(field), item), \
109 0, is_signed_type(type), FILTER_OTHER);\
110 if (ret) \
111 return ret;
178 112
179#undef FTRACE_ENTRY 113#undef FTRACE_ENTRY
180#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ 114#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
@@ -184,10 +118,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
184 struct struct_name field; \ 118 struct struct_name field; \
185 int ret; \ 119 int ret; \
186 \ 120 \
187 ret = trace_define_common_fields(event_call); \
188 if (ret) \
189 return ret; \
190 \
191 tstruct; \ 121 tstruct; \
192 \ 122 \
193 return ret; \ 123 return ret; \
@@ -201,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
201 return 0; 131 return 0;
202} 132}
203 133
134#undef __entry
135#define __entry REC
136
204#undef __field 137#undef __field
205#define __field(type, item) 138#define __field(type, item)
206 139
@@ -216,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
216#undef __dynamic_array 149#undef __dynamic_array
217#define __dynamic_array(type, item) 150#define __dynamic_array(type, item)
218 151
152#undef F_printk
153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154
219#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
221 \ 157 \
@@ -226,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
226 .id = type, \ 162 .id = type, \
227 .system = __stringify(TRACE_SYSTEM), \ 163 .system = __stringify(TRACE_SYSTEM), \
228 .raw_init = ftrace_raw_init_event, \ 164 .raw_init = ftrace_raw_init_event, \
229 .show_format = ftrace_format_##call, \ 165 .print_fmt = print, \
230 .define_fields = ftrace_define_fields_##call, \ 166 .define_fields = ftrace_define_fields_##call, \
231}; \ 167}; \
232 168
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 45e6c01b2e4d..e998a824e9db 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,9 +14,21 @@
14#include "trace.h" 14#include "trace.h"
15#include "trace_output.h" 15#include "trace_output.h"
16 16
17struct fgraph_data { 17struct fgraph_cpu_data {
18 pid_t last_pid; 18 pid_t last_pid;
19 int depth; 19 int depth;
20 int ignore;
21 unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH];
22};
23
24struct fgraph_data {
25 struct fgraph_cpu_data *cpu_data;
26
27 /* Place to preserve last processed entry. */
28 struct ftrace_graph_ent_entry ent;
29 struct ftrace_graph_ret_entry ret;
30 int failed;
31 int cpu;
20}; 32};
21 33
22#define TRACE_GRAPH_INDENT 2 34#define TRACE_GRAPH_INDENT 2
@@ -176,7 +188,7 @@ static int __trace_graph_entry(struct trace_array *tr,
176 struct ring_buffer *buffer = tr->buffer; 188 struct ring_buffer *buffer = tr->buffer;
177 struct ftrace_graph_ent_entry *entry; 189 struct ftrace_graph_ent_entry *entry;
178 190
179 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 191 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
180 return 0; 192 return 0;
181 193
182 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, 194 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT,
@@ -201,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
201 int cpu; 213 int cpu;
202 int pc; 214 int pc;
203 215
204 if (unlikely(!tr))
205 return 0;
206
207 if (!ftrace_trace_task(current)) 216 if (!ftrace_trace_task(current))
208 return 0; 217 return 0;
209 218
210 if (!ftrace_graph_addr(trace->func)) 219 /* trace it when it is-nested-in or is a function enabled. */
220 if (!(trace->depth || ftrace_graph_addr(trace->func)))
211 return 0; 221 return 0;
212 222
213 local_irq_save(flags); 223 local_irq_save(flags);
@@ -220,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
220 } else { 230 } else {
221 ret = 0; 231 ret = 0;
222 } 232 }
223 /* Only do the atomic if it is not already set */
224 if (!test_tsk_trace_graph(current))
225 set_tsk_trace_graph(current);
226 233
227 atomic_dec(&data->disabled); 234 atomic_dec(&data->disabled);
228 local_irq_restore(flags); 235 local_irq_restore(flags);
@@ -240,7 +247,7 @@ static void __trace_graph_return(struct trace_array *tr,
240 struct ring_buffer *buffer = tr->buffer; 247 struct ring_buffer *buffer = tr->buffer;
241 struct ftrace_graph_ret_entry *entry; 248 struct ftrace_graph_ret_entry *entry;
242 249
243 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled)))) 250 if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled))))
244 return; 251 return;
245 252
246 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, 253 event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET,
@@ -270,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
270 pc = preempt_count(); 277 pc = preempt_count();
271 __trace_graph_return(tr, trace, flags, pc); 278 __trace_graph_return(tr, trace, flags, pc);
272 } 279 }
273 if (!trace->depth)
274 clear_tsk_trace_graph(current);
275 atomic_dec(&data->disabled); 280 atomic_dec(&data->disabled);
276 local_irq_restore(flags); 281 local_irq_restore(flags);
277} 282}
278 283
284void set_graph_array(struct trace_array *tr)
285{
286 graph_array = tr;
287
288 /* Make graph_array visible before we start tracing */
289
290 smp_mb();
291}
292
279static int graph_trace_init(struct trace_array *tr) 293static int graph_trace_init(struct trace_array *tr)
280{ 294{
281 int ret; 295 int ret;
282 296
283 graph_array = tr; 297 set_graph_array(tr);
284 ret = register_ftrace_graph(&trace_graph_return, 298 ret = register_ftrace_graph(&trace_graph_return,
285 &trace_graph_entry); 299 &trace_graph_entry);
286 if (ret) 300 if (ret)
@@ -290,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr)
290 return 0; 304 return 0;
291} 305}
292 306
293void set_graph_array(struct trace_array *tr)
294{
295 graph_array = tr;
296}
297
298static void graph_trace_reset(struct trace_array *tr) 307static void graph_trace_reset(struct trace_array *tr)
299{ 308{
300 tracing_stop_cmdline_record(); 309 tracing_stop_cmdline_record();
@@ -384,7 +393,7 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
384 if (!data) 393 if (!data)
385 return TRACE_TYPE_HANDLED; 394 return TRACE_TYPE_HANDLED;
386 395
387 last_pid = &(per_cpu_ptr(data, cpu)->last_pid); 396 last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
388 397
389 if (*last_pid == pid) 398 if (*last_pid == pid)
390 return TRACE_TYPE_HANDLED; 399 return TRACE_TYPE_HANDLED;
@@ -435,26 +444,49 @@ static struct ftrace_graph_ret_entry *
435get_return_for_leaf(struct trace_iterator *iter, 444get_return_for_leaf(struct trace_iterator *iter,
436 struct ftrace_graph_ent_entry *curr) 445 struct ftrace_graph_ent_entry *curr)
437{ 446{
438 struct ring_buffer_iter *ring_iter; 447 struct fgraph_data *data = iter->private;
448 struct ring_buffer_iter *ring_iter = NULL;
439 struct ring_buffer_event *event; 449 struct ring_buffer_event *event;
440 struct ftrace_graph_ret_entry *next; 450 struct ftrace_graph_ret_entry *next;
441 451
442 ring_iter = iter->buffer_iter[iter->cpu]; 452 /*
453 * If the previous output failed to write to the seq buffer,
454 * then we just reuse the data from before.
455 */
456 if (data && data->failed) {
457 curr = &data->ent;
458 next = &data->ret;
459 } else {
443 460
444 /* First peek to compare current entry and the next one */ 461 ring_iter = iter->buffer_iter[iter->cpu];
445 if (ring_iter) 462
446 event = ring_buffer_iter_peek(ring_iter, NULL); 463 /* First peek to compare current entry and the next one */
447 else { 464 if (ring_iter)
448 /* We need to consume the current entry to see the next one */ 465 event = ring_buffer_iter_peek(ring_iter, NULL);
449 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 466 else {
450 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 467 /*
451 NULL); 468 * We need to consume the current entry to see
452 } 469 * the next one.
470 */
471 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
472 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
473 NULL);
474 }
453 475
454 if (!event) 476 if (!event)
455 return NULL; 477 return NULL;
456 478
457 next = ring_buffer_event_data(event); 479 next = ring_buffer_event_data(event);
480
481 if (data) {
482 /*
483 * Save current and next entries for later reference
484 * if the output fails.
485 */
486 data->ent = *curr;
487 data->ret = *next;
488 }
489 }
458 490
459 if (next->ent.type != TRACE_GRAPH_RET) 491 if (next->ent.type != TRACE_GRAPH_RET)
460 return NULL; 492 return NULL;
@@ -639,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
639 duration = graph_ret->rettime - graph_ret->calltime; 671 duration = graph_ret->rettime - graph_ret->calltime;
640 672
641 if (data) { 673 if (data) {
674 struct fgraph_cpu_data *cpu_data;
642 int cpu = iter->cpu; 675 int cpu = iter->cpu;
643 int *depth = &(per_cpu_ptr(data, cpu)->depth); 676
677 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
644 678
645 /* 679 /*
646 * Comments display at + 1 to depth. Since 680 * Comments display at + 1 to depth. Since
647 * this is a leaf function, keep the comments 681 * this is a leaf function, keep the comments
648 * equal to this depth. 682 * equal to this depth.
649 */ 683 */
650 *depth = call->depth - 1; 684 cpu_data->depth = call->depth - 1;
685
686 /* No need to keep this function around for this depth */
687 if (call->depth < FTRACE_RETFUNC_DEPTH)
688 cpu_data->enter_funcs[call->depth] = 0;
651 } 689 }
652 690
653 /* Overhead */ 691 /* Overhead */
@@ -687,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter,
687 int i; 725 int i;
688 726
689 if (data) { 727 if (data) {
728 struct fgraph_cpu_data *cpu_data;
690 int cpu = iter->cpu; 729 int cpu = iter->cpu;
691 int *depth = &(per_cpu_ptr(data, cpu)->depth);
692 730
693 *depth = call->depth; 731 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
732 cpu_data->depth = call->depth;
733
734 /* Save this function pointer to see if the exit matches */
735 if (call->depth < FTRACE_RETFUNC_DEPTH)
736 cpu_data->enter_funcs[call->depth] = call->func;
694 } 737 }
695 738
696 /* No overhead */ 739 /* No overhead */
@@ -782,19 +825,34 @@ static enum print_line_t
782print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 825print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
783 struct trace_iterator *iter) 826 struct trace_iterator *iter)
784{ 827{
785 int cpu = iter->cpu; 828 struct fgraph_data *data = iter->private;
786 struct ftrace_graph_ent *call = &field->graph_ent; 829 struct ftrace_graph_ent *call = &field->graph_ent;
787 struct ftrace_graph_ret_entry *leaf_ret; 830 struct ftrace_graph_ret_entry *leaf_ret;
831 static enum print_line_t ret;
832 int cpu = iter->cpu;
788 833
789 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 834 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
790 return TRACE_TYPE_PARTIAL_LINE; 835 return TRACE_TYPE_PARTIAL_LINE;
791 836
792 leaf_ret = get_return_for_leaf(iter, field); 837 leaf_ret = get_return_for_leaf(iter, field);
793 if (leaf_ret) 838 if (leaf_ret)
794 return print_graph_entry_leaf(iter, field, leaf_ret, s); 839 ret = print_graph_entry_leaf(iter, field, leaf_ret, s);
795 else 840 else
796 return print_graph_entry_nested(iter, field, s, cpu); 841 ret = print_graph_entry_nested(iter, field, s, cpu);
842
843 if (data) {
844 /*
845 * If we failed to write our output, then we need to make
846 * note of it. Because we already consumed our entry.
847 */
848 if (s->full) {
849 data->failed = 1;
850 data->cpu = cpu;
851 } else
852 data->failed = 0;
853 }
797 854
855 return ret;
798} 856}
799 857
800static enum print_line_t 858static enum print_line_t
@@ -805,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
805 struct fgraph_data *data = iter->private; 863 struct fgraph_data *data = iter->private;
806 pid_t pid = ent->pid; 864 pid_t pid = ent->pid;
807 int cpu = iter->cpu; 865 int cpu = iter->cpu;
866 int func_match = 1;
808 int ret; 867 int ret;
809 int i; 868 int i;
810 869
811 if (data) { 870 if (data) {
871 struct fgraph_cpu_data *cpu_data;
812 int cpu = iter->cpu; 872 int cpu = iter->cpu;
813 int *depth = &(per_cpu_ptr(data, cpu)->depth); 873
874 cpu_data = per_cpu_ptr(data->cpu_data, cpu);
814 875
815 /* 876 /*
816 * Comments display at + 1 to depth. This is the 877 * Comments display at + 1 to depth. This is the
817 * return from a function, we now want the comments 878 * return from a function, we now want the comments
818 * to display at the same level of the bracket. 879 * to display at the same level of the bracket.
819 */ 880 */
820 *depth = trace->depth - 1; 881 cpu_data->depth = trace->depth - 1;
882
883 if (trace->depth < FTRACE_RETFUNC_DEPTH) {
884 if (cpu_data->enter_funcs[trace->depth] != trace->func)
885 func_match = 0;
886 cpu_data->enter_funcs[trace->depth] = 0;
887 }
821 } 888 }
822 889
823 if (print_graph_prologue(iter, s, 0, 0)) 890 if (print_graph_prologue(iter, s, 0, 0))
@@ -842,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
842 return TRACE_TYPE_PARTIAL_LINE; 909 return TRACE_TYPE_PARTIAL_LINE;
843 } 910 }
844 911
845 ret = trace_seq_printf(s, "}\n"); 912 /*
846 if (!ret) 913 * If the return function does not have a matching entry,
847 return TRACE_TYPE_PARTIAL_LINE; 914 * then the entry was lost. Instead of just printing
915 * the '}' and letting the user guess what function this
916 * belongs to, write out the function name.
917 */
918 if (func_match) {
919 ret = trace_seq_printf(s, "}\n");
920 if (!ret)
921 return TRACE_TYPE_PARTIAL_LINE;
922 } else {
923 ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func);
924 if (!ret)
925 return TRACE_TYPE_PARTIAL_LINE;
926 }
848 927
849 /* Overrun */ 928 /* Overrun */
850 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 929 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) {
@@ -873,7 +952,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
873 int i; 952 int i;
874 953
875 if (data) 954 if (data)
876 depth = per_cpu_ptr(data, iter->cpu)->depth; 955 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
877 956
878 if (print_graph_prologue(iter, s, 0, 0)) 957 if (print_graph_prologue(iter, s, 0, 0))
879 return TRACE_TYPE_PARTIAL_LINE; 958 return TRACE_TYPE_PARTIAL_LINE;
@@ -941,8 +1020,33 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
941enum print_line_t 1020enum print_line_t
942print_graph_function(struct trace_iterator *iter) 1021print_graph_function(struct trace_iterator *iter)
943{ 1022{
1023 struct ftrace_graph_ent_entry *field;
1024 struct fgraph_data *data = iter->private;
944 struct trace_entry *entry = iter->ent; 1025 struct trace_entry *entry = iter->ent;
945 struct trace_seq *s = &iter->seq; 1026 struct trace_seq *s = &iter->seq;
1027 int cpu = iter->cpu;
1028 int ret;
1029
1030 if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
1031 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
1032 return TRACE_TYPE_HANDLED;
1033 }
1034
1035 /*
1036 * If the last output failed, there's a possibility we need
1037 * to print out the missing entry which would never go out.
1038 */
1039 if (data && data->failed) {
1040 field = &data->ent;
1041 iter->cpu = data->cpu;
1042 ret = print_graph_entry(field, s, iter);
1043 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1044 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1045 ret = TRACE_TYPE_NO_CONSUME;
1046 }
1047 iter->cpu = cpu;
1048 return ret;
1049 }
946 1050
947 switch (entry->type) { 1051 switch (entry->type) {
948 case TRACE_GRAPH_ENT: { 1052 case TRACE_GRAPH_ENT: {
@@ -952,7 +1056,7 @@ print_graph_function(struct trace_iterator *iter)
952 * sizeof(struct ftrace_graph_ent_entry) is very small, 1056 * sizeof(struct ftrace_graph_ent_entry) is very small,
953 * it can be safely saved at the stack. 1057 * it can be safely saved at the stack.
954 */ 1058 */
955 struct ftrace_graph_ent_entry *field, saved; 1059 struct ftrace_graph_ent_entry saved;
956 trace_assign_type(field, entry); 1060 trace_assign_type(field, entry);
957 saved = *field; 1061 saved = *field;
958 return print_graph_entry(&saved, s, iter); 1062 return print_graph_entry(&saved, s, iter);
@@ -1030,31 +1134,54 @@ static void print_graph_headers(struct seq_file *s)
1030static void graph_trace_open(struct trace_iterator *iter) 1134static void graph_trace_open(struct trace_iterator *iter)
1031{ 1135{
1032 /* pid and depth on the last trace processed */ 1136 /* pid and depth on the last trace processed */
1033 struct fgraph_data *data = alloc_percpu(struct fgraph_data); 1137 struct fgraph_data *data;
1034 int cpu; 1138 int cpu;
1035 1139
1140 iter->private = NULL;
1141
1142 data = kzalloc(sizeof(*data), GFP_KERNEL);
1036 if (!data) 1143 if (!data)
1037 pr_warning("function graph tracer: not enough memory\n"); 1144 goto out_err;
1038 else 1145
1039 for_each_possible_cpu(cpu) { 1146 data->cpu_data = alloc_percpu(struct fgraph_cpu_data);
1040 pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid); 1147 if (!data->cpu_data)
1041 int *depth = &(per_cpu_ptr(data, cpu)->depth); 1148 goto out_err_free;
1042 *pid = -1; 1149
1043 *depth = 0; 1150 for_each_possible_cpu(cpu) {
1044 } 1151 pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid);
1152 int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth);
1153 int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore);
1154 *pid = -1;
1155 *depth = 0;
1156 *ignore = 0;
1157 }
1045 1158
1046 iter->private = data; 1159 iter->private = data;
1160
1161 return;
1162
1163 out_err_free:
1164 kfree(data);
1165 out_err:
1166 pr_warning("function graph tracer: not enough memory\n");
1047} 1167}
1048 1168
1049static void graph_trace_close(struct trace_iterator *iter) 1169static void graph_trace_close(struct trace_iterator *iter)
1050{ 1170{
1051 free_percpu(iter->private); 1171 struct fgraph_data *data = iter->private;
1172
1173 if (data) {
1174 free_percpu(data->cpu_data);
1175 kfree(data);
1176 }
1052} 1177}
1053 1178
1054static struct tracer graph_trace __read_mostly = { 1179static struct tracer graph_trace __read_mostly = {
1055 .name = "function_graph", 1180 .name = "function_graph",
1056 .open = graph_trace_open, 1181 .open = graph_trace_open,
1182 .pipe_open = graph_trace_open,
1057 .close = graph_trace_close, 1183 .close = graph_trace_close,
1184 .pipe_close = graph_trace_close,
1058 .wait_pipe = poll_wait_pipe, 1185 .wait_pipe = poll_wait_pipe,
1059 .init = graph_trace_init, 1186 .init = graph_trace_init,
1060 .reset = graph_trace_reset, 1187 .reset = graph_trace_reset,
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 69543a905cd5..7b97000745f5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -20,10 +20,10 @@
20 20
21#define BTS_BUFFER_SIZE (1 << 13) 21#define BTS_BUFFER_SIZE (1 << 13)
22 22
23static DEFINE_PER_CPU(struct bts_tracer *, tracer); 23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); 24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25 25
26#define this_tracer per_cpu(tracer, smp_processor_id()) 26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27 27
28static int trace_hw_branches_enabled __read_mostly; 28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly; 29static int trace_hw_branches_suspended __read_mostly;
@@ -32,12 +32,13 @@ static struct trace_array *hw_branch_trace __read_mostly;
32 32
33static void bts_trace_init_cpu(int cpu) 33static void bts_trace_init_cpu(int cpu)
34{ 34{
35 per_cpu(tracer, cpu) = 35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, 36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 NULL, (size_t)-1, BTS_KERNEL); 37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
38 39
39 if (IS_ERR(per_cpu(tracer, cpu))) 40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
40 per_cpu(tracer, cpu) = NULL; 41 per_cpu(hwb_tracer, cpu) = NULL;
41} 42}
42 43
43static int bts_trace_init(struct trace_array *tr) 44static int bts_trace_init(struct trace_array *tr)
@@ -51,7 +52,7 @@ static int bts_trace_init(struct trace_array *tr)
51 for_each_online_cpu(cpu) { 52 for_each_online_cpu(cpu) {
52 bts_trace_init_cpu(cpu); 53 bts_trace_init_cpu(cpu);
53 54
54 if (likely(per_cpu(tracer, cpu))) 55 if (likely(per_cpu(hwb_tracer, cpu)))
55 trace_hw_branches_enabled = 1; 56 trace_hw_branches_enabled = 1;
56 } 57 }
57 trace_hw_branches_suspended = 0; 58 trace_hw_branches_suspended = 0;
@@ -67,9 +68,9 @@ static void bts_trace_reset(struct trace_array *tr)
67 68
68 get_online_cpus(); 69 get_online_cpus();
69 for_each_online_cpu(cpu) { 70 for_each_online_cpu(cpu) {
70 if (likely(per_cpu(tracer, cpu))) { 71 if (likely(per_cpu(hwb_tracer, cpu))) {
71 ds_release_bts(per_cpu(tracer, cpu)); 72 ds_release_bts(per_cpu(hwb_tracer, cpu));
72 per_cpu(tracer, cpu) = NULL; 73 per_cpu(hwb_tracer, cpu) = NULL;
73 } 74 }
74 } 75 }
75 trace_hw_branches_enabled = 0; 76 trace_hw_branches_enabled = 0;
@@ -83,8 +84,8 @@ static void bts_trace_start(struct trace_array *tr)
83 84
84 get_online_cpus(); 85 get_online_cpus();
85 for_each_online_cpu(cpu) 86 for_each_online_cpu(cpu)
86 if (likely(per_cpu(tracer, cpu))) 87 if (likely(per_cpu(hwb_tracer, cpu)))
87 ds_resume_bts(per_cpu(tracer, cpu)); 88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
88 trace_hw_branches_suspended = 0; 89 trace_hw_branches_suspended = 0;
89 put_online_cpus(); 90 put_online_cpus();
90} 91}
@@ -95,8 +96,8 @@ static void bts_trace_stop(struct trace_array *tr)
95 96
96 get_online_cpus(); 97 get_online_cpus();
97 for_each_online_cpu(cpu) 98 for_each_online_cpu(cpu)
98 if (likely(per_cpu(tracer, cpu))) 99 if (likely(per_cpu(hwb_tracer, cpu)))
99 ds_suspend_bts(per_cpu(tracer, cpu)); 100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
100 trace_hw_branches_suspended = 1; 101 trace_hw_branches_suspended = 1;
101 put_online_cpus(); 102 put_online_cpus();
102} 103}
@@ -114,16 +115,16 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
114 bts_trace_init_cpu(cpu); 115 bts_trace_init_cpu(cpu);
115 116
116 if (trace_hw_branches_suspended && 117 if (trace_hw_branches_suspended &&
117 likely(per_cpu(tracer, cpu))) 118 likely(per_cpu(hwb_tracer, cpu)))
118 ds_suspend_bts(per_cpu(tracer, cpu)); 119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
119 } 120 }
120 break; 121 break;
121 122
122 case CPU_DOWN_PREPARE: 123 case CPU_DOWN_PREPARE:
123 /* The notification is sent with interrupts enabled. */ 124 /* The notification is sent with interrupts enabled. */
124 if (likely(per_cpu(tracer, cpu))) { 125 if (likely(per_cpu(hwb_tracer, cpu))) {
125 ds_release_bts(per_cpu(tracer, cpu)); 126 ds_release_bts(per_cpu(hwb_tracer, cpu));
126 per_cpu(tracer, cpu) = NULL; 127 per_cpu(hwb_tracer, cpu) = NULL;
127 } 128 }
128 } 129 }
129 130
@@ -258,8 +259,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
258 259
259 get_online_cpus(); 260 get_online_cpus();
260 for_each_online_cpu(cpu) 261 for_each_online_cpu(cpu)
261 if (likely(per_cpu(tracer, cpu))) 262 if (likely(per_cpu(hwb_tracer, cpu)))
262 ds_suspend_bts(per_cpu(tracer, cpu)); 263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
263 /* 264 /*
264 * We need to collect the trace on the respective cpu since ftrace 265 * We need to collect the trace on the respective cpu since ftrace
265 * implicitly adds the record for the current cpu. 266 * implicitly adds the record for the current cpu.
@@ -268,8 +269,8 @@ static void trace_bts_prepare(struct trace_iterator *iter)
268 on_each_cpu(trace_bts_cpu, iter->tr, 1); 269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
269 270
270 for_each_online_cpu(cpu) 271 for_each_online_cpu(cpu)
271 if (likely(per_cpu(tracer, cpu))) 272 if (likely(per_cpu(hwb_tracer, cpu)))
272 ds_resume_bts(per_cpu(tracer, cpu)); 273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
273 put_online_cpus(); 274 put_online_cpus();
274} 275}
275 276
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 3aa7eaa2114c..2974bc7538c7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -151,6 +151,8 @@ check_critical_timing(struct trace_array *tr,
151 goto out_unlock; 151 goto out_unlock;
152 152
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc);
154 156
155 if (data->critical_sequence != max_sequence) 157 if (data->critical_sequence != max_sequence)
156 goto out_unlock; 158 goto out_unlock;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index aff5f80b59b8..505c92273b1a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -282,6 +275,18 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
282static int kretprobe_dispatcher(struct kretprobe_instance *ri, 275static int kretprobe_dispatcher(struct kretprobe_instance *ri,
283 struct pt_regs *regs); 276 struct pt_regs *regs);
284 277
278/* Check the name is good for event/group */
279static int check_event_name(const char *name)
280{
281 if (!isalpha(*name) && *name != '_')
282 return 0;
283 while (*++name != '\0') {
284 if (!isalpha(*name) && !isdigit(*name) && *name != '_')
285 return 0;
286 }
287 return 1;
288}
289
285/* 290/*
286 * Allocate new trace_probe and initialize it (including kprobes). 291 * Allocate new trace_probe and initialize it (including kprobes).
287 */ 292 */
@@ -293,10 +298,11 @@ static struct trace_probe *alloc_trace_probe(const char *group,
293 int nargs, int is_return) 298 int nargs, int is_return)
294{ 299{
295 struct trace_probe *tp; 300 struct trace_probe *tp;
301 int ret = -ENOMEM;
296 302
297 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); 303 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
298 if (!tp) 304 if (!tp)
299 return ERR_PTR(-ENOMEM); 305 return ERR_PTR(ret);
300 306
301 if (symbol) { 307 if (symbol) {
302 tp->symbol = kstrdup(symbol, GFP_KERNEL); 308 tp->symbol = kstrdup(symbol, GFP_KERNEL);
@@ -312,14 +318,20 @@ static struct trace_probe *alloc_trace_probe(const char *group,
312 else 318 else
313 tp->rp.kp.pre_handler = kprobe_dispatcher; 319 tp->rp.kp.pre_handler = kprobe_dispatcher;
314 320
315 if (!event) 321 if (!event || !check_event_name(event)) {
322 ret = -EINVAL;
316 goto error; 323 goto error;
324 }
325
317 tp->call.name = kstrdup(event, GFP_KERNEL); 326 tp->call.name = kstrdup(event, GFP_KERNEL);
318 if (!tp->call.name) 327 if (!tp->call.name)
319 goto error; 328 goto error;
320 329
321 if (!group) 330 if (!group || !check_event_name(group)) {
331 ret = -EINVAL;
322 goto error; 332 goto error;
333 }
334
323 tp->call.system = kstrdup(group, GFP_KERNEL); 335 tp->call.system = kstrdup(group, GFP_KERNEL);
324 if (!tp->call.system) 336 if (!tp->call.system)
325 goto error; 337 goto error;
@@ -330,7 +342,7 @@ error:
330 kfree(tp->call.name); 342 kfree(tp->call.name);
331 kfree(tp->symbol); 343 kfree(tp->symbol);
332 kfree(tp); 344 kfree(tp);
333 return ERR_PTR(-ENOMEM); 345 return ERR_PTR(ret);
334} 346}
335 347
336static void free_probe_arg(struct probe_arg *arg) 348static void free_probe_arg(struct probe_arg *arg)
@@ -470,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
470 } 482 }
471 } else 483 } else
472 ret = -EINVAL; 484 ret = -EINVAL;
473 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
474 ret = strict_strtoul(arg + 3, 10, &param);
475 if (ret || param > PARAM_MAX_ARGS)
476 ret = -EINVAL;
477 else {
478 ff->func = fetch_argument;
479 ff->data = (void *)param;
480 }
481 } else 485 } else
482 ret = -EINVAL; 486 ret = -EINVAL;
483 return ret; 487 return ret;
@@ -592,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
592 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
593 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
594 * Fetch args: 598 * Fetch args:
595 * $argN : fetch Nth of function argument. (N:0-)
596 * $retval : fetch return value 599 * $retval : fetch return value
597 * $stack : fetch stack address 600 * $stack : fetch stack address
598 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -606,23 +609,22 @@ static int create_trace_probe(int argc, char **argv)
606 */ 609 */
607 struct trace_probe *tp; 610 struct trace_probe *tp;
608 int i, ret = 0; 611 int i, ret = 0;
609 int is_return = 0; 612 int is_return = 0, is_delete = 0;
610 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
611 unsigned long offset = 0; 614 unsigned long offset = 0;
612 void *addr = NULL; 615 void *addr = NULL;
613 char buf[MAX_EVENT_NAME_LEN]; 616 char buf[MAX_EVENT_NAME_LEN];
614 617
615 if (argc < 2) { 618 /* argc must be >= 1 */
616 pr_info("Probe point is not specified.\n");
617 return -EINVAL;
618 }
619
620 if (argv[0][0] == 'p') 619 if (argv[0][0] == 'p')
621 is_return = 0; 620 is_return = 0;
622 else if (argv[0][0] == 'r') 621 else if (argv[0][0] == 'r')
623 is_return = 1; 622 is_return = 1;
623 else if (argv[0][0] == '-')
624 is_delete = 1;
624 else { 625 else {
625 pr_info("Probe definition must be started with 'p' or 'r'.\n"); 626 pr_info("Probe definition must be started with 'p', 'r' or"
627 " '-'.\n");
626 return -EINVAL; 628 return -EINVAL;
627 } 629 }
628 630
@@ -633,23 +635,45 @@ static int create_trace_probe(int argc, char **argv)
633 event = strchr(group, '/') + 1; 635 event = strchr(group, '/') + 1;
634 event[-1] = '\0'; 636 event[-1] = '\0';
635 if (strlen(group) == 0) { 637 if (strlen(group) == 0) {
636 pr_info("Group name is not specifiled\n"); 638 pr_info("Group name is not specified\n");
637 return -EINVAL; 639 return -EINVAL;
638 } 640 }
639 } 641 }
640 if (strlen(event) == 0) { 642 if (strlen(event) == 0) {
641 pr_info("Event name is not specifiled\n"); 643 pr_info("Event name is not specified\n");
642 return -EINVAL; 644 return -EINVAL;
643 } 645 }
644 } 646 }
647 if (!group)
648 group = KPROBE_EVENT_SYSTEM;
645 649
650 if (is_delete) {
651 if (!event) {
652 pr_info("Delete command needs an event name.\n");
653 return -EINVAL;
654 }
655 tp = find_probe_event(event, group);
656 if (!tp) {
657 pr_info("Event %s/%s doesn't exist.\n", group, event);
658 return -ENOENT;
659 }
660 /* delete an event */
661 unregister_trace_probe(tp);
662 free_trace_probe(tp);
663 return 0;
664 }
665
666 if (argc < 2) {
667 pr_info("Probe point is not specified.\n");
668 return -EINVAL;
669 }
646 if (isdigit(argv[1][0])) { 670 if (isdigit(argv[1][0])) {
647 if (is_return) { 671 if (is_return) {
648 pr_info("Return probe point must be a symbol.\n"); 672 pr_info("Return probe point must be a symbol.\n");
649 return -EINVAL; 673 return -EINVAL;
650 } 674 }
651 /* an address specified */ 675 /* an address specified */
652 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
653 if (ret) { 677 if (ret) {
654 pr_info("Failed to parse address.\n"); 678 pr_info("Failed to parse address.\n");
655 return ret; 679 return ret;
@@ -671,15 +695,13 @@ static int create_trace_probe(int argc, char **argv)
671 argc -= 2; argv += 2; 695 argc -= 2; argv += 2;
672 696
673 /* setup a probe */ 697 /* setup a probe */
674 if (!group)
675 group = KPROBE_EVENT_SYSTEM;
676 if (!event) { 698 if (!event) {
677 /* Make a new event name */ 699 /* Make a new event name */
678 if (symbol) 700 if (symbol)
679 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", 701 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld",
680 is_return ? 'r' : 'p', symbol, offset); 702 is_return ? 'r' : 'p', symbol, offset);
681 else 703 else
682 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", 704 snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
683 is_return ? 'r' : 'p', addr); 705 is_return ? 'r' : 'p', addr);
684 event = buf; 706 event = buf;
685 } 707 }
@@ -920,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
920}; 942};
921 943
922/* Kprobe handler */ 944/* Kprobe handler */
923static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
924{ 946{
925 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
926 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -940,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
940 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
941 irq_flags, pc); 963 irq_flags, pc);
942 if (!event) 964 if (!event)
943 return 0; 965 return;
944 966
945 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
946 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -950,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
950 972
951 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
952 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
953 return 0;
954} 975}
955 976
956/* Kretprobe handler */ 977/* Kretprobe handler */
957static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
958 struct pt_regs *regs) 979 struct pt_regs *regs)
959{ 980{
960 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -973,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
973 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
974 irq_flags, pc); 995 irq_flags, pc);
975 if (!event) 996 if (!event)
976 return 0; 997 return;
977 998
978 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
979 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -984,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
984 1005
985 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
986 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
987
988 return 0;
989} 1008}
990 1009
991/* Event entry printers */ 1010/* Event entry printers */
@@ -1113,10 +1132,6 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1113 struct kprobe_trace_entry field; 1132 struct kprobe_trace_entry field;
1114 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1133 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1115 1134
1116 ret = trace_define_common_fields(event_call);
1117 if (!ret)
1118 return ret;
1119
1120 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1121 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1122 /* Set argument names as fields */ 1137 /* Set argument names as fields */
@@ -1131,10 +1146,6 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1131 struct kretprobe_trace_entry field; 1146 struct kretprobe_trace_entry field;
1132 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1147 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1133 1148
1134 ret = trace_define_common_fields(event_call);
1135 if (!ret)
1136 return ret;
1137
1138 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1139 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1140 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); 1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
@@ -1144,212 +1155,123 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144 return 0; 1155 return 0;
1145} 1156}
1146 1157
1147static int __probe_event_show_format(struct trace_seq *s, 1158static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1148 struct trace_probe *tp, const char *fmt,
1149 const char *arg)
1150{ 1159{
1151 int i; 1160 int i;
1161 int pos = 0;
1152 1162
1153 /* Show format */ 1163 const char *fmt, *arg;
1154 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1155 return 0;
1156 1164
1157 for (i = 0; i < tp->nr_args; i++) 1165 if (!probe_is_return(tp)) {
1158 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) 1166 fmt = "(%lx)";
1159 return 0; 1167 arg = "REC->" FIELD_STRING_IP;
1168 } else {
1169 fmt = "(%lx <- %lx)";
1170 arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
1171 }
1160 1172
1161 if (!trace_seq_printf(s, "\", %s", arg)) 1173 /* When len=0, we just calculate the needed length */
1162 return 0; 1174#define LEN_OR_ZERO (len ? len - pos : 0)
1163 1175
1164 for (i = 0; i < tp->nr_args; i++) 1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1165 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1166 return 0;
1167 1177
1168 return trace_seq_puts(s, "\n"); 1178 for (i = 0; i < tp->nr_args; i++) {
1169} 1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx",
1170 1180 tp->args[i].name);
1171#undef SHOW_FIELD 1181 }
1172#define SHOW_FIELD(type, item, name) \
1173 do { \
1174 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1175 "offset:%u;\tsize:%u;\n", name, \
1176 (unsigned int)offsetof(typeof(field), item),\
1177 (unsigned int)sizeof(type)); \
1178 if (!ret) \
1179 return 0; \
1180 } while (0)
1181 1182
1182static int kprobe_event_show_format(struct ftrace_event_call *call, 1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
1183 struct trace_seq *s)
1184{
1185 struct kprobe_trace_entry field __attribute__((unused));
1186 int ret, i;
1187 struct trace_probe *tp = (struct trace_probe *)call->data;
1188 1184
1189 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); 1185 for (i = 0; i < tp->nr_args; i++) {
1190 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1186 pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s",
1187 tp->args[i].name);
1188 }
1191 1189
1192 /* Show fields */ 1190#undef LEN_OR_ZERO
1193 for (i = 0; i < tp->nr_args; i++)
1194 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1195 trace_seq_puts(s, "\n");
1196 1191
1197 return __probe_event_show_format(s, tp, "(%lx)", 1192 /* return the length of print_fmt */
1198 "REC->" FIELD_STRING_IP); 1193 return pos;
1199} 1194}
1200 1195
1201static int kretprobe_event_show_format(struct ftrace_event_call *call, 1196static int set_print_fmt(struct trace_probe *tp)
1202 struct trace_seq *s)
1203{ 1197{
1204 struct kretprobe_trace_entry field __attribute__((unused)); 1198 int len;
1205 int ret, i; 1199 char *print_fmt;
1206 struct trace_probe *tp = (struct trace_probe *)call->data;
1207 1200
1208 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); 1201 /* First: called with 0 length to calculate the needed length */
1209 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); 1202 len = __set_print_fmt(tp, NULL, 0);
1210 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); 1203 print_fmt = kmalloc(len + 1, GFP_KERNEL);
1204 if (!print_fmt)
1205 return -ENOMEM;
1211 1206
1212 /* Show fields */ 1207 /* Second: actually write the @print_fmt */
1213 for (i = 0; i < tp->nr_args; i++) 1208 __set_print_fmt(tp, print_fmt, len + 1);
1214 SHOW_FIELD(unsigned long, args[i], tp->args[i].name); 1209 tp->call.print_fmt = print_fmt;
1215 trace_seq_puts(s, "\n");
1216 1210
1217 return __probe_event_show_format(s, tp, "(%lx <- %lx)", 1211 return 0;
1218 "REC->" FIELD_STRING_FUNC
1219 ", REC->" FIELD_STRING_RETIP);
1220} 1212}
1221 1213
1222#ifdef CONFIG_EVENT_PROFILE 1214#ifdef CONFIG_PERF_EVENTS
1223 1215
1224/* Kprobe profile handler */ 1216/* Kprobe profile handler */
1225static __kprobes int kprobe_profile_func(struct kprobe *kp, 1217static __kprobes void kprobe_profile_func(struct kprobe *kp,
1226 struct pt_regs *regs) 1218 struct pt_regs *regs)
1227{ 1219{
1228 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1229 struct ftrace_event_call *call = &tp->call; 1221 struct ftrace_event_call *call = &tp->call;
1230 struct kprobe_trace_entry *entry; 1222 struct kprobe_trace_entry *entry;
1231 struct trace_entry *ent; 1223 int size, __size, i;
1232 int size, __size, i, pc, __cpu;
1233 unsigned long irq_flags; 1224 unsigned long irq_flags;
1234 char *trace_buf;
1235 char *raw_data;
1236 int rctx; 1225 int rctx;
1237 1226
1238 pc = preempt_count();
1239 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1240 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1228 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1241 size -= sizeof(u32); 1229 size -= sizeof(u32);
1242 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1230 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1243 "profile buffer not large enough")) 1231 "profile buffer not large enough"))
1244 return 0; 1232 return;
1245
1246 /*
1247 * Protect the non nmi buffer
1248 * This also protects the rcu read side
1249 */
1250 local_irq_save(irq_flags);
1251
1252 rctx = perf_swevent_get_recursion_context();
1253 if (rctx < 0)
1254 goto end_recursion;
1255 1233
1256 __cpu = smp_processor_id(); 1234 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1257 1235 if (!entry)
1258 if (in_nmi()) 1236 return;
1259 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1260 else
1261 trace_buf = rcu_dereference(perf_trace_buf);
1262
1263 if (!trace_buf)
1264 goto end;
1265
1266 raw_data = per_cpu_ptr(trace_buf, __cpu);
1267
1268 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1269 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1270 entry = (struct kprobe_trace_entry *)raw_data;
1271 ent = &entry->ent;
1272 1237
1273 tracing_generic_entry_update(ent, irq_flags, pc);
1274 ent->type = call->id;
1275 entry->nargs = tp->nr_args; 1238 entry->nargs = tp->nr_args;
1276 entry->ip = (unsigned long)kp->addr; 1239 entry->ip = (unsigned long)kp->addr;
1277 for (i = 0; i < tp->nr_args; i++) 1240 for (i = 0; i < tp->nr_args; i++)
1278 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1279 perf_tp_event(call->id, entry->ip, 1, entry, size);
1280
1281end:
1282 perf_swevent_put_recursion_context(rctx);
1283end_recursion:
1284 local_irq_restore(irq_flags);
1285 1242
1286 return 0; 1243 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1287} 1244}
1288 1245
1289/* Kretprobe profile handler */ 1246/* Kretprobe profile handler */
1290static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1247static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1291 struct pt_regs *regs) 1248 struct pt_regs *regs)
1292{ 1249{
1293 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1294 struct ftrace_event_call *call = &tp->call; 1251 struct ftrace_event_call *call = &tp->call;
1295 struct kretprobe_trace_entry *entry; 1252 struct kretprobe_trace_entry *entry;
1296 struct trace_entry *ent; 1253 int size, __size, i;
1297 int size, __size, i, pc, __cpu;
1298 unsigned long irq_flags; 1254 unsigned long irq_flags;
1299 char *trace_buf;
1300 char *raw_data;
1301 int rctx; 1255 int rctx;
1302 1256
1303 pc = preempt_count();
1304 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1305 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1258 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1306 size -= sizeof(u32); 1259 size -= sizeof(u32);
1307 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1260 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1308 "profile buffer not large enough")) 1261 "profile buffer not large enough"))
1309 return 0; 1262 return;
1310
1311 /*
1312 * Protect the non nmi buffer
1313 * This also protects the rcu read side
1314 */
1315 local_irq_save(irq_flags);
1316
1317 rctx = perf_swevent_get_recursion_context();
1318 if (rctx < 0)
1319 goto end_recursion;
1320 1263
1321 __cpu = smp_processor_id(); 1264 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1322 1265 if (!entry)
1323 if (in_nmi()) 1266 return;
1324 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1325 else
1326 trace_buf = rcu_dereference(perf_trace_buf);
1327
1328 if (!trace_buf)
1329 goto end;
1330
1331 raw_data = per_cpu_ptr(trace_buf, __cpu);
1332
1333 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1334 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1335 entry = (struct kretprobe_trace_entry *)raw_data;
1336 ent = &entry->ent;
1337 1267
1338 tracing_generic_entry_update(ent, irq_flags, pc);
1339 ent->type = call->id;
1340 entry->nargs = tp->nr_args; 1268 entry->nargs = tp->nr_args;
1341 entry->func = (unsigned long)tp->rp.kp.addr; 1269 entry->func = (unsigned long)tp->rp.kp.addr;
1342 entry->ret_ip = (unsigned long)ri->ret_addr; 1270 entry->ret_ip = (unsigned long)ri->ret_addr;
1343 for (i = 0; i < tp->nr_args; i++) 1271 for (i = 0; i < tp->nr_args; i++)
1344 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1345 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1346
1347end:
1348 perf_swevent_put_recursion_context(rctx);
1349end_recursion:
1350 local_irq_restore(irq_flags);
1351 1273
1352 return 0; 1274 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1353} 1275}
1354 1276
1355static int probe_profile_enable(struct ftrace_event_call *call) 1277static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1377,7 +1299,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1377 disable_kprobe(&tp->rp.kp); 1299 disable_kprobe(&tp->rp.kp);
1378 } 1300 }
1379} 1301}
1380#endif /* CONFIG_EVENT_PROFILE */ 1302#endif /* CONFIG_PERF_EVENTS */
1381 1303
1382 1304
1383static __kprobes 1305static __kprobes
@@ -1387,10 +1309,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1387 1309
1388 if (tp->flags & TP_FLAG_TRACE) 1310 if (tp->flags & TP_FLAG_TRACE)
1389 kprobe_trace_func(kp, regs); 1311 kprobe_trace_func(kp, regs);
1390#ifdef CONFIG_EVENT_PROFILE 1312#ifdef CONFIG_PERF_EVENTS
1391 if (tp->flags & TP_FLAG_PROFILE) 1313 if (tp->flags & TP_FLAG_PROFILE)
1392 kprobe_profile_func(kp, regs); 1314 kprobe_profile_func(kp, regs);
1393#endif /* CONFIG_EVENT_PROFILE */ 1315#endif
1394 return 0; /* We don't tweek kernel, so just return 0 */ 1316 return 0; /* We don't tweek kernel, so just return 0 */
1395} 1317}
1396 1318
@@ -1401,10 +1323,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1401 1323
1402 if (tp->flags & TP_FLAG_TRACE) 1324 if (tp->flags & TP_FLAG_TRACE)
1403 kretprobe_trace_func(ri, regs); 1325 kretprobe_trace_func(ri, regs);
1404#ifdef CONFIG_EVENT_PROFILE 1326#ifdef CONFIG_PERF_EVENTS
1405 if (tp->flags & TP_FLAG_PROFILE) 1327 if (tp->flags & TP_FLAG_PROFILE)
1406 kretprobe_profile_func(ri, regs); 1328 kretprobe_profile_func(ri, regs);
1407#endif /* CONFIG_EVENT_PROFILE */ 1329#endif
1408 return 0; /* We don't tweek kernel, so just return 0 */ 1330 return 0; /* We don't tweek kernel, so just return 0 */
1409} 1331}
1410 1332
@@ -1417,24 +1339,25 @@ static int register_probe_event(struct trace_probe *tp)
1417 if (probe_is_return(tp)) { 1339 if (probe_is_return(tp)) {
1418 tp->event.trace = print_kretprobe_event; 1340 tp->event.trace = print_kretprobe_event;
1419 call->raw_init = probe_event_raw_init; 1341 call->raw_init = probe_event_raw_init;
1420 call->show_format = kretprobe_event_show_format;
1421 call->define_fields = kretprobe_event_define_fields; 1342 call->define_fields = kretprobe_event_define_fields;
1422 } else { 1343 } else {
1423 tp->event.trace = print_kprobe_event; 1344 tp->event.trace = print_kprobe_event;
1424 call->raw_init = probe_event_raw_init; 1345 call->raw_init = probe_event_raw_init;
1425 call->show_format = kprobe_event_show_format;
1426 call->define_fields = kprobe_event_define_fields; 1346 call->define_fields = kprobe_event_define_fields;
1427 } 1347 }
1348 if (set_print_fmt(tp) < 0)
1349 return -ENOMEM;
1428 call->event = &tp->event; 1350 call->event = &tp->event;
1429 call->id = register_ftrace_event(&tp->event); 1351 call->id = register_ftrace_event(&tp->event);
1430 if (!call->id) 1352 if (!call->id) {
1353 kfree(call->print_fmt);
1431 return -ENODEV; 1354 return -ENODEV;
1355 }
1432 call->enabled = 0; 1356 call->enabled = 0;
1433 call->regfunc = probe_event_enable; 1357 call->regfunc = probe_event_enable;
1434 call->unregfunc = probe_event_disable; 1358 call->unregfunc = probe_event_disable;
1435 1359
1436#ifdef CONFIG_EVENT_PROFILE 1360#ifdef CONFIG_PERF_EVENTS
1437 atomic_set(&call->profile_count, -1);
1438 call->profile_enable = probe_profile_enable; 1361 call->profile_enable = probe_profile_enable;
1439 call->profile_disable = probe_profile_disable; 1362 call->profile_disable = probe_profile_disable;
1440#endif 1363#endif
@@ -1442,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp)
1442 ret = trace_add_event_call(call); 1365 ret = trace_add_event_call(call);
1443 if (ret) { 1366 if (ret) {
1444 pr_info("Failed to register kprobe event: %s\n", call->name); 1367 pr_info("Failed to register kprobe event: %s\n", call->name);
1368 kfree(call->print_fmt);
1445 unregister_ftrace_event(&tp->event); 1369 unregister_ftrace_event(&tp->event);
1446 } 1370 }
1447 return ret; 1371 return ret;
@@ -1451,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp)
1451{ 1375{
1452 /* tp->event is unregistered in trace_remove_event_call() */ 1376 /* tp->event is unregistered in trace_remove_event_call() */
1453 trace_remove_event_call(&tp->call); 1377 trace_remove_event_call(&tp->call);
1378 kfree(tp->call.print_fmt);
1454} 1379}
1455 1380
1456/* Make a debugfs interface for controling probe points */ 1381/* Make a debugfs interface for controling probe points */
@@ -1493,28 +1418,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1493 1418
1494static __init int kprobe_trace_self_tests_init(void) 1419static __init int kprobe_trace_self_tests_init(void)
1495{ 1420{
1496 int ret; 1421 int ret, warn = 0;
1497 int (*target)(int, int, int, int, int, int); 1422 int (*target)(int, int, int, int, int, int);
1423 struct trace_probe *tp;
1498 1424
1499 target = kprobe_trace_selftest_target; 1425 target = kprobe_trace_selftest_target;
1500 1426
1501 pr_info("Testing kprobe tracing: "); 1427 pr_info("Testing kprobe tracing: ");
1502 1428
1503 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1429 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1504 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1430 "$stack $stack0 +0($stack)");
1505 if (WARN_ON_ONCE(ret)) 1431 if (WARN_ON_ONCE(ret)) {
1506 pr_warning("error enabling function entry\n"); 1432 pr_warning("error on probing function entry.\n");
1433 warn++;
1434 } else {
1435 /* Enable trace point */
1436 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1437 if (WARN_ON_ONCE(tp == NULL)) {
1438 pr_warning("error on getting new probe.\n");
1439 warn++;
1440 } else
1441 probe_event_enable(&tp->call);
1442 }
1507 1443
1508 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1444 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1509 "$retval"); 1445 "$retval");
1510 if (WARN_ON_ONCE(ret)) 1446 if (WARN_ON_ONCE(ret)) {
1511 pr_warning("error enabling function return\n"); 1447 pr_warning("error on probing function return.\n");
1448 warn++;
1449 } else {
1450 /* Enable trace point */
1451 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1452 if (WARN_ON_ONCE(tp == NULL)) {
1453 pr_warning("error on getting new probe.\n");
1454 warn++;
1455 } else
1456 probe_event_enable(&tp->call);
1457 }
1458
1459 if (warn)
1460 goto end;
1512 1461
1513 ret = target(1, 2, 3, 4, 5, 6); 1462 ret = target(1, 2, 3, 4, 5, 6);
1514 1463
1515 cleanup_all_probes(); 1464 ret = command_trace_probe("-:testprobe");
1465 if (WARN_ON_ONCE(ret)) {
1466 pr_warning("error on deleting a probe.\n");
1467 warn++;
1468 }
1469
1470 ret = command_trace_probe("-:testprobe2");
1471 if (WARN_ON_ONCE(ret)) {
1472 pr_warning("error on deleting a probe.\n");
1473 warn++;
1474 }
1516 1475
1517 pr_cont("OK\n"); 1476end:
1477 cleanup_all_probes();
1478 if (warn)
1479 pr_cont("NG: Some tests are failed. Please check them.\n");
1480 else
1481 pr_cont("OK\n");
1518 return 0; 1482 return 0;
1519} 1483}
1520 1484
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index ddfa0fd43bc0..94103cdcf9d8 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -26,12 +26,13 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27 27
28#include "trace_output.h" 28#include "trace_output.h"
29#include "trace_stat.h"
30#include "trace.h" 29#include "trace.h"
31 30
32#include <linux/hw_breakpoint.h> 31#include <linux/hw_breakpoint.h>
33#include <asm/hw_breakpoint.h> 32#include <asm/hw_breakpoint.h>
34 33
34#include <asm/atomic.h>
35
35/* 36/*
36 * For now, let us restrict the no. of symbols traced simultaneously to number 37 * For now, let us restrict the no. of symbols traced simultaneously to number
37 * of available hardware breakpoint registers. 38 * of available hardware breakpoint registers.
@@ -44,7 +45,7 @@ struct trace_ksym {
44 struct perf_event **ksym_hbp; 45 struct perf_event **ksym_hbp;
45 struct perf_event_attr attr; 46 struct perf_event_attr attr;
46#ifdef CONFIG_PROFILE_KSYM_TRACER 47#ifdef CONFIG_PROFILE_KSYM_TRACER
47 unsigned long counter; 48 atomic64_t counter;
48#endif 49#endif
49 struct hlist_node ksym_hlist; 50 struct hlist_node ksym_hlist;
50}; 51};
@@ -69,9 +70,8 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
69 70
70 rcu_read_lock(); 71 rcu_read_lock();
71 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) { 72 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
72 if ((entry->attr.bp_addr == hbp_hit_addr) && 73 if (entry->attr.bp_addr == hbp_hit_addr) {
73 (entry->counter <= MAX_UL_INT)) { 74 atomic64_inc(&entry->counter);
74 entry->counter++;
75 break; 75 break;
76 } 76 }
77 } 77 }
@@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr)
79} 79}
80#endif /* CONFIG_PROFILE_KSYM_TRACER */ 80#endif /* CONFIG_PROFILE_KSYM_TRACER */
81 81
82void ksym_hbp_handler(struct perf_event *hbp, void *data) 82void ksym_hbp_handler(struct perf_event *hbp, int nmi,
83 struct perf_sample_data *data,
84 struct pt_regs *regs)
83{ 85{
84 struct ring_buffer_event *event; 86 struct ring_buffer_event *event;
85 struct ksym_trace_entry *entry; 87 struct ksym_trace_entry *entry;
86 struct pt_regs *regs = data;
87 struct ring_buffer *buffer; 88 struct ring_buffer *buffer;
88 int pc; 89 int pc;
89 90
@@ -196,7 +197,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
196 entry->attr.bp_addr = addr; 197 entry->attr.bp_addr = addr;
197 entry->attr.bp_len = HW_BREAKPOINT_LEN_4; 198 entry->attr.bp_len = HW_BREAKPOINT_LEN_4;
198 199
199 ret = -EAGAIN;
200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr, 200 entry->ksym_hbp = register_wide_hw_breakpoint(&entry->attr,
201 ksym_hbp_handler); 201 ksym_hbp_handler);
202 202
@@ -235,7 +235,8 @@ static ssize_t ksym_trace_filter_read(struct file *filp, char __user *ubuf,
235 mutex_lock(&ksym_tracer_mutex); 235 mutex_lock(&ksym_tracer_mutex);
236 236
237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) { 237 hlist_for_each_entry(entry, node, &ksym_filter_head, ksym_hlist) {
238 ret = trace_seq_printf(s, "%pS:", (void *)entry->attr.bp_addr); 238 ret = trace_seq_printf(s, "%pS:",
239 (void *)(unsigned long)entry->attr.bp_addr);
239 if (entry->attr.bp_type == HW_BREAKPOINT_R) 240 if (entry->attr.bp_type == HW_BREAKPOINT_R)
240 ret = trace_seq_puts(s, "r--\n"); 241 ret = trace_seq_puts(s, "r--\n");
241 else if (entry->attr.bp_type == HW_BREAKPOINT_W) 242 else if (entry->attr.bp_type == HW_BREAKPOINT_W)
@@ -277,21 +278,20 @@ static ssize_t ksym_trace_filter_write(struct file *file,
277{ 278{
278 struct trace_ksym *entry; 279 struct trace_ksym *entry;
279 struct hlist_node *node; 280 struct hlist_node *node;
280 char *input_string, *ksymname = NULL; 281 char *buf, *input_string, *ksymname = NULL;
281 unsigned long ksym_addr = 0; 282 unsigned long ksym_addr = 0;
282 int ret, op, changed = 0; 283 int ret, op, changed = 0;
283 284
284 input_string = kzalloc(count + 1, GFP_KERNEL); 285 buf = kzalloc(count + 1, GFP_KERNEL);
285 if (!input_string) 286 if (!buf)
286 return -ENOMEM; 287 return -ENOMEM;
287 288
288 if (copy_from_user(input_string, buffer, count)) { 289 ret = -EFAULT;
289 kfree(input_string); 290 if (copy_from_user(buf, buffer, count))
290 return -EFAULT; 291 goto out;
291 }
292 input_string[count] = '\0';
293 292
294 strstrip(input_string); 293 buf[count] = '\0';
294 input_string = strstrip(buf);
295 295
296 /* 296 /*
297 * Clear all breakpoints if: 297 * Clear all breakpoints if:
@@ -302,15 +302,13 @@ static ssize_t ksym_trace_filter_write(struct file *file,
302 if (!input_string[0] || !strcmp(input_string, "0") || 302 if (!input_string[0] || !strcmp(input_string, "0") ||
303 !strcmp(input_string, "*:---")) { 303 !strcmp(input_string, "*:---")) {
304 __ksym_trace_reset(); 304 __ksym_trace_reset();
305 kfree(input_string); 305 ret = 0;
306 return count; 306 goto out;
307 } 307 }
308 308
309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr); 309 ret = op = parse_ksym_trace_str(input_string, &ksymname, &ksym_addr);
310 if (ret < 0) { 310 if (ret < 0)
311 kfree(input_string); 311 goto out;
312 return ret;
313 }
314 312
315 mutex_lock(&ksym_tracer_mutex); 313 mutex_lock(&ksym_tracer_mutex);
316 314
@@ -321,7 +319,7 @@ static ssize_t ksym_trace_filter_write(struct file *file,
321 if (entry->attr.bp_type != op) 319 if (entry->attr.bp_type != op)
322 changed = 1; 320 changed = 1;
323 else 321 else
324 goto out; 322 goto out_unlock;
325 break; 323 break;
326 } 324 }
327 } 325 }
@@ -336,28 +334,24 @@ static ssize_t ksym_trace_filter_write(struct file *file,
336 if (IS_ERR(entry->ksym_hbp)) 334 if (IS_ERR(entry->ksym_hbp))
337 ret = PTR_ERR(entry->ksym_hbp); 335 ret = PTR_ERR(entry->ksym_hbp);
338 else 336 else
339 goto out; 337 goto out_unlock;
340 } 338 }
341 /* Error or "symbol:---" case: drop it */ 339 /* Error or "symbol:---" case: drop it */
342 ksym_filter_entry_count--; 340 ksym_filter_entry_count--;
343 hlist_del_rcu(&(entry->ksym_hlist)); 341 hlist_del_rcu(&(entry->ksym_hlist));
344 synchronize_rcu(); 342 synchronize_rcu();
345 kfree(entry); 343 kfree(entry);
346 goto out; 344 goto out_unlock;
347 } else { 345 } else {
348 /* Check for malformed request: (4) */ 346 /* Check for malformed request: (4) */
349 if (op == 0) 347 if (op)
350 goto out; 348 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
351 ret = process_new_ksym_entry(ksymname, op, ksym_addr);
352 } 349 }
353out: 350out_unlock:
354 mutex_unlock(&ksym_tracer_mutex); 351 mutex_unlock(&ksym_tracer_mutex);
355 352out:
356 kfree(input_string); 353 kfree(buf);
357 354 return !ret ? count : ret;
358 if (!ret)
359 ret = count;
360 return ret;
361} 355}
362 356
363static const struct file_operations ksym_tracing_fops = { 357static const struct file_operations ksym_tracing_fops = {
@@ -449,102 +443,77 @@ struct tracer ksym_tracer __read_mostly =
449 .print_line = ksym_trace_output 443 .print_line = ksym_trace_output
450}; 444};
451 445
452__init static int init_ksym_trace(void)
453{
454 struct dentry *d_tracer;
455 struct dentry *entry;
456
457 d_tracer = tracing_init_dentry();
458 ksym_filter_entry_count = 0;
459
460 entry = debugfs_create_file("ksym_trace_filter", 0644, d_tracer,
461 NULL, &ksym_tracing_fops);
462 if (!entry)
463 pr_warning("Could not create debugfs "
464 "'ksym_trace_filter' file\n");
465
466 return register_tracer(&ksym_tracer);
467}
468device_initcall(init_ksym_trace);
469
470
471#ifdef CONFIG_PROFILE_KSYM_TRACER 446#ifdef CONFIG_PROFILE_KSYM_TRACER
472static int ksym_tracer_stat_headers(struct seq_file *m) 447static int ksym_profile_show(struct seq_file *m, void *v)
473{ 448{
449 struct hlist_node *node;
450 struct trace_ksym *entry;
451 int access_type = 0;
452 char fn_name[KSYM_NAME_LEN];
453
474 seq_puts(m, " Access Type "); 454 seq_puts(m, " Access Type ");
475 seq_puts(m, " Symbol Counter\n"); 455 seq_puts(m, " Symbol Counter\n");
476 seq_puts(m, " ----------- "); 456 seq_puts(m, " ----------- ");
477 seq_puts(m, " ------ -------\n"); 457 seq_puts(m, " ------ -------\n");
478 return 0;
479}
480 458
481static int ksym_tracer_stat_show(struct seq_file *m, void *v) 459 rcu_read_lock();
482{ 460 hlist_for_each_entry_rcu(entry, node, &ksym_filter_head, ksym_hlist) {
483 struct hlist_node *stat = v;
484 struct trace_ksym *entry;
485 int access_type = 0;
486 char fn_name[KSYM_NAME_LEN];
487 461
488 entry = hlist_entry(stat, struct trace_ksym, ksym_hlist); 462 access_type = entry->attr.bp_type;
489 463
490 access_type = entry->attr.bp_type; 464 switch (access_type) {
465 case HW_BREAKPOINT_R:
466 seq_puts(m, " R ");
467 break;
468 case HW_BREAKPOINT_W:
469 seq_puts(m, " W ");
470 break;
471 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
472 seq_puts(m, " RW ");
473 break;
474 default:
475 seq_puts(m, " NA ");
476 }
491 477
492 switch (access_type) { 478 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
493 case HW_BREAKPOINT_R: 479 seq_printf(m, " %-36s", fn_name);
494 seq_puts(m, " R "); 480 else
495 break; 481 seq_printf(m, " %-36s", "<NA>");
496 case HW_BREAKPOINT_W: 482 seq_printf(m, " %15llu\n",
497 seq_puts(m, " W "); 483 (unsigned long long)atomic64_read(&entry->counter));
498 break;
499 case HW_BREAKPOINT_R | HW_BREAKPOINT_W:
500 seq_puts(m, " RW ");
501 break;
502 default:
503 seq_puts(m, " NA ");
504 } 484 }
505 485 rcu_read_unlock();
506 if (lookup_symbol_name(entry->attr.bp_addr, fn_name) >= 0)
507 seq_printf(m, " %-36s", fn_name);
508 else
509 seq_printf(m, " %-36s", "<NA>");
510 seq_printf(m, " %15lu\n", entry->counter);
511 486
512 return 0; 487 return 0;
513} 488}
514 489
515static void *ksym_tracer_stat_start(struct tracer_stat *trace) 490static int ksym_profile_open(struct inode *node, struct file *file)
516{ 491{
517 return ksym_filter_head.first; 492 return single_open(file, ksym_profile_show, NULL);
518} 493}
519 494
520static void * 495static const struct file_operations ksym_profile_fops = {
521ksym_tracer_stat_next(void *v, int idx) 496 .open = ksym_profile_open,
522{ 497 .read = seq_read,
523 struct hlist_node *stat = v; 498 .llseek = seq_lseek,
524 499 .release = single_release,
525 return stat->next;
526}
527
528static struct tracer_stat ksym_tracer_stats = {
529 .name = "ksym_tracer",
530 .stat_start = ksym_tracer_stat_start,
531 .stat_next = ksym_tracer_stat_next,
532 .stat_headers = ksym_tracer_stat_headers,
533 .stat_show = ksym_tracer_stat_show
534}; 500};
501#endif /* CONFIG_PROFILE_KSYM_TRACER */
535 502
536__init static int ksym_tracer_stat_init(void) 503__init static int init_ksym_trace(void)
537{ 504{
538 int ret; 505 struct dentry *d_tracer;
539 506
540 ret = register_stat_tracer(&ksym_tracer_stats); 507 d_tracer = tracing_init_dentry();
541 if (ret) {
542 printk(KERN_WARNING "Warning: could not register "
543 "ksym tracer stats\n");
544 return 1;
545 }
546 508
547 return 0; 509 trace_create_file("ksym_trace_filter", 0644, d_tracer,
510 NULL, &ksym_tracing_fops);
511
512#ifdef CONFIG_PROFILE_KSYM_TRACER
513 trace_create_file("ksym_profile", 0444, d_tracer,
514 NULL, &ksym_profile_fops);
515#endif
516
517 return register_tracer(&ksym_tracer);
548} 518}
549fs_initcall(ksym_tracer_stat_init); 519device_initcall(init_ksym_trace);
550#endif /* CONFIG_PROFILE_KSYM_TRACER */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b6c12c6a1bcd..8e46b3323cdc 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -23,13 +23,21 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
23 23
24static int next_event_type = __TRACE_LAST_TYPE + 1; 24static int next_event_type = __TRACE_LAST_TYPE + 1;
25 25
26void trace_print_seq(struct seq_file *m, struct trace_seq *s) 26int trace_print_seq(struct seq_file *m, struct trace_seq *s)
27{ 27{
28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; 28 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
29 int ret;
30
31 ret = seq_write(m, s->buffer, len);
29 32
30 seq_write(m, s->buffer, len); 33 /*
34 * Only reset this buffer if we successfully wrote to the
35 * seq_file buffer.
36 */
37 if (!ret)
38 trace_seq_init(s);
31 39
32 trace_seq_init(s); 40 return ret;
33} 41}
34 42
35enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) 43enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
@@ -85,7 +93,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
85 va_list ap; 93 va_list ap;
86 int ret; 94 int ret;
87 95
88 if (!len) 96 if (s->full || !len)
89 return 0; 97 return 0;
90 98
91 va_start(ap, fmt); 99 va_start(ap, fmt);
@@ -93,8 +101,10 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
93 va_end(ap); 101 va_end(ap);
94 102
95 /* If we can't write it all, don't bother writing anything */ 103 /* If we can't write it all, don't bother writing anything */
96 if (ret >= len) 104 if (ret >= len) {
105 s->full = 1;
97 return 0; 106 return 0;
107 }
98 108
99 s->len += ret; 109 s->len += ret;
100 110
@@ -119,14 +129,16 @@ trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
119 int len = (PAGE_SIZE - 1) - s->len; 129 int len = (PAGE_SIZE - 1) - s->len;
120 int ret; 130 int ret;
121 131
122 if (!len) 132 if (s->full || !len)
123 return 0; 133 return 0;
124 134
125 ret = vsnprintf(s->buffer + s->len, len, fmt, args); 135 ret = vsnprintf(s->buffer + s->len, len, fmt, args);
126 136
127 /* If we can't write it all, don't bother writing anything */ 137 /* If we can't write it all, don't bother writing anything */
128 if (ret >= len) 138 if (ret >= len) {
139 s->full = 1;
129 return 0; 140 return 0;
141 }
130 142
131 s->len += ret; 143 s->len += ret;
132 144
@@ -139,14 +151,16 @@ int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
139 int len = (PAGE_SIZE - 1) - s->len; 151 int len = (PAGE_SIZE - 1) - s->len;
140 int ret; 152 int ret;
141 153
142 if (!len) 154 if (s->full || !len)
143 return 0; 155 return 0;
144 156
145 ret = bstr_printf(s->buffer + s->len, len, fmt, binary); 157 ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
146 158
147 /* If we can't write it all, don't bother writing anything */ 159 /* If we can't write it all, don't bother writing anything */
148 if (ret >= len) 160 if (ret >= len) {
161 s->full = 1;
149 return 0; 162 return 0;
163 }
150 164
151 s->len += ret; 165 s->len += ret;
152 166
@@ -167,8 +181,13 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
167{ 181{
168 int len = strlen(str); 182 int len = strlen(str);
169 183
170 if (len > ((PAGE_SIZE - 1) - s->len)) 184 if (s->full)
185 return 0;
186
187 if (len > ((PAGE_SIZE - 1) - s->len)) {
188 s->full = 1;
171 return 0; 189 return 0;
190 }
172 191
173 memcpy(s->buffer + s->len, str, len); 192 memcpy(s->buffer + s->len, str, len);
174 s->len += len; 193 s->len += len;
@@ -178,9 +197,14 @@ int trace_seq_puts(struct trace_seq *s, const char *str)
178 197
179int trace_seq_putc(struct trace_seq *s, unsigned char c) 198int trace_seq_putc(struct trace_seq *s, unsigned char c)
180{ 199{
181 if (s->len >= (PAGE_SIZE - 1)) 200 if (s->full)
182 return 0; 201 return 0;
183 202
203 if (s->len >= (PAGE_SIZE - 1)) {
204 s->full = 1;
205 return 0;
206 }
207
184 s->buffer[s->len++] = c; 208 s->buffer[s->len++] = c;
185 209
186 return 1; 210 return 1;
@@ -188,9 +212,14 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
188 212
189int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
190{ 214{
191 if (len > ((PAGE_SIZE - 1) - s->len)) 215 if (s->full)
192 return 0; 216 return 0;
193 217
218 if (len > ((PAGE_SIZE - 1) - s->len)) {
219 s->full = 1;
220 return 0;
221 }
222
194 memcpy(s->buffer + s->len, mem, len); 223 memcpy(s->buffer + s->len, mem, len);
195 s->len += len; 224 s->len += len;
196 225
@@ -203,6 +232,9 @@ int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
203 const unsigned char *data = mem; 232 const unsigned char *data = mem;
204 int i, j; 233 int i, j;
205 234
235 if (s->full)
236 return 0;
237
206#ifdef __BIG_ENDIAN 238#ifdef __BIG_ENDIAN
207 for (i = 0, j = 0; i < len; i++) { 239 for (i = 0, j = 0; i < len; i++) {
208#else 240#else
@@ -220,8 +252,13 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
220{ 252{
221 void *ret; 253 void *ret;
222 254
223 if (len > ((PAGE_SIZE - 1) - s->len)) 255 if (s->full)
256 return 0;
257
258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1;
224 return NULL; 260 return NULL;
261 }
225 262
226 ret = s->buffer + s->len; 263 ret = s->buffer + s->len;
227 s->len += len; 264 s->len += len;
@@ -233,8 +270,14 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
233{ 270{
234 unsigned char *p; 271 unsigned char *p;
235 272
236 if (s->len >= (PAGE_SIZE - 1)) 273 if (s->full)
274 return 0;
275
276 if (s->len >= (PAGE_SIZE - 1)) {
277 s->full = 1;
237 return 0; 278 return 0;
279 }
280
238 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); 281 p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
239 if (!IS_ERR(p)) { 282 if (!IS_ERR(p)) {
240 p = mangle_path(s->buffer + s->len, p, "\n"); 283 p = mangle_path(s->buffer + s->len, p, "\n");
@@ -247,6 +290,7 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
247 return 1; 290 return 1;
248 } 291 }
249 292
293 s->full = 1;
250 return 0; 294 return 0;
251} 295}
252 296
@@ -373,6 +417,9 @@ int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
373 unsigned long vmstart = 0; 417 unsigned long vmstart = 0;
374 int ret = 1; 418 int ret = 1;
375 419
420 if (s->full)
421 return 0;
422
376 if (mm) { 423 if (mm) {
377 const struct vm_area_struct *vma; 424 const struct vm_area_struct *vma;
378 425
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 26185d727676..0271742abb8d 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -28,8 +28,8 @@ static int wakeup_current_cpu;
28static unsigned wakeup_prio = -1; 28static unsigned wakeup_prio = -1;
29static int wakeup_rt; 29static int wakeup_rt;
30 30
31static raw_spinlock_t wakeup_lock = 31static arch_spinlock_t wakeup_lock =
32 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 32 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
33 33
34static void __wakeup_reset(struct trace_array *tr); 34static void __wakeup_reset(struct trace_array *tr);
35 35
@@ -143,7 +143,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
143 goto out; 143 goto out;
144 144
145 local_irq_save(flags); 145 local_irq_save(flags);
146 __raw_spin_lock(&wakeup_lock); 146 arch_spin_lock(&wakeup_lock);
147 147
148 /* We could race with grabbing wakeup_lock */ 148 /* We could race with grabbing wakeup_lock */
149 if (unlikely(!tracer_enabled || next != wakeup_task)) 149 if (unlikely(!tracer_enabled || next != wakeup_task))
@@ -169,7 +169,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
169 169
170out_unlock: 170out_unlock:
171 __wakeup_reset(wakeup_trace); 171 __wakeup_reset(wakeup_trace);
172 __raw_spin_unlock(&wakeup_lock); 172 arch_spin_unlock(&wakeup_lock);
173 local_irq_restore(flags); 173 local_irq_restore(flags);
174out: 174out:
175 atomic_dec(&wakeup_trace->data[cpu]->disabled); 175 atomic_dec(&wakeup_trace->data[cpu]->disabled);
@@ -193,9 +193,9 @@ static void wakeup_reset(struct trace_array *tr)
193 tracing_reset_online_cpus(tr); 193 tracing_reset_online_cpus(tr);
194 194
195 local_irq_save(flags); 195 local_irq_save(flags);
196 __raw_spin_lock(&wakeup_lock); 196 arch_spin_lock(&wakeup_lock);
197 __wakeup_reset(tr); 197 __wakeup_reset(tr);
198 __raw_spin_unlock(&wakeup_lock); 198 arch_spin_unlock(&wakeup_lock);
199 local_irq_restore(flags); 199 local_irq_restore(flags);
200} 200}
201 201
@@ -225,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
225 goto out; 225 goto out;
226 226
227 /* interrupts should be off from try_to_wake_up */ 227 /* interrupts should be off from try_to_wake_up */
228 __raw_spin_lock(&wakeup_lock); 228 arch_spin_lock(&wakeup_lock);
229 229
230 /* check for races. */ 230 /* check for races. */
231 if (!tracer_enabled || p->prio >= wakeup_prio) 231 if (!tracer_enabled || p->prio >= wakeup_prio)
@@ -255,7 +255,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 255 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
256 256
257out_locked: 257out_locked:
258 __raw_spin_unlock(&wakeup_lock); 258 arch_spin_unlock(&wakeup_lock);
259out: 259out:
260 atomic_dec(&wakeup_trace->data[cpu]->disabled); 260 atomic_dec(&wakeup_trace->data[cpu]->disabled);
261} 261}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index dc98309e839a..280fea470d67 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
67 67
68 /* Don't allow flipping of max traces now */ 68 /* Don't allow flipping of max traces now */
69 local_irq_save(flags); 69 local_irq_save(flags);
70 __raw_spin_lock(&ftrace_max_lock); 70 arch_spin_lock(&ftrace_max_lock);
71 71
72 cnt = ring_buffer_entries(tr->buffer); 72 cnt = ring_buffer_entries(tr->buffer);
73 73
@@ -85,7 +85,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
85 break; 85 break;
86 } 86 }
87 tracing_on(); 87 tracing_on();
88 __raw_spin_unlock(&ftrace_max_lock); 88 arch_spin_unlock(&ftrace_max_lock);
89 local_irq_restore(flags); 89 local_irq_restore(flags);
90 90
91 if (count) 91 if (count)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8504ac71e4e8..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -27,8 +27,8 @@ static struct stack_trace max_stack_trace = {
27}; 27};
28 28
29static unsigned long max_stack_size; 29static unsigned long max_stack_size;
30static raw_spinlock_t max_stack_lock = 30static arch_spinlock_t max_stack_lock =
31 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 31 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
32 32
33static int stack_trace_disabled __read_mostly; 33static int stack_trace_disabled __read_mostly;
34static DEFINE_PER_CPU(int, trace_active); 34static DEFINE_PER_CPU(int, trace_active);
@@ -54,7 +54,7 @@ static inline void check_stack(void)
54 return; 54 return;
55 55
56 local_irq_save(flags); 56 local_irq_save(flags);
57 __raw_spin_lock(&max_stack_lock); 57 arch_spin_lock(&max_stack_lock);
58 58
59 /* a race could have already updated it */ 59 /* a race could have already updated it */
60 if (this_size <= max_stack_size) 60 if (this_size <= max_stack_size)
@@ -103,7 +103,7 @@ static inline void check_stack(void)
103 } 103 }
104 104
105 out: 105 out:
106 __raw_spin_unlock(&max_stack_lock); 106 arch_spin_unlock(&max_stack_lock);
107 local_irq_restore(flags); 107 local_irq_restore(flags);
108} 108}
109 109
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
174 __raw_spin_lock(&max_stack_lock); 175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 __raw_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,8 +218,14 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
210 __raw_spin_lock(&max_stack_lock); 224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
213 return SEQ_START_TOKEN; 231 return SEQ_START_TOKEN;
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
220 __raw_spin_unlock(&max_stack_lock); 238 int cpu;
239
240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 57501d90096a..cba47d7935cc 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -143,70 +143,65 @@ extern char *__bad_type_size(void);
143 #type, #name, offsetof(typeof(trace), name), \ 143 #type, #name, offsetof(typeof(trace), name), \
144 sizeof(trace.name), is_signed_type(type) 144 sizeof(trace.name), is_signed_type(type)
145 145
146int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) 146static
147int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
147{ 148{
148 int i; 149 int i;
149 int ret; 150 int pos = 0;
150 struct syscall_metadata *entry = call->data;
151 struct syscall_trace_enter trace;
152 int offset = offsetof(struct syscall_trace_enter, args);
153 151
154 ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 152 /* When len=0, we just calculate the needed length */
155 "\tsigned:%u;\n", 153#define LEN_OR_ZERO (len ? len - pos : 0)
156 SYSCALL_FIELD(int, nr));
157 if (!ret)
158 return 0;
159 154
155 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
160 for (i = 0; i < entry->nb_args; i++) { 156 for (i = 0; i < entry->nb_args; i++) {
161 ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], 157 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
162 entry->args[i]); 158 entry->args[i], sizeof(unsigned long),
163 if (!ret) 159 i == entry->nb_args - 1 ? "" : ", ");
164 return 0;
165 ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;"
166 "\tsigned:%u;\n", offset,
167 sizeof(unsigned long),
168 is_signed_type(unsigned long));
169 if (!ret)
170 return 0;
171 offset += sizeof(unsigned long);
172 } 160 }
161 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
173 162
174 trace_seq_puts(s, "\nprint fmt: \"");
175 for (i = 0; i < entry->nb_args; i++) { 163 for (i = 0; i < entry->nb_args; i++) {
176 ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], 164 pos += snprintf(buf + pos, LEN_OR_ZERO,
177 sizeof(unsigned long), 165 ", ((unsigned long)(REC->%s))", entry->args[i]);
178 i == entry->nb_args - 1 ? "" : ", ");
179 if (!ret)
180 return 0;
181 } 166 }
182 trace_seq_putc(s, '"');
183 167
184 for (i = 0; i < entry->nb_args; i++) { 168#undef LEN_OR_ZERO
185 ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))",
186 entry->args[i]);
187 if (!ret)
188 return 0;
189 }
190 169
191 return trace_seq_putc(s, '\n'); 170 /* return the length of print_fmt */
171 return pos;
192} 172}
193 173
194int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) 174static int set_syscall_print_fmt(struct ftrace_event_call *call)
195{ 175{
196 int ret; 176 char *print_fmt;
197 struct syscall_trace_exit trace; 177 int len;
178 struct syscall_metadata *entry = call->data;
198 179
199 ret = trace_seq_printf(s, 180 if (entry->enter_event != call) {
200 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" 181 call->print_fmt = "\"0x%lx\", REC->ret";
201 "\tsigned:%u;\n"
202 "\tfield:%s %s;\toffset:%zu;\tsize:%zu;"
203 "\tsigned:%u;\n",
204 SYSCALL_FIELD(int, nr),
205 SYSCALL_FIELD(long, ret));
206 if (!ret)
207 return 0; 182 return 0;
183 }
208 184
209 return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); 185 /* First: called with 0 length to calculate the needed length */
186 len = __set_enter_print_fmt(entry, NULL, 0);
187
188 print_fmt = kmalloc(len + 1, GFP_KERNEL);
189 if (!print_fmt)
190 return -ENOMEM;
191
192 /* Second: actually write the @print_fmt */
193 __set_enter_print_fmt(entry, print_fmt, len + 1);
194 call->print_fmt = print_fmt;
195
196 return 0;
197}
198
199static void free_syscall_print_fmt(struct ftrace_event_call *call)
200{
201 struct syscall_metadata *entry = call->data;
202
203 if (entry->enter_event == call)
204 kfree(call->print_fmt);
210} 205}
211 206
212int syscall_enter_define_fields(struct ftrace_event_call *call) 207int syscall_enter_define_fields(struct ftrace_event_call *call)
@@ -217,10 +212,6 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
217 int i; 212 int i;
218 int offset = offsetof(typeof(trace), args); 213 int offset = offsetof(typeof(trace), args);
219 214
220 ret = trace_define_common_fields(call);
221 if (ret)
222 return ret;
223
224 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 215 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
225 if (ret) 216 if (ret)
226 return ret; 217 return ret;
@@ -241,10 +232,6 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
241 struct syscall_trace_exit trace; 232 struct syscall_trace_exit trace;
242 int ret; 233 int ret;
243 234
244 ret = trace_define_common_fields(call);
245 if (ret)
246 return ret;
247
248 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 235 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
249 if (ret) 236 if (ret)
250 return ret; 237 return ret;
@@ -333,10 +320,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
333 mutex_lock(&syscall_trace_lock); 320 mutex_lock(&syscall_trace_lock);
334 if (!sys_refcount_enter) 321 if (!sys_refcount_enter)
335 ret = register_trace_sys_enter(ftrace_syscall_enter); 322 ret = register_trace_sys_enter(ftrace_syscall_enter);
336 if (ret) { 323 if (!ret) {
337 pr_info("event trace: Could not activate"
338 "syscall entry trace point");
339 } else {
340 set_bit(num, enabled_enter_syscalls); 324 set_bit(num, enabled_enter_syscalls);
341 sys_refcount_enter++; 325 sys_refcount_enter++;
342 } 326 }
@@ -370,10 +354,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
370 mutex_lock(&syscall_trace_lock); 354 mutex_lock(&syscall_trace_lock);
371 if (!sys_refcount_exit) 355 if (!sys_refcount_exit)
372 ret = register_trace_sys_exit(ftrace_syscall_exit); 356 ret = register_trace_sys_exit(ftrace_syscall_exit);
373 if (ret) { 357 if (!ret) {
374 pr_info("event trace: Could not activate"
375 "syscall exit trace point");
376 } else {
377 set_bit(num, enabled_exit_syscalls); 358 set_bit(num, enabled_exit_syscalls);
378 sys_refcount_exit++; 359 sys_refcount_exit++;
379 } 360 }
@@ -400,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call)
400{ 381{
401 int id; 382 int id;
402 383
403 id = register_ftrace_event(call->event); 384 if (set_syscall_print_fmt(call) < 0)
404 if (!id) 385 return -ENOMEM;
405 return -ENODEV; 386
406 call->id = id; 387 id = trace_event_raw_init(call);
407 INIT_LIST_HEAD(&call->fields); 388
408 return 0; 389 if (id < 0) {
390 free_syscall_print_fmt(call);
391 return id;
392 }
393
394 return id;
395}
396
397unsigned long __init arch_syscall_addr(int nr)
398{
399 return (unsigned long)sys_call_table[nr];
409} 400}
410 401
411int __init init_ftrace_syscalls(void) 402int __init init_ftrace_syscalls(void)
@@ -435,7 +426,7 @@ int __init init_ftrace_syscalls(void)
435} 426}
436core_initcall(init_ftrace_syscalls); 427core_initcall(init_ftrace_syscalls);
437 428
438#ifdef CONFIG_EVENT_PROFILE 429#ifdef CONFIG_PERF_EVENTS
439 430
440static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 431static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
441static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 432static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -447,12 +438,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
447 struct syscall_metadata *sys_data; 438 struct syscall_metadata *sys_data;
448 struct syscall_trace_enter *rec; 439 struct syscall_trace_enter *rec;
449 unsigned long flags; 440 unsigned long flags;
450 char *trace_buf;
451 char *raw_data;
452 int syscall_nr; 441 int syscall_nr;
453 int rctx; 442 int rctx;
454 int size; 443 int size;
455 int cpu;
456 444
457 syscall_nr = syscall_get_nr(current, regs); 445 syscall_nr = syscall_get_nr(current, regs);
458 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 446 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -471,37 +459,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
471 "profile buffer not large enough")) 459 "profile buffer not large enough"))
472 return; 460 return;
473 461
474 /* Protect the per cpu buffer, begin the rcu read side */ 462 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
475 local_irq_save(flags); 463 sys_data->enter_event->id, &rctx, &flags);
476 464 if (!rec)
477 rctx = perf_swevent_get_recursion_context(); 465 return;
478 if (rctx < 0)
479 goto end_recursion;
480
481 cpu = smp_processor_id();
482
483 trace_buf = rcu_dereference(perf_trace_buf);
484
485 if (!trace_buf)
486 goto end;
487
488 raw_data = per_cpu_ptr(trace_buf, cpu);
489
490 /* zero the dead bytes from align to not leak stack to user */
491 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
492 466
493 rec = (struct syscall_trace_enter *) raw_data;
494 tracing_generic_entry_update(&rec->ent, 0, 0);
495 rec->ent.type = sys_data->enter_event->id;
496 rec->nr = syscall_nr; 467 rec->nr = syscall_nr;
497 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 468 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
498 (unsigned long *)&rec->args); 469 (unsigned long *)&rec->args);
499 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 470 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
500
501end:
502 perf_swevent_put_recursion_context(rctx);
503end_recursion:
504 local_irq_restore(flags);
505} 471}
506 472
507int prof_sysenter_enable(struct ftrace_event_call *call) 473int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -545,11 +511,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
545 struct syscall_trace_exit *rec; 511 struct syscall_trace_exit *rec;
546 unsigned long flags; 512 unsigned long flags;
547 int syscall_nr; 513 int syscall_nr;
548 char *trace_buf;
549 char *raw_data;
550 int rctx; 514 int rctx;
551 int size; 515 int size;
552 int cpu;
553 516
554 syscall_nr = syscall_get_nr(current, regs); 517 syscall_nr = syscall_get_nr(current, regs);
555 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 518 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -571,38 +534,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
571 "exit event has grown above profile buffer size")) 534 "exit event has grown above profile buffer size"))
572 return; 535 return;
573 536
574 /* Protect the per cpu buffer, begin the rcu read side */ 537 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
575 local_irq_save(flags); 538 sys_data->exit_event->id, &rctx, &flags);
576 539 if (!rec)
577 rctx = perf_swevent_get_recursion_context(); 540 return;
578 if (rctx < 0)
579 goto end_recursion;
580
581 cpu = smp_processor_id();
582
583 trace_buf = rcu_dereference(perf_trace_buf);
584
585 if (!trace_buf)
586 goto end;
587
588 raw_data = per_cpu_ptr(trace_buf, cpu);
589
590 /* zero the dead bytes from align to not leak stack to user */
591 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
592
593 rec = (struct syscall_trace_exit *)raw_data;
594 541
595 tracing_generic_entry_update(&rec->ent, 0, 0);
596 rec->ent.type = sys_data->exit_event->id;
597 rec->nr = syscall_nr; 542 rec->nr = syscall_nr;
598 rec->ret = syscall_get_return_value(current, regs); 543 rec->ret = syscall_get_return_value(current, regs);
599 544
600 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 545 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
601
602end:
603 perf_swevent_put_recursion_context(rctx);
604end_recursion:
605 local_irq_restore(flags);
606} 546}
607 547
608int prof_sysexit_enable(struct ftrace_event_call *call) 548int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -617,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call)
617 ret = register_trace_sys_exit(prof_syscall_exit); 557 ret = register_trace_sys_exit(prof_syscall_exit);
618 if (ret) { 558 if (ret) {
619 pr_info("event trace: Could not activate" 559 pr_info("event trace: Could not activate"
620 "syscall entry trace point"); 560 "syscall exit trace point");
621 } else { 561 } else {
622 set_bit(num, enabled_prof_exit_syscalls); 562 set_bit(num, enabled_prof_exit_syscalls);
623 sys_prof_refcount_exit++; 563 sys_prof_refcount_exit++;
@@ -640,6 +580,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
640 mutex_unlock(&syscall_trace_lock); 580 mutex_unlock(&syscall_trace_lock);
641} 581}
642 582
643#endif 583#endif /* CONFIG_PERF_EVENTS */
644
645 584
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index f6693969287d..a7974a552ca9 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -93,6 +93,7 @@ static const struct stacktrace_ops backtrace_ops = {
93 .warning_symbol = backtrace_warning_symbol, 93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack, 94 .stack = backtrace_stack,
95 .address = backtrace_address, 95 .address = backtrace_address,
96 .walk_stack = print_context_stack,
96}; 97};
97 98
98static int 99static int
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 67e526b6ae81..dee48658805c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,116 @@ struct workqueue_struct {
68#endif 68#endif
69}; 69};
70 70
71#ifdef CONFIG_DEBUG_OBJECTS_WORK
72
73static struct debug_obj_descr work_debug_descr;
74
75/*
76 * fixup_init is called when:
77 * - an active object is initialized
78 */
79static int work_fixup_init(void *addr, enum debug_obj_state state)
80{
81 struct work_struct *work = addr;
82
83 switch (state) {
84 case ODEBUG_STATE_ACTIVE:
85 cancel_work_sync(work);
86 debug_object_init(work, &work_debug_descr);
87 return 1;
88 default:
89 return 0;
90 }
91}
92
93/*
94 * fixup_activate is called when:
95 * - an active object is activated
96 * - an unknown object is activated (might be a statically initialized object)
97 */
98static int work_fixup_activate(void *addr, enum debug_obj_state state)
99{
100 struct work_struct *work = addr;
101
102 switch (state) {
103
104 case ODEBUG_STATE_NOTAVAILABLE:
105 /*
106 * This is not really a fixup. The work struct was
107 * statically initialized. We just make sure that it
108 * is tracked in the object tracker.
109 */
110 if (test_bit(WORK_STRUCT_STATIC, work_data_bits(work))) {
111 debug_object_init(work, &work_debug_descr);
112 debug_object_activate(work, &work_debug_descr);
113 return 0;
114 }
115 WARN_ON_ONCE(1);
116 return 0;
117
118 case ODEBUG_STATE_ACTIVE:
119 WARN_ON(1);
120
121 default:
122 return 0;
123 }
124}
125
126/*
127 * fixup_free is called when:
128 * - an active object is freed
129 */
130static int work_fixup_free(void *addr, enum debug_obj_state state)
131{
132 struct work_struct *work = addr;
133
134 switch (state) {
135 case ODEBUG_STATE_ACTIVE:
136 cancel_work_sync(work);
137 debug_object_free(work, &work_debug_descr);
138 return 1;
139 default:
140 return 0;
141 }
142}
143
144static struct debug_obj_descr work_debug_descr = {
145 .name = "work_struct",
146 .fixup_init = work_fixup_init,
147 .fixup_activate = work_fixup_activate,
148 .fixup_free = work_fixup_free,
149};
150
151static inline void debug_work_activate(struct work_struct *work)
152{
153 debug_object_activate(work, &work_debug_descr);
154}
155
156static inline void debug_work_deactivate(struct work_struct *work)
157{
158 debug_object_deactivate(work, &work_debug_descr);
159}
160
161void __init_work(struct work_struct *work, int onstack)
162{
163 if (onstack)
164 debug_object_init_on_stack(work, &work_debug_descr);
165 else
166 debug_object_init(work, &work_debug_descr);
167}
168EXPORT_SYMBOL_GPL(__init_work);
169
170void destroy_work_on_stack(struct work_struct *work)
171{
172 debug_object_free(work, &work_debug_descr);
173}
174EXPORT_SYMBOL_GPL(destroy_work_on_stack);
175
176#else
177static inline void debug_work_activate(struct work_struct *work) { }
178static inline void debug_work_deactivate(struct work_struct *work) { }
179#endif
180
71/* Serializes the accesses to the list of workqueues. */ 181/* Serializes the accesses to the list of workqueues. */
72static DEFINE_SPINLOCK(workqueue_lock); 182static DEFINE_SPINLOCK(workqueue_lock);
73static LIST_HEAD(workqueues); 183static LIST_HEAD(workqueues);
@@ -145,6 +255,7 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
145{ 255{
146 unsigned long flags; 256 unsigned long flags;
147 257
258 debug_work_activate(work);
148 spin_lock_irqsave(&cwq->lock, flags); 259 spin_lock_irqsave(&cwq->lock, flags);
149 insert_work(cwq, work, &cwq->worklist); 260 insert_work(cwq, work, &cwq->worklist);
150 spin_unlock_irqrestore(&cwq->lock, flags); 261 spin_unlock_irqrestore(&cwq->lock, flags);
@@ -280,6 +391,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
280 struct lockdep_map lockdep_map = work->lockdep_map; 391 struct lockdep_map lockdep_map = work->lockdep_map;
281#endif 392#endif
282 trace_workqueue_execution(cwq->thread, work); 393 trace_workqueue_execution(cwq->thread, work);
394 debug_work_deactivate(work);
283 cwq->current_work = work; 395 cwq->current_work = work;
284 list_del_init(cwq->worklist.next); 396 list_del_init(cwq->worklist.next);
285 spin_unlock_irq(&cwq->lock); 397 spin_unlock_irq(&cwq->lock);
@@ -350,11 +462,18 @@ static void wq_barrier_func(struct work_struct *work)
350static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 462static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
351 struct wq_barrier *barr, struct list_head *head) 463 struct wq_barrier *barr, struct list_head *head)
352{ 464{
353 INIT_WORK(&barr->work, wq_barrier_func); 465 /*
466 * debugobject calls are safe here even with cwq->lock locked
467 * as we know for sure that this will not trigger any of the
468 * checks and call back into the fixup functions where we
469 * might deadlock.
470 */
471 INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
354 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 472 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
355 473
356 init_completion(&barr->done); 474 init_completion(&barr->done);
357 475
476 debug_work_activate(&barr->work);
358 insert_work(cwq, &barr->work, head); 477 insert_work(cwq, &barr->work, head);
359} 478}
360 479
@@ -372,8 +491,10 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
372 } 491 }
373 spin_unlock_irq(&cwq->lock); 492 spin_unlock_irq(&cwq->lock);
374 493
375 if (active) 494 if (active) {
376 wait_for_completion(&barr.done); 495 wait_for_completion(&barr.done);
496 destroy_work_on_stack(&barr.work);
497 }
377 498
378 return active; 499 return active;
379} 500}
@@ -451,6 +572,7 @@ out:
451 return 0; 572 return 0;
452 573
453 wait_for_completion(&barr.done); 574 wait_for_completion(&barr.done);
575 destroy_work_on_stack(&barr.work);
454 return 1; 576 return 1;
455} 577}
456EXPORT_SYMBOL_GPL(flush_work); 578EXPORT_SYMBOL_GPL(flush_work);
@@ -485,6 +607,7 @@ static int try_to_grab_pending(struct work_struct *work)
485 */ 607 */
486 smp_rmb(); 608 smp_rmb();
487 if (cwq == get_wq_data(work)) { 609 if (cwq == get_wq_data(work)) {
610 debug_work_deactivate(work);
488 list_del_init(&work->entry); 611 list_del_init(&work->entry);
489 ret = 1; 612 ret = 1;
490 } 613 }
@@ -507,8 +630,10 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
507 } 630 }
508 spin_unlock_irq(&cwq->lock); 631 spin_unlock_irq(&cwq->lock);
509 632
510 if (unlikely(running)) 633 if (unlikely(running)) {
511 wait_for_completion(&barr.done); 634 wait_for_completion(&barr.done);
635 destroy_work_on_stack(&barr.work);
636 }
512} 637}
513 638
514static void wait_on_work(struct work_struct *work) 639static void wait_on_work(struct work_struct *work)